Skip to content

Commit

Permalink
Merge pull request #42 from MarginaliaSearch/no-downtime-upgrades
Browse files Browse the repository at this point in the history
Zero downtime upgrades, merge-based index construction
  • Loading branch information
vlofgren committed Aug 29, 2023
2 parents 229c63c + 3f288e2 commit bdcbfb1
Show file tree
Hide file tree
Showing 243 changed files with 5,586 additions and 5,198 deletions.
12 changes: 12 additions & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@ tasks.register('dist', Copy) {
from subprojects.collect { it.tasks.withType(Tar) }
into "$buildDir/dist"

// For local development, each processes that are to be triggerable
// from the control-service need to go here to end up somewhere the
// control-service can find them

doLast {
copy {
from tarTree("$buildDir/dist/converter-process.tar")
Expand All @@ -34,10 +38,18 @@ tasks.register('dist', Copy) {
from tarTree("$buildDir/dist/crawl-job-extractor-process.tar")
into "$projectDir/run/dist/"
}
copy {
from tarTree("$buildDir/dist/index-construction-process.tar")
into "$projectDir/run/dist/"
}
}
}
idea {
module {
// Exclude these directories from being indexed by IntelliJ
// as they tend to bring the IDE to its knees and use up all
// Inotify spots in a hurry
excludeDirs.add(file("$projectDir/run/backup"))
excludeDirs.add(file("$projectDir/run/model"))
excludeDirs.add(file("$projectDir/run/dist"))
excludeDirs.add(file("$projectDir/run/samples"))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
public class IndexMqEndpoints {
public static final String INDEX_IS_BLOCKED = "INDEX-IS-BLOCKED";
public static final String INDEX_REPARTITION = "INDEX-REPARTITION";

public static final String INDEX_RELOAD_LEXICON = "INDEX-RELOAD-LEXICON";
public static final String INDEX_REINDEX = "INDEX-REINDEX";
public static final String SWITCH_INDEX = "SWITCH-INDEX";

}
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,17 @@

import lombok.AllArgsConstructor;
import lombok.Getter;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.id.EdgeId;
import nu.marginalia.model.id.UrlIdCodec;
import org.jetbrains.annotations.NotNull;

import java.util.ArrayList;
import java.util.List;

/** Represents a document matching a search query */
@AllArgsConstructor @Getter
public class SearchResultItem {
/** Encoded ID that contains both the URL id and its ranking */
public class SearchResultItem implements Comparable<SearchResultItem> {
/** Encoded ID that contains both the URL id and its ranking. This is
* probably not what you want, use getDocumentId() instead */
public final long combinedId;

/** How did the subqueries match against the document ? */
Expand All @@ -20,20 +21,18 @@ public class SearchResultItem {
/** How many other potential results existed in the same domain */
public int resultsFromDomain;

public SearchResultItem(long val) {
this.combinedId = val;
public SearchResultItem(long combinedId) {
this.combinedId = combinedId;
this.keywordScores = new ArrayList<>(16);
}

public EdgeId<EdgeUrl> getUrlId() {
return new EdgeId<>(getUrlIdInt());
}

public int getUrlIdInt() {
return (int)(combinedId & 0xFFFF_FFFFL);
public long getDocumentId() {
return UrlIdCodec.removeRank(combinedId);
}

public int getRanking() {
return (int)(combinedId >>> 32);
return UrlIdCodec.getRank(combinedId);
}

/* Used for evaluation */
Expand All @@ -45,20 +44,16 @@ public SearchResultPreliminaryScore getScore() {
return scoreValue;
}

private transient int domainId = Integer.MIN_VALUE;
public void setDomainId(int domainId) {
this.domainId = domainId;
}
public int getDomainId() {
return this.domainId;
return UrlIdCodec.getDomainId(this.combinedId);
}

public int hashCode() {
return getUrlIdInt();
return Long.hashCode(combinedId);
}

public String toString() {
return getClass().getSimpleName() + "[ url= " + getUrlId() + ", rank=" + getRanking() + "]";
return getClass().getSimpleName() + "[ url= " + getDocumentId() + ", rank=" + getRanking() + "]";
}

public boolean equals(Object other) {
Expand All @@ -67,18 +62,18 @@ public boolean equals(Object other) {
if (other == this)
return true;
if (other instanceof SearchResultItem o) {
return o.getUrlIdInt() == getUrlIdInt();
return o.getDocumentId() == getDocumentId();
}
return false;
}

public long deduplicationKey() {
final int domainId = getDomainId();

if (domainId == Integer.MAX_VALUE || domainId == Integer.MIN_VALUE) {
return 0;
}
@Override
public int compareTo(@NotNull SearchResultItem o) {
// this looks like a bug, but we actually want this in a reversed order
int diff = o.getScore().compareTo(getScore());
if (diff != 0)
return diff;

return domainId;
return Long.compare(this.combinedId, o.combinedId);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import static java.lang.Double.compare;

public record SearchResultPreliminaryScore(
boolean disqualified,
boolean hasPriorityTerm,
double searchRankingScore)
implements Comparable<SearchResultPreliminaryScore>
Expand All @@ -25,7 +24,4 @@ public int compareTo(@NotNull SearchResultPreliminaryScore other) {
return PREFER_LOW * compare(searchRankingScore, other.searchRankingScore);
}

public boolean isDisqualified() {
return disqualified;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,6 @@ public class ProcessInboxNames {
public static final String CONVERTER_INBOX = "converter";
public static final String LOADER_INBOX = "loader";
public static final String CRAWLER_INBOX = "crawler";

public static final String INDEX_CONSTRUCTOR_INBOX = "index_constructor";
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
package nu.marginalia.mqapi.index;

public record CreateIndexRequest(IndexName indexName)
{
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
package nu.marginalia.mqapi.index;

public enum IndexName {
FORWARD,
REVERSE_FULL,
REVERSE_PRIO
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@
public class SearchMqEndpoints {
/** Flushes the URL caches, run if significant changes have occurred in the URLs database */
public static final String FLUSH_CACHES = "FLUSH_CACHES";
public static final String SWITCH_LINKDB = "SWITCH_LINKDB";
}
30 changes: 15 additions & 15 deletions code/common/db/src/main/java/nu/marginalia/db/DbDomainQueries.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,16 @@
import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.id.EdgeId;

import java.util.NoSuchElementException;
import java.util.Optional;
import java.util.OptionalInt;

@Singleton
public class DbDomainQueries {
private final HikariDataSource dataSource;

private final Cache<EdgeDomain, EdgeId<EdgeDomain>> domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
private final Cache<EdgeDomain, Integer> domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build();

@Inject
public DbDomainQueries(HikariDataSource dataSource)
Expand All @@ -28,15 +28,15 @@ public DbDomainQueries(HikariDataSource dataSource)


@SneakyThrows
public EdgeId<EdgeDomain> getDomainId(EdgeDomain domain) {
public Integer getDomainId(EdgeDomain domain) {
try (var connection = dataSource.getConnection()) {

return domainIdCache.get(domain, () -> {
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
stmt.setString(1, domain.toString());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return new EdgeId<>(rsp.getInt(1));
return rsp.getInt(1);
}
}
throw new NoSuchElementException();
Expand All @@ -48,38 +48,38 @@ public EdgeId<EdgeDomain> getDomainId(EdgeDomain domain) {
}

@SneakyThrows
public Optional<EdgeId<EdgeDomain>> tryGetDomainId(EdgeDomain domain) {
public OptionalInt tryGetDomainId(EdgeDomain domain) {

var maybe = Optional.ofNullable(domainIdCache.getIfPresent(domain));

if (maybe.isPresent())
return maybe;
Integer maybeId = domainIdCache.getIfPresent(domain);
if (maybeId != null) {
return OptionalInt.of(maybeId);
}

try (var connection = dataSource.getConnection()) {

try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
stmt.setString(1, domain.toString());
var rsp = stmt.executeQuery();
if (rsp.next()) {
var id = new EdgeId<EdgeDomain>(rsp.getInt(1));
var id = rsp.getInt(1);

domainIdCache.put(domain, id);
return Optional.of(id);
return OptionalInt.of(id);
}
}
return Optional.empty();
return OptionalInt.empty();
}
catch (UncheckedExecutionException ex) {
return Optional.empty();
return OptionalInt.empty();
}
}

@SneakyThrows
public Optional<EdgeDomain> getDomain(EdgeId<EdgeDomain> id) {
public Optional<EdgeDomain> getDomain(int id) {
try (var connection = dataSource.getConnection()) {

try (var stmt = connection.prepareStatement("SELECT DOMAIN_NAME FROM EC_DOMAIN WHERE ID=?")) {
stmt.setInt(1, id.id());
stmt.setInt(1, id);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return Optional.of(new EdgeDomain(rsp.getString(1)));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,10 @@

import com.google.inject.ImplementedBy;
import gnu.trove.set.hash.TIntHashSet;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.id.EdgeId;

@ImplementedBy(DomainBlacklistImpl.class)
public interface DomainBlacklist {
boolean isBlacklisted(int domainId);
default boolean isBlacklisted(EdgeId<EdgeDomain> domainId) {
return isBlacklisted(domainId.id());
}
default TIntHashSet getSpamDomains() {
return new TIntHashSet();
}
Expand Down
13 changes: 7 additions & 6 deletions code/common/db/src/main/java/nu/marginalia/db/DomainTypes.java
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
package nu.marginalia.db;

import com.zaxxer.hikari.HikariDataSource;
import gnu.trove.list.TIntList;
import gnu.trove.list.array.TIntArrayList;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.id.EdgeIdList;
import org.slf4j.LoggerFactory;
import org.slf4j.Logger;

import javax.inject.Inject;
import javax.inject.Singleton;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
Expand Down Expand Up @@ -58,10 +59,10 @@ public List<String> getAllDomainsByType(Type type) {
return ret;
}

/** Retrieve the EdgeId of all domains of a certain type,
/** Retrieve the domain id of all domains of a certain type,
* ignoring entries that are not in the EC_DOMAIN table */
public EdgeIdList<EdgeDomain> getKnownDomainsByType(Type type) {
EdgeIdList<EdgeDomain> ret = new EdgeIdList<>();
public TIntList getKnownDomainsByType(Type type) {
TIntList ret = new TIntArrayList();

try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.inject.Inject;
import javax.inject.Singleton;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ public enum FileStorageType {
CRAWL_DATA,
PROCESSED_DATA,
INDEX_STAGING,
LEXICON_STAGING,
LINKDB_STAGING,
LINKDB_LIVE,
INDEX_LIVE,
LEXICON_LIVE,
BACKUP,
EXPORT,
SEARCH_SETS
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
ALTER TABLE FILE_STORAGE MODIFY COLUMN TYPE ENUM ('CRAWL_SPEC', 'CRAWL_DATA', 'PROCESSED_DATA', 'INDEX_STAGING', 'LEXICON_STAGING', 'INDEX_LIVE', 'LEXICON_LIVE', 'SEARCH_SETS', 'BACKUP', 'EXPORT', 'LINKDB_LIVE', 'LINKDB_STAGING') NOT NULL;

INSERT IGNORE INTO FILE_STORAGE(BASE_ID, PATH, DESCRIPTION, TYPE)
SELECT ID, 'ldbr', "Linkdb Current", 'LINKDB_LIVE'
FROM FILE_STORAGE_BASE WHERE NAME='Index Storage';

INSERT IGNORE INTO FILE_STORAGE(BASE_ID, PATH, DESCRIPTION, TYPE)
SELECT ID, 'ldbw', "Linkdb Staging Area", 'LINKDB_STAGING'
FROM FILE_STORAGE_BASE WHERE NAME='Index Storage';
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
DROP VIEW EC_URL_VIEW;
DROP TABLE EC_PAGE_DATA;
DROP TABLE EC_URL;
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
INSERT IGNORE INTO FILE_STORAGE_BASE(NAME, PATH, TYPE, PERMIT_TEMP)
VALUES
('Backup Storage', '/backup', 'BACKUP', true);
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
DELETE FROM FILE_STORAGE WHERE TYPE IN ('LEXICON_STAGING', 'LEXICON_LIVE');

0 comments on commit bdcbfb1

Please sign in to comment.