Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Zero downtime upgrades, merge-based index construction #42

Merged
merged 30 commits into from
Aug 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
b22f4fb
(linkdb) New Module for sqlite-backed document db
vlofgren Aug 24, 2023
b958acb
(file-storage) New File Storage type for linkdb
vlofgren Aug 24, 2023
7bb3e44
(common) Deprecate EdgeId and similar
vlofgren Aug 24, 2023
c70670b
(common) New UrlIdCodec class
vlofgren Aug 24, 2023
6a04cdf
(loader) Implement new linkdb in loader
vlofgren Aug 24, 2023
9894f37
(index) Implement new URL ID coding scheme.
vlofgren Aug 24, 2023
c909120
(search) Basic working integration of linkdb in search service
vlofgren Aug 24, 2023
1e68005
(system) Remove EdgeId<T> and similar objects
vlofgren Aug 24, 2023
56eb833
(index) Clean up result domain deduplicator
vlofgren Aug 24, 2023
b911665
(index) Clean up and optimize valuator
vlofgren Aug 24, 2023
5ed5298
(converter) Update confusing state description
vlofgren Aug 24, 2023
e741301
(search) Remove endpoint flush-search-caches
vlofgren Aug 25, 2023
460998d
(index) Move index construction to separate process.
vlofgren Aug 25, 2023
70a5df9
(control) Display progress of process tasks
vlofgren Aug 25, 2023
28188a6
(control) Simplify ConvertAndLoadActor
vlofgren Aug 25, 2023
e710e05
(db) Remove EC_URL and EC_PAGE_DATA from mariadb database
vlofgren Aug 25, 2023
194a605
(index,control) Recoverable index backups
vlofgren Aug 25, 2023
4e694fd
(minor) Comment build.gradle
vlofgren Aug 25, 2023
3101b74
(index) Move to a lexicon-free index design
vlofgren Aug 28, 2023
00c4686
(reverse-index) Fix over-allocation of the count array in merging
vlofgren Aug 28, 2023
ffa0366
(minor) Fix typo in ActorStateMachine's logging
vlofgren Aug 28, 2023
b6a9250
(index) Hook in missing DocIdRewriter
vlofgren Aug 28, 2023
6525b16
(minor) Improved logging and error messages
vlofgren Aug 28, 2023
ba4513e
(loader) Revert accidental experimental changes that slipped by in an…
vlofgren Aug 28, 2023
a2e6616
(index-reverse) Add documentation and clean up code.
vlofgren Aug 29, 2023
c57a2d0
(control-service) Remove old index journal files when restoring a bac…
vlofgren Aug 29, 2023
39c1857
(heartbeat, reverse-index) Better heartbeat mocking, improved heartbe…
vlofgren Aug 29, 2023
fa87c7e
(process) Automatic flightrecorder runs for processes when run in doc…
vlofgren Aug 29, 2023
dd593c2
(loader) Minor optimizations and bugfixes.
vlofgren Aug 29, 2023
3f288e2
(minor) Clean up dead endpoints
vlofgren Aug 29, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
12 changes: 12 additions & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@ tasks.register('dist', Copy) {
from subprojects.collect { it.tasks.withType(Tar) }
into "$buildDir/dist"

// For local development, each processes that are to be triggerable
// from the control-service need to go here to end up somewhere the
// control-service can find them

doLast {
copy {
from tarTree("$buildDir/dist/converter-process.tar")
Expand All @@ -34,10 +38,18 @@ tasks.register('dist', Copy) {
from tarTree("$buildDir/dist/crawl-job-extractor-process.tar")
into "$projectDir/run/dist/"
}
copy {
from tarTree("$buildDir/dist/index-construction-process.tar")
into "$projectDir/run/dist/"
}
}
}
idea {
module {
// Exclude these directories from being indexed by IntelliJ
// as they tend to bring the IDE to its knees and use up all
// Inotify spots in a hurry
excludeDirs.add(file("$projectDir/run/backup"))
excludeDirs.add(file("$projectDir/run/model"))
excludeDirs.add(file("$projectDir/run/dist"))
excludeDirs.add(file("$projectDir/run/samples"))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
public class IndexMqEndpoints {
public static final String INDEX_IS_BLOCKED = "INDEX-IS-BLOCKED";
public static final String INDEX_REPARTITION = "INDEX-REPARTITION";

public static final String INDEX_RELOAD_LEXICON = "INDEX-RELOAD-LEXICON";
public static final String INDEX_REINDEX = "INDEX-REINDEX";
public static final String SWITCH_INDEX = "SWITCH-INDEX";

}
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,17 @@

import lombok.AllArgsConstructor;
import lombok.Getter;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.id.EdgeId;
import nu.marginalia.model.id.UrlIdCodec;
import org.jetbrains.annotations.NotNull;

import java.util.ArrayList;
import java.util.List;

/** Represents a document matching a search query */
@AllArgsConstructor @Getter
public class SearchResultItem {
/** Encoded ID that contains both the URL id and its ranking */
public class SearchResultItem implements Comparable<SearchResultItem> {
/** Encoded ID that contains both the URL id and its ranking. This is
* probably not what you want, use getDocumentId() instead */
public final long combinedId;

/** How did the subqueries match against the document ? */
Expand All @@ -20,20 +21,18 @@ public class SearchResultItem {
/** How many other potential results existed in the same domain */
public int resultsFromDomain;

public SearchResultItem(long val) {
this.combinedId = val;
public SearchResultItem(long combinedId) {
this.combinedId = combinedId;
this.keywordScores = new ArrayList<>(16);
}

public EdgeId<EdgeUrl> getUrlId() {
return new EdgeId<>(getUrlIdInt());
}

public int getUrlIdInt() {
return (int)(combinedId & 0xFFFF_FFFFL);
public long getDocumentId() {
return UrlIdCodec.removeRank(combinedId);
}

public int getRanking() {
return (int)(combinedId >>> 32);
return UrlIdCodec.getRank(combinedId);
}

/* Used for evaluation */
Expand All @@ -45,20 +44,16 @@ public SearchResultPreliminaryScore getScore() {
return scoreValue;
}

private transient int domainId = Integer.MIN_VALUE;
public void setDomainId(int domainId) {
this.domainId = domainId;
}
public int getDomainId() {
return this.domainId;
return UrlIdCodec.getDomainId(this.combinedId);
}

public int hashCode() {
return getUrlIdInt();
return Long.hashCode(combinedId);
}

public String toString() {
return getClass().getSimpleName() + "[ url= " + getUrlId() + ", rank=" + getRanking() + "]";
return getClass().getSimpleName() + "[ url= " + getDocumentId() + ", rank=" + getRanking() + "]";
}

public boolean equals(Object other) {
Expand All @@ -67,18 +62,18 @@ public boolean equals(Object other) {
if (other == this)
return true;
if (other instanceof SearchResultItem o) {
return o.getUrlIdInt() == getUrlIdInt();
return o.getDocumentId() == getDocumentId();
}
return false;
}

public long deduplicationKey() {
final int domainId = getDomainId();

if (domainId == Integer.MAX_VALUE || domainId == Integer.MIN_VALUE) {
return 0;
}
@Override
public int compareTo(@NotNull SearchResultItem o) {
// this looks like a bug, but we actually want this in a reversed order
int diff = o.getScore().compareTo(getScore());
if (diff != 0)
return diff;

return domainId;
return Long.compare(this.combinedId, o.combinedId);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import static java.lang.Double.compare;

public record SearchResultPreliminaryScore(
boolean disqualified,
boolean hasPriorityTerm,
double searchRankingScore)
implements Comparable<SearchResultPreliminaryScore>
Expand All @@ -25,7 +24,4 @@ public int compareTo(@NotNull SearchResultPreliminaryScore other) {
return PREFER_LOW * compare(searchRankingScore, other.searchRankingScore);
}

public boolean isDisqualified() {
return disqualified;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,6 @@ public class ProcessInboxNames {
public static final String CONVERTER_INBOX = "converter";
public static final String LOADER_INBOX = "loader";
public static final String CRAWLER_INBOX = "crawler";

public static final String INDEX_CONSTRUCTOR_INBOX = "index_constructor";
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
package nu.marginalia.mqapi.index;

public record CreateIndexRequest(IndexName indexName)
{
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
package nu.marginalia.mqapi.index;

public enum IndexName {
FORWARD,
REVERSE_FULL,
REVERSE_PRIO
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@
public class SearchMqEndpoints {
/** Flushes the URL caches, run if significant changes have occurred in the URLs database */
public static final String FLUSH_CACHES = "FLUSH_CACHES";
public static final String SWITCH_LINKDB = "SWITCH_LINKDB";
}
30 changes: 15 additions & 15 deletions code/common/db/src/main/java/nu/marginalia/db/DbDomainQueries.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,16 @@
import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.id.EdgeId;

import java.util.NoSuchElementException;
import java.util.Optional;
import java.util.OptionalInt;

@Singleton
public class DbDomainQueries {
private final HikariDataSource dataSource;

private final Cache<EdgeDomain, EdgeId<EdgeDomain>> domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
private final Cache<EdgeDomain, Integer> domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build();

@Inject
public DbDomainQueries(HikariDataSource dataSource)
Expand All @@ -28,15 +28,15 @@ public DbDomainQueries(HikariDataSource dataSource)


@SneakyThrows
public EdgeId<EdgeDomain> getDomainId(EdgeDomain domain) {
public Integer getDomainId(EdgeDomain domain) {
try (var connection = dataSource.getConnection()) {

return domainIdCache.get(domain, () -> {
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
stmt.setString(1, domain.toString());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return new EdgeId<>(rsp.getInt(1));
return rsp.getInt(1);
}
}
throw new NoSuchElementException();
Expand All @@ -48,38 +48,38 @@ public EdgeId<EdgeDomain> getDomainId(EdgeDomain domain) {
}

@SneakyThrows
public Optional<EdgeId<EdgeDomain>> tryGetDomainId(EdgeDomain domain) {
public OptionalInt tryGetDomainId(EdgeDomain domain) {

var maybe = Optional.ofNullable(domainIdCache.getIfPresent(domain));

if (maybe.isPresent())
return maybe;
Integer maybeId = domainIdCache.getIfPresent(domain);
if (maybeId != null) {
return OptionalInt.of(maybeId);
}

try (var connection = dataSource.getConnection()) {

try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
stmt.setString(1, domain.toString());
var rsp = stmt.executeQuery();
if (rsp.next()) {
var id = new EdgeId<EdgeDomain>(rsp.getInt(1));
var id = rsp.getInt(1);

domainIdCache.put(domain, id);
return Optional.of(id);
return OptionalInt.of(id);
}
}
return Optional.empty();
return OptionalInt.empty();
}
catch (UncheckedExecutionException ex) {
return Optional.empty();
return OptionalInt.empty();
}
}

@SneakyThrows
public Optional<EdgeDomain> getDomain(EdgeId<EdgeDomain> id) {
public Optional<EdgeDomain> getDomain(int id) {
try (var connection = dataSource.getConnection()) {

try (var stmt = connection.prepareStatement("SELECT DOMAIN_NAME FROM EC_DOMAIN WHERE ID=?")) {
stmt.setInt(1, id.id());
stmt.setInt(1, id);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return Optional.of(new EdgeDomain(rsp.getString(1)));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,10 @@

import com.google.inject.ImplementedBy;
import gnu.trove.set.hash.TIntHashSet;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.id.EdgeId;

@ImplementedBy(DomainBlacklistImpl.class)
public interface DomainBlacklist {
boolean isBlacklisted(int domainId);
default boolean isBlacklisted(EdgeId<EdgeDomain> domainId) {
return isBlacklisted(domainId.id());
}
default TIntHashSet getSpamDomains() {
return new TIntHashSet();
}
Expand Down
13 changes: 7 additions & 6 deletions code/common/db/src/main/java/nu/marginalia/db/DomainTypes.java
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
package nu.marginalia.db;

import com.zaxxer.hikari.HikariDataSource;
import gnu.trove.list.TIntList;
import gnu.trove.list.array.TIntArrayList;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.id.EdgeIdList;
import org.slf4j.LoggerFactory;
import org.slf4j.Logger;

import javax.inject.Inject;
import javax.inject.Singleton;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
Expand Down Expand Up @@ -58,10 +59,10 @@ public List<String> getAllDomainsByType(Type type) {
return ret;
}

/** Retrieve the EdgeId of all domains of a certain type,
/** Retrieve the domain id of all domains of a certain type,
* ignoring entries that are not in the EC_DOMAIN table */
public EdgeIdList<EdgeDomain> getKnownDomainsByType(Type type) {
EdgeIdList<EdgeDomain> ret = new EdgeIdList<>();
public TIntList getKnownDomainsByType(Type type) {
TIntList ret = new TIntArrayList();

try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.inject.Inject;
import javax.inject.Singleton;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ public enum FileStorageType {
CRAWL_DATA,
PROCESSED_DATA,
INDEX_STAGING,
LEXICON_STAGING,
LINKDB_STAGING,
LINKDB_LIVE,
INDEX_LIVE,
LEXICON_LIVE,
BACKUP,
EXPORT,
SEARCH_SETS
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
ALTER TABLE FILE_STORAGE MODIFY COLUMN TYPE ENUM ('CRAWL_SPEC', 'CRAWL_DATA', 'PROCESSED_DATA', 'INDEX_STAGING', 'LEXICON_STAGING', 'INDEX_LIVE', 'LEXICON_LIVE', 'SEARCH_SETS', 'BACKUP', 'EXPORT', 'LINKDB_LIVE', 'LINKDB_STAGING') NOT NULL;

INSERT IGNORE INTO FILE_STORAGE(BASE_ID, PATH, DESCRIPTION, TYPE)
SELECT ID, 'ldbr', "Linkdb Current", 'LINKDB_LIVE'
FROM FILE_STORAGE_BASE WHERE NAME='Index Storage';

INSERT IGNORE INTO FILE_STORAGE(BASE_ID, PATH, DESCRIPTION, TYPE)
SELECT ID, 'ldbw', "Linkdb Staging Area", 'LINKDB_STAGING'
FROM FILE_STORAGE_BASE WHERE NAME='Index Storage';
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
DROP VIEW EC_URL_VIEW;
DROP TABLE EC_PAGE_DATA;
DROP TABLE EC_URL;
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
INSERT IGNORE INTO FILE_STORAGE_BASE(NAME, PATH, TYPE, PERMIT_TEMP)
VALUES
('Backup Storage', '/backup', 'BACKUP', true);
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
DELETE FROM FILE_STORAGE WHERE TYPE IN ('LEXICON_STAGING', 'LEXICON_LIVE');