Skip to content

Commit

Permalink
(converter) Integrate zim->db conversion into automatic encyclopedia …
Browse files Browse the repository at this point in the history
…processing workflow

Previously, in order to load encyclopedia data into the search engine, it was necessary to use the encyclopedia.marginalia.nu converter to first create a .db-file.  This isn't very ergonomic, so parts of that code-base was lifted in as a 3rd party library, and conversion from .zim to .db is now done automatically.

The output file name is based on the original filename, plus a crc32 hash and a .db-ending, to ensure we can recycle the data on repeat loads.
  • Loading branch information
vlofgren committed Jan 19, 2024
1 parent 22c8fb3 commit 27ffb8f
Show file tree
Hide file tree
Showing 21 changed files with 895 additions and 19 deletions.
Expand Up @@ -2,6 +2,8 @@

import nu.marginalia.io.processed.ProcessedDataFileNames;
import nu.marginalia.worklog.BatchingWorkLogInspector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.nio.file.Path;
Expand All @@ -10,13 +12,21 @@

public class LoaderInputData {
private final List<Path> sourceDirectories;
private static final Logger logger = LoggerFactory.getLogger(LoaderInputData.class);
private final Map<Path, Integer> lastGoodBatch = new HashMap<>();

public LoaderInputData(List<Path> sourceDirectories) throws IOException {
this.sourceDirectories = sourceDirectories;

for (var source : sourceDirectories) {
lastGoodBatch.put(source, BatchingWorkLogInspector.getValidBatches(source.resolve("processor.log")));
int lastGoodBatch = BatchingWorkLogInspector.getValidBatches(source.resolve("processor.log"));

this.lastGoodBatch.put(source, lastGoodBatch);

if (lastGoodBatch == 0) {
// This is useful diagnostic information, so we log it as a warning
logger.warn("No valid batches found in {}", source);
}
}

}
Expand Down
@@ -1,11 +1,17 @@
<h1 class="my-3">Sideload Encyclopedia</h1>

<div class="my-3 p-3 border bg-light">
<p>This will sideload a pre-converted MediaWiki-style OpenZim data set.
See the <a href="https://github.com/MarginaliaSearch/MarginaliaSearch/blob/master/doc/sideloading-howto.md">sideloading howto</a>
for instructions how to produce this file. </p>
<p>Place an articles.db file in the upload directory on the server, and select it from the list
below. </p>
<p>This will side-load a MediaWiki-style OpenZim data set. Place a zim file in the uploads directory.
For Wikipedia, the zim file can be downloaded from <a href="https://download.kiwix.org/zim/wikipedia/">https://download.kiwix.org/zim/wikipedia/</a>.
The en_all_nopic sets are recommended for wikipedia, since they are smaller and do not contain images
(which are not used anyway). For testing, the _mini or _en_100 sets are good choices.
<p></p>
The zim file will be converted to a sqlite database (.db-file) with a similar name to
the zim file, which then automatically is turned into processed data.
<p></p>
Since the first stage of processing is very time-consuming, the sqlite database can
also be loaded from this form.
</p>
</div>
<form method="post" action="actions/sideload-encyclopedia" onsubmit="return confirm('Confirm sideloading')">
<div class="my-3 py-3">
Expand Down
1 change: 1 addition & 0 deletions code/services-core/executor-service/build.gradle
Expand Up @@ -45,6 +45,7 @@ dependencies {
implementation project(':code:api:query-api')
implementation project(':code:api:process-mqapi')
implementation project(':code:api:executor-api')
implementation project(':third-party:encyclopedia-marginalia-nu')

implementation libs.bundles.slf4j

Expand Down
Expand Up @@ -7,6 +7,7 @@
import nu.marginalia.actor.state.ActorResumeBehavior;
import nu.marginalia.actor.state.ActorStep;
import nu.marginalia.actor.state.Resume;
import nu.marginalia.encyclopedia.EncyclopediaConverter;
import nu.marginalia.process.ProcessOutboxes;
import nu.marginalia.process.ProcessService;
import nu.marginalia.storage.FileStorageService;
Expand All @@ -16,21 +17,27 @@
import nu.marginalia.storage.model.FileStorageType;
import nu.marginalia.mq.MqMessageState;
import nu.marginalia.mq.outbox.MqOutbox;
import nu.marginalia.mqapi.converting.ConvertAction;
import nu.marginalia.mqapi.converting.ConvertRequest;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.zip.CRC32;

@Singleton
public class ConvertActor extends RecordActorPrototype {

private static final Logger logger = LoggerFactory.getLogger(ConvertActor.class);
private final ActorProcessWatcher processWatcher;
private final MqOutbox mqConverterOutbox;
private final FileStorageService storageService;
private final Gson gson;

public record Convert(FileStorageId fid) implements ActorStep {};
public record ConvertEncyclopedia(String source, String baseUrl) implements ActorStep {};
public record PredigestEncyclopedia(String source, String dest, String baseUrl) implements ActorStep {};
public record ConvertDirtree(String source) implements ActorStep {};
public record ConvertWarc(String source) implements ActorStep {};
public record ConvertStackexchange(String source) implements ActorStep {};
Expand Down Expand Up @@ -100,6 +107,19 @@ case ConvertEncyclopedia(String source, String baseUrl) -> {
if (!Files.exists(sourcePath))
yield new Error("Source path does not exist: " + sourcePath);

if (source.toLowerCase().endsWith(".zim")) {
// If we're fed a ZIM file, we need to convert it to a sqlite database first
String hash = getCrc32FileHash(sourcePath);

// To avoid re-converting the same file, we'll assign the file a name based on its hash
// and the original filename. This way, if we're fed the same file again, we'll be able to just
// re-use the predigested database file.
yield new PredigestEncyclopedia(source, STR."\{source}.\{hash}.db", baseUrl);
} else if (!source.endsWith(".db")) {
yield new Error("Source path must be a ZIM or pre-digested sqlite database file (.db)");
}


String fileName = sourcePath.toFile().getName();

var base = storageService.getStorageBase(FileStorageBaseType.STORAGE);
Expand All @@ -114,6 +134,36 @@ yield new ConvertWait(
mqConverterOutbox.sendAsync(ConvertRequest.forEncyclopedia(sourcePath, baseUrl, processedArea.id()))
);
}
case PredigestEncyclopedia(String source, String dest, String baseUrl) -> {
Path sourcePath = Path.of(source);

if (!Files.exists(sourcePath)) {
yield new Error("Source path does not exist: " + sourcePath);
}

Path destPath = Path.of(dest);
if (Files.exists(destPath)) {
// Already predigested, go straight to convert step
yield new ConvertEncyclopedia(dest, baseUrl);
}

Path tempFile = Files.createTempFile(destPath.getParent(), "encyclopedia", "db.tmp");

try {
EncyclopediaConverter.convert(sourcePath, tempFile);
Files.move(tempFile, destPath);
}
catch (Exception e) {
logger.error("Failed to convert ZIM file to sqlite database", e);
Files.deleteIfExists(tempFile);
Files.deleteIfExists(destPath);

yield new Error("Failed to convert ZIM file to sqlite database: " + e.getMessage());
}

// Go back to convert step with the new database file
yield new ConvertEncyclopedia(dest, baseUrl);
}
case ConvertStackexchange(String source) -> {

Path sourcePath = Path.of(source);
Expand Down Expand Up @@ -150,6 +200,22 @@ case ConvertWait(FileStorageId destFid, long msgId) -> {
};
}

private String getCrc32FileHash(Path file) throws IOException {
ByteBuffer buffer = ByteBuffer.allocate(8192);

try (var channel = Files.newByteChannel(file)) {
CRC32 crc = new CRC32();

while (channel.read(buffer) > 0) {
buffer.flip();
crc.update(buffer);
buffer.clear();
}

return Long.toHexString(crc.getValue());
}
}

@Override
public String describe() {
return "Convert a set of crawl data into a format suitable for loading into the database.";
Expand All @@ -165,6 +231,5 @@ public ConvertActor(ActorProcessWatcher processWatcher,
this.processWatcher = processWatcher;
this.mqConverterOutbox = processOutboxes.getConverterOutbox();
this.storageService = storageService;
this.gson = gson;
}
}
1 change: 1 addition & 0 deletions settings.gradle
Expand Up @@ -99,6 +99,7 @@ include 'third-party:monkey-patch-opennlp'
include 'third-party:monkey-patch-gson'
include 'third-party:commons-codec'
include 'third-party:parquet-floor'
include 'third-party:encyclopedia-marginalia-nu'


dependencyResolutionManagement {
Expand Down
4 changes: 2 additions & 2 deletions third-party/README.md
Expand Up @@ -9,9 +9,9 @@ or lack an artifact, or to override some default that is inappropriate for the t
* [RDRPosTagger](rdrpostagger/) - GPL3
* [PorterStemmer](porterstemmer/) - LGPL3
* [Uppend](uppend/) - MIT
* [OpenZIM](openzim/) - GPL-2.0
* [OpenZIM](openzim/) - GPL-2.0+
* [Commons Codec](commons-codec/) - Apache 2.0

* [encylopedia.marginalia.nu](encyclopedia-marginalia-nu/) - GPL 2.0+
### Repackaged
* [SymSpell](symspell/) - LGPL-3.0
* [Count-Min-Sketch](count-min-sketch/) - Apache 2.0
Expand Down
26 changes: 26 additions & 0 deletions third-party/encyclopedia-marginalia-nu/build.gradle
@@ -0,0 +1,26 @@
plugins {
id 'java'
}

java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(21))
}
}

dependencies {
implementation libs.jsoup
implementation libs.notnull
implementation libs.bundles.gson
implementation libs.zstd
implementation libs.bundles.slf4j

implementation project(':code:libraries:blocking-thread-pool')

implementation project(':third-party:xz')
implementation project(':third-party:openzim')
}

test {
useJUnitPlatform()
}
5 changes: 5 additions & 0 deletions third-party/encyclopedia-marginalia-nu/readme.md
@@ -0,0 +1,5 @@
This package contains a severely stripped down version of the codebase from
[encyclopedia.marginalia.nu](https://encyclopedia.marginalia.nu/).

The extracted code is a ZimFile reader and WikiHTML cleaner. It is used by the
encyclopedia side-loader.
@@ -0,0 +1,67 @@
package nu.marginalia.encyclopedia;

import nu.marginalia.encyclopedia.cleaner.WikiCleaner;
import nu.marginalia.encyclopedia.store.ArticleDbProvider;
import nu.marginalia.encyclopedia.store.ArticleStoreWriter;
import nu.marginalia.util.SimpleBlockingThreadPool;
import org.openzim.ZIMTypes.ZIMFile;
import org.openzim.ZIMTypes.ZIMReader;
import org.slf4j.LoggerFactory;
import org.slf4j.Logger;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.SQLException;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.BiConsumer;
import java.util.function.Predicate;

/** Converts an OpenZim file with Wikipedia articles to a SQLite database
* with cleaned-up MediaWiki HTML
*/
public class EncyclopediaConverter {
private static final Logger logger = LoggerFactory.getLogger(EncyclopediaConverter.class);

public static void convert(Path inputFile, Path outputFile) throws IOException, SQLException, InterruptedException {
var wc = new WikiCleaner();
var pool = new SimpleBlockingThreadPool("Convert ZIM",
Math.clamp(Runtime.getRuntime().availableProcessors() - 2, 1, 32),
2);
var size = new AtomicInteger();

if (!Files.exists(inputFile)) {
throw new IllegalStateException("ZIM file not found: " + inputFile);
}
Files.deleteIfExists(outputFile);

try (var asw = new ArticleStoreWriter(new ArticleDbProvider(outputFile))) {
Predicate<Integer> keepGoing = (s) -> true;

BiConsumer<String, String> handleArticle = (url, html) -> {
if (pool.isTerminated())
return;

pool.submitQuietly(() -> {
int sz = size.incrementAndGet();
if (sz % 1000 == 0) {
System.out.printf("\u001b[2K\r%d", sz);
}
asw.add(wc.cleanWikiJunk(url, html));
});

size.incrementAndGet();
};

new ZIMReader(new ZIMFile(inputFile.toString())).forEachArticles(handleArticle, keepGoing);

pool.shutDown();
logger.info("Waiting for pool to finish");

while (!pool.awaitTermination(1, TimeUnit.SECONDS)) {
// ...
}
}
}
}
@@ -0,0 +1,60 @@
package nu.marginalia.encyclopedia.cleaner;

import lombok.Builder;
import org.jsoup.nodes.Comment;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.select.NodeFilter;

import java.util.Set;
import java.util.function.Predicate;
import java.util.regex.Pattern;

@Builder
public class CleanerFilter implements NodeFilter {
final Set<String> badTags;
final Set<String> badIds;
final Set<String> badClasses;

final Set<Predicate<Element>> predicates;

private static final Pattern spacePattern = Pattern.compile("\\s+");

@Override
public FilterResult head(Node node, int depth) {
if (node instanceof Element el) {
if (badTags != null && badTags.contains(el.tagName()))
return FilterResult.REMOVE;

if (badIds != null && badIds.contains(el.id()))
return FilterResult.REMOVE;

if (badClasses != null) {
String className = el.className();
if (className.contains(" ")) {
String[] parts = spacePattern.split(className);
for (var c : parts) {
if (badClasses.contains(c))
return FilterResult.REMOVE;
}
}
else if (badClasses.contains(className)) {
return FilterResult.REMOVE;
}
}

if (predicates != null) {
for (var pred : predicates) {
if (pred.test(el))
return FilterResult.REMOVE;
}
}
}

if (node instanceof Comment) {
return FilterResult.REMOVE;
}

return FilterResult.CONTINUE;
}
}

0 comments on commit 27ffb8f

Please sign in to comment.