Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
(converter) Integrate zim->db conversion into automatic encyclopedia …
…processing workflow Previously, in order to load encyclopedia data into the search engine, it was necessary to use the encyclopedia.marginalia.nu converter to first create a .db-file. This isn't very ergonomic, so parts of that code-base was lifted in as a 3rd party library, and conversion from .zim to .db is now done automatically. The output file name is based on the original filename, plus a crc32 hash and a .db-ending, to ensure we can recycle the data on repeat loads.
- Loading branch information
Showing
21 changed files
with
895 additions
and
19 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
16 changes: 11 additions & 5 deletions
16
...rvice/src/main/resources/templates/control/node/actions/partial-sideload-encyclopedia.hdb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
plugins { | ||
id 'java' | ||
} | ||
|
||
java { | ||
toolchain { | ||
languageVersion.set(JavaLanguageVersion.of(21)) | ||
} | ||
} | ||
|
||
dependencies { | ||
implementation libs.jsoup | ||
implementation libs.notnull | ||
implementation libs.bundles.gson | ||
implementation libs.zstd | ||
implementation libs.bundles.slf4j | ||
|
||
implementation project(':code:libraries:blocking-thread-pool') | ||
|
||
implementation project(':third-party:xz') | ||
implementation project(':third-party:openzim') | ||
} | ||
|
||
test { | ||
useJUnitPlatform() | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
This package contains a severely stripped down version of the codebase from | ||
[encyclopedia.marginalia.nu](https://encyclopedia.marginalia.nu/). | ||
|
||
The extracted code is a ZimFile reader and WikiHTML cleaner. It is used by the | ||
encyclopedia side-loader. |
67 changes: 67 additions & 0 deletions
67
...lopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/EncyclopediaConverter.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
package nu.marginalia.encyclopedia; | ||
|
||
import nu.marginalia.encyclopedia.cleaner.WikiCleaner; | ||
import nu.marginalia.encyclopedia.store.ArticleDbProvider; | ||
import nu.marginalia.encyclopedia.store.ArticleStoreWriter; | ||
import nu.marginalia.util.SimpleBlockingThreadPool; | ||
import org.openzim.ZIMTypes.ZIMFile; | ||
import org.openzim.ZIMTypes.ZIMReader; | ||
import org.slf4j.LoggerFactory; | ||
import org.slf4j.Logger; | ||
|
||
import java.io.IOException; | ||
import java.nio.file.Files; | ||
import java.nio.file.Path; | ||
import java.sql.SQLException; | ||
import java.util.concurrent.TimeUnit; | ||
import java.util.concurrent.atomic.AtomicInteger; | ||
import java.util.function.BiConsumer; | ||
import java.util.function.Predicate; | ||
|
||
/** Converts an OpenZim file with Wikipedia articles to a SQLite database | ||
* with cleaned-up MediaWiki HTML | ||
*/ | ||
public class EncyclopediaConverter { | ||
private static final Logger logger = LoggerFactory.getLogger(EncyclopediaConverter.class); | ||
|
||
public static void convert(Path inputFile, Path outputFile) throws IOException, SQLException, InterruptedException { | ||
var wc = new WikiCleaner(); | ||
var pool = new SimpleBlockingThreadPool("Convert ZIM", | ||
Math.clamp(Runtime.getRuntime().availableProcessors() - 2, 1, 32), | ||
2); | ||
var size = new AtomicInteger(); | ||
|
||
if (!Files.exists(inputFile)) { | ||
throw new IllegalStateException("ZIM file not found: " + inputFile); | ||
} | ||
Files.deleteIfExists(outputFile); | ||
|
||
try (var asw = new ArticleStoreWriter(new ArticleDbProvider(outputFile))) { | ||
Predicate<Integer> keepGoing = (s) -> true; | ||
|
||
BiConsumer<String, String> handleArticle = (url, html) -> { | ||
if (pool.isTerminated()) | ||
return; | ||
|
||
pool.submitQuietly(() -> { | ||
int sz = size.incrementAndGet(); | ||
if (sz % 1000 == 0) { | ||
System.out.printf("\u001b[2K\r%d", sz); | ||
} | ||
asw.add(wc.cleanWikiJunk(url, html)); | ||
}); | ||
|
||
size.incrementAndGet(); | ||
}; | ||
|
||
new ZIMReader(new ZIMFile(inputFile.toString())).forEachArticles(handleArticle, keepGoing); | ||
|
||
pool.shutDown(); | ||
logger.info("Waiting for pool to finish"); | ||
|
||
while (!pool.awaitTermination(1, TimeUnit.SECONDS)) { | ||
// ... | ||
} | ||
} | ||
} | ||
} |
60 changes: 60 additions & 0 deletions
60
...lopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/cleaner/CleanerFilter.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
package nu.marginalia.encyclopedia.cleaner; | ||
|
||
import lombok.Builder; | ||
import org.jsoup.nodes.Comment; | ||
import org.jsoup.nodes.Element; | ||
import org.jsoup.nodes.Node; | ||
import org.jsoup.select.NodeFilter; | ||
|
||
import java.util.Set; | ||
import java.util.function.Predicate; | ||
import java.util.regex.Pattern; | ||
|
||
@Builder | ||
public class CleanerFilter implements NodeFilter { | ||
final Set<String> badTags; | ||
final Set<String> badIds; | ||
final Set<String> badClasses; | ||
|
||
final Set<Predicate<Element>> predicates; | ||
|
||
private static final Pattern spacePattern = Pattern.compile("\\s+"); | ||
|
||
@Override | ||
public FilterResult head(Node node, int depth) { | ||
if (node instanceof Element el) { | ||
if (badTags != null && badTags.contains(el.tagName())) | ||
return FilterResult.REMOVE; | ||
|
||
if (badIds != null && badIds.contains(el.id())) | ||
return FilterResult.REMOVE; | ||
|
||
if (badClasses != null) { | ||
String className = el.className(); | ||
if (className.contains(" ")) { | ||
String[] parts = spacePattern.split(className); | ||
for (var c : parts) { | ||
if (badClasses.contains(c)) | ||
return FilterResult.REMOVE; | ||
} | ||
} | ||
else if (badClasses.contains(className)) { | ||
return FilterResult.REMOVE; | ||
} | ||
} | ||
|
||
if (predicates != null) { | ||
for (var pred : predicates) { | ||
if (pred.test(el)) | ||
return FilterResult.REMOVE; | ||
} | ||
} | ||
} | ||
|
||
if (node instanceof Comment) { | ||
return FilterResult.REMOVE; | ||
} | ||
|
||
return FilterResult.CONTINUE; | ||
} | ||
} |
Oops, something went wrong.