(control) New export actions for RSS/Atom feeds and term frequency data

This commit also refactors the executor a bit, and introduces a new converter-feature called data-extractors for this class of jobs.
MarginaliaSearch · Jan 15, 2024 · c41e68a · c41e68a
1 parent 4665af6
commit c41e68a
Show file tree

Hide file tree

Showing 35 changed files with 864 additions and 214 deletions.
diff --git a/code/api/executor-api/src/main/java/nu/marginalia/executor/client/ExecutorClient.java b/code/api/executor-api/src/main/java/nu/marginalia/executor/client/ExecutorClient.java
@@ -100,6 +100,13 @@ public void createCrawlSpecFromDownload(Context context, int node, String descri
     public void exportAtags(Context ctx, int node, String fid) {
         post(ctx, node, "/export/atags?fid="+fid, "").blockingSubscribe();
     }
+    public void exportRssFeeds(Context ctx, int node, String fid) {
+        post(ctx, node, "/export/feeds?fid="+fid, "").blockingSubscribe();
+    }
+    public void exportTermFrequencies(Context ctx, int node, String fid) {
+        post(ctx, node, "/export/termfreq?fid="+fid, "").blockingSubscribe();
+    }
+
     public void exportData(Context ctx, int node) {
         post(ctx, node, "/export/data", "").blockingSubscribe();
     }

diff --git a/code/features-convert/data-extractors/build.gradle b/code/features-convert/data-extractors/build.gradle
@@ -0,0 +1,40 @@
+plugins {
+    id 'java'
+
+
+    id "de.undercouch.download" version "5.1.0"
+
+    id 'jvm-test-suite'
+}
+
+java {
+    toolchain {
+        languageVersion.set(JavaLanguageVersion.of(21))
+    }
+}
+
+dependencies {
+    implementation project(':code:common:config')
+    implementation project(':code:common:process')
+    implementation project(':code:common:model')
+    implementation project(':code:libraries:language-processing')
+    implementation project(':code:libraries:term-frequency-dict')
+    implementation project(':code:features-crawl:link-parser')
+    implementation project(':code:features-convert:anchor-keywords')
+    implementation project(':code:process-models:crawling-model')
+    implementation project(':code:processes:converting-process')
+    implementation project(':third-party:commons-codec')
+
+
+    implementation libs.bundles.slf4j
+    implementation libs.guice
+    implementation libs.trove
+    implementation libs.commons.lang3
+    implementation libs.notnull
+    implementation libs.jsoup
+
+    testImplementation libs.bundles.slf4j.test
+    testImplementation libs.bundles.junit
+    testImplementation libs.mockito
+}
+
diff --git a/code/features-convert/data-extractors/readme.md b/code/features-convert/data-extractors/readme.md
@@ -0,0 +1,7 @@
+Contains converter-*like* extraction jobs that operate on crawled data to produce export files.
+
+## Important classes
+
+* [AtagExporter](src/main/java/nu/marginalia/extractor/AtagExporter.java) - extracts anchor texts from the crawled data.
+* [FeedExporter](src/main/java/nu/marginalia/extractor/FeedExporter.java) - tries to find RSS/Atom feeds within the crawled data.
+* [TermFrequencyExporter](src/main/java/nu/marginalia/extractor/TermFrequencyExporter.java) - exports the 'TF' part of TF-IDF.
diff --git a/.../features-convert/data-extractors/src/main/java/nu/marginalia/extractor/AtagExporter.java b/.../features-convert/data-extractors/src/main/java/nu/marginalia/extractor/AtagExporter.java
@@ -0,0 +1,196 @@
+package nu.marginalia.extractor;
+
+import com.google.inject.Inject;
+import gnu.trove.set.hash.TLongHashSet;
+import lombok.SneakyThrows;
+import nu.marginalia.crawling.io.CrawledDomainReader;
+import nu.marginalia.crawling.io.SerializableCrawlDataStream;
+import nu.marginalia.crawling.model.CrawledDocument;
+import nu.marginalia.hash.MurmurHash3_128;
+import nu.marginalia.link_parser.LinkParser;
+import nu.marginalia.model.EdgeDomain;
+import nu.marginalia.model.EdgeUrl;
+import nu.marginalia.process.log.WorkLog;
+import nu.marginalia.storage.FileStorageService;
+import nu.marginalia.storage.model.FileStorage;
+import nu.marginalia.storage.model.FileStorageId;
+import org.apache.commons.lang3.StringUtils;
+import org.jsoup.Jsoup;
+
+import java.io.BufferedWriter;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.net.URISyntaxException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardCopyOption;
+import java.nio.file.StandardOpenOption;
+import java.nio.file.attribute.PosixFilePermissions;
+import java.util.Objects;
+import java.util.zip.GZIPOutputStream;
+
+public class AtagExporter implements ExporterIf {
+    private static final LinkParser linkParser = new LinkParser();
+    private static final MurmurHash3_128 hash = new MurmurHash3_128();
+    private final FileStorageService storageService;
+
+    @Inject
+    public AtagExporter(FileStorageService storageService) {
+        this.storageService = storageService;
+    }
+
+    @Override
+    public void export(FileStorageId crawlId, FileStorageId destId) throws Exception {
+        FileStorage destStorage = storageService.getStorage(destId);
+
+        var tmpFile = Files.createTempFile(destStorage.asPath(), "atags", ".csv.gz",
+                PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
+
+        Path inputDir = storageService.getStorage(crawlId).asPath();
+
+        try (var bw = new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(Files.newOutputStream(tmpFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING))));
+        )
+        {
+            Path crawlerLogFile = inputDir.resolve("crawler.log");
+
+            var tagWriter = new ATagCsvWriter(bw);
+
+            for (var item : WorkLog.iterable(crawlerLogFile)) {
+                if (Thread.interrupted()) {
+                    throw new InterruptedException();
+                }
+
+                Path crawlDataPath = inputDir.resolve(item.relPath());
+                try (var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.FAST, crawlDataPath)) {
+                    exportLinks(tagWriter, stream);
+                }
+                catch (Exception ex) {
+                    ex.printStackTrace();
+                }
+            }
+
+            Files.move(tmpFile, destStorage.asPath().resolve("atags.csv.gz"), StandardCopyOption.ATOMIC_MOVE, StandardCopyOption.REPLACE_EXISTING);
+
+        }
+        catch (Exception ex) {
+
+        }
+        finally {
+            Files.deleteIfExists(tmpFile);
+        }
+
+    }
+
+
+    private boolean exportLinks(ATagCsvWriter exporter, SerializableCrawlDataStream stream) throws IOException, URISyntaxException {
+        ATagLinkFilter linkFilter = new ATagLinkFilter();
+
+        while (stream.hasNext()) {
+            if (!(stream.next() instanceof CrawledDocument doc))
+                continue;
+            if (null == doc.documentBody)
+                continue;
+
+            var baseUrl = new EdgeUrl(doc.url);
+            var parsed = Jsoup.parse(doc.documentBody);
+
+            for (var atag : parsed.getElementsByTag("a")) {
+                String linkText = atag.text();
+
+                if (!linkFilter.isLinkTextEligible(linkText)) {
+                    continue;
+                }
+
+                var linkOpt = linkParser.parseLinkPermissive(baseUrl, atag);
+                linkOpt
+                        .filter(url -> linkFilter.isEligible(url, baseUrl, linkText))
+                        .ifPresent(url -> exporter.accept(url, baseUrl.domain, linkText));
+            }
+        }
+
+        return true;
+    }
+
+    private static class ATagLinkFilter {
+        private final TLongHashSet hashes = new TLongHashSet();
+
+        private boolean isLinkTextEligible(String linkText) {
+            // Filter out the most obviously uninteresting anchor texts
+
+            if (linkText.isBlank())
+                return false;
+            if (linkText.startsWith("this"))
+                return false;
+            if (linkText.equalsIgnoreCase("here"))
+                return false;
+            if (linkText.equalsIgnoreCase("click here"))
+                return false;
+
+            if (!StringUtils.isAsciiPrintable(linkText))  // This also filters out newlines, a good thing!
+                return false;
+
+            return true;
+        }
+        private boolean isEligible(EdgeUrl url, EdgeUrl baseUrl, String linkText) {
+            if (!"http".equals(url.proto) && !"https".equals(url.proto))
+                return false;
+
+            // This is an artifact of the link parser typically
+            if ("example.com".equals(url.domain.topDomain))
+                return false;
+
+            if (linkText.contains(url.domain.toString()))
+                return false;
+            if (Objects.equals(url.domain, baseUrl.domain))
+                return false;
+
+            String urlString = url.toString();
+            if (!StringUtils.isAsciiPrintable(urlString)) { // This also filters out newlines, a good thing!
+                return false;
+            }
+
+            // Deduplicate by hash;  we've already checked that the strings are ASCII printable so we don't
+            // need to be concerned about using the fast ASCII hash
+            if (hashes.add(hash.hashLowerBytes(linkText) ^ hash.hashLowerBytes(urlString))) {
+                return false;
+            }
+
+            return true;
+        }
+    }
+
+
+    private static class ATagCsvWriter {
+        private final BufferedWriter writer;
+
+        private ATagCsvWriter(BufferedWriter writer) {
+            this.writer = writer;
+        }
+
+        @SneakyThrows
+        public void accept(EdgeUrl url, EdgeDomain sourceDomain, String linkText) {
+            final String urlString = urlWithNoSchema(url);
+
+            writer.write(String.format("\"%s\",\"%s\",\"%s\"\n",
+                    csvify(urlString),
+                    csvify(linkText),
+                    csvify(sourceDomain)));
+        }
+
+        private static String urlWithNoSchema(EdgeUrl url) {
+            StringBuilder sb = new StringBuilder();
+
+            sb.append(url.domain).append(url.path);
+
+            if (url.param != null)
+                sb.append('?').append(url.param);
+
+            return sb.toString();
+        }
+
+        private static String csvify(Object field) {
+            return field.toString().replace("\"", "\"\"");
+        }
+
+    }
+}
diff --git a/code/features-convert/data-extractors/src/main/java/nu/marginalia/extractor/ExporterIf.java b/code/features-convert/data-extractors/src/main/java/nu/marginalia/extractor/ExporterIf.java
@@ -0,0 +1,7 @@
+package nu.marginalia.extractor;
+
+import nu.marginalia.storage.model.FileStorageId;
+
+public interface ExporterIf {
+    void export(FileStorageId crawlId, FileStorageId destId) throws Exception;
+}