diff --git a/code/api/executor-api/src/main/java/nu/marginalia/executor/client/ExecutorClient.java b/code/api/executor-api/src/main/java/nu/marginalia/executor/client/ExecutorClient.java index 51fa06c6e..d5fee1c46 100644 --- a/code/api/executor-api/src/main/java/nu/marginalia/executor/client/ExecutorClient.java +++ b/code/api/executor-api/src/main/java/nu/marginalia/executor/client/ExecutorClient.java @@ -100,6 +100,13 @@ public void createCrawlSpecFromDownload(Context context, int node, String descri public void exportAtags(Context ctx, int node, String fid) { post(ctx, node, "/export/atags?fid="+fid, "").blockingSubscribe(); } + public void exportRssFeeds(Context ctx, int node, String fid) { + post(ctx, node, "/export/feeds?fid="+fid, "").blockingSubscribe(); + } + public void exportTermFrequencies(Context ctx, int node, String fid) { + post(ctx, node, "/export/termfreq?fid="+fid, "").blockingSubscribe(); + } + public void exportData(Context ctx, int node) { post(ctx, node, "/export/data", "").blockingSubscribe(); } diff --git a/code/features-convert/data-extractors/build.gradle b/code/features-convert/data-extractors/build.gradle new file mode 100644 index 000000000..217d50443 --- /dev/null +++ b/code/features-convert/data-extractors/build.gradle @@ -0,0 +1,40 @@ +plugins { + id 'java' + + + id "de.undercouch.download" version "5.1.0" + + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(21)) + } +} + +dependencies { + implementation project(':code:common:config') + implementation project(':code:common:process') + implementation project(':code:common:model') + implementation project(':code:libraries:language-processing') + implementation project(':code:libraries:term-frequency-dict') + implementation project(':code:features-crawl:link-parser') + implementation project(':code:features-convert:anchor-keywords') + implementation project(':code:process-models:crawling-model') + implementation project(':code:processes:converting-process') + implementation project(':third-party:commons-codec') + + + implementation libs.bundles.slf4j + implementation libs.guice + implementation libs.trove + implementation libs.commons.lang3 + implementation libs.notnull + implementation libs.jsoup + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito +} + diff --git a/code/features-convert/data-extractors/readme.md b/code/features-convert/data-extractors/readme.md new file mode 100644 index 000000000..d8c9fc0d3 --- /dev/null +++ b/code/features-convert/data-extractors/readme.md @@ -0,0 +1,7 @@ +Contains converter-*like* extraction jobs that operate on crawled data to produce export files. + +## Important classes + +* [AtagExporter](src/main/java/nu/marginalia/extractor/AtagExporter.java) - extracts anchor texts from the crawled data. +* [FeedExporter](src/main/java/nu/marginalia/extractor/FeedExporter.java) - tries to find RSS/Atom feeds within the crawled data. +* [TermFrequencyExporter](src/main/java/nu/marginalia/extractor/TermFrequencyExporter.java) - exports the 'TF' part of TF-IDF. \ No newline at end of file diff --git a/code/features-convert/data-extractors/src/main/java/nu/marginalia/extractor/AtagExporter.java b/code/features-convert/data-extractors/src/main/java/nu/marginalia/extractor/AtagExporter.java new file mode 100644 index 000000000..dc286323c --- /dev/null +++ b/code/features-convert/data-extractors/src/main/java/nu/marginalia/extractor/AtagExporter.java @@ -0,0 +1,196 @@ +package nu.marginalia.extractor; + +import com.google.inject.Inject; +import gnu.trove.set.hash.TLongHashSet; +import lombok.SneakyThrows; +import nu.marginalia.crawling.io.CrawledDomainReader; +import nu.marginalia.crawling.io.SerializableCrawlDataStream; +import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.hash.MurmurHash3_128; +import nu.marginalia.link_parser.LinkParser; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.process.log.WorkLog; +import nu.marginalia.storage.FileStorageService; +import nu.marginalia.storage.model.FileStorage; +import nu.marginalia.storage.model.FileStorageId; +import org.apache.commons.lang3.StringUtils; +import org.jsoup.Jsoup; + +import java.io.BufferedWriter; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.net.URISyntaxException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardCopyOption; +import java.nio.file.StandardOpenOption; +import java.nio.file.attribute.PosixFilePermissions; +import java.util.Objects; +import java.util.zip.GZIPOutputStream; + +public class AtagExporter implements ExporterIf { + private static final LinkParser linkParser = new LinkParser(); + private static final MurmurHash3_128 hash = new MurmurHash3_128(); + private final FileStorageService storageService; + + @Inject + public AtagExporter(FileStorageService storageService) { + this.storageService = storageService; + } + + @Override + public void export(FileStorageId crawlId, FileStorageId destId) throws Exception { + FileStorage destStorage = storageService.getStorage(destId); + + var tmpFile = Files.createTempFile(destStorage.asPath(), "atags", ".csv.gz", + PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--"))); + + Path inputDir = storageService.getStorage(crawlId).asPath(); + + try (var bw = new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(Files.newOutputStream(tmpFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)))); + ) + { + Path crawlerLogFile = inputDir.resolve("crawler.log"); + + var tagWriter = new ATagCsvWriter(bw); + + for (var item : WorkLog.iterable(crawlerLogFile)) { + if (Thread.interrupted()) { + throw new InterruptedException(); + } + + Path crawlDataPath = inputDir.resolve(item.relPath()); + try (var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.FAST, crawlDataPath)) { + exportLinks(tagWriter, stream); + } + catch (Exception ex) { + ex.printStackTrace(); + } + } + + Files.move(tmpFile, destStorage.asPath().resolve("atags.csv.gz"), StandardCopyOption.ATOMIC_MOVE, StandardCopyOption.REPLACE_EXISTING); + + } + catch (Exception ex) { + + } + finally { + Files.deleteIfExists(tmpFile); + } + + } + + + private boolean exportLinks(ATagCsvWriter exporter, SerializableCrawlDataStream stream) throws IOException, URISyntaxException { + ATagLinkFilter linkFilter = new ATagLinkFilter(); + + while (stream.hasNext()) { + if (!(stream.next() instanceof CrawledDocument doc)) + continue; + if (null == doc.documentBody) + continue; + + var baseUrl = new EdgeUrl(doc.url); + var parsed = Jsoup.parse(doc.documentBody); + + for (var atag : parsed.getElementsByTag("a")) { + String linkText = atag.text(); + + if (!linkFilter.isLinkTextEligible(linkText)) { + continue; + } + + var linkOpt = linkParser.parseLinkPermissive(baseUrl, atag); + linkOpt + .filter(url -> linkFilter.isEligible(url, baseUrl, linkText)) + .ifPresent(url -> exporter.accept(url, baseUrl.domain, linkText)); + } + } + + return true; + } + + private static class ATagLinkFilter { + private final TLongHashSet hashes = new TLongHashSet(); + + private boolean isLinkTextEligible(String linkText) { + // Filter out the most obviously uninteresting anchor texts + + if (linkText.isBlank()) + return false; + if (linkText.startsWith("this")) + return false; + if (linkText.equalsIgnoreCase("here")) + return false; + if (linkText.equalsIgnoreCase("click here")) + return false; + + if (!StringUtils.isAsciiPrintable(linkText)) // This also filters out newlines, a good thing! + return false; + + return true; + } + private boolean isEligible(EdgeUrl url, EdgeUrl baseUrl, String linkText) { + if (!"http".equals(url.proto) && !"https".equals(url.proto)) + return false; + + // This is an artifact of the link parser typically + if ("example.com".equals(url.domain.topDomain)) + return false; + + if (linkText.contains(url.domain.toString())) + return false; + if (Objects.equals(url.domain, baseUrl.domain)) + return false; + + String urlString = url.toString(); + if (!StringUtils.isAsciiPrintable(urlString)) { // This also filters out newlines, a good thing! + return false; + } + + // Deduplicate by hash; we've already checked that the strings are ASCII printable so we don't + // need to be concerned about using the fast ASCII hash + if (hashes.add(hash.hashLowerBytes(linkText) ^ hash.hashLowerBytes(urlString))) { + return false; + } + + return true; + } + } + + + private static class ATagCsvWriter { + private final BufferedWriter writer; + + private ATagCsvWriter(BufferedWriter writer) { + this.writer = writer; + } + + @SneakyThrows + public void accept(EdgeUrl url, EdgeDomain sourceDomain, String linkText) { + final String urlString = urlWithNoSchema(url); + + writer.write(String.format("\"%s\",\"%s\",\"%s\"\n", + csvify(urlString), + csvify(linkText), + csvify(sourceDomain))); + } + + private static String urlWithNoSchema(EdgeUrl url) { + StringBuilder sb = new StringBuilder(); + + sb.append(url.domain).append(url.path); + + if (url.param != null) + sb.append('?').append(url.param); + + return sb.toString(); + } + + private static String csvify(Object field) { + return field.toString().replace("\"", "\"\""); + } + + } +} diff --git a/code/features-convert/data-extractors/src/main/java/nu/marginalia/extractor/ExporterIf.java b/code/features-convert/data-extractors/src/main/java/nu/marginalia/extractor/ExporterIf.java new file mode 100644 index 000000000..c2f50eab8 --- /dev/null +++ b/code/features-convert/data-extractors/src/main/java/nu/marginalia/extractor/ExporterIf.java @@ -0,0 +1,7 @@ +package nu.marginalia.extractor; + +import nu.marginalia.storage.model.FileStorageId; + +public interface ExporterIf { + void export(FileStorageId crawlId, FileStorageId destId) throws Exception; +} diff --git a/code/features-convert/data-extractors/src/main/java/nu/marginalia/extractor/FeedExporter.java b/code/features-convert/data-extractors/src/main/java/nu/marginalia/extractor/FeedExporter.java new file mode 100644 index 000000000..28a299063 --- /dev/null +++ b/code/features-convert/data-extractors/src/main/java/nu/marginalia/extractor/FeedExporter.java @@ -0,0 +1,131 @@ +package nu.marginalia.extractor; + +import com.google.inject.Inject; +import lombok.SneakyThrows; +import nu.marginalia.crawling.io.CrawledDomainReader; +import nu.marginalia.crawling.io.SerializableCrawlDataStream; +import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.link_parser.FeedExtractor; +import nu.marginalia.link_parser.LinkParser; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.process.log.WorkLog; +import nu.marginalia.storage.FileStorageService; +import nu.marginalia.storage.model.FileStorage; +import nu.marginalia.storage.model.FileStorageId; +import org.jsoup.Jsoup; + +import java.io.BufferedWriter; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.net.URISyntaxException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardCopyOption; +import java.nio.file.StandardOpenOption; +import java.nio.file.attribute.PosixFilePermissions; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import java.util.zip.GZIPOutputStream; + +public class FeedExporter implements ExporterIf { + private final FileStorageService storageService; + + + @Inject + public FeedExporter(FileStorageService storageService) { + this.storageService = storageService; + } + + public void export(FileStorageId crawlId, FileStorageId destId) throws Exception { + FileStorage destStorage = storageService.getStorage(destId); + + var tmpFile = Files.createTempFile(destStorage.asPath(), "feeds", ".csv.gz", + PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--"))); + + Path inputDir = storageService.getStorage(crawlId).asPath(); + + try (var bw = new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(Files.newOutputStream(tmpFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING))))) + { + Path crawlerLogFile = inputDir.resolve("crawler.log"); + + var tagWriter = new FeedCsvWriter(bw); + + for (var item : WorkLog.iterable(crawlerLogFile)) { + if (Thread.interrupted()) { + throw new InterruptedException(); + } + + Path crawlDataPath = inputDir.resolve(item.relPath()); + try (var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.COMPATIBLE, crawlDataPath)) { + exportFeeds(tagWriter, stream); + } + catch (Exception ex) { + ex.printStackTrace(); + } + } + + Files.move(tmpFile, destStorage.asPath().resolve("feeds.csv.gz"), StandardCopyOption.ATOMIC_MOVE, StandardCopyOption.REPLACE_EXISTING); + } + finally { + Files.deleteIfExists(tmpFile); + } + + } + + private boolean exportFeeds(FeedCsvWriter exporter, SerializableCrawlDataStream stream) throws IOException, URISyntaxException { + FeedExtractor feedExtractor = new FeedExtractor(new LinkParser()); + + int size = stream.sizeHint(); + + while (stream.hasNext()) { + if (!(stream.next() instanceof CrawledDocument doc)) + continue; + if (null == doc.documentBody) + continue; + + var baseUrl = new EdgeUrl(doc.url); + var parsed = Jsoup.parse(doc.documentBody); + + List feedUrls = new ArrayList<>(); + for (var link : parsed.select("link[rel=alternate]")) { + feedExtractor + .getFeedFromAlternateTag(baseUrl, link) + .ifPresent(feedUrls::add); + } + + // Take the shortest path if there are multiple + if (!feedUrls.isEmpty()) { + feedUrls.sort(Comparator.comparing(url -> url.path.length())); + exporter.accept(baseUrl.domain, size, feedUrls.getFirst()); + } + + // Only consider the first viable document, otherwise this will be very slow + break; + } + + return true; + } + + private static class FeedCsvWriter { + private final BufferedWriter writer; + + private FeedCsvWriter(BufferedWriter writer) { + this.writer = writer; + } + + @SneakyThrows + public void accept(EdgeDomain domain, int size, EdgeUrl path) { + writer.write(String.format("\"%s\",\"%s\",\"%s\"\n", + csvify(domain), + csvify(size), + csvify(path))); + } + + private static String csvify(Object field) { + return field.toString().replace("\"", "\"\""); + } + } + +} diff --git a/code/features-convert/data-extractors/src/main/java/nu/marginalia/extractor/TermFrequencyExporter.java b/code/features-convert/data-extractors/src/main/java/nu/marginalia/extractor/TermFrequencyExporter.java new file mode 100644 index 000000000..df1e56a93 --- /dev/null +++ b/code/features-convert/data-extractors/src/main/java/nu/marginalia/extractor/TermFrequencyExporter.java @@ -0,0 +1,147 @@ +package nu.marginalia.extractor; + +import com.google.inject.Inject; +import gnu.trove.map.hash.TLongIntHashMap; +import gnu.trove.set.hash.TLongHashSet; +import nu.marginalia.WmsaHome; +import nu.marginalia.converting.processor.logic.dom.DomPruningFilter; +import nu.marginalia.crawling.io.CrawledDomainReader; +import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.language.filter.LanguageFilter; +import nu.marginalia.language.model.DocumentLanguageData; +import nu.marginalia.language.sentence.SentenceExtractor; +import nu.marginalia.process.log.WorkLog; +import nu.marginalia.storage.FileStorageService; +import nu.marginalia.storage.model.FileStorage; +import nu.marginalia.storage.model.FileStorageId; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardCopyOption; +import java.nio.file.attribute.PosixFilePermissions; +import java.util.concurrent.ForkJoinPool; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; + +import static nu.marginalia.term_frequency_dict.TermFrequencyDict.DOC_COUNT_KEY; +import static nu.marginalia.term_frequency_dict.TermFrequencyDict.longHash; + +public class TermFrequencyExporter implements ExporterIf { + private final FileStorageService storageService; + private final LanguageFilter lf = new LanguageFilter(WmsaHome.getLanguageModels()); + private static final Logger logger = LoggerFactory.getLogger(TermFrequencyExporter.class); + + @Inject + public TermFrequencyExporter(FileStorageService storageService) { + this.storageService = storageService; + } + + @Override + public void export(FileStorageId crawlId, FileStorageId destId) throws Exception { + Path inputDir = storageService.getStorage(crawlId).asPath(); + FileStorage destStorage = storageService.getStorage(destId); + + ThreadLocal se = ThreadLocal.withInitial(() -> new SentenceExtractor(WmsaHome.getLanguageModels())); + + TLongIntHashMap counts = new TLongIntHashMap(100_000_000, 0.7f, -1, -1); + AtomicInteger docCount = new AtomicInteger(); + + try (ForkJoinPool fjp = new ForkJoinPool(Math.max(2, Runtime.getRuntime().availableProcessors() / 2))) { + + Path crawlerLogFile = inputDir.resolve("crawler.log"); + + for (var item : WorkLog.iterable(crawlerLogFile)) { + if (Thread.interrupted()) { + fjp.shutdownNow(); + + throw new InterruptedException(); + } + + Path crawlDataPath = inputDir.resolve(item.relPath()); + fjp.execute(() -> processFile(crawlDataPath, counts, docCount, se.get())); + } + + while (!fjp.isQuiescent()) { + if (fjp.awaitQuiescence(10, TimeUnit.SECONDS)) + break; + } + } + + var tmpFile = Files.createTempFile(destStorage.asPath(), "freqs", ".dat.tmp", + PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--"))); + + try (var dos = new DataOutputStream(Files.newOutputStream(tmpFile))) { + synchronized (counts) { + counts.put(DOC_COUNT_KEY, docCount.get()); + + counts.forEachEntry((hash, cnt) -> { + try { + dos.writeLong(hash); + dos.writeLong(cnt); + } catch (IOException e) { + throw new RuntimeException(e); + } + return true; + }); + } + Files.move(tmpFile, destStorage.asPath().resolve("freqs.dat"), StandardCopyOption.ATOMIC_MOVE, StandardCopyOption.REPLACE_EXISTING); + } + catch (Exception ex) { + logger.error("Error writing file {}", tmpFile, ex); + Files.deleteIfExists(tmpFile); + } + + } + + private void processFile(Path crawlDataPath, TLongIntHashMap counts, AtomicInteger docCount, SentenceExtractor se) { + TLongHashSet words = new TLongHashSet(10_000); + try (var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.FAST, crawlDataPath)) { + while (stream.hasNext()) { + if (Thread.interrupted()) + return; + + if (!(stream.next() instanceof CrawledDocument doc)) continue; + if (doc.documentBody == null) continue; + if (!doc.contentType.startsWith("text/html")) + continue; + + docCount.incrementAndGet(); + + Document parsed = Jsoup.parse(doc.documentBody); + parsed.body().filter(new DomPruningFilter(0.5)); + + DocumentLanguageData dld = se.extractSentences(parsed); + + if (lf.dictionaryAgreement(dld) < 0.1) { + return; + } + + for (var sent : dld.sentences) { + for (var word : sent) { + words.add(longHash(word.stemmed().getBytes(StandardCharsets.UTF_8))); + } + } + + synchronized (counts) { + words.forEach(w -> { + counts.adjustOrPutValue(w, 1, 1); + return true; + }); + } + + words.clear(); + } + } + catch (Exception ex) { + logger.error("Error processing file {}", crawlDataPath, ex); + } + } + +} diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeedExtractor.java b/code/features-crawl/link-parser/src/main/java/nu/marginalia/link_parser/FeedExtractor.java similarity index 96% rename from code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeedExtractor.java rename to code/features-crawl/link-parser/src/main/java/nu/marginalia/link_parser/FeedExtractor.java index 36878dad8..566029cde 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeedExtractor.java +++ b/code/features-crawl/link-parser/src/main/java/nu/marginalia/link_parser/FeedExtractor.java @@ -1,4 +1,4 @@ -package nu.marginalia.converting.processor.logic; +package nu.marginalia.link_parser; import nu.marginalia.link_parser.LinkParser; import nu.marginalia.model.EdgeUrl; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/language/FasttextLanguagePredictionModel.java b/code/libraries/language-processing/src/main/java/nu/marginalia/language/filter/FasttextLanguagePredictionModel.java similarity index 94% rename from code/processes/converting-process/src/main/java/nu/marginalia/converting/language/FasttextLanguagePredictionModel.java rename to code/libraries/language-processing/src/main/java/nu/marginalia/language/filter/FasttextLanguagePredictionModel.java index ab3c934f2..5eca3c763 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/language/FasttextLanguagePredictionModel.java +++ b/code/libraries/language-processing/src/main/java/nu/marginalia/language/filter/FasttextLanguagePredictionModel.java @@ -1,4 +1,4 @@ -package nu.marginalia.converting.language; +package nu.marginalia.language.filter; import com.github.jfasttext.JFastText; import nu.marginalia.LanguageModels; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/language/LanguageFilter.java b/code/libraries/language-processing/src/main/java/nu/marginalia/language/filter/LanguageFilter.java similarity index 97% rename from code/processes/converting-process/src/main/java/nu/marginalia/converting/language/LanguageFilter.java rename to code/libraries/language-processing/src/main/java/nu/marginalia/language/filter/LanguageFilter.java index 524bfa1f0..bf817735c 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/language/LanguageFilter.java +++ b/code/libraries/language-processing/src/main/java/nu/marginalia/language/filter/LanguageFilter.java @@ -1,4 +1,4 @@ -package nu.marginalia.converting.language; +package nu.marginalia.language.filter; import lombok.SneakyThrows; import nu.marginalia.LanguageModels; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/language/LanguagePredictionModel.java b/code/libraries/language-processing/src/main/java/nu/marginalia/language/filter/LanguagePredictionModel.java similarity index 85% rename from code/processes/converting-process/src/main/java/nu/marginalia/converting/language/LanguagePredictionModel.java rename to code/libraries/language-processing/src/main/java/nu/marginalia/language/filter/LanguagePredictionModel.java index e2b907c42..7371cef5b 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/language/LanguagePredictionModel.java +++ b/code/libraries/language-processing/src/main/java/nu/marginalia/language/filter/LanguagePredictionModel.java @@ -1,4 +1,4 @@ -package nu.marginalia.converting.language; +package nu.marginalia.language.filter; import nu.marginalia.language.model.DocumentLanguageData; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/language/UngaBungaLanguagePredictionModel.java b/code/libraries/language-processing/src/main/java/nu/marginalia/language/filter/UngaBungaLanguagePredictionModel.java similarity index 97% rename from code/processes/converting-process/src/main/java/nu/marginalia/converting/language/UngaBungaLanguagePredictionModel.java rename to code/libraries/language-processing/src/main/java/nu/marginalia/language/filter/UngaBungaLanguagePredictionModel.java index 21c4f0a4c..8b3c45671 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/language/UngaBungaLanguagePredictionModel.java +++ b/code/libraries/language-processing/src/main/java/nu/marginalia/language/filter/UngaBungaLanguagePredictionModel.java @@ -1,4 +1,4 @@ -package nu.marginalia.converting.language; +package nu.marginalia.language.filter; import nu.marginalia.language.model.DocumentLanguageData; diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/language/LanguageFilterTest.java b/code/libraries/language-processing/src/test/java/nu/marginalia/language/filter/LanguageFilterTest.java similarity index 87% rename from code/processes/converting-process/src/test/java/nu/marginalia/converting/language/LanguageFilterTest.java rename to code/libraries/language-processing/src/test/java/nu/marginalia/language/filter/LanguageFilterTest.java index dd3a89366..0cc864250 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/language/LanguageFilterTest.java +++ b/code/libraries/language-processing/src/test/java/nu/marginalia/language/filter/LanguageFilterTest.java @@ -1,6 +1,5 @@ -package nu.marginalia.converting.language; +package nu.marginalia.language.filter; -import nu.marginalia.converting.util.TestLanguageModels; import org.jsoup.Jsoup; import org.junit.jupiter.api.Test; diff --git a/code/libraries/language-processing/src/test/java/nu/marginalia/language/filter/TestLanguageModels.java b/code/libraries/language-processing/src/test/java/nu/marginalia/language/filter/TestLanguageModels.java new file mode 100644 index 000000000..2b7bf0e29 --- /dev/null +++ b/code/libraries/language-processing/src/test/java/nu/marginalia/language/filter/TestLanguageModels.java @@ -0,0 +1,38 @@ +package nu.marginalia.language.filter; + +import nu.marginalia.LanguageModels; +import nu.marginalia.WmsaHome; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Optional; + +public class TestLanguageModels { + private static final Path LANGUAGE_MODELS_DEFAULT = WmsaHome.getHomePath().resolve("model"); + + public static Path getLanguageModelsPath() { + final Path languageModelsHome = Optional.ofNullable(System.getenv("LANGUAGE_MODELS_HOME")) + .map(Path::of) + .orElse(LANGUAGE_MODELS_DEFAULT); + + if (!Files.isDirectory(languageModelsHome)) { + throw new IllegalStateException("Could not find $LANGUAGE_MODELS_HOME, see doc/language-models.md"); + } + return languageModelsHome; + } + + public static LanguageModels getLanguageModels() { + + var languageModelsHome = getLanguageModelsPath(); + + return new LanguageModels( + languageModelsHome.resolve("ngrams.bin"), + languageModelsHome.resolve("tfreq-new-algo3.bin"), + languageModelsHome.resolve("opennlp-sentence.bin"), + languageModelsHome.resolve("English.RDR"), + languageModelsHome.resolve("English.DICT"), + languageModelsHome.resolve("opennlp-tokens.bin"), + languageModelsHome.resolve("lid.176.ftz") + ); + } +} diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java index a92a4af79..59b095e76 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java @@ -2,7 +2,7 @@ import nu.marginalia.converting.processor.DocumentClass; import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.converting.language.LanguageFilter; +import nu.marginalia.language.filter.LanguageFilter; import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.keyword.model.DocumentKeywordsBuilder; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java index 44da6008a..7cf1069a9 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java @@ -2,7 +2,7 @@ import com.google.inject.Inject; import com.google.inject.name.Named; -import nu.marginalia.converting.language.LanguageFilter; +import nu.marginalia.language.filter.LanguageFilter; import nu.marginalia.converting.model.GeneratorType; import nu.marginalia.converting.processor.DocumentClass; import nu.marginalia.converting.processor.MetaRobotsTag; @@ -12,6 +12,7 @@ import nu.marginalia.converting.processor.plugin.specialization.*; import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider; +import nu.marginalia.link_parser.FeedExtractor; import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.link_parser.LinkParser; import nu.marginalia.crawling.model.CrawledDocument; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java index 05a9a210c..7bb94eaca 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java @@ -2,7 +2,7 @@ import com.google.inject.Inject; import com.google.inject.name.Named; -import nu.marginalia.converting.language.LanguageFilter; +import nu.marginalia.language.filter.LanguageFilter; import nu.marginalia.converting.processor.DocumentClass; import nu.marginalia.converting.processor.logic.DocumentLengthLogic; import nu.marginalia.crawling.model.CrawledDocument; diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/node/svc/ControlNodeActionsService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/node/svc/ControlNodeActionsService.java index a3acec45e..4145f25cb 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/node/svc/ControlNodeActionsService.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/node/svc/ControlNodeActionsService.java @@ -79,8 +79,11 @@ public void register() { Spark.post("/public/nodes/:id/actions/new-crawl-specs", this::createNewSpecsAction, redirectControl.renderRedirectAcknowledgement("Creating", "../actions?view=new-crawl") ); - Spark.post("/public/nodes/:id/actions/export-data", this::exportData, - redirectControl.renderRedirectAcknowledgement("Exporting", "../storage/exports") + Spark.post("/public/nodes/:id/actions/export-db-data", this::exportDbData, + redirectControl.renderRedirectAcknowledgement("Exporting", "..") + ); + Spark.post("/public/nodes/:id/actions/export-from-crawl-data", this::exportFromCrawlData, + redirectControl.renderRedirectAcknowledgement("Exporting", "..") ); } @@ -233,8 +236,29 @@ private Object createNewSpecsAction(Request request, Response response) { return ""; } - private Object exportData(Request req, Response rsp) { + private Object exportDbData(Request req, Response rsp) { executorClient.exportData(Context.fromRequest(req), Integer.parseInt(req.params("id"))); + + return ""; + } + + private Object exportFromCrawlData(Request req, Response rsp) { + String exportType = req.queryParams("exportType"); + String source = req.queryParams("source"); + + if (exportType.equals("atags")) { + executorClient.exportAtags(Context.fromRequest(req), Integer.parseInt(req.params("id")), source); + } + else if (exportType.equals("rss")) { + executorClient.exportRssFeeds(Context.fromRequest(req), Integer.parseInt(req.params("id")), source); + } + else if (exportType.equals("termFreq")) { + executorClient.exportTermFrequencies(Context.fromRequest(req), Integer.parseInt(req.params("id")), source); + } + else { + rsp.status(404); + } + return ""; } } diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/node/svc/ControlNodeService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/node/svc/ControlNodeService.java index 18d8aea3b..6883d70ac 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/node/svc/ControlNodeService.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/node/svc/ControlNodeService.java @@ -97,9 +97,6 @@ public void register() throws IOException { Spark.post("/public/nodes/:id/storage/reset-state/:fid", this::resetState, redirectControl.renderRedirectAcknowledgement("Restoring", "..") ); - Spark.post("/public/nodes/:id/storage/:fid/export-atags", this::exportAtags, - redirectControl.renderRedirectAcknowledgement("Exporting", "../../storage/exports") - ); Spark.post("/public/nodes/:id/fsms/:fsm/start", this::startFsm); Spark.post("/public/nodes/:id/fsms/:fsm/stop", this::stopFsm); } @@ -109,11 +106,6 @@ private Object resetState(Request request, Response response) throws SQLExceptio return ""; } - private Object exportAtags(Request req, Response rsp) { - executorClient.exportAtags(Context.fromRequest(req), Integer.parseInt(req.params("id")), req.params("fid")); - return ""; - } - public Object startFsm(Request req, Response rsp) throws Exception { executorClient.startFsm(Context.fromRequest(req), Integer.parseInt(req.params("id")), req.params("fsm").toUpperCase()); diff --git a/code/services-core/control-service/src/main/resources/templates/control/node/actions/partial-export-data.hdb b/code/services-core/control-service/src/main/resources/templates/control/node/actions/partial-export-db-data.hdb similarity index 75% rename from code/services-core/control-service/src/main/resources/templates/control/node/actions/partial-export-data.hdb rename to code/services-core/control-service/src/main/resources/templates/control/node/actions/partial-export-db-data.hdb index 7dff4ab1c..885f31260 100644 --- a/code/services-core/control-service/src/main/resources/templates/control/node/actions/partial-export-data.hdb +++ b/code/services-core/control-service/src/main/resources/templates/control/node/actions/partial-export-db-data.hdb @@ -1,11 +1,11 @@ -

Export Data

+

Create Data Export

This will export database data: Domains, blacklist and domain links. The exported data will be saved as a new exports storage object.
-
+
diff --git a/code/services-core/control-service/src/main/resources/templates/control/node/actions/partial-export-from-crawl-data.hdb b/code/services-core/control-service/src/main/resources/templates/control/node/actions/partial-export-from-crawl-data.hdb new file mode 100644 index 000000000..f17baf904 --- /dev/null +++ b/code/services-core/control-service/src/main/resources/templates/control/node/actions/partial-export-from-crawl-data.hdb @@ -0,0 +1,78 @@ +

Export From Crawl Data

+ +
+ This will run an extraction job against a crawl data set. The generated data will be available as + an export object. +
+ + +

Select a source

+ + + + + + + + + {{#each allCrawlData}} + + + + + + + {{/each}} +
UsePathDescriptionDetails
{{description}} + {{#if new}}[CREATING]{{/if}} + {{#if delete}}[DELETING]{{/if}} + [Details]
+ +

Select the export operation to run

+
+ + +
+ + Creates a CSV file with information related to external anchor tags. External anchor tags can be + used to improve search result accuracy, since they often describe what they are linking to better than + the destination page itself. + +
+
+
+ + +
+ + Run a best-effort attempt at extracting RSS and Atom feeds from the crawl data. The operation + will only consider the root page of each crawl data set. It will only extract the shortest + feed URL from each document. The result is a CSV. The CSV will contain the domain name, + the feed URL, and the number of documents in the crawl data set for that particular domain. + +
+
+
+ + +
+ + Creates a binary data file consisting of term hashes and frequencies. This is the TF- side of TF-IDF, + and is used to evaluate the importance of a term in relation to its frequency in a document. + +
+
+
+
+
+ +
+
+
+ \ No newline at end of file diff --git a/code/services-core/control-service/src/main/resources/templates/control/node/node-actions.hdb b/code/services-core/control-service/src/main/resources/templates/control/node/node-actions.hdb index 441de7fd3..aa5de1a99 100644 --- a/code/services-core/control-service/src/main/resources/templates/control/node/node-actions.hdb +++ b/code/services-core/control-service/src/main/resources/templates/control/node/node-actions.hdb @@ -18,7 +18,8 @@ {{#if view.sideload-encyclopedia}} {{> control/node/actions/partial-sideload-encyclopedia }} {{/if}} {{#if view.sideload-stackexchange}} {{> control/node/actions/partial-sideload-stackexchange }} {{/if}} {{#if view.sideload-warc}} {{> control/node/actions/partial-sideload-warc }} {{/if}} - {{#if view.export-data}} {{> control/node/actions/partial-export-data }} {{/if}} + {{#if view.export-db-data}} {{> control/node/actions/partial-export-db-data }} {{/if}} + {{#if view.export-from-crawl-data}} {{> control/node/actions/partial-export-from-crawl-data }} {{/if}} {{#if view.restore-backup}} {{> control/node/actions/partial-restore-backup }} {{/if}}
 
diff --git a/code/services-core/control-service/src/main/resources/templates/control/node/node-storage-details.hdb b/code/services-core/control-service/src/main/resources/templates/control/node/node-storage-details.hdb index 30a28ac80..981826831 100644 --- a/code/services-core/control-service/src/main/resources/templates/control/node/node-storage-details.hdb +++ b/code/services-core/control-service/src/main/resources/templates/control/node/node-storage-details.hdb @@ -51,15 +51,6 @@ {{/if}} - {{#if isAtagsExportable}} -
- - Export anchor tags from this crawl data - - -
- {{/if}} - {{#if isDeletable}}
diff --git a/code/services-core/control-service/src/main/resources/templates/control/node/partial-node-nav.hdb b/code/services-core/control-service/src/main/resources/templates/control/node/partial-node-nav.hdb index 6ba528b34..636d232b4 100644 --- a/code/services-core/control-service/src/main/resources/templates/control/node/partial-node-nav.hdb +++ b/code/services-core/control-service/src/main/resources/templates/control/node/partial-node-nav.hdb @@ -23,7 +23,8 @@
  • Sideload Stackexchange
  • Sideload WARC Files
  • -
  • Export Database Data
  • +
  • Export Database Data
  • +
  • Export From Crawl Data
  • Restore Index Backup
  • diff --git a/code/services-core/executor-service/build.gradle b/code/services-core/executor-service/build.gradle index 5471a6a8f..be550b7f2 100644 --- a/code/services-core/executor-service/build.gradle +++ b/code/services-core/executor-service/build.gradle @@ -39,6 +39,7 @@ dependencies { implementation project(':code:process-models:crawl-spec') implementation project(':code:process-models:crawling-model') implementation project(':code:features-crawl:link-parser') + implementation project(':code:features-convert:data-extractors') implementation project(':code:features-index:index-journal') implementation project(':code:api:index-api') implementation project(':code:api:query-api') diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/ExecutorActor.java b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/ExecutorActor.java index 91d3956c7..53d9601e1 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/ExecutorActor.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/ExecutorActor.java @@ -13,6 +13,8 @@ public enum ExecutorActor { CRAWL_JOB_EXTRACTOR, EXPORT_DATA, EXPORT_ATAGS, + EXPORT_TERM_FREQUENCIES, + EXPORT_FEEDS, PROC_INDEX_CONSTRUCTOR_SPAWNER, CONVERT, RESTORE_BACKUP; diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/ExecutorActorControlService.java b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/ExecutorActorControlService.java index c9c8bf6d9..b02fc12ac 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/ExecutorActorControlService.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/ExecutorActorControlService.java @@ -44,6 +44,8 @@ public ExecutorActorControlService(MessageQueueFactory messageQueueFactory, CrawlJobExtractorActor crawlJobExtractorActor, ExportDataActor exportDataActor, ExportAtagsActor exportAtagsActor, + ExportFeedsActor exportFeedsActor, + ExportTermFreqActor exportTermFrequenciesActor, ExecutorActorStateMachines stateMachines) { this.messageQueueFactory = messageQueueFactory; this.eventLog = baseServiceParams.eventLog; @@ -68,6 +70,8 @@ public ExecutorActorControlService(MessageQueueFactory messageQueueFactory, register(ExecutorActor.CRAWL_JOB_EXTRACTOR, crawlJobExtractorActor); register(ExecutorActor.EXPORT_DATA, exportDataActor); register(ExecutorActor.EXPORT_ATAGS, exportAtagsActor); + register(ExecutorActor.EXPORT_TERM_FREQUENCIES, exportTermFrequenciesActor); + register(ExecutorActor.EXPORT_FEEDS, exportFeedsActor); } private void register(ExecutorActor process, RecordActorPrototype graph) { diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportAtagsActor.java b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportAtagsActor.java index c45adbaad..3a06fecbd 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportAtagsActor.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportAtagsActor.java @@ -3,44 +3,19 @@ import com.google.gson.Gson; import com.google.inject.Inject; import com.google.inject.Singleton; -import gnu.trove.set.hash.TLongHashSet; -import lombok.SneakyThrows; -import nu.marginalia.hash.MurmurHash3_128; -import nu.marginalia.link_parser.LinkParser; -import nu.marginalia.model.EdgeDomain; +import nu.marginalia.extractor.AtagExporter; +import nu.marginalia.extractor.ExporterIf; import nu.marginalia.storage.model.*; -import org.apache.commons.lang3.StringUtils; -import org.jsoup.Jsoup; import nu.marginalia.actor.prototype.RecordActorPrototype; import nu.marginalia.actor.state.ActorStep; -import nu.marginalia.crawling.io.CrawledDomainReader; -import nu.marginalia.crawling.io.SerializableCrawlDataStream; -import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.model.EdgeUrl; -import nu.marginalia.process.log.WorkLog; import nu.marginalia.storage.FileStorageService; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import java.io.BufferedWriter; -import java.io.IOException; -import java.io.OutputStreamWriter; -import java.net.URISyntaxException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.StandardCopyOption; -import java.nio.file.StandardOpenOption; -import java.nio.file.attribute.PosixFilePermissions; import java.time.LocalDateTime; -import java.util.Objects; -import java.util.zip.GZIPOutputStream; @Singleton public class ExportAtagsActor extends RecordActorPrototype { - private static final LinkParser linkParser = new LinkParser(); - private static final MurmurHash3_128 hash = new MurmurHash3_128(); private final FileStorageService storageService; - private final Logger logger = LoggerFactory.getLogger(getClass()); + private final ExporterIf atagExporter; public record Export(FileStorageId crawlId) implements ActorStep {} public record Run(FileStorageId crawlId, FileStorageId destId) implements ActorStep {} @@ -55,46 +30,15 @@ case Export(FileStorageId crawlId) -> { yield new Run(crawlId, storage.id()); } case Run(FileStorageId crawlId, FileStorageId destId) -> { - FileStorage destStorage = storageService.getStorage(destId); storageService.setFileStorageState(destId, FileStorageState.NEW); - var tmpFile = Files.createTempFile(destStorage.asPath(), "atags", ".csv.gz", - PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--"))); - - Path inputDir = storageService.getStorage(crawlId).asPath(); - - try (var bw = new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(Files.newOutputStream(tmpFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)))); - ) - { - Path crawlerLogFile = inputDir.resolve("crawler.log"); - - var tagWriter = new ATagCsvWriter(bw); - - for (var item : WorkLog.iterable(crawlerLogFile)) { - if (Thread.interrupted()) { - throw new InterruptedException(); - } - - Path crawlDataPath = inputDir.resolve(item.relPath()); - try (var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.FAST, crawlDataPath)) { - exportLinks(tagWriter, stream); - } - catch (Exception ex) { - ex.printStackTrace(); - } - } - - Files.move(tmpFile, destStorage.asPath().resolve("atags.csv.gz"), StandardCopyOption.ATOMIC_MOVE, StandardCopyOption.REPLACE_EXISTING); - + try { + atagExporter.export(crawlId, destId); storageService.setFileStorageState(destId, FileStorageState.UNSET); } catch (Exception ex) { - logger.error("Failed to export blacklist", ex); storageService.setFileStorageState(destId, FileStorageState.DELETE); - yield new Error("Failed to export blacklist"); - } - finally { - Files.deleteIfExists(tmpFile); + yield new Error("Failed to export data"); } yield new End(); @@ -103,117 +47,6 @@ case Run(FileStorageId crawlId, FileStorageId destId) -> { }; } - private boolean exportLinks(ATagCsvWriter exporter, SerializableCrawlDataStream stream) throws IOException, URISyntaxException { - ATagLinkFilter linkFilter = new ATagLinkFilter(); - - while (stream.hasNext()) { - if (!(stream.next() instanceof CrawledDocument doc)) - continue; - if (null == doc.documentBody) - continue; - - var baseUrl = new EdgeUrl(doc.url); - var parsed = Jsoup.parse(doc.documentBody); - - for (var atag : parsed.getElementsByTag("a")) { - String linkText = atag.text(); - - if (!linkFilter.isLinkTextEligible(linkText)) { - continue; - } - - var linkOpt = linkParser.parseLinkPermissive(baseUrl, atag); - linkOpt - .filter(url -> linkFilter.isEligible(url, baseUrl, linkText)) - .ifPresent(url -> exporter.accept(url, baseUrl.domain, linkText)); - } - } - - return true; - } - - private static class ATagLinkFilter { - private final TLongHashSet hashes = new TLongHashSet(); - - private boolean isLinkTextEligible(String linkText) { - // Filter out the most obviously uninteresting anchor texts - - if (linkText.isBlank()) - return false; - if (linkText.startsWith("this")) - return false; - if (linkText.equalsIgnoreCase("here")) - return false; - if (linkText.equalsIgnoreCase("click here")) - return false; - - if (!StringUtils.isAsciiPrintable(linkText)) // This also filters out newlines, a good thing! - return false; - - return true; - } - private boolean isEligible(EdgeUrl url, EdgeUrl baseUrl, String linkText) { - if (!"http".equals(url.proto) && !"https".equals(url.proto)) - return false; - - // This is an artifact of the link parser typically - if ("example.com".equals(url.domain.topDomain)) - return false; - - if (linkText.contains(url.domain.toString())) - return false; - if (Objects.equals(url.domain, baseUrl.domain)) - return false; - - String urlString = url.toString(); - if (!StringUtils.isAsciiPrintable(urlString)) { // This also filters out newlines, a good thing! - return false; - } - - // Deduplicate by hash; we've already checked that the strings are ASCII printable so we don't - // need to be concerned about using the fast ASCII hash - if (hashes.add(hash.hashLowerBytes(linkText) ^ hash.hashLowerBytes(urlString))) { - return false; - } - - return true; - } - } - - - private static class ATagCsvWriter { - private final BufferedWriter writer; - - private ATagCsvWriter(BufferedWriter writer) { - this.writer = writer; - } - - @SneakyThrows - public void accept(EdgeUrl url, EdgeDomain sourceDomain, String linkText) { - final String urlString = urlWithNoSchema(url); - - writer.write(String.format("\"%s\",\"%s\",\"%s\"\n", - csvify(urlString), - csvify(linkText), - csvify(sourceDomain))); - } - - private static String urlWithNoSchema(EdgeUrl url) { - StringBuilder sb = new StringBuilder(); - - sb.append(url.domain).append(url.path); - - if (url.param != null) - sb.append('?').append(url.param); - - return sb.toString(); - } - - private static String csvify(Object field) { - return field.toString().replace("\"", "\"\""); - } - - } @Override public String describe() { @@ -222,10 +55,12 @@ public String describe() { @Inject public ExportAtagsActor(Gson gson, - FileStorageService storageService) + FileStorageService storageService, + AtagExporter atagExporter) { super(gson); this.storageService = storageService; + this.atagExporter = atagExporter; } } diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportFeedsActor.java b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportFeedsActor.java new file mode 100644 index 000000000..5df5b2365 --- /dev/null +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportFeedsActor.java @@ -0,0 +1,72 @@ +package nu.marginalia.actor.task; + +import com.google.gson.Gson; +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.actor.prototype.RecordActorPrototype; +import nu.marginalia.actor.state.ActorStep; +import nu.marginalia.extractor.ExporterIf; +import nu.marginalia.extractor.FeedExporter; +import nu.marginalia.storage.FileStorageService; +import nu.marginalia.storage.model.FileStorageBaseType; +import nu.marginalia.storage.model.FileStorageId; +import nu.marginalia.storage.model.FileStorageState; +import nu.marginalia.storage.model.FileStorageType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.time.LocalDateTime; + +@Singleton +public class ExportFeedsActor extends RecordActorPrototype { + private final FileStorageService storageService; + private final Logger logger = LoggerFactory.getLogger(getClass()); + + private final ExporterIf feedExporter; + public record Export(FileStorageId crawlId) implements ActorStep {} + public record Run(FileStorageId crawlId, FileStorageId destId) implements ActorStep {} + @Override + public ActorStep transition(ActorStep self) throws Exception { + return switch(self) { + case Export(FileStorageId crawlId) -> { + var storageBase = storageService.getStorageBase(FileStorageBaseType.STORAGE); + var storage = storageService.allocateTemporaryStorage(storageBase, FileStorageType.EXPORT, "feed-export", "Feeds " + LocalDateTime.now()); + + if (storage == null) yield new Error("Bad storage id"); + yield new Run(crawlId, storage.id()); + } + case Run(FileStorageId crawlId, FileStorageId destId) -> { + storageService.setFileStorageState(destId, FileStorageState.NEW); + + try { + feedExporter.export(crawlId, destId); + storageService.setFileStorageState(destId, FileStorageState.UNSET); + } + catch (Exception ex) { + storageService.setFileStorageState(destId, FileStorageState.DELETE); + yield new Error("Failed to export data"); + } + + yield new End(); + } + default -> new Error(); + }; + } + + + @Override + public String describe() { + return "Export RSS/Atom feeds from crawl data"; + } + + @Inject + public ExportFeedsActor(Gson gson, + FileStorageService storageService, + FeedExporter feedExporter) + { + super(gson); + this.storageService = storageService; + this.feedExporter = feedExporter; + } + +} diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportTermFreqActor.java b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportTermFreqActor.java new file mode 100644 index 000000000..d04b75d24 --- /dev/null +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportTermFreqActor.java @@ -0,0 +1,68 @@ +package nu.marginalia.actor.task; + +import com.google.gson.Gson; +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.actor.prototype.RecordActorPrototype; +import nu.marginalia.actor.state.ActorStep; +import nu.marginalia.extractor.ExporterIf; +import nu.marginalia.extractor.TermFrequencyExporter; +import nu.marginalia.storage.FileStorageService; +import nu.marginalia.storage.model.FileStorageBaseType; +import nu.marginalia.storage.model.FileStorageId; +import nu.marginalia.storage.model.FileStorageState; +import nu.marginalia.storage.model.FileStorageType; + +import java.time.LocalDateTime; + +@Singleton +public class ExportTermFreqActor extends RecordActorPrototype { + private final FileStorageService storageService; + private final ExporterIf exporter; + public record Export(FileStorageId crawlId) implements ActorStep {} + public record Run(FileStorageId crawlId, FileStorageId destId) implements ActorStep {} + @Override + public ActorStep transition(ActorStep self) throws Exception { + return switch(self) { + case Export(FileStorageId crawlId) -> { + var storageBase = storageService.getStorageBase(FileStorageBaseType.STORAGE); + var storage = storageService.allocateTemporaryStorage(storageBase, FileStorageType.EXPORT, "term-freq-export", "Term Frequencies " + LocalDateTime.now()); + + if (storage == null) yield new Error("Bad storage id"); + yield new Run(crawlId, storage.id()); + } + case Run(FileStorageId crawlId, FileStorageId destId) -> { + storageService.setFileStorageState(destId, FileStorageState.NEW); + + try { + exporter.export(crawlId, destId); + storageService.setFileStorageState(destId, FileStorageState.UNSET); + } + catch (Exception ex) { + storageService.setFileStorageState(destId, FileStorageState.DELETE); + yield new Error("Failed to export data"); + } + + yield new End(); + } + default -> new Error(); + }; + } + + + @Override + public String describe() { + return "Export term frequencies from crawl data"; + } + + @Inject + public ExportTermFreqActor(Gson gson, + FileStorageService storageService, + TermFrequencyExporter exporter) + { + super(gson); + this.storageService = storageService; + this.exporter = exporter; + } + +} diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/executor/ExecutorSvc.java b/code/services-core/executor-service/src/main/java/nu/marginalia/executor/ExecutorSvc.java index 83909ee41..ee4d08916 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/executor/ExecutorSvc.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/executor/ExecutorSvc.java @@ -72,6 +72,8 @@ public ExecutorSvc(BaseServiceParams params, Spark.post("/sideload/encyclopedia", sideloadService::sideloadEncyclopedia); Spark.post("/export/atags", exportService::exportAtags); + Spark.post("/export/feeds", exportService::exportFeeds); + Spark.post("/export/termfreq", exportService::exportTermFrequencies); Spark.post("/export/data", exportService::exportData); Spark.post("/backup/:fid/restore", backupService::restore); diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/executor/svc/ExportService.java b/code/services-core/executor-service/src/main/java/nu/marginalia/executor/svc/ExportService.java index 6baaf5de7..2ada2d7ce 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/executor/svc/ExportService.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/executor/svc/ExportService.java @@ -28,4 +28,14 @@ public Object exportAtags(Request request, Response response) throws Exception { return ""; } + public Object exportFeeds(Request request, Response response) throws Exception { + actorControlService.startFrom(ExecutorActor.EXPORT_FEEDS, new ExportAtagsActor.Export(FileStorageId.parse(request.queryParams("fid")))); + return ""; + } + public Object exportTermFrequencies(Request request, Response response) throws Exception { + actorControlService.startFrom(ExecutorActor.EXPORT_TERM_FREQUENCIES, new ExportAtagsActor.Export(FileStorageId.parse(request.queryParams("fid")))); + return ""; + } + + } diff --git a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/TopicExperiment.java b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/TopicExperiment.java index 52abe4123..0d99356a9 100644 --- a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/TopicExperiment.java +++ b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/TopicExperiment.java @@ -2,20 +2,15 @@ import com.google.inject.Inject; import nu.marginalia.WmsaHome; -import nu.marginalia.adblock.AdblockSimulator; import nu.marginalia.adblock.GoogleAnwersSpamDetector; -import nu.marginalia.converting.processor.DocumentProcessor; import nu.marginalia.converting.processor.logic.dom.DomPruningFilter; -import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.language.sentence.SentenceExtractor; -import nu.marginalia.tools.Experiment; import nu.marginalia.tools.LegacyExperiment; import nu.marginalia.topic.RecipeDetector; import nu.marginalia.topic.TextileCraftDetector; import nu.marginalia.topic.WoodworkingDetector; import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; public class TopicExperiment extends LegacyExperiment { diff --git a/code/tools/term-frequency-extractor/src/main/java/nu/marginalia/tools/TermFrequencyExtractor.java b/code/tools/term-frequency-extractor/src/main/java/nu/marginalia/tools/TermFrequencyExtractor.java index 4a41cb559..c625208f7 100644 --- a/code/tools/term-frequency-extractor/src/main/java/nu/marginalia/tools/TermFrequencyExtractor.java +++ b/code/tools/term-frequency-extractor/src/main/java/nu/marginalia/tools/TermFrequencyExtractor.java @@ -3,7 +3,7 @@ import gnu.trove.map.hash.TLongIntHashMap; import gnu.trove.set.hash.TLongHashSet; import nu.marginalia.WmsaHome; -import nu.marginalia.converting.language.LanguageFilter; +import nu.marginalia.language.filter.LanguageFilter; import nu.marginalia.converting.processor.logic.dom.DomPruningFilter; import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.sentence.SentenceExtractor; diff --git a/settings.gradle b/settings.gradle index bcd1f7552..779ab288b 100644 --- a/settings.gradle +++ b/settings.gradle @@ -35,6 +35,7 @@ include 'code:features-index:result-ranking' include 'code:features-convert:adblock' include 'code:features-convert:anchor-keywords' +include 'code:features-convert:data-extractors' include 'code:features-convert:stackexchange-xml' include 'code:features-convert:pubdate' include 'code:features-convert:summary-extraction'