Skip to content

Commit

Permalink
(control) New export actions for RSS/Atom feeds and term frequency data
Browse files Browse the repository at this point in the history
This commit also refactors the executor a bit, and introduces a new converter-feature called data-extractors for this class of jobs.
  • Loading branch information
vlofgren committed Jan 15, 2024
1 parent 4665af6 commit c41e68a
Show file tree
Hide file tree
Showing 35 changed files with 864 additions and 214 deletions.
Expand Up @@ -100,6 +100,13 @@ public void createCrawlSpecFromDownload(Context context, int node, String descri
public void exportAtags(Context ctx, int node, String fid) {
post(ctx, node, "/export/atags?fid="+fid, "").blockingSubscribe();
}
public void exportRssFeeds(Context ctx, int node, String fid) {
post(ctx, node, "/export/feeds?fid="+fid, "").blockingSubscribe();
}
public void exportTermFrequencies(Context ctx, int node, String fid) {
post(ctx, node, "/export/termfreq?fid="+fid, "").blockingSubscribe();
}

public void exportData(Context ctx, int node) {
post(ctx, node, "/export/data", "").blockingSubscribe();
}
Expand Down
40 changes: 40 additions & 0 deletions code/features-convert/data-extractors/build.gradle
@@ -0,0 +1,40 @@
plugins {
id 'java'


id "de.undercouch.download" version "5.1.0"

id 'jvm-test-suite'
}

java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(21))
}
}

dependencies {
implementation project(':code:common:config')
implementation project(':code:common:process')
implementation project(':code:common:model')
implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:term-frequency-dict')
implementation project(':code:features-crawl:link-parser')
implementation project(':code:features-convert:anchor-keywords')
implementation project(':code:process-models:crawling-model')
implementation project(':code:processes:converting-process')
implementation project(':third-party:commons-codec')


implementation libs.bundles.slf4j
implementation libs.guice
implementation libs.trove
implementation libs.commons.lang3
implementation libs.notnull
implementation libs.jsoup

testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
}

7 changes: 7 additions & 0 deletions code/features-convert/data-extractors/readme.md
@@ -0,0 +1,7 @@
Contains converter-*like* extraction jobs that operate on crawled data to produce export files.

## Important classes

* [AtagExporter](src/main/java/nu/marginalia/extractor/AtagExporter.java) - extracts anchor texts from the crawled data.
* [FeedExporter](src/main/java/nu/marginalia/extractor/FeedExporter.java) - tries to find RSS/Atom feeds within the crawled data.
* [TermFrequencyExporter](src/main/java/nu/marginalia/extractor/TermFrequencyExporter.java) - exports the 'TF' part of TF-IDF.
@@ -0,0 +1,196 @@
package nu.marginalia.extractor;

import com.google.inject.Inject;
import gnu.trove.set.hash.TLongHashSet;
import lombok.SneakyThrows;
import nu.marginalia.crawling.io.CrawledDomainReader;
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.link_parser.LinkParser;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.process.log.WorkLog;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorage;
import nu.marginalia.storage.model.FileStorageId;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;

import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.net.URISyntaxException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.nio.file.StandardOpenOption;
import java.nio.file.attribute.PosixFilePermissions;
import java.util.Objects;
import java.util.zip.GZIPOutputStream;

public class AtagExporter implements ExporterIf {
private static final LinkParser linkParser = new LinkParser();
private static final MurmurHash3_128 hash = new MurmurHash3_128();
private final FileStorageService storageService;

@Inject
public AtagExporter(FileStorageService storageService) {
this.storageService = storageService;
}

@Override
public void export(FileStorageId crawlId, FileStorageId destId) throws Exception {
FileStorage destStorage = storageService.getStorage(destId);

var tmpFile = Files.createTempFile(destStorage.asPath(), "atags", ".csv.gz",
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));

Path inputDir = storageService.getStorage(crawlId).asPath();

try (var bw = new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(Files.newOutputStream(tmpFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING))));
)
{
Path crawlerLogFile = inputDir.resolve("crawler.log");

var tagWriter = new ATagCsvWriter(bw);

for (var item : WorkLog.iterable(crawlerLogFile)) {
if (Thread.interrupted()) {
throw new InterruptedException();
}

Path crawlDataPath = inputDir.resolve(item.relPath());
try (var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.FAST, crawlDataPath)) {
exportLinks(tagWriter, stream);
}
catch (Exception ex) {
ex.printStackTrace();
}
}

Files.move(tmpFile, destStorage.asPath().resolve("atags.csv.gz"), StandardCopyOption.ATOMIC_MOVE, StandardCopyOption.REPLACE_EXISTING);

}
catch (Exception ex) {

}
finally {
Files.deleteIfExists(tmpFile);
}

}


private boolean exportLinks(ATagCsvWriter exporter, SerializableCrawlDataStream stream) throws IOException, URISyntaxException {
ATagLinkFilter linkFilter = new ATagLinkFilter();

while (stream.hasNext()) {
if (!(stream.next() instanceof CrawledDocument doc))
continue;
if (null == doc.documentBody)
continue;

var baseUrl = new EdgeUrl(doc.url);
var parsed = Jsoup.parse(doc.documentBody);

for (var atag : parsed.getElementsByTag("a")) {
String linkText = atag.text();

if (!linkFilter.isLinkTextEligible(linkText)) {
continue;
}

var linkOpt = linkParser.parseLinkPermissive(baseUrl, atag);
linkOpt
.filter(url -> linkFilter.isEligible(url, baseUrl, linkText))
.ifPresent(url -> exporter.accept(url, baseUrl.domain, linkText));
}
}

return true;
}

private static class ATagLinkFilter {
private final TLongHashSet hashes = new TLongHashSet();

private boolean isLinkTextEligible(String linkText) {
// Filter out the most obviously uninteresting anchor texts

if (linkText.isBlank())
return false;
if (linkText.startsWith("this"))
return false;
if (linkText.equalsIgnoreCase("here"))
return false;
if (linkText.equalsIgnoreCase("click here"))
return false;

if (!StringUtils.isAsciiPrintable(linkText)) // This also filters out newlines, a good thing!
return false;

return true;
}
private boolean isEligible(EdgeUrl url, EdgeUrl baseUrl, String linkText) {
if (!"http".equals(url.proto) && !"https".equals(url.proto))
return false;

// This is an artifact of the link parser typically
if ("example.com".equals(url.domain.topDomain))
return false;

if (linkText.contains(url.domain.toString()))
return false;
if (Objects.equals(url.domain, baseUrl.domain))
return false;

String urlString = url.toString();
if (!StringUtils.isAsciiPrintable(urlString)) { // This also filters out newlines, a good thing!
return false;
}

// Deduplicate by hash; we've already checked that the strings are ASCII printable so we don't
// need to be concerned about using the fast ASCII hash
if (hashes.add(hash.hashLowerBytes(linkText) ^ hash.hashLowerBytes(urlString))) {
return false;
}

return true;
}
}


private static class ATagCsvWriter {
private final BufferedWriter writer;

private ATagCsvWriter(BufferedWriter writer) {
this.writer = writer;
}

@SneakyThrows
public void accept(EdgeUrl url, EdgeDomain sourceDomain, String linkText) {
final String urlString = urlWithNoSchema(url);

writer.write(String.format("\"%s\",\"%s\",\"%s\"\n",
csvify(urlString),
csvify(linkText),
csvify(sourceDomain)));
}

private static String urlWithNoSchema(EdgeUrl url) {
StringBuilder sb = new StringBuilder();

sb.append(url.domain).append(url.path);

if (url.param != null)
sb.append('?').append(url.param);

return sb.toString();
}

private static String csvify(Object field) {
return field.toString().replace("\"", "\"\"");
}

}
}
@@ -0,0 +1,7 @@
package nu.marginalia.extractor;

import nu.marginalia.storage.model.FileStorageId;

public interface ExporterIf {
void export(FileStorageId crawlId, FileStorageId destId) throws Exception;
}

0 comments on commit c41e68a

Please sign in to comment.