Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
(control) New export actions for RSS/Atom feeds and term frequency data
This commit also refactors the executor a bit, and introduces a new converter-feature called data-extractors for this class of jobs.
- Loading branch information
Showing
35 changed files
with
864 additions
and
214 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
plugins { | ||
id 'java' | ||
|
||
|
||
id "de.undercouch.download" version "5.1.0" | ||
|
||
id 'jvm-test-suite' | ||
} | ||
|
||
java { | ||
toolchain { | ||
languageVersion.set(JavaLanguageVersion.of(21)) | ||
} | ||
} | ||
|
||
dependencies { | ||
implementation project(':code:common:config') | ||
implementation project(':code:common:process') | ||
implementation project(':code:common:model') | ||
implementation project(':code:libraries:language-processing') | ||
implementation project(':code:libraries:term-frequency-dict') | ||
implementation project(':code:features-crawl:link-parser') | ||
implementation project(':code:features-convert:anchor-keywords') | ||
implementation project(':code:process-models:crawling-model') | ||
implementation project(':code:processes:converting-process') | ||
implementation project(':third-party:commons-codec') | ||
|
||
|
||
implementation libs.bundles.slf4j | ||
implementation libs.guice | ||
implementation libs.trove | ||
implementation libs.commons.lang3 | ||
implementation libs.notnull | ||
implementation libs.jsoup | ||
|
||
testImplementation libs.bundles.slf4j.test | ||
testImplementation libs.bundles.junit | ||
testImplementation libs.mockito | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
Contains converter-*like* extraction jobs that operate on crawled data to produce export files. | ||
|
||
## Important classes | ||
|
||
* [AtagExporter](src/main/java/nu/marginalia/extractor/AtagExporter.java) - extracts anchor texts from the crawled data. | ||
* [FeedExporter](src/main/java/nu/marginalia/extractor/FeedExporter.java) - tries to find RSS/Atom feeds within the crawled data. | ||
* [TermFrequencyExporter](src/main/java/nu/marginalia/extractor/TermFrequencyExporter.java) - exports the 'TF' part of TF-IDF. |
196 changes: 196 additions & 0 deletions
196
.../features-convert/data-extractors/src/main/java/nu/marginalia/extractor/AtagExporter.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,196 @@ | ||
package nu.marginalia.extractor; | ||
|
||
import com.google.inject.Inject; | ||
import gnu.trove.set.hash.TLongHashSet; | ||
import lombok.SneakyThrows; | ||
import nu.marginalia.crawling.io.CrawledDomainReader; | ||
import nu.marginalia.crawling.io.SerializableCrawlDataStream; | ||
import nu.marginalia.crawling.model.CrawledDocument; | ||
import nu.marginalia.hash.MurmurHash3_128; | ||
import nu.marginalia.link_parser.LinkParser; | ||
import nu.marginalia.model.EdgeDomain; | ||
import nu.marginalia.model.EdgeUrl; | ||
import nu.marginalia.process.log.WorkLog; | ||
import nu.marginalia.storage.FileStorageService; | ||
import nu.marginalia.storage.model.FileStorage; | ||
import nu.marginalia.storage.model.FileStorageId; | ||
import org.apache.commons.lang3.StringUtils; | ||
import org.jsoup.Jsoup; | ||
|
||
import java.io.BufferedWriter; | ||
import java.io.IOException; | ||
import java.io.OutputStreamWriter; | ||
import java.net.URISyntaxException; | ||
import java.nio.file.Files; | ||
import java.nio.file.Path; | ||
import java.nio.file.StandardCopyOption; | ||
import java.nio.file.StandardOpenOption; | ||
import java.nio.file.attribute.PosixFilePermissions; | ||
import java.util.Objects; | ||
import java.util.zip.GZIPOutputStream; | ||
|
||
public class AtagExporter implements ExporterIf { | ||
private static final LinkParser linkParser = new LinkParser(); | ||
private static final MurmurHash3_128 hash = new MurmurHash3_128(); | ||
private final FileStorageService storageService; | ||
|
||
@Inject | ||
public AtagExporter(FileStorageService storageService) { | ||
this.storageService = storageService; | ||
} | ||
|
||
@Override | ||
public void export(FileStorageId crawlId, FileStorageId destId) throws Exception { | ||
FileStorage destStorage = storageService.getStorage(destId); | ||
|
||
var tmpFile = Files.createTempFile(destStorage.asPath(), "atags", ".csv.gz", | ||
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--"))); | ||
|
||
Path inputDir = storageService.getStorage(crawlId).asPath(); | ||
|
||
try (var bw = new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(Files.newOutputStream(tmpFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)))); | ||
) | ||
{ | ||
Path crawlerLogFile = inputDir.resolve("crawler.log"); | ||
|
||
var tagWriter = new ATagCsvWriter(bw); | ||
|
||
for (var item : WorkLog.iterable(crawlerLogFile)) { | ||
if (Thread.interrupted()) { | ||
throw new InterruptedException(); | ||
} | ||
|
||
Path crawlDataPath = inputDir.resolve(item.relPath()); | ||
try (var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.FAST, crawlDataPath)) { | ||
exportLinks(tagWriter, stream); | ||
} | ||
catch (Exception ex) { | ||
ex.printStackTrace(); | ||
} | ||
} | ||
|
||
Files.move(tmpFile, destStorage.asPath().resolve("atags.csv.gz"), StandardCopyOption.ATOMIC_MOVE, StandardCopyOption.REPLACE_EXISTING); | ||
|
||
} | ||
catch (Exception ex) { | ||
|
||
} | ||
finally { | ||
Files.deleteIfExists(tmpFile); | ||
} | ||
|
||
} | ||
|
||
|
||
private boolean exportLinks(ATagCsvWriter exporter, SerializableCrawlDataStream stream) throws IOException, URISyntaxException { | ||
ATagLinkFilter linkFilter = new ATagLinkFilter(); | ||
|
||
while (stream.hasNext()) { | ||
if (!(stream.next() instanceof CrawledDocument doc)) | ||
continue; | ||
if (null == doc.documentBody) | ||
continue; | ||
|
||
var baseUrl = new EdgeUrl(doc.url); | ||
var parsed = Jsoup.parse(doc.documentBody); | ||
|
||
for (var atag : parsed.getElementsByTag("a")) { | ||
String linkText = atag.text(); | ||
|
||
if (!linkFilter.isLinkTextEligible(linkText)) { | ||
continue; | ||
} | ||
|
||
var linkOpt = linkParser.parseLinkPermissive(baseUrl, atag); | ||
linkOpt | ||
.filter(url -> linkFilter.isEligible(url, baseUrl, linkText)) | ||
.ifPresent(url -> exporter.accept(url, baseUrl.domain, linkText)); | ||
} | ||
} | ||
|
||
return true; | ||
} | ||
|
||
private static class ATagLinkFilter { | ||
private final TLongHashSet hashes = new TLongHashSet(); | ||
|
||
private boolean isLinkTextEligible(String linkText) { | ||
// Filter out the most obviously uninteresting anchor texts | ||
|
||
if (linkText.isBlank()) | ||
return false; | ||
if (linkText.startsWith("this")) | ||
return false; | ||
if (linkText.equalsIgnoreCase("here")) | ||
return false; | ||
if (linkText.equalsIgnoreCase("click here")) | ||
return false; | ||
|
||
if (!StringUtils.isAsciiPrintable(linkText)) // This also filters out newlines, a good thing! | ||
return false; | ||
|
||
return true; | ||
} | ||
private boolean isEligible(EdgeUrl url, EdgeUrl baseUrl, String linkText) { | ||
if (!"http".equals(url.proto) && !"https".equals(url.proto)) | ||
return false; | ||
|
||
// This is an artifact of the link parser typically | ||
if ("example.com".equals(url.domain.topDomain)) | ||
return false; | ||
|
||
if (linkText.contains(url.domain.toString())) | ||
return false; | ||
if (Objects.equals(url.domain, baseUrl.domain)) | ||
return false; | ||
|
||
String urlString = url.toString(); | ||
if (!StringUtils.isAsciiPrintable(urlString)) { // This also filters out newlines, a good thing! | ||
return false; | ||
} | ||
|
||
// Deduplicate by hash; we've already checked that the strings are ASCII printable so we don't | ||
// need to be concerned about using the fast ASCII hash | ||
if (hashes.add(hash.hashLowerBytes(linkText) ^ hash.hashLowerBytes(urlString))) { | ||
return false; | ||
} | ||
|
||
return true; | ||
} | ||
} | ||
|
||
|
||
private static class ATagCsvWriter { | ||
private final BufferedWriter writer; | ||
|
||
private ATagCsvWriter(BufferedWriter writer) { | ||
this.writer = writer; | ||
} | ||
|
||
@SneakyThrows | ||
public void accept(EdgeUrl url, EdgeDomain sourceDomain, String linkText) { | ||
final String urlString = urlWithNoSchema(url); | ||
|
||
writer.write(String.format("\"%s\",\"%s\",\"%s\"\n", | ||
csvify(urlString), | ||
csvify(linkText), | ||
csvify(sourceDomain))); | ||
} | ||
|
||
private static String urlWithNoSchema(EdgeUrl url) { | ||
StringBuilder sb = new StringBuilder(); | ||
|
||
sb.append(url.domain).append(url.path); | ||
|
||
if (url.param != null) | ||
sb.append('?').append(url.param); | ||
|
||
return sb.toString(); | ||
} | ||
|
||
private static String csvify(Object field) { | ||
return field.toString().replace("\"", "\"\""); | ||
} | ||
|
||
} | ||
} |
7 changes: 7 additions & 0 deletions
7
code/features-convert/data-extractors/src/main/java/nu/marginalia/extractor/ExporterIf.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
package nu.marginalia.extractor; | ||
|
||
import nu.marginalia.storage.model.FileStorageId; | ||
|
||
public interface ExporterIf { | ||
void export(FileStorageId crawlId, FileStorageId destId) throws Exception; | ||
} |
Oops, something went wrong.