Skip to content

Commit

Permalink
(converter) Make it possible to sideload documents from a directory tree
Browse files Browse the repository at this point in the history
  • Loading branch information
vlofgren committed Sep 17, 2023
1 parent 5c040f7 commit 9b385ec
Show file tree
Hide file tree
Showing 18 changed files with 386 additions and 138 deletions.
Expand Up @@ -3,5 +3,6 @@
public enum ConvertAction {
ConvertCrawlData,
SideloadEncyclopedia,
SideloadDirtree,
SideloadStackexchange
}
2 changes: 2 additions & 0 deletions code/processes/converting-process/build.gradle
Expand Up @@ -79,6 +79,8 @@ dependencies {
implementation libs.trove
implementation libs.fastutil

implementation libs.snakeyaml

implementation libs.crawlercommons

implementation libs.commons.lang3
Expand Down
Expand Up @@ -29,6 +29,8 @@
import java.io.IOException;
import java.nio.file.Path;
import java.sql.SQLException;
import java.util.Collection;
import java.util.List;
import java.util.Optional;
import java.util.UUID;
import java.util.concurrent.TimeUnit;
Expand Down Expand Up @@ -84,11 +86,14 @@ public ConverterMain(
heartbeat.start();
}

public void convert(SideloadSource sideloadSource, Path writeDir) throws Exception {
public void convert(Collection<? extends SideloadSource> sideloadSources, Path writeDir) throws Exception {
try (var writer = new ConverterBatchWriter(writeDir, 0);
BatchingWorkLog batchingWorkLog = new BatchingWorkLogImpl(writeDir.resolve("processor.log"))
) {
writer.write(sideloadSource);
for (var sideloadSource : sideloadSources) {
logger.info("Sideloading {}", sideloadSource.getDomain());
writer.write(sideloadSource);
}

// We write an empty log with just a finish marker for the sideloading action
batchingWorkLog.logFinishedBatch();
Expand Down Expand Up @@ -149,21 +154,27 @@ public void err() {

private static class SideloadAction extends ConvertRequest {

private final SideloadSource sideloadSource;
private final Collection<? extends SideloadSource> sideloadSources;
private final Path workDir;

SideloadAction(SideloadSource sideloadSource,
Path workDir,
MqMessage message, MqSingleShotInbox inbox) {
super(message, inbox);
this.sideloadSource = sideloadSource;
this.sideloadSources = List.of(sideloadSource);
this.workDir = workDir;
}
SideloadAction(Collection<? extends SideloadSource> sideloadSources,
Path workDir,
MqMessage message, MqSingleShotInbox inbox) {
super(message, inbox);
this.sideloadSources = sideloadSources;
this.workDir = workDir;
}

@Override
public void execute(ConverterMain converterMain) throws Exception {
try {
converterMain.convert(sideloadSource, workDir);
converterMain.convert(sideloadSources, workDir);
ok();
}
catch (Exception ex) {
Expand Down Expand Up @@ -203,39 +214,43 @@ private ConvertRequest fetchInstructions() throws Exception {

var request = gson.fromJson(msg.payload(), nu.marginalia.mqapi.converting.ConvertRequest.class);

if (request.action == ConvertAction.ConvertCrawlData) {
var filePath = Path.of(request.inputSource);

var crawlData = fileStorageService.getStorage(request.crawlStorage);
var processData = fileStorageService.getStorage(request.processedDataStorage);
return switch(request.action) {
case ConvertCrawlData -> {
var crawlData = fileStorageService.getStorage(request.crawlStorage);
var processData = fileStorageService.getStorage(request.processedDataStorage);

var plan = new CrawlPlan(null,
new CrawlPlan.WorkDir(crawlData.path(), "crawler.log"),
new CrawlPlan.WorkDir(processData.path(), "processor.log"));

return new ConvertCrawlDataAction(plan, msg, inbox);
}
var plan = new CrawlPlan(null,
new CrawlPlan.WorkDir(crawlData.path(), "crawler.log"),
new CrawlPlan.WorkDir(processData.path(), "processor.log"));

if (request.action == ConvertAction.SideloadEncyclopedia) {
var processData = fileStorageService.getStorage(request.processedDataStorage);
var filePath = Path.of(request.inputSource);
yield new ConvertCrawlDataAction(plan, msg, inbox);
}
case SideloadEncyclopedia -> {
var processData = fileStorageService.getStorage(request.processedDataStorage);

return new SideloadAction(sideloadSourceFactory.sideloadEncyclopediaMarginaliaNu(filePath),
processData.asPath(),
msg, inbox);
}
yield new SideloadAction(sideloadSourceFactory.sideloadEncyclopediaMarginaliaNu(filePath),
processData.asPath(),
msg, inbox);
}
case SideloadDirtree -> {
var processData = fileStorageService.getStorage(request.processedDataStorage);

if (request.action == ConvertAction.SideloadStackexchange) {
var processData = fileStorageService.getStorage(request.processedDataStorage);
var filePath = Path.of(request.inputSource);
var domainName = filePath.toFile().getName().substring(0, filePath.toFile().getName().lastIndexOf('.'));
return new SideloadAction(sideloadSourceFactory.sideloadStackexchange(filePath, domainName),
processData.asPath(),
msg, inbox);
}
yield new SideloadAction(
sideloadSourceFactory.sideloadDirtree(filePath),
processData.asPath(),
msg, inbox);
}
case SideloadStackexchange -> {
var processData = fileStorageService.getStorage(request.processedDataStorage);
var domainName = filePath.toFile().getName().substring(0, filePath.toFile().getName().lastIndexOf('.'));

else {
throw new RuntimeException("Unknown action: " + request.action);
}
yield new SideloadAction(sideloadSourceFactory.sideloadStackexchange(filePath, domainName),
processData.asPath(),
msg, inbox);
}
};
}

private Optional<MqMessage> getMessage(MqSingleShotInbox inbox, String expectedFunction) throws SQLException, InterruptedException {
Expand Down
Expand Up @@ -9,6 +9,4 @@
public interface SideloadSource {
ProcessedDomain getDomain();
Iterator<ProcessedDocument> getDocumentsStream();

String getId();
}
Expand Up @@ -2,29 +2,43 @@

import com.google.gson.Gson;
import com.google.inject.Inject;
import nu.marginalia.converting.processor.plugin.HtmlDocumentProcessorPlugin;
import nu.marginalia.converting.sideload.dirtree.DirtreeSideloaderFactory;
import nu.marginalia.converting.sideload.encyclopedia.EncyclopediaMarginaliaNuSideloader;
import nu.marginalia.converting.sideload.stackexchange.StackexchangeSideloader;
import nu.marginalia.keyword.DocumentKeywordExtractor;
import nu.marginalia.language.sentence.SentenceExtractor;

import java.io.IOException;
import java.nio.file.Path;
import java.sql.SQLException;
import java.util.Collection;

public class SideloadSourceFactory {
private final Gson gson;
private final HtmlDocumentProcessorPlugin htmlProcessorPlugin;
private final SideloaderProcessing sideloaderProcessing;
private final SentenceExtractor sentenceExtractor;
private final DocumentKeywordExtractor documentKeywordExtractor;
private final DirtreeSideloaderFactory dirtreeSideloaderFactory;

@Inject
public SideloadSourceFactory(Gson gson, HtmlDocumentProcessorPlugin htmlProcessorPlugin, SentenceExtractor sentenceExtractor, DocumentKeywordExtractor documentKeywordExtractor) {
public SideloadSourceFactory(Gson gson,
SideloaderProcessing sideloaderProcessing,
SentenceExtractor sentenceExtractor,
DocumentKeywordExtractor documentKeywordExtractor,
DirtreeSideloaderFactory dirtreeSideloaderFactory) {
this.gson = gson;
this.htmlProcessorPlugin = htmlProcessorPlugin;
this.sideloaderProcessing = sideloaderProcessing;
this.sentenceExtractor = sentenceExtractor;
this.documentKeywordExtractor = documentKeywordExtractor;
this.dirtreeSideloaderFactory = dirtreeSideloaderFactory;
}

public SideloadSource sideloadEncyclopediaMarginaliaNu(Path pathToDbFile) throws SQLException {
return new EncyclopediaMarginaliaNuSideloader(pathToDbFile, gson, htmlProcessorPlugin);
return new EncyclopediaMarginaliaNuSideloader(pathToDbFile, gson, sideloaderProcessing);
}

public Collection<? extends SideloadSource> sideloadDirtree(Path pathToYamlFile) throws IOException {
return dirtreeSideloaderFactory.createSideloaders(pathToYamlFile);
}

/** Do not use, this code isn't finished */
Expand Down
@@ -0,0 +1,67 @@
package nu.marginalia.converting.sideload;

import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.processor.plugin.HtmlDocumentProcessorPlugin;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.UrlIndexingState;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.idx.WordMetadata;

import java.net.URISyntaxException;
import java.time.LocalDateTime;
import java.util.List;

@Singleton
public class SideloaderProcessing {
private final HtmlDocumentProcessorPlugin htmlProcessorPlugin;

@Inject
public SideloaderProcessing(HtmlDocumentProcessorPlugin htmlProcessorPlugin) {
this.htmlProcessorPlugin = htmlProcessorPlugin;
}

public ProcessedDocument processDocument(String url, String body, List<String> extraKeywords, int size) throws URISyntaxException {
var crawledDoc = new CrawledDocument(
"encyclopedia.marginalia.nu",
url,
"text/html",
LocalDateTime.now().toString(),
200,
"OK",
"NP",
"",
body,
Integer.toHexString(url.hashCode()),
url,
"",
"SIDELOAD"
);

var ret = new ProcessedDocument();
try {
var details = htmlProcessorPlugin.createDetails(crawledDoc);

ret.words = details.words();

for (String keyword : extraKeywords)
ret.words.add(keyword, WordFlags.Subjects.asBit());

ret.details = details.details();
ret.details.metadata = ret.details.metadata
.withSize(size, Math.max(0, 255 - url.length()));
ret.url = new EdgeUrl(url);
ret.state = UrlIndexingState.OK;
ret.stateReason = "SIDELOAD";
}
catch (Exception e) {
ret.url = new EdgeUrl(url);
ret.state = UrlIndexingState.DISQUALIFIED;
ret.stateReason = "SIDELOAD";
}

return ret;
}
}
@@ -0,0 +1,20 @@
package nu.marginalia.converting.sideload.dirtree;

import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.Setter;

import java.util.List;

@AllArgsConstructor
@NoArgsConstructor
@Setter
@Getter
class DirtreeSideloadSpec {
public String name;
public String domainName;
public String dir;
public String baseUrl;
public List<String> keywords;
}
@@ -0,0 +1,14 @@
package nu.marginalia.converting.sideload.dirtree;

import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.Setter;

import java.util.List;

@AllArgsConstructor @NoArgsConstructor
@Setter @Getter
class DirtreeSideloadSpecList {
public List<DirtreeSideloadSpec> sources;
}
@@ -0,0 +1,83 @@
package nu.marginalia.converting.sideload.dirtree;

import lombok.SneakyThrows;
import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.model.ProcessedDomain;
import nu.marginalia.converting.sideload.SideloadSource;
import nu.marginalia.converting.sideload.SideloaderProcessing;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.crawl.DomainIndexingState;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Iterator;
import java.util.List;
import java.util.stream.Stream;

public class DirtreeSideloader implements SideloadSource, AutoCloseable {
private final Path dirBase;
private final String domainName;
private final String urlBase;

private final List<String> extraKeywords;
private final SideloaderProcessing sideloaderProcessing;

private final Stream<Path> filesStream;

public DirtreeSideloader(Path dirBase,
String domainName,
String urlBase,
List<String> extraKeywords,
SideloaderProcessing sideloaderProcessing)
throws IOException
{
this.dirBase = dirBase;
this.domainName = domainName;
this.urlBase = urlBase + (urlBase.endsWith("/") ? "" : "/");
this.filesStream = Files.walk(dirBase);
this.extraKeywords = extraKeywords;
this.sideloaderProcessing = sideloaderProcessing;
}

@Override
public ProcessedDomain getDomain() {
var ret = new ProcessedDomain();

ret.domain = new EdgeDomain(domainName);
ret.ip = "0.0.0.0";
ret.state = DomainIndexingState.ACTIVE;

return ret;
}

@Override
public Iterator<ProcessedDocument> getDocumentsStream() {
return filesStream
.filter(Files::isRegularFile)
.filter(this::isHtmlFile)
.map(this::process)
.iterator();
}

private boolean isHtmlFile(Path path) {
final String name = path.toFile().getName().toLowerCase();

return name.endsWith(".html") || name.endsWith(".htm");
}

@SneakyThrows
private ProcessedDocument process(Path path) {
String body = Files.readString(path);
String url = urlBase + dirBase.relativize(path);

return sideloaderProcessing
.processDocument(url, body, extraKeywords, 10_000);
}

@Override
public void close() throws Exception {
filesStream.close();
}

}

0 comments on commit 9b385ec

Please sign in to comment.