diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java index 7afab7409..99a468930 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java @@ -7,6 +7,7 @@ import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.converting.sideload.SideloadSource; import nu.marginalia.converting.sideload.SideloadSourceFactory; +import nu.marginalia.converting.writer.ConverterBatchWriter; import nu.marginalia.converting.writer.ConverterWriter; import nu.marginalia.db.storage.FileStorageService; import nu.marginalia.mq.MessageQueueFactory; @@ -25,6 +26,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.IOException; import java.nio.file.Path; import java.sql.SQLException; import java.util.Optional; @@ -82,10 +84,10 @@ public ConverterMain( heartbeat.start(); } - public void convert(SideloadSource sideloadSource, Path writeDir) throws Exception { - int maxPoolSize = 16; - - // FIXME + public void convert(SideloadSource sideloadSource, Path writeDir) throws IOException { + try (var writer = new ConverterBatchWriter(writeDir, 0)) { + writer.write(sideloadSource); + } } public void convert(CrawlPlan plan) throws Exception { diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterBatchWriter.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterBatchWriter.java index cea46f20a..cc9f04678 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterBatchWriter.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterBatchWriter.java @@ -2,7 +2,9 @@ import gnu.trove.list.TLongList; import gnu.trove.list.array.TLongArrayList; +import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.model.ProcessedDomain; +import nu.marginalia.converting.sideload.SideloadSource; import nu.marginalia.io.processed.DocumentRecordParquetFileWriter; import nu.marginalia.io.processed.DomainLinkRecordParquetFileWriter; import nu.marginalia.io.processed.DomainRecordParquetFileWriter; @@ -24,14 +26,15 @@ import java.util.concurrent.ForkJoinPool; import java.util.concurrent.Future; -public class ConverterBatchWriter { +/** Writer for a single batch of converter parquet files */ +public class ConverterBatchWriter implements AutoCloseable { private final DomainRecordParquetFileWriter domainWriter; private final DomainLinkRecordParquetFileWriter domainLinkWriter; private final DocumentRecordParquetFileWriter documentWriter; private static final Logger logger = LoggerFactory.getLogger(ConverterBatchWriter.class); - ConverterBatchWriter(Path basePath, int batchNumber) throws IOException { + public ConverterBatchWriter(Path basePath, int batchNumber) throws IOException { domainWriter = new DomainRecordParquetFileWriter( ProcessedDataFileNames.domainFileName(basePath, batchNumber) ); @@ -43,6 +46,14 @@ public class ConverterBatchWriter { ); } + public void write(SideloadSource sideloadSource) throws IOException { + var domain = sideloadSource.getDomain(); + + writeDomainData(domain); + + writeDocumentData(domain.domain, sideloadSource.getDocumentsStream()); + } + public void write(ProcessedDomain domain) { var results = ForkJoinPool.commonPool().invokeAll( writeTasks(domain) @@ -67,10 +78,22 @@ private Object writeDocumentData(ProcessedDomain domain) throws IOException { if (domain.documents == null) return this; - String domainName = domain.domain.toString(); + writeDocumentData(domain.domain, domain.documents.iterator()); + + return this; + } + + private void writeDocumentData(EdgeDomain domain, + Iterator documentIterator) + throws IOException + { + int ordinal = 0; - for (var document : domain.documents) { + String domainName = domain.toString(); + + while (documentIterator.hasNext()) { + var document = documentIterator.next(); if (document.details == null) { new DocumentRecord( domainName, @@ -119,7 +142,6 @@ private Object writeDocumentData(ProcessedDomain domain) throws IOException { ordinal++; } - return this; } private Object writeLinkData(ProcessedDomain domain) throws IOException { diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java index 43e9d9856..2b6b6d580 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java @@ -188,6 +188,7 @@ public ControlService(BaseServiceParams params, Spark.post("/public/actions/trigger-data-exports", controlActionsService::triggerDataExports, redirectToActors); Spark.post("/public/actions/flush-api-caches", controlActionsService::flushApiCaches, redirectToActors); Spark.post("/public/actions/truncate-links-database", controlActionsService::truncateLinkDatabase, redirectToActors); + Spark.post("/public/actions/sideload-encyclopedia", controlActionsService::sideloadEncyclopedia, redirectToActors); // Review Random Domains Spark.get("/public/review-random-domains", this::reviewRandomDomainsModel, reviewRandomDomainsRenderer::render); diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActionsService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActionsService.java index a8267d40d..337a05bfd 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActionsService.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActionsService.java @@ -4,18 +4,20 @@ import com.google.inject.Singleton; import nu.marginalia.control.actor.ControlActors; import nu.marginalia.control.actor.Actor; +import nu.marginalia.control.actor.task.ConvertActor; import nu.marginalia.db.DomainTypes; import nu.marginalia.index.client.IndexClient; import nu.marginalia.index.client.IndexMqEndpoints; import nu.marginalia.mq.MessageQueueFactory; import nu.marginalia.mq.outbox.MqOutbox; -import nu.marginalia.search.client.SearchClient; import nu.marginalia.service.control.ServiceEventLog; import nu.marginalia.service.id.ServiceId; import spark.Request; import spark.Response; import spark.Spark; +import java.nio.file.Files; +import java.nio.file.Path; import java.util.UUID; @Singleton @@ -97,6 +99,22 @@ public Object truncateLinkDatabase(Request request, Response response) throws Ex return ""; } + public Object sideloadEncyclopedia(Request request, Response response) throws Exception { + + Path sourcePath = Path.of(request.queryParams("source")); + if (!Files.exists(sourcePath)) { + Spark.halt(404); + return "No such file " + sourcePath; + } + + eventLog.logEvent("USER-ACTION", "SIDELOAD ENCYCLOPEDIA"); + + actors.startFrom(Actor.CONVERT, ConvertActor.CONVERT_ENCYCLOPEDIA, sourcePath.toString()); + + return ""; + } + + public Object triggerRepartition(Request request, Response response) throws Exception { indexClient.outbox().sendAsync(IndexMqEndpoints.INDEX_REPARTITION, ""); diff --git a/code/services-core/control-service/src/main/resources/templates/control/actions.hdb b/code/services-core/control-service/src/main/resources/templates/control/actions.hdb index 9ec528d4a..e97683226 100644 --- a/code/services-core/control-service/src/main/resources/templates/control/actions.hdb +++ b/code/services-core/control-service/src/main/resources/templates/control/actions.hdb @@ -24,6 +24,21 @@ + + Sideload Encyclopedia

+ This will load pre-digested encyclopedia data + from a encyclopedia.marginalia.nu-style database. + + +

+
+ +

+ + +
+ + Reload Blogs List