Skip to content

Commit

Permalink
(converter, control) Re-enable sideloading encyclopedia data
Browse files Browse the repository at this point in the history
  • Loading branch information
vlofgren committed Sep 14, 2023
1 parent 35996d0 commit 5e5aaf9
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 10 deletions.
Expand Up @@ -7,6 +7,7 @@
import nu.marginalia.converting.model.ProcessedDomain;
import nu.marginalia.converting.sideload.SideloadSource;
import nu.marginalia.converting.sideload.SideloadSourceFactory;
import nu.marginalia.converting.writer.ConverterBatchWriter;
import nu.marginalia.converting.writer.ConverterWriter;
import nu.marginalia.db.storage.FileStorageService;
import nu.marginalia.mq.MessageQueueFactory;
Expand All @@ -25,6 +26,7 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.nio.file.Path;
import java.sql.SQLException;
import java.util.Optional;
Expand Down Expand Up @@ -82,10 +84,10 @@ public ConverterMain(
heartbeat.start();
}

public void convert(SideloadSource sideloadSource, Path writeDir) throws Exception {
int maxPoolSize = 16;

// FIXME
public void convert(SideloadSource sideloadSource, Path writeDir) throws IOException {
try (var writer = new ConverterBatchWriter(writeDir, 0)) {
writer.write(sideloadSource);
}
}

public void convert(CrawlPlan plan) throws Exception {
Expand Down
Expand Up @@ -2,7 +2,9 @@

import gnu.trove.list.TLongList;
import gnu.trove.list.array.TLongArrayList;
import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.model.ProcessedDomain;
import nu.marginalia.converting.sideload.SideloadSource;
import nu.marginalia.io.processed.DocumentRecordParquetFileWriter;
import nu.marginalia.io.processed.DomainLinkRecordParquetFileWriter;
import nu.marginalia.io.processed.DomainRecordParquetFileWriter;
Expand All @@ -24,14 +26,15 @@
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.Future;

public class ConverterBatchWriter {
/** Writer for a single batch of converter parquet files */
public class ConverterBatchWriter implements AutoCloseable {
private final DomainRecordParquetFileWriter domainWriter;
private final DomainLinkRecordParquetFileWriter domainLinkWriter;
private final DocumentRecordParquetFileWriter documentWriter;

private static final Logger logger = LoggerFactory.getLogger(ConverterBatchWriter.class);

ConverterBatchWriter(Path basePath, int batchNumber) throws IOException {
public ConverterBatchWriter(Path basePath, int batchNumber) throws IOException {
domainWriter = new DomainRecordParquetFileWriter(
ProcessedDataFileNames.domainFileName(basePath, batchNumber)
);
Expand All @@ -43,6 +46,14 @@ public class ConverterBatchWriter {
);
}

public void write(SideloadSource sideloadSource) throws IOException {
var domain = sideloadSource.getDomain();

writeDomainData(domain);

writeDocumentData(domain.domain, sideloadSource.getDocumentsStream());
}

public void write(ProcessedDomain domain) {
var results = ForkJoinPool.commonPool().invokeAll(
writeTasks(domain)
Expand All @@ -67,10 +78,22 @@ private Object writeDocumentData(ProcessedDomain domain) throws IOException {
if (domain.documents == null)
return this;

String domainName = domain.domain.toString();
writeDocumentData(domain.domain, domain.documents.iterator());

return this;
}

private void writeDocumentData(EdgeDomain domain,
Iterator<ProcessedDocument> documentIterator)
throws IOException
{

int ordinal = 0;

for (var document : domain.documents) {
String domainName = domain.toString();

while (documentIterator.hasNext()) {
var document = documentIterator.next();
if (document.details == null) {
new DocumentRecord(
domainName,
Expand Down Expand Up @@ -119,7 +142,6 @@ private Object writeDocumentData(ProcessedDomain domain) throws IOException {
ordinal++;
}

return this;
}

private Object writeLinkData(ProcessedDomain domain) throws IOException {
Expand Down
Expand Up @@ -188,6 +188,7 @@ public ControlService(BaseServiceParams params,
Spark.post("/public/actions/trigger-data-exports", controlActionsService::triggerDataExports, redirectToActors);
Spark.post("/public/actions/flush-api-caches", controlActionsService::flushApiCaches, redirectToActors);
Spark.post("/public/actions/truncate-links-database", controlActionsService::truncateLinkDatabase, redirectToActors);
Spark.post("/public/actions/sideload-encyclopedia", controlActionsService::sideloadEncyclopedia, redirectToActors);

// Review Random Domains
Spark.get("/public/review-random-domains", this::reviewRandomDomainsModel, reviewRandomDomainsRenderer::render);
Expand Down
Expand Up @@ -4,18 +4,20 @@
import com.google.inject.Singleton;
import nu.marginalia.control.actor.ControlActors;
import nu.marginalia.control.actor.Actor;
import nu.marginalia.control.actor.task.ConvertActor;
import nu.marginalia.db.DomainTypes;
import nu.marginalia.index.client.IndexClient;
import nu.marginalia.index.client.IndexMqEndpoints;
import nu.marginalia.mq.MessageQueueFactory;
import nu.marginalia.mq.outbox.MqOutbox;
import nu.marginalia.search.client.SearchClient;
import nu.marginalia.service.control.ServiceEventLog;
import nu.marginalia.service.id.ServiceId;
import spark.Request;
import spark.Response;
import spark.Spark;

import java.nio.file.Files;
import java.nio.file.Path;
import java.util.UUID;

@Singleton
Expand Down Expand Up @@ -97,6 +99,22 @@ public Object truncateLinkDatabase(Request request, Response response) throws Ex
return "";
}

public Object sideloadEncyclopedia(Request request, Response response) throws Exception {

Path sourcePath = Path.of(request.queryParams("source"));
if (!Files.exists(sourcePath)) {
Spark.halt(404);
return "No such file " + sourcePath;
}

eventLog.logEvent("USER-ACTION", "SIDELOAD ENCYCLOPEDIA");

actors.startFrom(Actor.CONVERT, ConvertActor.CONVERT_ENCYCLOPEDIA, sourcePath.toString());

return "";
}


public Object triggerRepartition(Request request, Response response) throws Exception {
indexClient.outbox().sendAsync(IndexMqEndpoints.INDEX_REPARTITION, "");

Expand Down
Expand Up @@ -24,6 +24,21 @@
</form>
</td>
</tr>
<tr>
<td><b>Sideload Encyclopedia</b><p>
This will load pre-digested encyclopedia data
from a encyclopedia.marginalia.nu-style database.
</td>
<td>
<form method="post" action="/actions/sideload-encyclopedia" onsubmit="return confirm('Confirm sideloading')">
<label for="source">articles.db location on server</label><br>
<input id="source" name="source" value="">
<br><br>

<input type="submit" value="Sideload Encyclopedia">
</form>
</td>
</tr>
<tr>
<td>
<b>Reload Blogs List</b>
Expand Down

0 comments on commit 5e5aaf9

Please sign in to comment.