Skip to content

Commit

Permalink
(converter, stackexchange-xml) Add the ability to sideload stackexcha…
Browse files Browse the repository at this point in the history
…nge data
  • Loading branch information
vlofgren committed Sep 21, 2023
1 parent 4aa47e8 commit 70aa04c
Show file tree
Hide file tree
Showing 13 changed files with 290 additions and 307 deletions.
Expand Up @@ -155,13 +155,20 @@ public static void forEachPost(
String title = "";
int year = 2023;

String tags = "";

List<Future<String>> partWork = new ArrayList<>();
var commonPool = ForkJoinPool.commonPool();
while (rs.next()) {
String maybeTitle = rs.getString("title");

if (maybeTitle != null && !maybeTitle.isBlank())
title = maybeTitle;

String maybeTags = rs.getString("tags");
if (maybeTags != null && !maybeTags.isBlank())
tags = maybeTags;

int origSize = rs.getInt("origSize");

year = Math.min(year, rs.getInt("postYear"));
Expand All @@ -177,7 +184,7 @@ public static void forEachPost(
parts.add(workItem.get());
}

if (!consumer.test(new CombinedPostModel(ordinal++, threadId, title, year, parts)))
if (!consumer.test(new CombinedPostModel(ordinal++, threadId, title, year, parts, tags)))
break;
}

Expand All @@ -188,11 +195,27 @@ public static void forEachPost(

}

public static String getDomainName(Path pathToDbFile) throws SQLException {
String connStr = "jdbc:sqlite:" + pathToDbFile;

try (var connection = DriverManager.getConnection(connStr);
var stmt = connection.prepareStatement("SELECT domainName FROM metadata")
) {
var rs = stmt.executeQuery();
if (rs.next()) {
return rs.getString(1);
}
throw new IllegalArgumentException("No metadata in db file " + pathToDbFile);
}

}

public record CombinedPostModel(int ordinal,
int threadId,
String title,
int year,
List<String> bodies)
List<String> bodies,
String tags)
{ }

}
1 change: 1 addition & 0 deletions code/processes/converting-process/build.gradle
Expand Up @@ -53,6 +53,7 @@ dependencies {
implementation project(':code:features-convert:pubdate')
implementation project(':code:features-convert:keyword-extraction')
implementation project(':code:features-convert:summary-extraction')
implementation project(':code:features-convert:stackexchange-xml')

implementation project(':code:features-crawl:crawl-blocklist')
implementation project(':code:features-crawl:link-parser')
Expand Down
Expand Up @@ -25,6 +25,7 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.SQLException;
import java.util.Collection;
Expand Down Expand Up @@ -86,12 +87,19 @@ public ConverterMain(

public void convert(Collection<? extends SideloadSource> sideloadSources, Path writeDir) throws Exception {
try (var writer = new ConverterBatchWriter(writeDir, 0);
var taskHeartbeat = heartbeat.createAdHocTaskHeartbeat("Sideloading");
BatchingWorkLog batchingWorkLog = new BatchingWorkLogImpl(writeDir.resolve("processor.log"))
) {

int i = 0;
for (var sideloadSource : sideloadSources) {
logger.info("Sideloading {}", sideloadSource.getDomain());

taskHeartbeat.progress(sideloadSource.getDomain().toString(), i++, sideloadSources.size());

writer.write(sideloadSource);
}
taskHeartbeat.progress("Finished", i, sideloadSources.size());

// We write an empty log with just a finish marker for the sideloading action
batchingWorkLog.logFinishedBatch();
Expand Down Expand Up @@ -242,9 +250,8 @@ yield new SideloadAction(
}
case SideloadStackexchange -> {
var processData = fileStorageService.getStorage(request.processedDataStorage);
var domainName = filePath.toFile().getName().substring(0, filePath.toFile().getName().lastIndexOf('.'));

yield new SideloadAction(sideloadSourceFactory.sideloadStackexchange(filePath, domainName),
yield new SideloadAction(sideloadSourceFactory.sideloadStackexchange(filePath),
processData.asPath(),
msg, inbox);
}
Expand Down
Expand Up @@ -9,6 +9,7 @@
import nu.marginalia.language.sentence.SentenceExtractor;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.SQLException;
import java.util.Collection;
Expand Down Expand Up @@ -42,8 +43,13 @@ public Collection<? extends SideloadSource> sideloadDirtree(Path pathToYamlFile)
}

/** Do not use, this code isn't finished */
@Deprecated()
public SideloadSource sideloadStackexchange(Path pathTo7zFile, String domainName) {
return new StackexchangeSideloader(pathTo7zFile, domainName, sentenceExtractor, documentKeywordExtractor);
public Collection<? extends SideloadSource> sideloadStackexchange(Path pathToDbFileRoot) throws IOException {
try (var dirs = Files.walk(pathToDbFileRoot)) {
return dirs
.filter(Files::isRegularFile)
.filter(f -> f.toFile().getName().endsWith(".db"))
.map(dbFile -> new StackexchangeSideloader(dbFile, sentenceExtractor, documentKeywordExtractor))
.toList();
}
}
}

This file was deleted.

0 comments on commit 70aa04c

Please sign in to comment.