Skip to content

Commit

Permalink
(converter) Fix rare sentence extractor bug
Browse files Browse the repository at this point in the history
It was caused by non-thread safe concurrent memory access in SentenceExtractor.
  • Loading branch information
vlofgren committed Sep 24, 2023
1 parent 8ca20f1 commit a433bbb
Show file tree
Hide file tree
Showing 6 changed files with 44 additions and 15 deletions.
Expand Up @@ -22,6 +22,11 @@
import java.io.InputStream;
import java.util.*;

/** This class still isn't thread safe!! If you use it concurrently, it won't throw exceptions,
* it will just haunt your code and cause unpredictable mysterious errors.
*
* Use {@link ThreadLocalSentenceExtractorProvider} instead to avoid falling into the twilight zone!
*/
public class SentenceExtractor {

private SentenceDetectorME sentenceDetector;
Expand Down
@@ -0,0 +1,19 @@
package nu.marginalia.language.sentence;

import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.LanguageModels;

@Singleton
public class ThreadLocalSentenceExtractorProvider {
private final ThreadLocal<SentenceExtractor> sentenceExtractorThreadLocal;

@Inject
public ThreadLocalSentenceExtractorProvider(LanguageModels languageModels) {
sentenceExtractorThreadLocal = ThreadLocal.withInitial(() -> new SentenceExtractor(languageModels));
}

public SentenceExtractor get() {
return sentenceExtractorThreadLocal.get();
}
}
Expand Up @@ -10,6 +10,7 @@
import nu.marginalia.converting.processor.logic.links.LinkProcessor;
import nu.marginalia.converting.processor.plugin.specialization.*;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.link_parser.LinkParser;
import nu.marginalia.crawling.model.CrawledDocument;
Expand Down Expand Up @@ -44,7 +45,6 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
private final Logger logger = LoggerFactory.getLogger(getClass());
private final double minDocumentQuality;

private final SentenceExtractor sentenceExtractor;
private final FeatureExtractor featureExtractor;
private final TitleExtractor titleExtractor;
private final DocumentKeywordExtractor keywordExtractor;
Expand All @@ -59,6 +59,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
private static final LinkParser linkParser = new LinkParser();
private static final FeedExtractor feedExtractor = new FeedExtractor(linkParser);

private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
private final HtmlProcessorSpecializations htmlProcessorSpecializations;

@Inject
Expand All @@ -73,13 +74,13 @@ public HtmlDocumentProcessorPlugin(
DocumentLengthLogic documentLengthLogic,
MetaRobotsTag metaRobotsTag,
DocumentGeneratorExtractor documentGeneratorExtractor,
ThreadLocalSentenceExtractorProvider sentenceExtractorProvider,
HtmlProcessorSpecializations specializations)
{
super(languageFilter);

this.documentLengthLogic = documentLengthLogic;
this.minDocumentQuality = minDocumentQuality;
this.sentenceExtractor = sentenceExtractor;
this.featureExtractor = featureExtractor;

this.titleExtractor = titleExtractor;
Expand All @@ -88,6 +89,7 @@ public HtmlDocumentProcessorPlugin(
this.metaRobotsTag = metaRobotsTag;

this.documentGeneratorExtractor = documentGeneratorExtractor;
this.sentenceExtractorProvider = sentenceExtractorProvider;
this.htmlProcessorSpecializations = specializations;
}

Expand Down Expand Up @@ -122,7 +124,8 @@ public DetailsWithWords createDetails(CrawledDocument crawledDocument)
throw new DisqualifiedException(DisqualificationReason.IRRELEVANT);
}

DocumentLanguageData dld = sentenceExtractor.extractSentences(specialization.prune(doc));
DocumentLanguageData dld =
sentenceExtractorProvider.get().extractSentences(specialization.prune(doc));

checkDocumentLanguage(dld);

Expand Down
Expand Up @@ -6,7 +6,7 @@
import nu.marginalia.converting.sideload.encyclopedia.EncyclopediaMarginaliaNuSideloader;
import nu.marginalia.converting.sideload.stackexchange.StackexchangeSideloader;
import nu.marginalia.keyword.DocumentKeywordExtractor;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;

import java.io.IOException;
import java.nio.file.Files;
Expand All @@ -17,19 +17,19 @@
public class SideloadSourceFactory {
private final Gson gson;
private final SideloaderProcessing sideloaderProcessing;
private final SentenceExtractor sentenceExtractor;
private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
private final DocumentKeywordExtractor documentKeywordExtractor;
private final DirtreeSideloaderFactory dirtreeSideloaderFactory;

@Inject
public SideloadSourceFactory(Gson gson,
SideloaderProcessing sideloaderProcessing,
SentenceExtractor sentenceExtractor,
ThreadLocalSentenceExtractorProvider sentenceExtractorProvider,
DocumentKeywordExtractor documentKeywordExtractor,
DirtreeSideloaderFactory dirtreeSideloaderFactory) {
this.gson = gson;
this.sideloaderProcessing = sideloaderProcessing;
this.sentenceExtractor = sentenceExtractor;
this.sentenceExtractorProvider = sentenceExtractorProvider;
this.documentKeywordExtractor = documentKeywordExtractor;
this.dirtreeSideloaderFactory = dirtreeSideloaderFactory;
}
Expand All @@ -48,7 +48,7 @@ public Collection<? extends SideloadSource> sideloadStackexchange(Path pathToDbF
return dirs
.filter(Files::isRegularFile)
.filter(f -> f.toFile().getName().endsWith(".db"))
.map(dbFile -> new StackexchangeSideloader(dbFile, sentenceExtractor, documentKeywordExtractor))
.map(dbFile -> new StackexchangeSideloader(dbFile, sentenceExtractorProvider, documentKeywordExtractor))
.toList();
}
}
Expand Down
Expand Up @@ -8,7 +8,6 @@
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.UrlIndexingState;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.idx.WordMetadata;

import java.net.URISyntaxException;
import java.time.LocalDateTime;
Expand All @@ -23,7 +22,10 @@ public SideloaderProcessing(HtmlDocumentProcessorPlugin htmlProcessorPlugin) {
this.htmlProcessorPlugin = htmlProcessorPlugin;
}

public ProcessedDocument processDocument(String url, String body, List<String> extraKeywords, int size) throws URISyntaxException {
public ProcessedDocument processDocument(String url,
String body,
List<String> extraKeywords,
int size) throws URISyntaxException {
var crawledDoc = new CrawledDocument(
"encyclopedia.marginalia.nu",
url,
Expand Down
Expand Up @@ -5,7 +5,7 @@
import nu.marginalia.converting.sideload.SideloadSource;
import nu.marginalia.integration.stackexchange.sqlite.StackExchangePostsDb;
import nu.marginalia.keyword.DocumentKeywordExtractor;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.DomainIndexingState;
Expand All @@ -29,20 +29,20 @@
import java.util.concurrent.TimeUnit;

public class StackexchangeSideloader implements SideloadSource {
private final SentenceExtractor sentenceExtractor;
private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
private final DocumentKeywordExtractor keywordExtractor;
private final String domainName;

private final Path dbFile;

@SneakyThrows
public StackexchangeSideloader(Path pathToDbFile,
SentenceExtractor sentenceExtractor,
ThreadLocalSentenceExtractorProvider sentenceExtractorProvider,
DocumentKeywordExtractor keywordExtractor
) {
this.dbFile = pathToDbFile;
this.domainName = StackExchangePostsDb.getDomainName(pathToDbFile);
this.sentenceExtractor = sentenceExtractor;
this.sentenceExtractorProvider = sentenceExtractorProvider;
this.keywordExtractor = keywordExtractor;
}

Expand Down Expand Up @@ -109,7 +109,7 @@ private ProcessedDocument convert(StackExchangePostsDb.CombinedPostModel post) {

var url = new EdgeUrl(fullUrl);
var doc = Jsoup.parse(fullHtml.toString());
var dld = sentenceExtractor.extractSentences(doc);
var dld = sentenceExtractorProvider.get().extractSentences(doc);

ret.url = url;
ret.words = keywordExtractor.extractKeywords(dld, url);
Expand Down

0 comments on commit a433bbb

Please sign in to comment.