Skip to content

Commit

Permalink
limit number of in-memory Simplemma dictionaries to at most 5
Browse files Browse the repository at this point in the history
  • Loading branch information
osma committed Aug 10, 2023
1 parent 60331a0 commit 7b002f5
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 3 deletions.
7 changes: 6 additions & 1 deletion annif/analyzer/simplemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

import simplemma

import annif.langsupport

from . import analyzer


Expand All @@ -11,7 +13,10 @@ class SimplemmaAnalyzer(analyzer.Analyzer):

def __init__(self, param: str, **kwargs) -> None:
self.lang = param
self.lemmatizer = simplemma.Lemmatizer(
lemmatization_strategy=annif.langsupport.lemmatization_strategy
)
super().__init__(**kwargs)

def _normalize_word(self, word: str) -> str:
return simplemma.lemmatize(word, lang=self.lang)
return self.lemmatizer.lemmatize(word, lang=self.lang)
9 changes: 9 additions & 0 deletions annif/langsupport.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
"""Language support and language detection functionality for Annif"""

from simplemma.strategies import DefaultStrategy
from simplemma.strategies.dictionaries import DefaultDictionaryFactory

LANG_CACHE_SIZE = 5 # How many language dictionaries to keep in memory at once (max)

dictionary_factory = DefaultDictionaryFactory(cache_max_size=LANG_CACHE_SIZE)
lemmatization_strategy = DefaultStrategy(dictionary_factory=dictionary_factory)
9 changes: 7 additions & 2 deletions annif/transform/langfilter.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@

from typing import TYPE_CHECKING

from simplemma.langdetect import in_target_language
from simplemma import LanguageDetector

import annif
import annif.langsupport

from . import transform

Expand All @@ -30,6 +31,10 @@ def __init__(
self.text_min_length = int(text_min_length)
self.sentence_min_length = int(sentence_min_length)
self.min_ratio = float(min_ratio)
self.language_detector = LanguageDetector(
self.project.language,
lemmatization_strategy=annif.langsupport.lemmatization_strategy,
)

def transform_fn(self, text: str) -> str:
if len(text) < self.text_min_length:
Expand All @@ -40,7 +45,7 @@ def transform_fn(self, text: str) -> str:
if len(sent) < self.sentence_min_length:
retained_sentences.append(sent)
continue
proportion = in_target_language(sent, lang=(self.project.language,))
proportion = self.language_detector.proportion_in_target_languages(sent)
if proportion >= self.min_ratio:
retained_sentences.append(sent)
return " ".join(retained_sentences)

0 comments on commit 7b002f5

Please sign in to comment.