Skip to content

Commit

Permalink
access simplemma functionality only via annif.simplemma_util
Browse files Browse the repository at this point in the history
  • Loading branch information
osma committed Aug 10, 2023
1 parent 7b002f5 commit ec7d640
Show file tree
Hide file tree
Showing 4 changed files with 20 additions and 22 deletions.
9 changes: 2 additions & 7 deletions annif/analyzer/simplemma.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
"""Simplemma analyzer for Annif, based on simplemma lemmatizer."""
from __future__ import annotations

import simplemma

import annif.langsupport
import annif.simplemma_util

from . import analyzer

Expand All @@ -13,10 +11,7 @@ class SimplemmaAnalyzer(analyzer.Analyzer):

def __init__(self, param: str, **kwargs) -> None:
self.lang = param
self.lemmatizer = simplemma.Lemmatizer(
lemmatization_strategy=annif.langsupport.lemmatization_strategy
)
super().__init__(**kwargs)

def _normalize_word(self, word: str) -> str:
return self.lemmatizer.lemmatize(word, lang=self.lang)
return annif.simplemma_util.lemmatizer.lemmatize(word, lang=self.lang)
9 changes: 0 additions & 9 deletions annif/langsupport.py

This file was deleted.

15 changes: 15 additions & 0 deletions annif/simplemma_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
"""Wrapper code for using Simplemma functionality in Annif"""

from simplemma import LanguageDetector, Lemmatizer
from simplemma.strategies import DefaultStrategy
from simplemma.strategies.dictionaries import DefaultDictionaryFactory

LANG_CACHE_SIZE = 5 # How many language dictionaries to keep in memory at once (max)

_dictionary_factory = DefaultDictionaryFactory(cache_max_size=LANG_CACHE_SIZE)
_lemmatization_strategy = DefaultStrategy(dictionary_factory=_dictionary_factory)
lemmatizer = Lemmatizer(lemmatization_strategy=_lemmatization_strategy)


def get_language_detector(lang: str) -> LanguageDetector:
return LanguageDetector(lang, lemmatization_strategy=_lemmatization_strategy)
9 changes: 3 additions & 6 deletions annif/transform/langfilter.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,8 @@

from typing import TYPE_CHECKING

from simplemma import LanguageDetector

import annif
import annif.langsupport
import annif.simplemma_util

from . import transform

Expand All @@ -31,9 +29,8 @@ def __init__(
self.text_min_length = int(text_min_length)
self.sentence_min_length = int(sentence_min_length)
self.min_ratio = float(min_ratio)
self.language_detector = LanguageDetector(
self.project.language,
lemmatization_strategy=annif.langsupport.lemmatization_strategy,
self.language_detector = annif.simplemma_util.get_language_detector(
self.project.language
)

def transform_fn(self, text: str) -> str:
Expand Down

0 comments on commit ec7d640

Please sign in to comment.