NatLibFi · juhoinkinen · May 14, 2021 · Jan 5, 2021 · Jan 12, 2021 · Jan 12, 2021
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -32,9 +32,9 @@ jobs:
         # Install the optional neural network dependencies (TensorFlow and LMDB)
         # - except for one Python version (3.7) so that we can test also without them
         if [[ ${{ matrix.python-version }} != '3.7' ]]; then pip install .[nn]; fi
-        # Install the optional Omikuji dependency
+        # Install the optional Omikuji and YAKE dependencies
         # - except for one Python version (3.7) so that we can test also without them
-        if [[ ${{ matrix.python-version }} != '3.7' ]]; then pip install .[omikuji]; fi
+        if [[ ${{ matrix.python-version }} != '3.7' ]]; then pip install .[omikuji,yake]; fi
         # Install the optional fastText dependencies for Python 3.7 only
         if [[ ${{ matrix.python-version }} == '3.7' ]]; then pip install .[fasttext]; fi
         # For Python 3.6

diff --git a/README.md b/README.md
@@ -133,4 +133,11 @@ Zenodo DOI:
 
 The code in this repository is licensed under Apache License 2.0, except for the
 dependencies included under `annif/static/css` and `annif/static/js`,
-which have their own licenses. See the file headers for details.
+which have their own licenses, see the file headers for details.
+Please note that the [YAKE](https://github.com/LIAAD/yake) library is licended
+under [GPLv3](https://www.gnu.org/licenses/gpl-3.0.txt), while Annif is
+licensed under the Apache License 2.0. The licenses are compatible, but
+depending on legal interpretation, the terms of the GPLv3 (for example the
+requirement to publish corresponding source code when publishing an executable
+application) may be considered to apply to the whole of Annif+Yake if you
+decide to install the optional Yake dependency.
diff --git a/annif/backend/__init__.py b/annif/backend/__init__.py
@@ -60,3 +60,9 @@ def get_backend(backend_id):
     register_backend(omikuji.OmikujiBackend)
 except ImportError:
     annif.logger.debug("Omikuji not available, not enabling omikuji backend")
+
+try:
+    from . import yake
+    register_backend(yake.YakeBackend)
+except ImportError:
+    annif.logger.debug("YAKE not available, not enabling yake backend")
diff --git a/annif/backend/maui.py b/annif/backend/maui.py
@@ -101,7 +101,7 @@ def _upload_vocabulary(self, params):
         json = {}
         try:
             resp = requests.put(self.tagger_url(params) + '/vocab',
-                                data=self.project.vocab.as_skos())
+                                data=self.project.vocab.as_skos_file())
             try:
                 json = resp.json()
             except ValueError:

diff --git a/annif/backend/yake.py b/annif/backend/yake.py
@@ -0,0 +1,184 @@
+"""Annif backend using Yake keyword extraction"""
+# For license remarks of this backend see README.md:
+# https://github.com/NatLibFi/Annif#license.
+
+import yake
+import joblib
+import os.path
+import re
+from collections import defaultdict
+from rdflib.namespace import SKOS
+import annif.util
+from . import backend
+from annif.suggestion import SubjectSuggestion, ListSuggestionResult
+from annif.exception import ConfigurationException
+
+
+class YakeBackend(backend.AnnifBackend):
+    """Yake based backend for Annif"""
+    name = "yake"
+    needs_subject_index = False
+
+    # defaults for uninitialized instances
+    _index = None
+    _graph = None
+    INDEX_FILE = 'yake-index'
+
+    DEFAULT_PARAMETERS = {
+        'max_ngram_size': 4,
+        'deduplication_threshold': 0.9,
+        'deduplication_algo': 'levs',
+        'window_size': 1,
+        'num_keywords': 100,
+        'features': None,
+        'label_types': ['prefLabel', 'altLabel'],
+        'remove_parentheses': False
+    }
+
+    def default_params(self):
+        params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy()
+        params.update(self.DEFAULT_PARAMETERS)
+        return params
+
+    @property
+    def is_trained(self):
+        return True
+
+    @property
+    def label_types(self):
+        if type(self.params['label_types']) == str:  # Label types set by user
+            label_types = [lt.strip() for lt
+                           in self.params['label_types'].split(',')]
+            self._validate_label_types(label_types)
+        else:
+            label_types = self.params['label_types']  # The defaults
+        return [getattr(SKOS, lt) for lt in label_types]
+
+    def _validate_label_types(self, label_types):
+        for lt in label_types:
+            if lt not in ('prefLabel', 'altLabel', 'hiddenLabel'):
+                raise ConfigurationException(
+                    f'invalid label type {lt}', backend_id=self.backend_id)
+
+    def initialize(self):
+        self._initialize_index()
+
+    def _initialize_index(self):
+        if self._index is None:
+            path = os.path.join(self.datadir, self.INDEX_FILE)
+            if os.path.exists(path):
+                self._index = joblib.load(path)
+                self.debug(
+                    f'Loaded index from {path} with {len(self._index)} labels')
+            else:
+                self.info('Creating index')
+                self._index = self._create_index()
+                self._save_index(path)
+                self.info(f'Created index with {len(self._index)} labels')
+
+    def _save_index(self, path):
+        annif.util.atomic_save(
+            self._index,
+            self.datadir,
+            self.INDEX_FILE,
+            method=joblib.dump)
+
+    def _create_index(self):
+        index = defaultdict(set)
+        skos_vocab = self.project.vocab.skos
+        for concept in skos_vocab.concepts:
+            uri = str(concept)
+            labels = skos_vocab.get_concept_labels(
+                concept, self.label_types, self.params['language'])
+            for label in labels:
+                label = self._normalize_label(label)
+                index[label].add(uri)
+        index.pop('', None)  # Remove possible empty string entry
+        return dict(index)
+
+    def _normalize_label(self, label):
+        label = str(label)
+        if annif.util.boolean(self.params['remove_parentheses']):
+            label = re.sub(r' \(.*\)', '', label)
+        normalized_label = self._normalize_phrase(label)
+        return self._sort_phrase(normalized_label)
+
+    def _normalize_phrase(self, phrase):
+        normalized = []
+        for word in phrase.split():
+            normalized.append(
+                self.project.analyzer.normalize_word(word).lower())
+        return ' '.join(normalized)
+
+    def _sort_phrase(self, phrase):
+        words = phrase.split()
+        return ' '.join(sorted(words))
+
+    def _suggest(self, text, params):
+        self.debug(
+            f'Suggesting subjects for text "{text[:20]}..." (len={len(text)})')
+        limit = int(params['limit'])
+
+        self._kw_extractor = yake.KeywordExtractor(
+            lan=params['language'],
+            n=int(params['max_ngram_size']),
+            dedupLim=float(params['deduplication_threshold']),
+            dedupFunc=params['deduplication_algo'],
+            windowsSize=int(params['window_size']),
+            top=int(params['num_keywords']),
+            features=self.params['features'])
+        keyphrases = self._kw_extractor.extract_keywords(text)
+        suggestions = self._keyphrases2suggestions(keyphrases)
+
+        subject_suggestions = [SubjectSuggestion(
+                uri=uri,
+                label=None,
+                notation=None,
+                score=score)
+                for uri, score in suggestions[:limit] if score > 0.0]
+        return ListSuggestionResult.create_from_index(subject_suggestions,
+                                                      self.project.subjects)
+
+    def _keyphrases2suggestions(self, keyphrases):
+        suggestions = []
+        not_matched = []
+        for kp, score in keyphrases:
+            uris = self._keyphrase2uris(kp)
+            for uri in uris:
+                suggestions.append(
+                    (uri, self._transform_score(score)))
+            if not uris:
+                not_matched.append((kp, self._transform_score(score)))
+        # Remove duplicate uris, conflating the scores
+        suggestions = self._combine_suggestions(suggestions)
+        self.debug('Keyphrases not matched:\n' + '\t'.join(
+            [kp[0] + ' ' + str(kp[1]) for kp
+             in sorted(not_matched, reverse=True, key=lambda kp: kp[1])]))
+        return suggestions
+
+    def _keyphrase2uris(self, keyphrase):
+        keyphrase = self._normalize_phrase(keyphrase)
+        keyphrase = self._sort_phrase(keyphrase)
+        return self._index.get(keyphrase, [])
+
+    def _transform_score(self, score):
+        score = max(score, 0)
+        return 1.0 / (score + 1)
+
+    def _combine_suggestions(self, suggestions):
+        combined_suggestions = {}
+        for uri, score in suggestions:
+            if uri not in combined_suggestions:
+                combined_suggestions[uri] = score
+            else:
+                old_score = combined_suggestions[uri]
+                combined_suggestions[uri] = self._combine_scores(
+                    score, old_score)
+        return list(combined_suggestions.items())
+
+    def _combine_scores(self, score1, score2):
+        # The result is never smaller than the greater input
+        score1 = score1/2 + 0.5
+        score2 = score2/2 + 0.5
+        confl = score1 * score2 / (score1 * score2 + (1-score1) * (1-score2))
+        return (confl-0.5) * 2
diff --git a/annif/corpus/skos.py b/annif/corpus/skos.py
@@ -35,9 +35,7 @@ def __init__(self, path, language):
 
     @property
     def subjects(self):
-        for concept in self.graph.subjects(RDF.type, SKOS.Concept):
-            if (concept, OWL.deprecated, rdflib.Literal(True)) in self.graph:
-                continue
+        for concept in self.concepts:
             labels = self.graph.preferredLabel(concept, lang=self.language)
             notation = self.graph.value(concept, SKOS.notation, None, any=True)
             if not labels:
@@ -48,6 +46,19 @@ def subjects(self):
             yield Subject(uri=str(concept), label=label, notation=notation,
                           text=None)
 
+    @property
+    def concepts(self):
+        for concept in self.graph.subjects(RDF.type, SKOS.Concept):
+            if (concept, OWL.deprecated, rdflib.Literal(True)) in self.graph:
+                continue
+            yield concept
+
+    def get_concept_labels(self, concept, label_types, language):
+        return [str(label)
+                for label_type in label_types
+                for label in self.graph.objects(concept, label_type)
+                if label.language == language]
+
     @staticmethod
     def is_rdf_file(path):
         """return True if the path looks like an RDF file that can be loaded

diff --git a/annif/project.py b/annif/project.py
@@ -143,7 +143,8 @@ def vocab(self):
                 raise ConfigurationException("vocab setting is missing",
                                              project_id=self.project_id)
             self._vocab = annif.vocab.AnnifVocabulary(self.vocab_id,
-                                                      self._base_datadir)
+                                                      self._base_datadir,
+                                                      self.language)
         return self._vocab
 
     @property

diff --git a/annif/util.py b/annif/util.py
@@ -19,7 +19,7 @@ def atomic_save(obj, dirname, filename, method=None):
     tempfd, tempfilename = tempfile.mkstemp(
         prefix=prefix, suffix=suffix, dir=dirname)
     os.close(tempfd)
-    logger.debug('saving %s to temporary file %s', str(obj), tempfilename)
+    logger.debug('saving %s to temporary file %s', str(obj)[:90], tempfilename)
     if method is not None:
         method(obj, tempfilename)
     else:

diff --git a/annif/vocab.py b/annif/vocab.py
@@ -1,7 +1,6 @@
 """Vocabulary management functionality for Annif"""
 
 import os.path
-import rdflib.graph
 import annif
 import annif.corpus
 import annif.util
@@ -18,9 +17,11 @@ class AnnifVocabulary(DatadirMixin):
     # defaults for uninitialized instances
     _subjects = None
 
-    def __init__(self, vocab_id, datadir):
+    def __init__(self, vocab_id, datadir, language):
         DatadirMixin.__init__(self, datadir, 'vocabs', vocab_id)
         self.vocab_id = vocab_id
+        self.language = language
+        self._skos_vocab = None
 
     def _create_subject_index(self, subject_corpus):
         self._subjects = annif.corpus.SubjectIndex(subject_corpus)
@@ -55,6 +56,19 @@ def subjects(self):
                     "subject file {} not found".format(path))
         return self._subjects
 
+    @property
+    def skos(self):
+        """return the subject vocabulary from SKOS file"""
+        if self._skos_vocab is None:
+            path = os.path.join(self.datadir, 'subjects.ttl')
+            if os.path.exists(path):
+                logger.debug(f'loading graph from {path}')
+                self._skos_vocab = annif.corpus.SubjectFileSKOS(path,
+                                                                self.language)
+            else:
+                raise NotInitializedException(f'graph file {path} not found')
+        return self._skos_vocab
+
     def load_vocabulary(self, subject_corpus, language):
         """load subjects from a subject corpus and save them into a
         SKOS/Turtle file for later use"""
@@ -67,15 +81,10 @@ def load_vocabulary(self, subject_corpus, language):
         subject_corpus.save_skos(os.path.join(self.datadir, 'subjects.ttl'),
                                  language)
 
-    def as_skos(self):
+    def as_skos_file(self):
         """return the vocabulary as a file object, in SKOS/Turtle syntax"""
         return open(os.path.join(self.datadir, 'subjects.ttl'), 'rb')
 
     def as_graph(self):
         """return the vocabulary as an rdflib graph"""
-        g = rdflib.graph.Graph()
-        g.load(
-            os.path.join(self.datadir, 'subjects.ttl'),
-            format='ttl'
-        )
-        return g
+        return self.skos.graph
diff --git a/projects.cfg.dist b/projects.cfg.dist
@@ -111,6 +111,14 @@ backend=omikuji
 analyzer=snowball(english)
 vocab=yso-en
 
+[yake-fi]
+name=YAKE Finnish
+language=fi
+backend=yake
+vocab=yso-fi
+analyzer=voikko(fi)
+input_limit=20000
+
 [ensemble-fi]
 name=Ensemble Finnish
 language=fi

diff --git a/setup.py b/setup.py
@@ -44,6 +44,7 @@ def read(fname):
         'vw': ['vowpalwabbit==8.8.1'],
         'nn': ['tensorflow-cpu==2.3.1', 'lmdb==1.0.0'],
         'omikuji': ['omikuji==0.3.*'],
+        'yake': ['yake==0.4.5'],
         'dev': [
             'codecov',
             'pytest-cov',

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -68,7 +68,7 @@ def subject_file():
 
 @pytest.fixture(scope='module')
 def vocabulary(datadir):
-    vocab = annif.vocab.AnnifVocabulary('my-vocab', datadir)
+    vocab = annif.vocab.AnnifVocabulary('my-vocab', datadir, 'fi')
     subjfile = os.path.join(
         os.path.dirname(__file__),
         'corpora',