From 3fb6176b64021ac4495ce05b7bbce6d0ad5c7afb Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Tue, 29 Jun 2021 22:49:11 +0300 Subject: [PATCH 1/3] Convert set of uris from DocumentDirectory to list of uris --- annif/backend/svc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/annif/backend/svc.py b/annif/backend/svc.py index e8e1f8617..38f886d8d 100644 --- a/annif/backend/svc.py +++ b/annif/backend/svc.py @@ -53,7 +53,7 @@ def _corpus_to_texts_and_classes(corpus): classes = [] for doc in corpus.documents: texts.append(doc.text) - classes.append(doc.uris[0]) + classes.append(list(doc.uris)[0]) return texts, classes def _train_classifier(self, veccorpus, classes): From 38d7aec2be477a5fbaeb26d51a1a466169725ab3 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Wed, 30 Jun 2021 16:30:15 +0300 Subject: [PATCH 2/3] Raise NotSupportedException for training SVC on docs with many subjects --- annif/backend/svc.py | 4 ++++ tests/test_backend_svc.py | 11 +++++++++++ 2 files changed, 15 insertions(+) diff --git a/annif/backend/svc.py b/annif/backend/svc.py index 38f886d8d..626f42de3 100644 --- a/annif/backend/svc.py +++ b/annif/backend/svc.py @@ -53,6 +53,10 @@ def _corpus_to_texts_and_classes(corpus): classes = [] for doc in corpus.documents: texts.append(doc.text) + if len(doc.uris) > 1: + raise NotSupportedException( + 'SVC backend does not support training on documents ' + + 'with multiple subjects.') classes.append(list(doc.uris)[0]) return texts, classes diff --git a/tests/test_backend_svc.py b/tests/test_backend_svc.py index 9cc5245d0..55810dbe8 100644 --- a/tests/test_backend_svc.py +++ b/tests/test_backend_svc.py @@ -69,6 +69,17 @@ def test_svc_train_cached(datadir, project): svc.train("cached") +def test_svc_train_multiple_subjects(datadir, document_corpus, project): + svc_type = annif.backend.get_backend('svc') + svc = svc_type( + backend_id='svc', + config_params={}, + project=project) + + with pytest.raises(NotSupportedException): + svc.train(document_corpus) + + def test_svc_train_nodocuments(datadir, project, empty_corpus): svc_type = annif.backend.get_backend('svc') svc = svc_type( From efced0aa1aa8bc7350738cee52224b042b061669 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Thu, 1 Jul 2021 09:20:43 +0300 Subject: [PATCH 3/3] Fixture of corpus of docs with one subject only for SVC backend training --- tests/conftest.py | 11 +++++++++++ tests/test_backend_svc.py | 8 ++++---- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index f92e40e92..5e8f1ab45 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -96,6 +96,17 @@ def document_corpus(subject_index): return doc_corpus +@pytest.fixture(scope='module') +def document_corpus_single_subject(document_corpus): + docs_single_subj = [] + for doc in document_corpus.documents: + uri = list(doc.uris)[0] if len(doc.uris) > 0 else None + label = list(doc.labels)[0] if len(doc.labels) > 0 else None + docs_single_subj.append( + annif.corpus.Document(doc.text, {uri}, {label})) + return annif.corpus.DocumentList(docs_single_subj) + + @pytest.fixture(scope='module') def pretrained_vectors(): return py.path.local(os.path.join( diff --git a/tests/test_backend_svc.py b/tests/test_backend_svc.py index 55810dbe8..df6928ba1 100644 --- a/tests/test_backend_svc.py +++ b/tests/test_backend_svc.py @@ -34,26 +34,26 @@ def test_svc_suggest_no_vectorizer(project): svc.suggest("example text") -def test_svc_train(datadir, document_corpus, project): +def test_svc_train(datadir, document_corpus_single_subject, project): svc_type = annif.backend.get_backend('svc') svc = svc_type( backend_id='svc', config_params={}, project=project) - svc.train(document_corpus) + svc.train(document_corpus_single_subject) assert svc._model is not None assert datadir.join('svc-model.gz').exists() -def test_svc_train_ngram(datadir, document_corpus, project): +def test_svc_train_ngram(datadir, document_corpus_single_subject, project): svc_type = annif.backend.get_backend('svc') svc = svc_type( backend_id='svc', config_params={'ngram': 2}, project=project) - svc.train(document_corpus) + svc.train(document_corpus_single_subject) assert svc._model is not None assert datadir.join('svc-model.gz').exists()