Skip to content

Commit

Permalink
Merge pull request #501 from NatLibFi/fix-training-svc-on-fulltext-co…
Browse files Browse the repository at this point in the history
…rpus

Fix training SVC backend on fulltext corpus
  • Loading branch information
juhoinkinen committed Jul 1, 2021
2 parents 2b8e3c9 + efced0a commit ec8762d
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 5 deletions.
6 changes: 5 additions & 1 deletion annif/backend/svc.py
Expand Up @@ -53,7 +53,11 @@ def _corpus_to_texts_and_classes(corpus):
classes = []
for doc in corpus.documents:
texts.append(doc.text)
classes.append(doc.uris[0])
if len(doc.uris) > 1:
raise NotSupportedException(
'SVC backend does not support training on documents ' +
'with multiple subjects.')
classes.append(list(doc.uris)[0])
return texts, classes

def _train_classifier(self, veccorpus, classes):
Expand Down
11 changes: 11 additions & 0 deletions tests/conftest.py
Expand Up @@ -96,6 +96,17 @@ def document_corpus(subject_index):
return doc_corpus


@pytest.fixture(scope='module')
def document_corpus_single_subject(document_corpus):
docs_single_subj = []
for doc in document_corpus.documents:
uri = list(doc.uris)[0] if len(doc.uris) > 0 else None
label = list(doc.labels)[0] if len(doc.labels) > 0 else None
docs_single_subj.append(
annif.corpus.Document(doc.text, {uri}, {label}))
return annif.corpus.DocumentList(docs_single_subj)


@pytest.fixture(scope='module')
def pretrained_vectors():
return py.path.local(os.path.join(
Expand Down
19 changes: 15 additions & 4 deletions tests/test_backend_svc.py
Expand Up @@ -34,26 +34,26 @@ def test_svc_suggest_no_vectorizer(project):
svc.suggest("example text")


def test_svc_train(datadir, document_corpus, project):
def test_svc_train(datadir, document_corpus_single_subject, project):
svc_type = annif.backend.get_backend('svc')
svc = svc_type(
backend_id='svc',
config_params={},
project=project)

svc.train(document_corpus)
svc.train(document_corpus_single_subject)
assert svc._model is not None
assert datadir.join('svc-model.gz').exists()


def test_svc_train_ngram(datadir, document_corpus, project):
def test_svc_train_ngram(datadir, document_corpus_single_subject, project):
svc_type = annif.backend.get_backend('svc')
svc = svc_type(
backend_id='svc',
config_params={'ngram': 2},
project=project)

svc.train(document_corpus)
svc.train(document_corpus_single_subject)
assert svc._model is not None
assert datadir.join('svc-model.gz').exists()

Expand All @@ -69,6 +69,17 @@ def test_svc_train_cached(datadir, project):
svc.train("cached")


def test_svc_train_multiple_subjects(datadir, document_corpus, project):
svc_type = annif.backend.get_backend('svc')
svc = svc_type(
backend_id='svc',
config_params={},
project=project)

with pytest.raises(NotSupportedException):
svc.train(document_corpus)


def test_svc_train_nodocuments(datadir, project, empty_corpus):
svc_type = annif.backend.get_backend('svc')
svc = svc_type(
Expand Down

0 comments on commit ec8762d

Please sign in to comment.