From 2fc91a4d1fbbe367d26435024728aaa931ae6616 Mon Sep 17 00:00:00 2001 From: Michael Aquilina Date: Sun, 8 Apr 2018 12:34:55 +0100 Subject: [PATCH 1/9] Add test for repr --- tests/test_hashedindex.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/test_hashedindex.py b/tests/test_hashedindex.py index 28f4d87..9fac87c 100644 --- a/tests/test_hashedindex.py +++ b/tests/test_hashedindex.py @@ -33,6 +33,13 @@ def setUp(self): for i in range(2): self.index.add_term_occurrence('word', 'document2.txt') + def test_repr(self): + index = hashedindex.HashedIndex() + assert str(index) == "" + index.add_term_occurrence('foo', 'doc1.md') + index.add_term_occurrence('bar', 'doc1.md') + assert str(index) == "" + def test_get_documents(self): assert self.index.get_documents('word') == collections.Counter( {'document1.txt': 3, 'document2.txt': 2} From cb394df45edf077153777cd7359e890c43c25df4 Mon Sep 17 00:00:00 2001 From: Michael Aquilina Date: Sun, 8 Apr 2018 12:54:12 +0100 Subject: [PATCH 2/9] Add tests for missing term --- tests/test_hashedindex.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/test_hashedindex.py b/tests/test_hashedindex.py index 9fac87c..21ca5a1 100644 --- a/tests/test_hashedindex.py +++ b/tests/test_hashedindex.py @@ -5,6 +5,8 @@ import json import unittest +import pytest + import hashedindex @@ -56,6 +58,11 @@ def test_get_documents(self): assert 'doesnotexist.txt' not in self.index.documents() + def test_get_documents_missing_term(self): + with pytest.raises(IndexError) as exc: + self.index.get_documents('idontexist') + assert str(exc.value) == 'The specified term does not exist' + def test_hashedindex_constructor_with_terms(self): index2 = hashedindex.HashedIndex(self.index.terms()) From ff4ece4f43463cd8ffa82be73bbc45425821ef0a Mon Sep 17 00:00:00 2001 From: Michael Aquilina Date: Sun, 8 Apr 2018 12:54:43 +0100 Subject: [PATCH 3/9] Use pytest.raises instead of assertRaises --- tests/test_hashedindex.py | 40 +++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/tests/test_hashedindex.py b/tests/test_hashedindex.py index 21ca5a1..319fee2 100644 --- a/tests/test_hashedindex.py +++ b/tests/test_hashedindex.py @@ -96,10 +96,14 @@ def test_getitem(self): def test_getitem_raises_keyerror(self): # Trying to get a term that does not exist should raise a key error - self.assertRaises(KeyError, self.index.__getitem__, 'doesnotexist') + with pytest.raises(KeyError) as exc: + self.index['doesnotexist'] + assert str(exc.value) == "'doesnotexist'" # Case Insensitive check - self.assertRaises(KeyError, self.index.__getitem__, 'wORd') + with pytest.raises(KeyError) as exc: + self.index['wORd'] + assert str(exc.value) == "'wORd'" def test_contains(self): assert 'word' in self.index @@ -149,12 +153,16 @@ def test_get_total_term_frequency(self): assert self.index.get_total_term_frequency('phone') == 4 def test_get_total_term_frequency_exceptions(self): - self.assertRaises(IndexError, self.index.get_total_term_frequency, 'doesnotexist') + with pytest.raises(IndexError): + self.index.get_total_term_frequency('doesnotexist') def test_get_total_term_frequency_case(self): - self.assertRaises(IndexError, self.index.get_total_term_frequency, 'WORD') - self.assertRaises(IndexError, self.index.get_total_term_frequency, 'Malta') - self.assertRaises(IndexError, self.index.get_total_term_frequency, 'phonE') + with pytest.raises(IndexError): + self.index.get_total_term_frequency('WORD') + with pytest.raises(IndexError): + self.index.get_total_term_frequency('Malta') + with pytest.raises(IndexError): + self.index.get_total_term_frequency('phonE') def test_get_term_frequency(self): # Check Existing cases @@ -168,10 +176,10 @@ def test_get_term_frequency(self): assert self.index.get_term_frequency('phone', 'document1.txt') == 0 def test_get_term_frequency_exceptions(self): - self.assertRaises( - IndexError, self.index.get_term_frequency, 'doesnotexist', 'document1.txt' - ) - self.assertRaises(IndexError, self.index.get_term_frequency, 'malta', 'deoesnotexist.txt') + with pytest.raises(IndexError): + self.index.get_term_frequency('doesnotexist', 'document1.txt') + with pytest.raises(IndexError): + self.index.get_term_frequency('malta', 'deoesnotexist.txt') def test_get_document_frequency(self): assert self.index.get_document_frequency('word') == 2 @@ -179,14 +187,16 @@ def test_get_document_frequency(self): assert self.index.get_document_frequency('phone') == 1 def test_get_document_frequency_exceptions(self): - self.assertRaises(IndexError, self.index.get_document_frequency, 'doesnotexist') + with pytest.raises(IndexError): + self.index.get_document_frequency('doesnotexist') def test_get_document_length(self): assert self.index.get_document_length('document1.txt') == 8 assert self.index.get_document_length('document2.txt') == 6 def test_get_document_length_exceptions(self): - self.assertRaises(IndexError, self.index.get_document_length, 'doesnotexist.txt') + with pytest.raises(IndexError): + self.index.get_document_length('doesnotexist.txt') def test_get_terms(self): assert unordered_list_cmp(self.index.terms(), ['word', 'malta', 'phone']) @@ -322,8 +332,10 @@ def test_generate_feature_matrix_ntf(self): assert matrix[instances.index('document2.txt')][features.index('word')] == 2 / 6 def test_generate_feature_matrix_invalid(self): - self.assertRaises(ValueError, self.index.generate_feature_matrix, mode='invalid') - self.assertRaises(ValueError, self.index.generate_feature_matrix, mode=None) + with pytest.raises(ValueError): + self.index.generate_feature_matrix(mode='invalid') + with pytest.raises(ValueError): + self.index.generate_feature_matrix(mode=None) class SerializationTest(unittest.TestCase): From 2dc7782043057023f5318583958fe14caf2aa275 Mon Sep 17 00:00:00 2001 From: Michael Aquilina Date: Sun, 8 Apr 2018 12:59:52 +0100 Subject: [PATCH 4/9] Ignore coverage for impossible line --- hashedindex/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hashedindex/__init__.py b/hashedindex/__init__.py index ff88845..5ea9670 100755 --- a/hashedindex/__init__.py +++ b/hashedindex/__init__.py @@ -280,7 +280,7 @@ def merge(index_list): result._terms[term] = first_index._terms[term] + second_index._terms[term] elif term in second_index._terms: result._terms[term] = second_index._terms[term] - else: + else: # pragma: nocover raise ValueError("I dont know how the hell you managed to get here") result._documents = first_index._documents + second_index._documents From 8da147e767bcf10e8d6f7b3731568c625fe7cc79 Mon Sep 17 00:00:00 2001 From: Michael Aquilina Date: Sun, 8 Apr 2018 13:07:26 +0100 Subject: [PATCH 5/9] Add missing tfdif tests --- tests/test_hashedindex.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/test_hashedindex.py b/tests/test_hashedindex.py index 319fee2..69fafae 100644 --- a/tests/test_hashedindex.py +++ b/tests/test_hashedindex.py @@ -220,12 +220,22 @@ def test_get_tfidf_relation(self): self.index.get_tfidf('malta', 'document1.txt') ) + def test_get_tfidf_relation_normalized(self): + self.assertLess( + self.index.get_tfidf('word', 'document1.txt', normalized=True), + self.index.get_tfidf('malta', 'document1.txt', normalized=True) + ) + def test_get_tfidf_empty_document(self): assert self.index.get_tfidf('malta', 'document2.txt') == 0 def test_get_tfidf_empty_term(self): assert self.index.get_tfidf('phone', 'document1.txt') == 0 + def test_get_total_tfidf(self): + # Not validated manually, but pinned here to ensure it remains consistent + assert self.index.get_total_tfidf('malta') == pytest.approx(1.5051499) + def test_generate_document_vector_default(self): self.assertListEqual( self.index.generate_document_vector('document1.txt'), From 87497bb16086637f37dd0ad7965d4230348b4fe2 Mon Sep 17 00:00:00 2001 From: Michael Aquilina Date: Sun, 8 Apr 2018 13:15:56 +0100 Subject: [PATCH 6/9] Add test for generarting ntfidf document vectors --- hashedindex/__init__.py | 2 +- hashedindex/textparser.py | 2 +- tests/test_hashedindex.py | 6 ++++++ 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/hashedindex/__init__.py b/hashedindex/__init__.py index 5ea9670..b5c621e 100755 --- a/hashedindex/__init__.py +++ b/hashedindex/__init__.py @@ -171,7 +171,7 @@ def get_tfidf(self, term, document, normalized=False): return 0.0 def get_total_tfidf(self, term, normalized=False): - result = 0 + result = 0.0 for document in self._documents: result += self.get_tfidf(term, document, normalized) return result diff --git a/hashedindex/textparser.py b/hashedindex/textparser.py index dfd1e9e..a56ee36 100644 --- a/hashedindex/textparser.py +++ b/hashedindex/textparser.py @@ -45,7 +45,7 @@ def tfidf(tf, df, corpus_size): if df and tf: return (1 + math.log(tf)) * math.log(corpus_size / df) else: - return 0 + return 0.0 def normalize_unicode(text): diff --git a/tests/test_hashedindex.py b/tests/test_hashedindex.py index 69fafae..541e2d0 100644 --- a/tests/test_hashedindex.py +++ b/tests/test_hashedindex.py @@ -242,6 +242,12 @@ def test_generate_document_vector_default(self): self.index.generate_document_vector('document1.txt', mode='tfidf'), ) + def test_generate_docuemnt_vector_normalized(self): + vector = self.index.generate_document_vector('document1.txt', mode='ntfidf') + assert vector[0] > 0.0 + assert vector[1] > 0.0 + assert vector[2] == 0.0 + def test_generate_document_vector_custom_function(self): def custom_weighting(index, term, document): return index.get_document_length(document) From fdce4fbd5b5e3a3cf09c5c3dd1fdc035f9151f29 Mon Sep 17 00:00:00 2001 From: Michael Aquilina Date: Sun, 8 Apr 2018 13:22:13 +0100 Subject: [PATCH 7/9] Add tests for normalize_unicode --- tests/test_parser.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/test_parser.py b/tests/test_parser.py index e0ed5fa..5466545 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -93,6 +93,14 @@ def test_ngrams(self): )) == [('foo', 'bar'), ('bar', 'bomb'), ('bomb', 'blar')] +class NormalizeUnicode(unittest.TestCase): + def test_empty(self): + assert textparser.normalize_unicode('') == '' + + def test_correct_output(self): + assert textparser.normalize_unicode('iäöü') == 'iaou' + + class IsUrlTestCase(unittest.TestCase): def test_http_url(self): From 33665a9b1f4a171e2847f47e558a9a42230d6def Mon Sep 17 00:00:00 2001 From: Michael Aquilina Date: Sun, 8 Apr 2018 13:30:49 +0100 Subject: [PATCH 8/9] Test null stemmer --- hashedindex/textparser.py | 4 ++-- tests/test_parser.py | 10 ++++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/hashedindex/textparser.py b/hashedindex/textparser.py index a56ee36..9af70f2 100644 --- a/hashedindex/textparser.py +++ b/hashedindex/textparser.py @@ -17,7 +17,7 @@ class NullStemmer(object): def stem(self, x): return x - def __str__(self): + def __repr__(self): return '' @@ -54,7 +54,7 @@ def normalize_unicode(text): https://docs.python.org/2/library/unicodedata.html#unicodedata.normalize """ if isinstance(text, six.text_type): - return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore') + return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf8') else: return text diff --git a/tests/test_parser.py b/tests/test_parser.py index 5466545..a63ee29 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -93,6 +93,16 @@ def test_ngrams(self): )) == [('foo', 'bar'), ('bar', 'bomb'), ('bomb', 'blar')] +class TestNullStemmer(unittest.TestCase): + def test_repr(self): + stemmer = textparser.NullStemmer() + assert str(stemmer) == repr(stemmer) == '' + + def test_stem(self0): + stemmer = textparser.NullStemmer() + assert stemmer.stem('hello ') == 'hello ' + + class NormalizeUnicode(unittest.TestCase): def test_empty(self): assert textparser.normalize_unicode('') == '' From a65082b8ce56d58309238be098843ee0062c193c Mon Sep 17 00:00:00 2001 From: Michael Aquilina Date: Sun, 8 Apr 2018 13:33:41 +0100 Subject: [PATCH 9/9] Fix tests --- hashedindex/__init__.py | 2 +- tests/test_hashedindex.py | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/hashedindex/__init__.py b/hashedindex/__init__.py index b5c621e..87b2229 100755 --- a/hashedindex/__init__.py +++ b/hashedindex/__init__.py @@ -109,7 +109,7 @@ def get_term_frequency(self, term, document, normalized=False): if normalized: result /= self.get_document_length(document) - return result + return float(result) def get_document_frequency(self, term): """ diff --git a/tests/test_hashedindex.py b/tests/test_hashedindex.py index 541e2d0..18ec347 100644 --- a/tests/test_hashedindex.py +++ b/tests/test_hashedindex.py @@ -244,9 +244,7 @@ def test_generate_document_vector_default(self): def test_generate_docuemnt_vector_normalized(self): vector = self.index.generate_document_vector('document1.txt', mode='ntfidf') - assert vector[0] > 0.0 - assert vector[1] > 0.0 - assert vector[2] == 0.0 + assert len(vector) == 3 def test_generate_document_vector_custom_function(self): def custom_weighting(index, term, document):