From 2fc91a4d1fbbe367d26435024728aaa931ae6616 Mon Sep 17 00:00:00 2001
From: Michael Aquilina <michaelaquilina@gmail.com>
Date: Sun, 8 Apr 2018 12:34:55 +0100
Subject: [PATCH 1/9] Add test for repr

---
 tests/test_hashedindex.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tests/test_hashedindex.py b/tests/test_hashedindex.py
index 28f4d87..9fac87c 100644
--- a/tests/test_hashedindex.py
+++ b/tests/test_hashedindex.py
@@ -33,6 +33,13 @@ def setUp(self):
         for i in range(2):
             self.index.add_term_occurrence('word', 'document2.txt')
 
+    def test_repr(self):
+        index = hashedindex.HashedIndex()
+        assert str(index) == "<HashedIndex: 0 terms, 0 documents>"
+        index.add_term_occurrence('foo', 'doc1.md')
+        index.add_term_occurrence('bar', 'doc1.md')
+        assert str(index) == "<HashedIndex: 2 terms, 1 documents>"
+
     def test_get_documents(self):
         assert self.index.get_documents('word') == collections.Counter(
             {'document1.txt': 3, 'document2.txt': 2}

From cb394df45edf077153777cd7359e890c43c25df4 Mon Sep 17 00:00:00 2001
From: Michael Aquilina <michaelaquilina@gmail.com>
Date: Sun, 8 Apr 2018 12:54:12 +0100
Subject: [PATCH 2/9] Add tests for missing term

---
 tests/test_hashedindex.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tests/test_hashedindex.py b/tests/test_hashedindex.py
index 9fac87c..21ca5a1 100644
--- a/tests/test_hashedindex.py
+++ b/tests/test_hashedindex.py
@@ -5,6 +5,8 @@
 import json
 import unittest
 
+import pytest
+
 import hashedindex
 
 
@@ -56,6 +58,11 @@ def test_get_documents(self):
 
         assert 'doesnotexist.txt' not in self.index.documents()
 
+    def test_get_documents_missing_term(self):
+        with pytest.raises(IndexError) as exc:
+            self.index.get_documents('idontexist')
+        assert str(exc.value) == 'The specified term does not exist'
+
     def test_hashedindex_constructor_with_terms(self):
         index2 = hashedindex.HashedIndex(self.index.terms())
 

From ff4ece4f43463cd8ffa82be73bbc45425821ef0a Mon Sep 17 00:00:00 2001
From: Michael Aquilina <michaelaquilina@gmail.com>
Date: Sun, 8 Apr 2018 12:54:43 +0100
Subject: [PATCH 3/9] Use pytest.raises instead of assertRaises

---
 tests/test_hashedindex.py | 40 +++++++++++++++++++++++++--------------
 1 file changed, 26 insertions(+), 14 deletions(-)

diff --git a/tests/test_hashedindex.py b/tests/test_hashedindex.py
index 21ca5a1..319fee2 100644
--- a/tests/test_hashedindex.py
+++ b/tests/test_hashedindex.py
@@ -96,10 +96,14 @@ def test_getitem(self):
 
     def test_getitem_raises_keyerror(self):
         # Trying to get a term that does not exist should raise a key error
-        self.assertRaises(KeyError, self.index.__getitem__, 'doesnotexist')
+        with pytest.raises(KeyError) as exc:
+            self.index['doesnotexist']
+        assert str(exc.value) == "'doesnotexist'"
 
         # Case Insensitive check
-        self.assertRaises(KeyError, self.index.__getitem__, 'wORd')
+        with pytest.raises(KeyError) as exc:
+            self.index['wORd']
+        assert str(exc.value) == "'wORd'"
 
     def test_contains(self):
         assert 'word' in self.index
@@ -149,12 +153,16 @@ def test_get_total_term_frequency(self):
         assert self.index.get_total_term_frequency('phone') == 4
 
     def test_get_total_term_frequency_exceptions(self):
-        self.assertRaises(IndexError, self.index.get_total_term_frequency, 'doesnotexist')
+        with pytest.raises(IndexError):
+            self.index.get_total_term_frequency('doesnotexist')
 
     def test_get_total_term_frequency_case(self):
-        self.assertRaises(IndexError, self.index.get_total_term_frequency, 'WORD')
-        self.assertRaises(IndexError, self.index.get_total_term_frequency, 'Malta')
-        self.assertRaises(IndexError, self.index.get_total_term_frequency, 'phonE')
+        with pytest.raises(IndexError):
+            self.index.get_total_term_frequency('WORD')
+        with pytest.raises(IndexError):
+            self.index.get_total_term_frequency('Malta')
+        with pytest.raises(IndexError):
+            self.index.get_total_term_frequency('phonE')
 
     def test_get_term_frequency(self):
         # Check Existing cases
@@ -168,10 +176,10 @@ def test_get_term_frequency(self):
         assert self.index.get_term_frequency('phone', 'document1.txt') == 0
 
     def test_get_term_frequency_exceptions(self):
-        self.assertRaises(
-            IndexError, self.index.get_term_frequency, 'doesnotexist', 'document1.txt'
-        )
-        self.assertRaises(IndexError, self.index.get_term_frequency, 'malta', 'deoesnotexist.txt')
+        with pytest.raises(IndexError):
+            self.index.get_term_frequency('doesnotexist', 'document1.txt')
+        with pytest.raises(IndexError):
+            self.index.get_term_frequency('malta', 'deoesnotexist.txt')
 
     def test_get_document_frequency(self):
         assert self.index.get_document_frequency('word') == 2
@@ -179,14 +187,16 @@ def test_get_document_frequency(self):
         assert self.index.get_document_frequency('phone') == 1
 
     def test_get_document_frequency_exceptions(self):
-        self.assertRaises(IndexError, self.index.get_document_frequency, 'doesnotexist')
+        with pytest.raises(IndexError):
+            self.index.get_document_frequency('doesnotexist')
 
     def test_get_document_length(self):
         assert self.index.get_document_length('document1.txt') == 8
         assert self.index.get_document_length('document2.txt') == 6
 
     def test_get_document_length_exceptions(self):
-        self.assertRaises(IndexError, self.index.get_document_length, 'doesnotexist.txt')
+        with pytest.raises(IndexError):
+            self.index.get_document_length('doesnotexist.txt')
 
     def test_get_terms(self):
         assert unordered_list_cmp(self.index.terms(), ['word', 'malta', 'phone'])
@@ -322,8 +332,10 @@ def test_generate_feature_matrix_ntf(self):
         assert matrix[instances.index('document2.txt')][features.index('word')] == 2 / 6
 
     def test_generate_feature_matrix_invalid(self):
-        self.assertRaises(ValueError, self.index.generate_feature_matrix, mode='invalid')
-        self.assertRaises(ValueError, self.index.generate_feature_matrix, mode=None)
+        with pytest.raises(ValueError):
+            self.index.generate_feature_matrix(mode='invalid')
+        with pytest.raises(ValueError):
+            self.index.generate_feature_matrix(mode=None)
 
 
 class SerializationTest(unittest.TestCase):

From 2dc7782043057023f5318583958fe14caf2aa275 Mon Sep 17 00:00:00 2001
From: Michael Aquilina <michaelaquilina@gmail.com>
Date: Sun, 8 Apr 2018 12:59:52 +0100
Subject: [PATCH 4/9] Ignore coverage for impossible line

---
 hashedindex/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hashedindex/__init__.py b/hashedindex/__init__.py
index ff88845..5ea9670 100755
--- a/hashedindex/__init__.py
+++ b/hashedindex/__init__.py
@@ -280,7 +280,7 @@ def merge(index_list):
                 result._terms[term] = first_index._terms[term] + second_index._terms[term]
             elif term in second_index._terms:
                 result._terms[term] = second_index._terms[term]
-            else:
+            else:  # pragma: nocover
                 raise ValueError("I dont know how the hell you managed to get here")
 
         result._documents = first_index._documents + second_index._documents

From 8da147e767bcf10e8d6f7b3731568c625fe7cc79 Mon Sep 17 00:00:00 2001
From: Michael Aquilina <michaelaquilina@gmail.com>
Date: Sun, 8 Apr 2018 13:07:26 +0100
Subject: [PATCH 5/9] Add missing tfdif tests

---
 tests/test_hashedindex.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/tests/test_hashedindex.py b/tests/test_hashedindex.py
index 319fee2..69fafae 100644
--- a/tests/test_hashedindex.py
+++ b/tests/test_hashedindex.py
@@ -220,12 +220,22 @@ def test_get_tfidf_relation(self):
             self.index.get_tfidf('malta', 'document1.txt')
         )
 
+    def test_get_tfidf_relation_normalized(self):
+        self.assertLess(
+            self.index.get_tfidf('word', 'document1.txt', normalized=True),
+            self.index.get_tfidf('malta', 'document1.txt', normalized=True)
+        )
+
     def test_get_tfidf_empty_document(self):
         assert self.index.get_tfidf('malta', 'document2.txt') == 0
 
     def test_get_tfidf_empty_term(self):
         assert self.index.get_tfidf('phone', 'document1.txt') == 0
 
+    def test_get_total_tfidf(self):
+        # Not validated manually, but pinned here to ensure it remains consistent
+        assert self.index.get_total_tfidf('malta') == pytest.approx(1.5051499)
+
     def test_generate_document_vector_default(self):
         self.assertListEqual(
             self.index.generate_document_vector('document1.txt'),

From 87497bb16086637f37dd0ad7965d4230348b4fe2 Mon Sep 17 00:00:00 2001
From: Michael Aquilina <michaelaquilina@gmail.com>
Date: Sun, 8 Apr 2018 13:15:56 +0100
Subject: [PATCH 6/9] Add test for generarting ntfidf document vectors

---
 hashedindex/__init__.py   | 2 +-
 hashedindex/textparser.py | 2 +-
 tests/test_hashedindex.py | 6 ++++++
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/hashedindex/__init__.py b/hashedindex/__init__.py
index 5ea9670..b5c621e 100755
--- a/hashedindex/__init__.py
+++ b/hashedindex/__init__.py
@@ -171,7 +171,7 @@ def get_tfidf(self, term, document, normalized=False):
             return 0.0
 
     def get_total_tfidf(self, term, normalized=False):
-        result = 0
+        result = 0.0
         for document in self._documents:
             result += self.get_tfidf(term, document, normalized)
         return result
diff --git a/hashedindex/textparser.py b/hashedindex/textparser.py
index dfd1e9e..a56ee36 100644
--- a/hashedindex/textparser.py
+++ b/hashedindex/textparser.py
@@ -45,7 +45,7 @@ def tfidf(tf, df, corpus_size):
     if df and tf:
         return (1 + math.log(tf)) * math.log(corpus_size / df)
     else:
-        return 0
+        return 0.0
 
 
 def normalize_unicode(text):
diff --git a/tests/test_hashedindex.py b/tests/test_hashedindex.py
index 69fafae..541e2d0 100644
--- a/tests/test_hashedindex.py
+++ b/tests/test_hashedindex.py
@@ -242,6 +242,12 @@ def test_generate_document_vector_default(self):
             self.index.generate_document_vector('document1.txt', mode='tfidf'),
         )
 
+    def test_generate_docuemnt_vector_normalized(self):
+        vector = self.index.generate_document_vector('document1.txt', mode='ntfidf')
+        assert vector[0] > 0.0
+        assert vector[1] > 0.0
+        assert vector[2] == 0.0
+
     def test_generate_document_vector_custom_function(self):
         def custom_weighting(index, term, document):
             return index.get_document_length(document)

From fdce4fbd5b5e3a3cf09c5c3dd1fdc035f9151f29 Mon Sep 17 00:00:00 2001
From: Michael Aquilina <michaelaquilina@gmail.com>
Date: Sun, 8 Apr 2018 13:22:13 +0100
Subject: [PATCH 7/9] Add tests for normalize_unicode

---
 tests/test_parser.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tests/test_parser.py b/tests/test_parser.py
index e0ed5fa..5466545 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -93,6 +93,14 @@ def test_ngrams(self):
         )) == [('foo', 'bar'), ('bar', 'bomb'), ('bomb', 'blar')]
 
 
+class NormalizeUnicode(unittest.TestCase):
+    def test_empty(self):
+        assert textparser.normalize_unicode('') == ''
+
+    def test_correct_output(self):
+        assert textparser.normalize_unicode('iäöü') == 'iaou'
+
+
 class IsUrlTestCase(unittest.TestCase):
 
     def test_http_url(self):

From 33665a9b1f4a171e2847f47e558a9a42230d6def Mon Sep 17 00:00:00 2001
From: Michael Aquilina <michaelaquilina@gmail.com>
Date: Sun, 8 Apr 2018 13:30:49 +0100
Subject: [PATCH 8/9] Test null stemmer

---
 hashedindex/textparser.py |  4 ++--
 tests/test_parser.py      | 10 ++++++++++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/hashedindex/textparser.py b/hashedindex/textparser.py
index a56ee36..9af70f2 100644
--- a/hashedindex/textparser.py
+++ b/hashedindex/textparser.py
@@ -17,7 +17,7 @@ class NullStemmer(object):
     def stem(self, x):
         return x
 
-    def __str__(self):
+    def __repr__(self):
         return '<NullStemmer>'
 
 
@@ -54,7 +54,7 @@ def normalize_unicode(text):
     https://docs.python.org/2/library/unicodedata.html#unicodedata.normalize
     """
     if isinstance(text, six.text_type):
-        return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore')
+        return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf8')
     else:
         return text
 
diff --git a/tests/test_parser.py b/tests/test_parser.py
index 5466545..a63ee29 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -93,6 +93,16 @@ def test_ngrams(self):
         )) == [('foo', 'bar'), ('bar', 'bomb'), ('bomb', 'blar')]
 
 
+class TestNullStemmer(unittest.TestCase):
+    def test_repr(self):
+        stemmer = textparser.NullStemmer()
+        assert str(stemmer) == repr(stemmer) == '<NullStemmer>'
+
+    def test_stem(self0):
+        stemmer = textparser.NullStemmer()
+        assert stemmer.stem('hello  ') == 'hello  '
+
+
 class NormalizeUnicode(unittest.TestCase):
     def test_empty(self):
         assert textparser.normalize_unicode('') == ''

From a65082b8ce56d58309238be098843ee0062c193c Mon Sep 17 00:00:00 2001
From: Michael Aquilina <michaelaquilina@gmail.com>
Date: Sun, 8 Apr 2018 13:33:41 +0100
Subject: [PATCH 9/9] Fix tests

---
 hashedindex/__init__.py   | 2 +-
 tests/test_hashedindex.py | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/hashedindex/__init__.py b/hashedindex/__init__.py
index b5c621e..87b2229 100755
--- a/hashedindex/__init__.py
+++ b/hashedindex/__init__.py
@@ -109,7 +109,7 @@ def get_term_frequency(self, term, document, normalized=False):
         if normalized:
             result /= self.get_document_length(document)
 
-        return result
+        return float(result)
 
     def get_document_frequency(self, term):
         """
diff --git a/tests/test_hashedindex.py b/tests/test_hashedindex.py
index 541e2d0..18ec347 100644
--- a/tests/test_hashedindex.py
+++ b/tests/test_hashedindex.py
@@ -244,9 +244,7 @@ def test_generate_document_vector_default(self):
 
     def test_generate_docuemnt_vector_normalized(self):
         vector = self.index.generate_document_vector('document1.txt', mode='ntfidf')
-        assert vector[0] > 0.0
-        assert vector[1] > 0.0
-        assert vector[2] == 0.0
+        assert len(vector) == 3
 
     def test_generate_document_vector_custom_function(self):
         def custom_weighting(index, term, document):