Skip to content

Commit

Permalink
Merge pull request #10 from MichaelAquilina/100cov
Browse files Browse the repository at this point in the history
Improve test coverage
  • Loading branch information
MichaelAquilina committed Apr 8, 2018
2 parents f0f2b19 + a65082b commit 1bd8758
Show file tree
Hide file tree
Showing 4 changed files with 78 additions and 20 deletions.
6 changes: 3 additions & 3 deletions hashedindex/__init__.py
Expand Up @@ -109,7 +109,7 @@ def get_term_frequency(self, term, document, normalized=False):
if normalized:
result /= self.get_document_length(document)

return result
return float(result)

def get_document_frequency(self, term):
"""
Expand Down Expand Up @@ -171,7 +171,7 @@ def get_tfidf(self, term, document, normalized=False):
return 0.0

def get_total_tfidf(self, term, normalized=False):
result = 0
result = 0.0
for document in self._documents:
result += self.get_tfidf(term, document, normalized)
return result
Expand Down Expand Up @@ -280,7 +280,7 @@ def merge(index_list):
result._terms[term] = first_index._terms[term] + second_index._terms[term]
elif term in second_index._terms:
result._terms[term] = second_index._terms[term]
else:
else: # pragma: nocover
raise ValueError("I dont know how the hell you managed to get here")

result._documents = first_index._documents + second_index._documents
Expand Down
6 changes: 3 additions & 3 deletions hashedindex/textparser.py
Expand Up @@ -17,7 +17,7 @@ class NullStemmer(object):
def stem(self, x):
return x

def __str__(self):
def __repr__(self):
return '<NullStemmer>'


Expand Down Expand Up @@ -45,7 +45,7 @@ def tfidf(tf, df, corpus_size):
if df and tf:
return (1 + math.log(tf)) * math.log(corpus_size / df)
else:
return 0
return 0.0


def normalize_unicode(text):
Expand All @@ -54,7 +54,7 @@ def normalize_unicode(text):
https://docs.python.org/2/library/unicodedata.html#unicodedata.normalize
"""
if isinstance(text, six.text_type):
return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore')
return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf8')
else:
return text

Expand Down
68 changes: 54 additions & 14 deletions tests/test_hashedindex.py
Expand Up @@ -5,6 +5,8 @@
import json
import unittest

import pytest

import hashedindex


Expand Down Expand Up @@ -33,6 +35,13 @@ def setUp(self):
for i in range(2):
self.index.add_term_occurrence('word', 'document2.txt')

def test_repr(self):
index = hashedindex.HashedIndex()
assert str(index) == "<HashedIndex: 0 terms, 0 documents>"
index.add_term_occurrence('foo', 'doc1.md')
index.add_term_occurrence('bar', 'doc1.md')
assert str(index) == "<HashedIndex: 2 terms, 1 documents>"

def test_get_documents(self):
assert self.index.get_documents('word') == collections.Counter(
{'document1.txt': 3, 'document2.txt': 2}
Expand All @@ -49,6 +58,11 @@ def test_get_documents(self):

assert 'doesnotexist.txt' not in self.index.documents()

def test_get_documents_missing_term(self):
with pytest.raises(IndexError) as exc:
self.index.get_documents('idontexist')
assert str(exc.value) == 'The specified term does not exist'

def test_hashedindex_constructor_with_terms(self):
index2 = hashedindex.HashedIndex(self.index.terms())

Expand Down Expand Up @@ -82,10 +96,14 @@ def test_getitem(self):

def test_getitem_raises_keyerror(self):
# Trying to get a term that does not exist should raise a key error
self.assertRaises(KeyError, self.index.__getitem__, 'doesnotexist')
with pytest.raises(KeyError) as exc:
self.index['doesnotexist']
assert str(exc.value) == "'doesnotexist'"

# Case Insensitive check
self.assertRaises(KeyError, self.index.__getitem__, 'wORd')
with pytest.raises(KeyError) as exc:
self.index['wORd']
assert str(exc.value) == "'wORd'"

def test_contains(self):
assert 'word' in self.index
Expand Down Expand Up @@ -135,12 +153,16 @@ def test_get_total_term_frequency(self):
assert self.index.get_total_term_frequency('phone') == 4

def test_get_total_term_frequency_exceptions(self):
self.assertRaises(IndexError, self.index.get_total_term_frequency, 'doesnotexist')
with pytest.raises(IndexError):
self.index.get_total_term_frequency('doesnotexist')

def test_get_total_term_frequency_case(self):
self.assertRaises(IndexError, self.index.get_total_term_frequency, 'WORD')
self.assertRaises(IndexError, self.index.get_total_term_frequency, 'Malta')
self.assertRaises(IndexError, self.index.get_total_term_frequency, 'phonE')
with pytest.raises(IndexError):
self.index.get_total_term_frequency('WORD')
with pytest.raises(IndexError):
self.index.get_total_term_frequency('Malta')
with pytest.raises(IndexError):
self.index.get_total_term_frequency('phonE')

def test_get_term_frequency(self):
# Check Existing cases
Expand All @@ -154,25 +176,27 @@ def test_get_term_frequency(self):
assert self.index.get_term_frequency('phone', 'document1.txt') == 0

def test_get_term_frequency_exceptions(self):
self.assertRaises(
IndexError, self.index.get_term_frequency, 'doesnotexist', 'document1.txt'
)
self.assertRaises(IndexError, self.index.get_term_frequency, 'malta', 'deoesnotexist.txt')
with pytest.raises(IndexError):
self.index.get_term_frequency('doesnotexist', 'document1.txt')
with pytest.raises(IndexError):
self.index.get_term_frequency('malta', 'deoesnotexist.txt')

def test_get_document_frequency(self):
assert self.index.get_document_frequency('word') == 2
assert self.index.get_document_frequency('malta') == 1
assert self.index.get_document_frequency('phone') == 1

def test_get_document_frequency_exceptions(self):
self.assertRaises(IndexError, self.index.get_document_frequency, 'doesnotexist')
with pytest.raises(IndexError):
self.index.get_document_frequency('doesnotexist')

def test_get_document_length(self):
assert self.index.get_document_length('document1.txt') == 8
assert self.index.get_document_length('document2.txt') == 6

def test_get_document_length_exceptions(self):
self.assertRaises(IndexError, self.index.get_document_length, 'doesnotexist.txt')
with pytest.raises(IndexError):
self.index.get_document_length('doesnotexist.txt')

def test_get_terms(self):
assert unordered_list_cmp(self.index.terms(), ['word', 'malta', 'phone'])
Expand All @@ -196,18 +220,32 @@ def test_get_tfidf_relation(self):
self.index.get_tfidf('malta', 'document1.txt')
)

def test_get_tfidf_relation_normalized(self):
self.assertLess(
self.index.get_tfidf('word', 'document1.txt', normalized=True),
self.index.get_tfidf('malta', 'document1.txt', normalized=True)
)

def test_get_tfidf_empty_document(self):
assert self.index.get_tfidf('malta', 'document2.txt') == 0

def test_get_tfidf_empty_term(self):
assert self.index.get_tfidf('phone', 'document1.txt') == 0

def test_get_total_tfidf(self):
# Not validated manually, but pinned here to ensure it remains consistent
assert self.index.get_total_tfidf('malta') == pytest.approx(1.5051499)

def test_generate_document_vector_default(self):
self.assertListEqual(
self.index.generate_document_vector('document1.txt'),
self.index.generate_document_vector('document1.txt', mode='tfidf'),
)

def test_generate_docuemnt_vector_normalized(self):
vector = self.index.generate_document_vector('document1.txt', mode='ntfidf')
assert len(vector) == 3

def test_generate_document_vector_custom_function(self):
def custom_weighting(index, term, document):
return index.get_document_length(document)
Expand Down Expand Up @@ -308,8 +346,10 @@ def test_generate_feature_matrix_ntf(self):
assert matrix[instances.index('document2.txt')][features.index('word')] == 2 / 6

def test_generate_feature_matrix_invalid(self):
self.assertRaises(ValueError, self.index.generate_feature_matrix, mode='invalid')
self.assertRaises(ValueError, self.index.generate_feature_matrix, mode=None)
with pytest.raises(ValueError):
self.index.generate_feature_matrix(mode='invalid')
with pytest.raises(ValueError):
self.index.generate_feature_matrix(mode=None)


class SerializationTest(unittest.TestCase):
Expand Down
18 changes: 18 additions & 0 deletions tests/test_parser.py
Expand Up @@ -93,6 +93,24 @@ def test_ngrams(self):
)) == [('foo', 'bar'), ('bar', 'bomb'), ('bomb', 'blar')]


class TestNullStemmer(unittest.TestCase):
def test_repr(self):
stemmer = textparser.NullStemmer()
assert str(stemmer) == repr(stemmer) == '<NullStemmer>'

def test_stem(self0):
stemmer = textparser.NullStemmer()
assert stemmer.stem('hello ') == 'hello '


class NormalizeUnicode(unittest.TestCase):
def test_empty(self):
assert textparser.normalize_unicode('') == ''

def test_correct_output(self):
assert textparser.normalize_unicode('iäöü') == 'iaou'


class IsUrlTestCase(unittest.TestCase):

def test_http_url(self):
Expand Down

0 comments on commit 1bd8758

Please sign in to comment.