Skip to content

Commit

Permalink
Add and restructure compare tests
Browse files Browse the repository at this point in the history
  • Loading branch information
J535D165 committed Sep 9, 2016
1 parent c825ecc commit 6e21964
Showing 1 changed file with 116 additions and 64 deletions.
180 changes: 116 additions & 64 deletions tests/test_compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,105 +2,157 @@

import pandas.util.testing as pdt
import recordlinkage
import numpy as np
import pandas as pd

TEST_DATA_1 = pd.DataFrame([
[u'Donell', u'Gerlach', 20, u'New York'],
[np.nan, u'Smit', 17, u'Boston'],
[u'Kalie', u'Flatley', 33, u'Boston'],
[u'Kittie', u'Schuster', 27, np.nan],
[np.nan, np.nan, np.nan, u'South Devyn']
],
columns=['name', 'lastname', 'age', 'place'])

TEST_DATA_1.index.name = 'index_df1'

TEST_DATA_2 = pd.DataFrame([
[u'Donel', u'Gerleach', 20, u'New York'],
[np.nan, u'Smith', 17, u'Boston'],
[u'Kaly', u'Flatley', 33, u'Boston'],
[u'Kittie', np.nan, 20, np.nan],
[u'Bob', u'Armstrong', 70, u'Lake Gavinmouth']
],
columns=['name', 'lastname', 'age', 'place'])

TEST_DATA_2.index.name = 'index_df2'

TEST_INDEX_DEDUP = pd.MultiIndex.from_arrays(
[np.arange(len(TEST_DATA_1)), np.arange(len(TEST_DATA_1))],
names=[TEST_DATA_1.index.name, TEST_DATA_1.index.name])

i_l = recordlinkage.Pairs(TEST_DATA_1, TEST_DATA_2)
TEST_INDEX_LINKING = i_l.eye()

from numpy import nan, arange
import pandas

STRING_SIM_ALGORITHMS = ['jaro', 'jaro_winkler', 'dameraulevenshtein', 'levenshtein', 'q_gram', 'cosine']

# nosetests tests/test_compare.py:TestCompare
class TestCompare(unittest.TestCase):

def test_link_exact_basic(self):
@classmethod
def setUpClass(self):

comp = recordlinkage.Compare(TEST_INDEX_LINKING, TEST_DATA_1, TEST_DATA_2)
self.A = pandas.DataFrame([
[u'Donell', u'Gerlach', 20, u'New York'],
[nan, u'Smit', 17, u'Boston'],
[u'Kalie', u'Flatley', 33, u'Boston'],
[u'Kittie', u'Schuster', 27, nan],
[nan, nan, nan, u'South Devyn']
],
columns=['given_name', 'lastname', 'age', 'place'])

# Missing values
result = comp.exact('name', 'name', name='y_name')
expected = pd.Series([0,0,0,1,0], index=TEST_INDEX_LINKING, name='y_name')
self.A.index.name = 'index_df1'

self.B = pandas.DataFrame([
[u'Donel', u'Gerleach', 20, u'New York'],
[nan, u'Smith', 17, u'Boston'],
[u'Kaly', u'Flatley', 33, u'Boston'],
[u'Kittie', nan, 20, nan],
[u'Bob', u'Armstrong', 70, u'Lake Gavinmouth']
],
columns=['given_name', 'lastname', 'age', 'place'])

self.B.index.name = 'index_df2'

self.index_AB = pandas.MultiIndex.from_arrays(
[arange(len(self.A)), arange(len(self.B))],
names=[self.A.index.name, self.B.index.name])

# nosetests tests/test_compare.py:TestCompareAPI
class TestCompareAPI(TestCompare):

def test_instance_linking(self):

comp = recordlinkage.Compare(self.index_AB, self.A, self.B)

result = comp.exact('given_name', 'given_name')
self.assertIsInstance(result, pandas.Series)
self.assertEqual(result.name, None)

result = comp.numeric('age', 'age', 2)
self.assertIsInstance(result, pandas.Series)
self.assertEqual(result.name, None)

for alg in STRING_SIM_ALGORITHMS:

# Missing values
result = comp.fuzzy('given_name', 'given_name', method=alg)

self.assertIsInstance(result, pandas.Series)
self.assertEqual(result.name, None)

def test_name_series_linking(self):

comp = recordlinkage.Compare(self.index_AB, self.A, self.B)

result = comp.exact('given_name', 'given_name', name="given_name_comp")
self.assertIsInstance(result, pandas.Series)
self.assertEqual(result.name, "given_name_comp")

result = comp.numeric('age', 'age', 2, name="given_name_comp")
self.assertIsInstance(result, pandas.Series)
self.assertEqual(result.name, "given_name_comp")

for alg in STRING_SIM_ALGORITHMS:

# Missing values
result = comp.fuzzy('given_name', 'given_name', method=alg, name="given_name_comp")

self.assertIsInstance(result, pandas.Series)
self.assertEqual(result.name, "given_name_comp")

# nosetests tests/test_compare.py:TestCompareAlgorithms
class TestCompareAlgorithms(TestCompare):

def test_exact(self):

self.A['test'] = ['Bob', 'Myrthe', 'Ally', 'John', 'Rose']
self.B['test'] = ['Bob', 'Myrte', 'Ally', 'John', 'Roze']

comp = recordlinkage.Compare(self.index_AB, self.A, self.B)

result = comp.exact('test', 'test')
expected = pandas.Series([1,0,1,1,0], index=self.index_AB)

pdt.assert_series_equal(result, expected)

def test_link_exact_missing(self):

comp = recordlinkage.Compare(TEST_INDEX_LINKING, TEST_DATA_1, TEST_DATA_2)
comp = recordlinkage.Compare(self.index_AB, self.A, self.B)

# Missing values as 0
result = comp.exact('name', 'name', missing_value=0, name='y_name')
expected = pd.Series([0,0,0,1,0], index=TEST_INDEX_LINKING, name='y_name')
result = comp.exact('given_name', 'given_name', missing_value=0)
expected = pandas.Series([0,0,0,1,0], index=self.index_AB)

pdt.assert_series_equal(result, expected)

# Missing values as np.nan
result = comp.exact('name', 'name', missing_value=np.nan, name='y_name')
expected = pd.Series([0,np.nan,0,1,np.nan], index=TEST_INDEX_LINKING, name='y_name')
# Missing values as nan
result = comp.exact('given_name', 'given_name', missing_value=nan)
expected = pandas.Series([0,nan,0,1,nan], index=self.index_AB)

pdt.assert_series_equal(result, expected)

# Missing values as np.nan
result = comp.exact('name', 'name', missing_value=9, name='y_name')
expected = pd.Series([0,9,0,1,9], index=TEST_INDEX_LINKING, name='y_name')
# Missing values as nan
result = comp.exact('given_name', 'given_name', missing_value=9)
expected = pandas.Series([0,9,0,1,9], index=self.index_AB)

pdt.assert_series_equal(result, expected)

def test_link_exact_disagree(self):

comp = recordlinkage.Compare(TEST_INDEX_LINKING, TEST_DATA_1, TEST_DATA_2)
comp = recordlinkage.Compare(self.index_AB, self.A, self.B)

# Missing values 0 and disagreement as 2
result = comp.exact('name', 'name', disagree_value=2, missing_value=0, name='y_name')
expected = pd.Series([2,0,2,1,0], index=TEST_INDEX_LINKING, name='y_name')
result = comp.exact('given_name', 'given_name', disagree_value=2, missing_value=0, name='y_name')
expected = pandas.Series([2,0,2,1,0], index=self.index_AB, name='y_name')

pdt.assert_series_equal(result, expected)

def test_dedup_exact_basic(self):

comp = recordlinkage.Compare(TEST_INDEX_DEDUP, TEST_DATA_1, TEST_DATA_1)
comp = recordlinkage.Compare(self.index_AB, self.A, self.A)

# Missing values
result = comp.exact('name', 'name')#, name='y_name')
expected = pd.Series([1,0,1,1,0], index=TEST_INDEX_DEDUP)#, name='y_name')
result = comp.exact('given_name', 'given_name')#, name='y_name')
expected = pandas.Series([1,0,1,1,0], index=self.index_AB)#, name='y_name')

pdt.assert_series_equal(result, expected)

def test_numeric(self):

comp = recordlinkage.Compare(TEST_INDEX_LINKING, TEST_DATA_1, TEST_DATA_2)
comp = recordlinkage.Compare(self.index_AB, self.A, self.B)

# Missing values
result = comp.numeric('age', 'age', 2)
expected = pd.Series([1,1,1,0,0], index=TEST_INDEX_LINKING)#, name='age')
expected = pandas.Series([1,1,1,0,0], index=self.index_AB)#, name='age')

pdt.assert_series_equal(result, expected)

def test_geo(self):

comp = recordlinkage.Compare(TEST_INDEX_LINKING, TEST_DATA_1, TEST_DATA_2)
comp = recordlinkage.Compare(self.index_AB, self.A, self.B)

# Missing values
result = comp.geo('age', 'age', 'age', 'age', 2)
Expand All @@ -111,7 +163,7 @@ def test_geo(self):

def test_numeric_batch(self):

comp = recordlinkage.Compare(TEST_INDEX_DEDUP, TEST_DATA_1, TEST_DATA_2)
comp = recordlinkage.Compare(self.index_AB, self.A, self.B)

for alg in ['step', 'linear', 'squared']:

Expand All @@ -128,21 +180,21 @@ def test_numeric_batch(self):

def test_fuzzy_does_not_exist(self):

comp = recordlinkage.Compare(TEST_INDEX_DEDUP, TEST_DATA_1, TEST_DATA_1)
comp = recordlinkage.Compare(self.index_AB, self.A, self.A)

self.assertRaises(ValueError, comp.fuzzy, 'name', 'name', name='y_name', method='unknown_algorithm')
self.assertRaises(ValueError, comp.fuzzy, 'given_name', 'given_name', name='y_name', method='unknown_algorithm')

def test_fuzzy_same_labels(self):

comp = recordlinkage.Compare(TEST_INDEX_DEDUP, TEST_DATA_1, TEST_DATA_2)
comp = recordlinkage.Compare(self.index_AB, self.A, self.B)

for alg in ['jaro', 'jaro_winkler', 'dameraulevenshtein', 'levenshtein', 'q_gram', 'cosine']:

print (alg)

# Missing values
result = comp.fuzzy('name', 'name', method=alg, missing_value=0)
result = comp.fuzzy('name', 'name', alg, missing_value=0)
result = comp.fuzzy('given_name', 'given_name', method=alg, missing_value=0)
result = comp.fuzzy('given_name', 'given_name', alg, missing_value=0)

print (result)

Expand All @@ -152,14 +204,14 @@ def test_fuzzy_same_labels(self):

def test_fuzzy_different_labels(self):

comp = recordlinkage.Compare(TEST_INDEX_DEDUP, TEST_DATA_1, TEST_DATA_2)
comp = recordlinkage.Compare(self.index_AB, self.A, self.B)

for alg in ['jaro', 'jaro_winkler', 'dameraulevenshtein', 'levenshtein', 'q_gram', 'cosine']:

print (alg)

# Missing values
result = comp.fuzzy('name', 'name', method=alg, missing_value=0) # Change in future (should work without method)
result = comp.fuzzy('given_name', 'given_name', method=alg, missing_value=0) # Change in future (should work without method)

print (result)

Expand All @@ -173,10 +225,10 @@ def test_fuzzy_different_labels(self):

def test_batch_compare(self):

comp = recordlinkage.Compare(TEST_INDEX_LINKING, TEST_DATA_1, TEST_DATA_2)
comp = recordlinkage.Compare(self.index_AB, self.A, self.B)

# Missing values as 0
result = comp.exact('name', 'name', missing_value=0, name='y_name')
result = comp.exact('given_name', 'given_name', missing_value=0, name='y_name')



Expand Down

0 comments on commit 6e21964

Please sign in to comment.