Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fixing broken build and removed some warnings #168

Merged
merged 1 commit into from Apr 7, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
8 changes: 4 additions & 4 deletions recordlinkage/algorithms/string.py
Expand Up @@ -16,12 +16,12 @@ def jaro_similarity(s1, s2):

conc = pandas.Series(list(zip(s1, s2)))

from jellyfish import jaro_distance
from jellyfish import jaro_similarity

def jaro_apply(x):

try:
return jaro_distance(x[0], x[1])
return jaro_similarity(x[0], x[1])
except Exception as err:
if pandas.isnull(x[0]) or pandas.isnull(x[1]):
return np.nan
Expand All @@ -35,12 +35,12 @@ def jarowinkler_similarity(s1, s2):

conc = pandas.Series(list(zip(s1, s2)))

from jellyfish import jaro_winkler
from jellyfish import jaro_winkler_similarity

def jaro_winkler_apply(x):

try:
return jaro_winkler(x[0], x[1])
return jaro_winkler_similarity(x[0], x[1])
except Exception as err:
if pandas.isnull(x[0]) or pandas.isnull(x[1]):
return np.nan
Expand Down
3 changes: 3 additions & 0 deletions recordlinkage/classifiers.py
Expand Up @@ -297,6 +297,9 @@ def match_cluster_center(self, value):
if value is None:
return

# this attribute is filled in KMeans.fit and is required for predict
self.kernel._n_threads = 1

if not hasattr(self.kernel, 'cluster_centers_'):
self.kernel.cluster_centers_ = numpy.empty((2, len(value)))
self.kernel.cluster_centers_[:] = numpy.nan
Expand Down
10 changes: 5 additions & 5 deletions recordlinkage/preprocessing/cleaning.py
Expand Up @@ -111,17 +111,17 @@ def strip_accents_fn_wrapper(x):

# Remove all content between brackets
if remove_brackets is True:
s = s.str.replace(r'(\[.*?\]|\(.*?\)|\{.*?\})', '')
s = s.str.replace(r'(\[.*?\]|\(.*?\)|\{.*?\})', '', regex=True)

# Remove the special characters
if replace_by_none:
s = s.str.replace(replace_by_none, '')
s = s.str.replace(replace_by_none, '', regex=True)

if replace_by_whitespace:
s = s.str.replace(replace_by_whitespace, ' ')
s = s.str.replace(replace_by_whitespace, ' ', regex=True)

# Remove multiple whitespaces
s = s.str.replace(r'\s\s+', ' ')
s = s.str.replace(r'\s\s+', ' ', regex=True)

# Strip s
s = s.str.lstrip().str.rstrip()
Expand All @@ -145,7 +145,7 @@ def phonenumbers(s):
"""

# Remove all special tokens
s = s.astype(object).str.replace('[^0-9+]+', '')
s = s.astype(object).str.replace('[^0-9+]+', '', regex=True)

return s

Expand Down
2 changes: 1 addition & 1 deletion recordlinkage/preprocessing/encoding.py
Expand Up @@ -77,7 +77,7 @@ def phonetic(s, method, concat=True, encoding='utf-8', decode_error='strict'):
if type(x) == bytes else x)

if concat:
s = s.str.replace(r"[\-\_\s]", "")
s = s.str.replace(r"[\-\_\s]", "", regex=True)

for alg in _phonetic_algorithms:
if method in alg['argument_names']:
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Expand Up @@ -47,7 +47,7 @@ def read(fname):
],
python_requires=">=3.5",
install_requires=[
"jellyfish>=0.5.4",
"jellyfish>=0.8.0",
"numpy>=1.13.0",
"pandas>=1,<2",
"scipy>=1",
Expand Down
3 changes: 2 additions & 1 deletion tests/test_preprocessing.py
Expand Up @@ -210,8 +210,9 @@ def test_encode_match_rating(self):
np.nan, u'John', u'Mary Ann', u'billy', u'Jonathan', u'Gretha',
u'Micheal', u'Sjors'
])
# in jellyfish.match_rating_codex version 0.8.0 results have changed
expected = pd.Series([
np.nan, u'JHN', u'MRYNN', u'BLLY', u'JNTHN', u'GRTH', u'MCHL',
np.nan, u'JHN', u'MRYN', u'BLY', u'JNTHN', u'GRTH', u'MCHL',
u'SJRS'
])

Expand Down