diff --git a/recordlinkage/algorithms/string.py b/recordlinkage/algorithms/string.py index c006f1b6..535fa76a 100644 --- a/recordlinkage/algorithms/string.py +++ b/recordlinkage/algorithms/string.py @@ -16,12 +16,12 @@ def jaro_similarity(s1, s2): conc = pandas.Series(list(zip(s1, s2))) - from jellyfish import jaro_distance + from jellyfish import jaro_similarity def jaro_apply(x): try: - return jaro_distance(x[0], x[1]) + return jaro_similarity(x[0], x[1]) except Exception as err: if pandas.isnull(x[0]) or pandas.isnull(x[1]): return np.nan @@ -35,12 +35,12 @@ def jarowinkler_similarity(s1, s2): conc = pandas.Series(list(zip(s1, s2))) - from jellyfish import jaro_winkler + from jellyfish import jaro_winkler_similarity def jaro_winkler_apply(x): try: - return jaro_winkler(x[0], x[1]) + return jaro_winkler_similarity(x[0], x[1]) except Exception as err: if pandas.isnull(x[0]) or pandas.isnull(x[1]): return np.nan diff --git a/recordlinkage/classifiers.py b/recordlinkage/classifiers.py index ab3fac19..c657c20e 100644 --- a/recordlinkage/classifiers.py +++ b/recordlinkage/classifiers.py @@ -297,6 +297,9 @@ def match_cluster_center(self, value): if value is None: return + # this attribute is filled in KMeans.fit and is required for predict + self.kernel._n_threads = 1 + if not hasattr(self.kernel, 'cluster_centers_'): self.kernel.cluster_centers_ = numpy.empty((2, len(value))) self.kernel.cluster_centers_[:] = numpy.nan diff --git a/recordlinkage/preprocessing/cleaning.py b/recordlinkage/preprocessing/cleaning.py index 0ddf4496..32070576 100644 --- a/recordlinkage/preprocessing/cleaning.py +++ b/recordlinkage/preprocessing/cleaning.py @@ -111,17 +111,17 @@ def strip_accents_fn_wrapper(x): # Remove all content between brackets if remove_brackets is True: - s = s.str.replace(r'(\[.*?\]|\(.*?\)|\{.*?\})', '') + s = s.str.replace(r'(\[.*?\]|\(.*?\)|\{.*?\})', '', regex=True) # Remove the special characters if replace_by_none: - s = s.str.replace(replace_by_none, '') + s = s.str.replace(replace_by_none, '', regex=True) if replace_by_whitespace: - s = s.str.replace(replace_by_whitespace, ' ') + s = s.str.replace(replace_by_whitespace, ' ', regex=True) # Remove multiple whitespaces - s = s.str.replace(r'\s\s+', ' ') + s = s.str.replace(r'\s\s+', ' ', regex=True) # Strip s s = s.str.lstrip().str.rstrip() @@ -145,7 +145,7 @@ def phonenumbers(s): """ # Remove all special tokens - s = s.astype(object).str.replace('[^0-9+]+', '') + s = s.astype(object).str.replace('[^0-9+]+', '', regex=True) return s diff --git a/recordlinkage/preprocessing/encoding.py b/recordlinkage/preprocessing/encoding.py index 69e7d77a..85f12815 100644 --- a/recordlinkage/preprocessing/encoding.py +++ b/recordlinkage/preprocessing/encoding.py @@ -77,7 +77,7 @@ def phonetic(s, method, concat=True, encoding='utf-8', decode_error='strict'): if type(x) == bytes else x) if concat: - s = s.str.replace(r"[\-\_\s]", "") + s = s.str.replace(r"[\-\_\s]", "", regex=True) for alg in _phonetic_algorithms: if method in alg['argument_names']: diff --git a/setup.py b/setup.py index 640ef326..ff7420a0 100644 --- a/setup.py +++ b/setup.py @@ -47,7 +47,7 @@ def read(fname): ], python_requires=">=3.5", install_requires=[ - "jellyfish>=0.5.4", + "jellyfish>=0.8.0", "numpy>=1.13.0", "pandas>=1,<2", "scipy>=1", diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py index c746be8b..47a95c3b 100644 --- a/tests/test_preprocessing.py +++ b/tests/test_preprocessing.py @@ -210,8 +210,9 @@ def test_encode_match_rating(self): np.nan, u'John', u'Mary Ann', u'billy', u'Jonathan', u'Gretha', u'Micheal', u'Sjors' ]) + # in jellyfish.match_rating_codex version 0.8.0 results have changed expected = pd.Series([ - np.nan, u'JHN', u'MRYNN', u'BLLY', u'JNTHN', u'GRTH', u'MCHL', + np.nan, u'JHN', u'MRYN', u'BLY', u'JNTHN', u'GRTH', u'MCHL', u'SJRS' ])