-
Notifications
You must be signed in to change notification settings - Fork 66
/
polyfuzz.py
497 lines (395 loc) · 18.7 KB
/
polyfuzz.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
import joblib
import logging
import pandas as pd
from typing import List, Mapping, Union, Iterable
from polyfuzz.linkage import single_linkage
from polyfuzz.utils import check_matches, check_grouped, create_logger
from polyfuzz.models import TFIDF, RapidFuzz, Embeddings, BaseMatcher
from polyfuzz.metrics import precision_recall_curve, visualize_precision_recall
logger = create_logger()
class PolyFuzz:
"""
PolyFuzz class for Fuzzy string matching, grouping, and evaluation.
Arguments:
method: the method(s) used for matching. For quick selection of models
select one of the following: "EditDistance", "TF-IDF" or "Embeddings".
If you want more control over the models above, pass
in a model from polyfuzz.models. For examples, see
usage below.
verbose: Changes the verbosity of the model, Set to True if you want
to track the stages of the model.
Usage:
For basic, out-of-the-box usage, run the code below. You can replace "TF-IDF"
with either "EditDistance" or "Embeddings" for quick access to these models:
```python
import polyfuzz as pf
model = pf.PolyFuzz("TF-IDF")
```
If you want more control over the String Matching models, you can load
in these models separately:
```python
tfidf = TFIDF(n_gram_range=(3, 3), min_similarity=0, model_id="TF-IDF-Sklearn")
model = pf.PolyFuzz(tfidf)
```
You can also select multiple models in order to compare performance:
```python
tfidf = TFIDF(n_gram_range=(3, 3), min_similarity=0, model_id="TF-IDF-Sklearn")
edit = EditDistance(n_jobs=-1)
model = pf.PolyFuzz([tfidf, edit])
```
You can use embedding model, like Flair:
```python
from flair.embeddings import WordEmbeddings, TransformerWordEmbeddings
fasttext_embedding = WordEmbeddings('news')
bert_embedding = TransformerWordEmbeddings('bert-base-multilingual-cased')
embedding = Embeddings([fasttext_embedding, bert_embedding ], min_similarity=0.0)
model = pf.PolyFuzz(embedding)
```
"""
def __init__(self,
method: Union[str,
BaseMatcher,
List[BaseMatcher]] = "TF-IDF",
verbose: bool = False):
self.method = method
self.matches = None
# Metrics
self.min_precisions = None
self.recalls = None
self.average_precisions = None
# Cluster
self.clusters = None
self.cluster_mappings = None
self.grouped_matches = None
if verbose:
logger.setLevel(logging.DEBUG)
else:
logger.setLevel(logging.WARNING)
def match(self,
from_list: List[str],
to_list: List[str] = None,
top_n: int = 1):
""" Match the from_list of strings to the to_list of strings with whatever models
you have initialized
Arguments:
from_list: The list from which you want mappings.
If you want to map items within a list, and not map the
items to themselves, you can supply only the `from_list` and
ignore the `to_list`.
to_list: The list where you want to map to
top_n: The number of matches you want returned. This is currently only implemented
for `polyfuzz.models.TFIDF` and `polyfuzz.models.Embeddings` as they
can computationally handle more comparisons.
Updates:
self.matches: A dictionary with the matches from all models, can
be accessed with `model.get_all_matches` or
`model.get_match("TF-IDF")`
Usage:
After having initialized your models, you can pass through lists of strings:
```python
import polyfuzz as pf
model = pf.PolyFuzz("TF-IDF", model_id="TF-IDF")
model.match(from_list = ["string_one", "string_two"],
to_list = ["string_three", "string_four"])
```
You can access the results matches with `model.get_all_matches` or a specific
model with `model.get_match("TF-IDF")` based on their model_id.
"""
# Standard models - quick access
if isinstance(self.method, str):
if self.method in ["TF-IDF", "TFIDF"]:
self.method = TFIDF(min_similarity=0, top_n=top_n)
self.matches = {"TF-IDF": self.method.match(from_list, to_list)}
elif self.method in ["EditDistance", "Edit Distance"]:
self.method = RapidFuzz()
self.matches = {"EditDistance": self.method.match(from_list, to_list)}
elif self.method in ["Embeddings", "Embedding"]:
self.method = Embeddings(min_similarity=0, top_n=top_n)
self.matches = {"Embeddings": self.method.match(from_list, to_list)}
else:
raise ValueError("Please instantiate the model with one of the following methods: \n"
"* 'TF-IDF'\n"
"* 'EditDistance'\n"
"* 'Embeddings'\n")
logger.info(f"Ran model with model id = {self.method}")
# Custom models
elif isinstance(self.method, BaseMatcher):
self.matches = {self.method.model_id: self.method.match(from_list, to_list)}
logging.info(f"Ran model with model id = {self.method.model_id}")
# Multiple custom models
elif isinstance(self.method, Iterable):
self._update_model_ids()
self.matches = {}
for model in self.method:
self.matches[model.model_id] = model.match(from_list, to_list)
logging.info(f"Ran model with model id = {model.model_id}")
return self
def fit(self,
from_list: List[str],
to_list: List[str] = None):
""" Fit one or model distance models on `from_list` if no `to_list` is given
or fit them on `to_list` if both `from_list` and `to_list` are given.
Typically, the `to_list` will be tracked as the list that we want to transform
our `from_list` to. In other words, it is the golden list of words that we
want the words in the `from_list` mapped to.
However, you can also choose a single `from_list` and leave `to_list` empty
to map all words from within `from_list` to each other. Then, `from_list`
will be tracked instead as the golden list of words.
Thus, if you want to train on a single list instead, use only `from_list`
and keep `to_list` empty.
Arguments:
from_list: The list from which you want mappings.
If you want to map items within a list, and not map the
items to themselves, you can supply only the `from_list` and
ignore the `to_list`.
to_list: The list where you want to map to
Usage:
After having initialized your models, you can pass through lists of strings:
```python
import polyfuzz as pf
model = pf.PolyFuzz("TF-IDF", model_id="TF-IDF")
model.fit(from_list = ["string_one", "string_two"],
to_list = ["string_three", "string_four"])
```
Now, whenever you apply `.transform(new_list)`, the `new_list` will be mapped
to the words in `to_list`.
You can also fit on a single list of words:
```python
import polyfuzz as pf
model = pf.PolyFuzz("TF-IDF", model_id="TF-IDF")
model.fit(["string_three", "string_four"])
```
"""
self.match(from_list, to_list)
if to_list is not None:
self.to_list = to_list
else:
self.to_list = from_list
return self
def transform(self, from_list: List[str]) -> Mapping[str, pd.DataFrame]:
""" After fitting your model, match all words in `from_list`
to the words that were fitted on previously.
Arguments:
from_list: The list from which you want mappings.
Usage:
After having initialized your models, you can pass through lists of strings:
```python
import polyfuzz as pf
model = pf.PolyFuzz("TF-IDF", model_id="TF-IDF")
model.fit(["input_string_1", "input_string2"])
```
Then, you can transform and normalize new strings:
```python
results = model.transform(["input_string_1", "input_string2"])
```
"""
all_matches = {}
if isinstance(self.method, BaseMatcher):
matches = self.method.match(from_list, self.to_list, re_train=False)
all_matches[self.method.type] = matches
elif isinstance(self.method, Iterable):
for model in self.method:
all_matches[model.type] = model.match(from_list, self.to_list, re_train=False)
return all_matches
def fit_transform(self,
from_list: List[str],
to_list: List[str] = None) -> Mapping[str, pd.DataFrame]:
""" Fit and transform lists of words on one or more distance models.
Typically, the `to_list` will be tracked as the list that we want to transform
our `from_list` to. In other words, it is the golden list of words that we
want the words in the `from_list` mapped to.
However, you can also choose a single `from_list` and leave `to_list` empty
to map all words from within `from_list` to each other. Then, `from_list`
will be tracked instead as the golden list of words.
Arguments:
from_list: The list from which you want mappings.
If you want to map items within a list, and not map the
items to themselves, you can supply only the `from_list` and
ignore the `to_list`.
to_list: The list where you want to map to
Usage:
After having initialized your models, you can pass through lists of strings:
```python
import polyfuzz as pf
model = pf.PolyFuzz("TF-IDF", model_id="TF-IDF")
results = model.fit_transform(from_list = ["string_one", "string_two"],
to_list = ["string_three", "string_four"])
```
You can also fit and transform a single list of words:
```python
import polyfuzz as pf
model = pf.PolyFuzz("TF-IDF", model_id="TF-IDF")
results = model.fit_transform(["string_three", "string_four"])
```
"""
self.fit(from_list, to_list)
return self.transform(from_list)
def visualize_precision_recall(self,
kde: bool = False,
save_path: str = None
):
""" Calculate and visualize precision-recall curves
A minimum similarity score might be used to identify
when a match could be considered to be correct. For example,
we can assume that if a similarity score pass 0.95 we are
quite confident that the matches are correct. This minimum
similarity score can be defined as **precision** since it shows
you how precise we believe the matches are at a minimum.
**Recall** can then be defined as as the percentage of matches
found at a certain minimum similarity score. A high recall means
that for a certain minimum precision score, we find many matches.
Arguments:
kde: whether to also visualize the kde plot
save_path: the path to save the resulting image to
Usage:
```python
import polyfuzz as pf
model = pf.PolyFuzz("TF-IDF", model_id="TF-IDF")
model.match(from_list = ["string_one", "string_two"],
to_list = ["string_three", "string_four"])
model.visualize_precision_recall(save_path="results.png")
```
"""
check_matches(self)
self.min_precisions = {}
self.recalls = {}
self.average_precisions = {}
for name, match in self.matches.items():
min_precision, recall, average_precision = precision_recall_curve(match)
self.min_precisions[name] = min_precision
self.recalls[name] = recall
self.average_precisions[name] = average_precision
visualize_precision_recall(self.matches, self.min_precisions, self.recalls, kde, save_path)
def group(self,
model: Union[str, BaseMatcher] = None,
link_min_similarity: float = 0.75,
group_all_strings: bool = False):
""" From the matches, group the `To` matches together using single linkage
Arguments:
model: you can choose one of the models in `polyfuzz.models` to be used as a grouper
link_min_similarity: the minimum similarity between strings before they are grouped
in a single linkage fashion
group_all_strings: if you want to compare a list of strings with itself and then cluster
those strings, set this to True. Otherwise, only the strings that
were mapped To are clustered.
Updates:
self.matches: Adds a column `Group` that is the grouped version of the `To` column
"""
check_matches(self)
self.clusters = {}
self.cluster_mappings = {}
# Standard models - quick access
if isinstance(model, str):
if model in ["TF-IDF", "TFIDF"]:
model = TFIDF(n_gram_range=(3, 3), min_similarity=link_min_similarity)
elif self.method in ["EditDistance", "Edit Distance"]:
model = RapidFuzz()
elif self.method in ["Embeddings", "Embedding"]:
model = Embeddings(min_similarity=link_min_similarity)
else:
raise ValueError("Please instantiate the model with one of the following methods: \n"
"* 'TF-IDF'\n"
"* 'EditDistance'\n"
"* 'Embeddings'\n"
"* Or None if you want to automatically use TF-IDF")
# Use TF-IDF if no model is specified
elif not model:
model = TFIDF(n_gram_range=(3, 3), min_similarity=link_min_similarity)
# Group per model
for name, match in self.matches.items():
self._create_groups(name, model, link_min_similarity, group_all_strings)
def get_ids(self) -> Union[str, List[str], None]:
""" Get all model ids for easier access """
check_matches(self)
if isinstance(self.method, str):
return self.method
elif isinstance(self.method, Iterable):
return [model.model_id for model in self.method]
return None
def get_matches(self, model_id: str = None) -> Union[pd.DataFrame,
Mapping[str, pd.DataFrame]]:
""" Get the matches from one or more models"""
check_matches(self)
if len(self.matches) == 1:
return list(self.matches.values())[0]
elif len(self.matches) > 1 and model_id:
return self.matches[model_id]
return self.matches
def get_clusters(self, model_id: str = None) -> Mapping[str, List[str]]:
""" Get the groupings/clusters from a single model
Arguments:
model_id: the model id of the model if you have specified multiple models
"""
check_matches(self)
check_grouped(self)
if len(self.matches) == 1:
return list(self.clusters.values())[0]
elif len(self.matches) > 1 and model_id:
return self.clusters[model_id]
return self.clusters
def get_cluster_mappings(self, name: str = None) -> Mapping[str, int]:
""" Get the mappings from the `To` column to its respective column """
check_matches(self)
check_grouped(self)
if len(self.matches) == 1:
return list(self.cluster_mappings.values())[0]
elif len(self.matches) > 1 and name:
return self.cluster_mappings[name]
return self.cluster_mappings
def save(self, path: str) -> None:
""" Saves the model to the specified path
Arguments:
path: the location and name of the file you want to save
Usage:
```python
model.save("my_model")
```
"""
with open(path, 'wb') as file:
joblib.dump(self, file)
@classmethod
def load(cls, path: str):
""" Loads the model from the specified path
Arguments:
path: the location and name of the PolyFuzz file you want to load
Usage:
```python
PolyFuzz.load("my_model")
```
"""
with open(path, 'rb') as file:
model = joblib.load(file)
return model
def _create_groups(self,
name: str,
model: BaseMatcher,
link_min_similarity: float,
group_all_strings: bool):
""" Create groups based on either the To mappings if you compare two different lists of strings, or
the From mappings if you compare lists of strings that are equal (set group_all_strings to True)
"""
if group_all_strings:
strings = list(self.matches[name].From.dropna().unique())
else:
strings = list(self.matches[name].To.dropna().unique())
# Create clusters
matches = model.match(strings)
clusters, cluster_id_map, cluster_name_map = single_linkage(matches, link_min_similarity)
# Map the `to` list to groups
df = self.matches[name]
df["Group"] = df['To'].map(cluster_name_map).fillna(df['To'])
self.matches[name] = df
# Track clusters and their ids
self.clusters[name] = clusters
self.cluster_mappings[name] = cluster_id_map
def _update_model_ids(self):
""" Update model ids such that there is no overlap between ids """
# Give models a model_id if it didn't already exist
for index, model in enumerate(self.method):
if not model.model_id:
model.model_id = f"Model {index}"
# Update duplicate names
model_ids = [model.model_id for model in self.method]
if len(set(model_ids)) != len(model_ids):
for index, model in enumerate(self.method):
model.model_id = f"Model {index}"