Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/testing.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.6, 3.7, 3.8]
python-version: [3.7, 3.8]

steps:
- uses: actions/checkout@v2
Expand Down
3 changes: 3 additions & 0 deletions docs/releases.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
v0.2.2
- Update grouping to include all strings only if identical lists of strings are compared

v0.2.0
- Update naming convention matcher --> model
- Update documentation
Expand Down
2 changes: 1 addition & 1 deletion polyfuzz/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from .polyfuzz import PolyFuzz
__version__ = "0.2.1"
__version__ = "0.2.2"
44 changes: 30 additions & 14 deletions polyfuzz/polyfuzz.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,13 +189,17 @@ def visualize_precision_recall(self,

def group(self,
model: Union[str, BaseMatcher] = None,
link_min_similarity: float = 0.75):
link_min_similarity: float = 0.75,
group_all_strings: bool = False):
""" From the matches, group the `To` matches together using single linkage

Arguments:
model: you can choose one of the models in `polyfuzz.models` to be used as a grouper
link_min_similarity: the minimum similarity between strings before they are grouped
in a single linkage fashion
group_all_strings: if you want to compare a list of strings with itself and then cluster
those strings, set this to True. Otherwise, only the strings that
were mapped To are clustered.

Updates:
self.matches: Adds a column `Group` that is the grouped version of the `To` column
Expand Down Expand Up @@ -223,13 +227,9 @@ def group(self,
elif not model:
model = TFIDF(n_gram_range=(3, 3), min_similarity=link_min_similarity)

# Group per model
for name, match in self.matches.items():
strings = list(self.matches[name].To.dropna().unique())
matches = model.match(strings, strings)
clusters, cluster_id_map, cluster_name_map = single_linkage(matches, link_min_similarity)
self._map_groups(name, cluster_name_map)
self.clusters[name] = clusters
self.cluster_mappings[name] = cluster_id_map
self._create_groups(name, model, link_min_similarity, group_all_strings)

def get_ids(self) -> Union[str, List[str], None]:
""" Get all model ids for easier access """
Expand Down Expand Up @@ -285,17 +285,33 @@ def get_cluster_mappings(self, name: str = None) -> Mapping[str, int]:

return self.cluster_mappings

def _map_groups(self, name: str, cluster_name_map: Mapping[str, str]):
""" Map the 'to' list to groups """
def _create_groups(self,
name: str,
model: BaseMatcher,
link_min_similarity: float,
group_all_strings: bool):
""" Create groups based on either the To mappings if you compare two different lists of strings, or
the From mappings if you compare lists of strings that are equal (set group_all_strings to True)
"""

if group_all_strings:
strings = list(self.matches[name].From.dropna().unique())
else:
strings = list(self.matches[name].To.dropna().unique())

# Create clusters
matches = model.match(strings, strings)
clusters, cluster_id_map, cluster_name_map = single_linkage(matches, link_min_similarity)

# Map the `to` list to groups
df = self.matches[name]
df["Group"] = df['To'].map(cluster_name_map).fillna(df['To'])

# Fix that some mappings from "From" end up in "Group"
df.loc[(df.From != df.To) &
(df.From == df.Group), "Group"] = df.loc[(df.From != df.To) &
(df.From == df.Group), "To"]
self.matches[name] = df

# Track clusters and their ids
self.clusters[name] = clusters
self.cluster_mappings[name] = cluster_id_map

def _update_model_ids(self):
""" Update model ids such that there is no overlap between ids """
# Give models a model_id if it didn't already exist
Expand Down
6 changes: 3 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
]

base_packages = [
"numpy>= 1.18.5",
"numpy>= 1.18.5,<=1.19.4",
"scipy>= 1.3.1",
"pandas>= 0.25.3",
"tqdm>=4.41.1",
Expand All @@ -25,7 +25,7 @@
]

fast_cosine = ["sparse_dot_topn>=0.2.9"]
embeddings_packages = ["flair>= 0.6.1.post1"]
embeddings_packages = ["torch>=1.2.0", "flair>= 0.7"]

extra_packages = embeddings_packages + fast_cosine

Expand All @@ -37,7 +37,7 @@
setup(
name="polyfuzz",
packages=find_packages(exclude=["notebooks", "docs"]),
version="0.2.1",
version="0.2.2",
author="Maarten Grootendorst",
author_email="maartengrootendorst@gmail.com",
description="PolyFuzz performs fuzzy string matching, grouping, and evaluation.",
Expand Down
14 changes: 14 additions & 0 deletions tests/test_polyfuzz.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,20 @@ def test_grouper(method):
assert model.get_cluster_mappings() == {'apples': 1, 'apple': 1}


def test_grouper_same_list():
model = PolyFuzz("TF-IDF").match(from_list, from_list)
model.group(link_min_similarity=0.75, group_all_strings=True)
matches = model.get_matches()

assert isinstance(matches, pd.DataFrame)
assert matches.Similarity.mean() > 0.3
assert len(matches) == 6
assert list(matches.columns) == ['From', 'To', 'Similarity', 'Group']

assert model.get_clusters() == {1: ['apples', 'apple', 'appl']}
assert model.get_cluster_mappings() == {'apples': 1, 'apple': 1, 'appl': 1}


@pytest.mark.parametrize("method", ["Unknown Model"])
def test_wrongbase_model(method):
with pytest.raises(ValueError):
Expand Down