In [1]:
import pandas as pd
from io import StringIO
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AgglomerativeClustering

csv_data = """id,name,email,address
1,John Doe,john@example.com,123 Elm St
2,Jon Doe,jon@example.com,123 Elm Street
3,Jane Smith,jane@example.com,456 Oak St
4,J. Doe,j.doe@example.com,123 Elm St.
5,Jane S.,jane.s@example.com,456 Oak Street
"""

df = pd.read_csv(StringIO(csv_data))
df['combined'] = df['name'] + " " + df['email'] + " " + df['address']

vectorizer = TfidfVectorizer().fit_transform(df['combined'])
cosine_sim = cosine_similarity(vectorizer)

clustering = AgglomerativeClustering(
    n_clusters=None,
    metric='precomputed',   # changed fromaffinity to metric
    linkage='complete',
    distance_threshold=0.3
)
labels = clustering.fit_predict(1 - cosine_sim)

df['cluster'] = labels

deduped_df = df.groupby('cluster').first().reset_index(drop=True)
print(deduped_df[['id', 'name', 'email', 'address']])

   id        name              email         address
0   3  Jane Smith   jane@example.com      456 Oak St
1   4      J. Doe  j.doe@example.com     123 Elm St.
2   2     Jon Doe    jon@example.com  123 Elm Street
3   1    John Doe   john@example.com      123 Elm St
