In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_distances

# Sample data: Replace this with your actual dataset
data = {
    'id': [1, 2, 3, 4, 5],
    'name': ['John Smith', 'Jon Smith', 'Johnny Smith', 'Alice Johnson', 'Alicia Johnson'],
    'email': ['john.smith@example.com', 'jon.smith@example.com', 'johnny.smith@example.com',
              'alice.johnson@example.com', 'alicia.johnson@example.com']
}

df = pd.DataFrame(data)

# Step 1: Feature extraction - For deduplication, use text similarity on 'name' and 'email'

# Combine columns for similarity check
df['combined'] = df['name'] + ' ' + df['email']

# Use TF-IDF vectorization for text data
vectorizer = TfidfVectorizer().fit(df['combined'])
X_tfidf = vectorizer.transform(df['combined'])

# Step 2: Compute pairwise distance matrix based on cosine similarity
# DBSCAN expects a distance matrix or feature vectors
distance_matrix = cosine_distances(X_tfidf)

# Step 3: Apply DBSCAN
# eps: threshold distance for cluster inclusion; tune based on dataset
# min_samples: minimum points to form a cluster

db = DBSCAN(eps=0.3, min_samples=2, metric='precomputed')
clusters = db.fit_predict(distance_matrix)

df['cluster'] = clusters

# Step 4: View cluster assignments (duplicates should have the same cluster label)
print("Cluster assignments for deduplication:")
print(df[['id', 'name', 'email', 'cluster']])

# Step 5: (Optional) Group duplicates for inspection
duplicates = df[df['cluster'] != -1].sort_values('cluster')
print("\nPotential duplicates groups:")
for cluster_id in duplicates['cluster'].unique():
    print(f"\nCluster {cluster_id}:")
    print(duplicates[duplicates['cluster'] == cluster_id])