# Data Deduplication using Clustering
**Objective**: Learn and implement data deduplication techniques.

**Task**: DBSCAN for Data Deduplication

**Steps**:
1. Data Set: Download a dataset containing duplicate entries for event registrations.
2. DBSCAN Clustering: Apply the DBSCAN algorithm to cluster similar registrations.
3. Identify Duplicates: Detect duplicates based on density of the clusters.
4. Refinement: Validate clusters and remove any erroneous duplicates.

In [1]:
# write your code from here

import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances

def deduplicate_events(df, features, eps, min_samples):
    text_features = df[features].astype(str).agg(' '.join, axis=1)
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(text_features)
    distance_matrix = pairwise_distances(tfidf_matrix, metric='cosine')
    dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric='precomputed')
    clusters = dbscan.fit_predict(distance_matrix)
    df['cluster'] = clusters
    duplicate_clusters = df['cluster'].value_counts()[df['cluster'].value_counts() > 1].index.tolist()
    duplicates = df[df['cluster'].isin(duplicate_clusters)].sort_values(by='cluster')
    deduplicated_df = df.drop_duplicates(subset=features, keep='first')
    return duplicates, deduplicated_df

if __name__ == '__main__':
    data = {'RegistrationID': [1, 2, 3, 4, 5, 6, 7],
            'Event': ['Tech Conference', 'AI Workshop', 'Data Science Meetup', 'Tech Conf', 'AI Training', 'Data Meetup', 'AI Workshop'],
            'Name': ['Alice Brown', 'Bob Green', 'Charlie White', 'Alice B.', 'Robert Green', 'Charles White', 'Bob Green'],
            'Email': ['alice.b@example.com', 'bob.g@example.com', 'charlie.w@example.com', 'a.brown@example.com', 'bobgreen@example.com', 'c.white@example.com', 'b.green@example.com']}
    event_df = pd.DataFrame(data)
    features_to_consider = ['Event', 'Name', 'Email']
    epsilon = 0.3
    min_pts = 2

    duplicate_records, cleaned_df = deduplicate_events(event_df.copy(), features_to_consider, epsilon, min_pts)

    print("Duplicate Records:")
    print(duplicate_records)
    print("\nDeduplicated DataFrame:")
    print(cleaned_df)

Duplicate Records:
   RegistrationID                Event           Name                  Email  \
2               3  Data Science Meetup  Charlie White  charlie.w@example.com   
4               5          AI Training   Robert Green   bobgreen@example.com   
5               6          Data Meetup  Charles White    c.white@example.com   
0               1      Tech Conference    Alice Brown    alice.b@example.com   
3               4            Tech Conf       Alice B.    a.brown@example.com   
1               2          AI Workshop      Bob Green      bob.g@example.com   
6               7          AI Workshop      Bob Green    b.green@example.com   

   cluster  
2       -1  
4       -1  
5       -1  
0        0  
3        0  
1        1  
6        1  

Deduplicated DataFrame:
   RegistrationID                Event           Name                  Email  \
0               1      Tech Conference    Alice Brown    alice.b@example.com   
1               2          AI Workshop      Bob Gre