# Data Deduplication using Clustering
**Objective**: Learn and implement data deduplication techniques.

**Task**: Hierarchical Clustering for Deduplication

**Steps**:
1. Data Set: Obtain a dataset containing duplicate employee information.
2. Perform Clustering: Use hierarchical agglomerative clustering to cluster the employee
records.
3. Evaluate Duplicates: Determine duplicates by analyzing the clusters formed.
4. Clean Data: Remove duplicate employee records found during clustering.

In [1]:
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances

def deduplicate_employees(df, features, threshold):
    text_features = df[features].astype(str).agg(' '.join, axis=1)
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(text_features)
    distance_matrix = pairwise_distances(tfidf_matrix, metric='cosine')
    clustering = AgglomerativeClustering(n_clusters=None, affinity='precomputed', linkage='average', distance_threshold=threshold)
    clusters = clustering.fit_predict(distance_matrix)
    df['cluster'] = clusters
    duplicate_clusters = df['cluster'].value_counts()[df['cluster'].value_counts() > 1].index.tolist()
    duplicates = df[df['cluster'].isin(duplicate_clusters)].sort_values(by='cluster')
    deduplicated_df = df.drop_duplicates(subset=features, keep='first')
    return duplicates, deduplicated_df

if __name__ == '__main__':
    data = {'EmployeeID': [1, 2, 3, 4, 5, 6, 7],
            'Name': ['John Smith', 'Jane Doe', 'Peter Jones', 'John Smith', 'Jane D', 'Peter J.', 'Jane Doe'],
            'Email': ['john.123@example.com', 'jane.doe@example.com', 'peter.j@example.com', 'john.smith@example.com', 'jane.d@example.com', 'peter.jones@example.com', 'jane.doe@example.com'],
            'Phone': ['123-456-7890', '987-654-3210', '111-222-3333', '1234567890', '9876543210', '1112223333', '987-654-3210']}
    employee_df = pd.DataFrame(data)
    features_to_consider = ['Name', 'Email', 'Phone']
    threshold_distance = 0.2

    duplicate_records, cleaned_df = deduplicate_employees(employee_df.copy(), features_to_consider, threshold_distance)

    print("Duplicate Records:")
    print(duplicate_records)
    print("\nDeduplicated DataFrame:")
    print(cleaned_df)

TypeError: AgglomerativeClustering.__init__() got an unexpected keyword argument 'affinity'