In [3]:
# Question: Advanced Deduplication Using Machine Learning
# Description: Implement ML-based deduplication based on feature similarity.
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import DBSCAN
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

# Step 1: Data Loading
# Load your dataset
df = pd.read_csv("your_dataset.csv")

# Step 2: Data Preprocessing
# Handle missing values (example: fill with mean or mode)
df.fillna(df.mean(), inplace=True)

# Convert categorical columns to numerical ones (using label encoding)
label_encoder = LabelEncoder()
df['category_column'] = label_encoder.fit_transform(df['category_column'])

# Step 3: Feature Engineering (if you have text columns, you can create text similarity features)
# Example: Calculate text similarity using fuzzywuzzy

def calculate_text_similarity(text1, text2):
    return fuzz.ratio(text1, text2) / 100.0  # Returns a similarity score between 0 and 1

# Applying to each pair of text data in the dataframe (you can also apply pairwise comparison if needed)
df['text_similarity'] = df['text_column'].apply(lambda x: calculate_text_similarity(x, x))

# Step 4: Clustering to Detect Duplicates

# Assuming you have features that represent the data, such as 'text_similarity', 'numerical_feature1', etc.
X = df[['numerical_feature1', 'numerical_feature2', 'text_similarity']].values

# Using DBSCAN for clustering similar rows
db = DBSCAN(eps=0.5, min_samples=5, metric='euclidean')  # eps is the maximum distance between two samples for them to be considered as in the same neighborhood.
df['cluster'] = db.fit_predict(X)

# Step 5: Deduplication

# In the DBSCAN clustering result, -1 means that the point is considered as noise (not part of any cluster)
# We can drop duplicates by checking for similar rows within each cluster
deduplicated_df = df[df['cluster'] != -1].drop_duplicates(subset=['text_column', 'numerical_feature1', 'numerical_feature2'])

# Alternatively, use the similarity matrix to drop duplicates by comparing rows with a similarity threshold.
# For example:
# similarity_matrix = cosine_similarity(X)
# threshold = 0.9
# duplicate_pairs = np.where(similarity_matrix > threshold)

# Step 6: Save the deduplicated dataset
deduplicated_df.to_csv("deduplicated_dataset.csv", index=False)






ModuleNotFoundError: No module named 'fuzzywuzzy'