In [1]:
import pandas as pd

# Sample dataset with exact and partial duplicates
data = {
    "name": ["John Doe", "Jane Smith", "john doe", "Alice Brown", "Jane Smith"],
    "age": [28, 34, 28, 45, 34],
    "city": ["New York", "Chicago", "New York", "Los Angeles", "Chicago"]
}

df = pd.DataFrame(data)
print("Original Dataset:")
print(df)


Original Dataset:
          name  age         city
0     John Doe   28     New York
1   Jane Smith   34      Chicago
2     john doe   28     New York
3  Alice Brown   45  Los Angeles
4   Jane Smith   34      Chicago


Identify Exact Duplicates

In [2]:
# Boolean Series for duplicates
exact_duplicates = df.duplicated()
print("Exact duplicates boolean mask:")
print(exact_duplicates)

# Count exact duplicates
print("Number of exact duplicates:", df.duplicated().sum())


Exact duplicates boolean mask:
0    False
1    False
2    False
3    False
4     True
dtype: bool
Number of exact duplicates: 1


Remove Exact Duplicates

In [3]:
df.drop_duplicates(inplace=True)
print("Dataset After Removing Exact Duplicates:")
print(df)


Dataset After Removing Exact Duplicates:
          name  age         city
0     John Doe   28     New York
1   Jane Smith   34      Chicago
2     john doe   28     New York
3  Alice Brown   45  Los Angeles


Handle Partial / Fuzzy Duplicates

In [4]:
# Standardize 'name' column to lowercase
df['name'] = df['name'].str.lower()

# Inspect counts of standardized names
name_counts = df.groupby('name').size().sort_values(ascending=False)
print("Counts of standardized names:")
print(name_counts)


Counts of standardized names:
name
john doe       2
alice brown    1
jane smith     1
dtype: int64


Aggregate or Keep One Record for Partial Duplicates

In [5]:
# Example: Keep the first occurrence of each standardized name
df = df.groupby('name', as_index=False).first()
print("Dataset After Handling Partial Duplicates:")
print(df)


Dataset After Handling Partial Duplicates:
          name  age         city
0  alice brown   45  Los Angeles
1   jane smith   34      Chicago
2     john doe   28     New York


Final Check for Duplicates

In [6]:
print("Number of duplicates remaining:", df.duplicated().sum())


Number of duplicates remaining: 0
