gfhfh

In [7]:
# Ques_12.ipynb - Dealing with Duplicates & Redundancy

import pandas as pd
import numpy as np

# === Step 1: Generate or Load Data ===
def generate_sample_data():
    """Generates sample data with intentional duplicates."""
    data = {
        'ID': [101, 102, 103, 104, 105, 101, 103, 106, 107, 107],
        'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Alice', 'Charlie', 'Frank', 'Grace', 'Grace'],
        'Age': [25, 30, 35, 40, 28, 25, 35, 33, 45, 45],
        'City': ['NY', 'LA', 'Chicago', 'Houston', 'LA', 'NY', 'Chicago', 'Seattle', 'Boston', 'Boston']
    }
    return pd.DataFrame(data)


# === Step 2: Validate Data ===
def validate_data(df):
    """Basic sanity checks for input DataFrame."""
    if df.empty:
        raise ValueError("DataFrame is empty. Please check the input source.")
    if df.isnull().all().any():
        print("⚠️ Warning: Some columns have only missing values.")
    if 'ID' not in df.columns:
        raise KeyError("Missing required column: 'ID'")
    return True


# === Step 3: Identify Full Duplicates ===
def get_full_duplicates(df):
    return df[df.duplicated(keep=False)]


# === Step 4: Remove Full Duplicates ===
def remove_full_duplicates(df):
    return df.drop_duplicates()


# === Step 5: Identify Partial Duplicates by Column (e.g., ID) ===
def get_partial_duplicates(df, subset_col):
    return df[df.duplicated(subset=[subset_col], keep=False)]


# === Step 6: Remove Partial Duplicates by Column (keeping first) ===
def drop_partial_duplicates(df, subset_col):
    return df.drop_duplicates(subset=[subset_col], keep='first')


# === Step 7: Flag Duplicates in a Column ===
def flag_duplicates(df, subset_col):
    df = df.copy()
    df['is_duplicate'] = df.duplicated(subset=[subset_col], keep=False)
    return df


# === Step 8: Save to CSV ===
def save_cleaned_data(df, path="Q12_cleaned_deduplicated.csv"):
    df.to_csv(path, index=False)
    print(f"✅ Cleaned data saved to: {path}")


# === MAIN EXECUTION ===
if __name__ == "__main__":
    # Step 1: Load Data
    df = generate_sample_data()
    print("🔹 Original Data:\n", df)

    # Step 2: Validate
    try:
        validate_data(df)
    except Exception as e:
        print(f"❌ Validation Error: {e}")
        exit(1)

    # Step 3: Display full duplicates
    full_dupes = get_full_duplicates(df)
    print("\n🔸 Full Row Duplicates:\n", full_dupes)

    # Step 4: Remove full duplicates
    df_no_full_dupes = remove_full_duplicates(df)
    print("\n✅ After Removing Full Row Duplicates:\n", df_no_full_dupes)

    # Step 5: Show partial duplicates based on 'ID'
    partial_dupes = get_partial_duplicates(df_no_full_dupes, 'ID')
    print("\n🔸 Partial Duplicates Based on 'ID':\n", partial_dupes)

    # Step 6: Drop partial duplicates (keep first ID)
    df_unique_ids = drop_partial_duplicates(df_no_full_dupes, 'ID')
    print("\n✅ After Removing Duplicate 'ID's (Keep First):\n", df_unique_ids)

    # Step 7: Add a duplicate flag for transparency
    df_flagged = flag_duplicates(df_no_full_dupes, 'ID')
    print("\n🔸 Data with 'is_duplicate' flag:\n", df_flagged)

    # Step 8: Save cleaned output
    save_cleaned_data(df_unique_ids)


🔹 Original Data:
     ID     Name  Age     City
0  101    Alice   25       NY
1  102      Bob   30       LA
2  103  Charlie   35  Chicago
3  104    David   40  Houston
4  105      Eva   28       LA
5  101    Alice   25       NY
6  103  Charlie   35  Chicago
7  106    Frank   33  Seattle
8  107    Grace   45   Boston
9  107    Grace   45   Boston

🔸 Full Row Duplicates:
     ID     Name  Age     City
0  101    Alice   25       NY
2  103  Charlie   35  Chicago
5  101    Alice   25       NY
6  103  Charlie   35  Chicago
8  107    Grace   45   Boston
9  107    Grace   45   Boston

✅ After Removing Full Row Duplicates:
     ID     Name  Age     City
0  101    Alice   25       NY
1  102      Bob   30       LA
2  103  Charlie   35  Chicago
3  104    David   40  Houston
4  105      Eva   28       LA
7  106    Frank   33  Seattle
8  107    Grace   45   Boston

🔸 Partial Duplicates Based on 'ID':
 Empty DataFrame
Columns: [ID, Name, Age, City]
Index: []

✅ After Removing Duplicate 'ID's (Keep Fi