# 🧹 Data Cleaning Steps (Customer Personality Dataset)
This notebook demonstrates **6 key preprocessing steps** performed on the dataset.

In [None]:
import pandas as pd

# Load dataset
df = pd.read_csv("marketing_campaign_Task-1(dataset).csv", sep="\t")
df.head()

In [None]:
# Step 1️⃣ - Handle Missing Values
print("Missing values before:\n", df.isnull().sum())

# Fix Income missing values
df['Income'] = pd.to_numeric(df['Income'], errors='coerce')
df['Income'].fillna(df['Income'].median(), inplace=True)

print("Missing values after:\n", df.isnull().sum())

In [None]:
# Step 2️⃣ - Remove Duplicates
print("Rows before:", len(df))
df.drop_duplicates(inplace=True)
print("Rows after:", len(df))

In [None]:
# Step 3️⃣ - Standardize Text Values
print("Before standardization:")
if 'Gender' in df.columns:
    print("Gender:", df['Gender'].unique())
if 'Marital_Status' in df.columns:
    print("Marital_Status:", df['Marital_Status'].unique())

if 'Gender' in df.columns:
    df['Gender'] = df['Gender'].str.strip().str.upper()
if 'Marital_Status' in df.columns:
    df['Marital_Status'] = df['Marital_Status'].str.strip().str.upper()

print("After standardization:")
if 'Gender' in df.columns:
    print("Gender:", df['Gender'].unique())
if 'Marital_Status' in df.columns:
    print("Marital_Status:", df['Marital_Status'].unique())

In [None]:
# Step 4️⃣ - Convert Date Formats
if 'Dt_Customer' in df.columns:
    df['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'], errors='coerce')
    print("Dt_Customer type:", df['Dt_Customer'].dtype)
    print("Sample Dt_Customer values:\n", df['Dt_Customer'].head())

In [None]:
# Step 5️⃣ - Rename Column Headers
print("Before renaming:\n", df.columns.tolist())
df.columns = df.columns.str.lower().str.replace(" ", "_")
print("After renaming:\n", df.columns.tolist())

In [None]:
# Step 6️⃣ - Fix Data Types
print("Before conversion:\n", df.dtypes)

# year_birth → int
if 'year_birth' in df.columns:
    df['year_birth'] = pd.to_datetime(df['year_birth'], errors='coerce').dt.year
    df['year_birth'] = df['year_birth'].astype('Int64')

# income → int
if 'income' in df.columns:
    df['income'] = pd.to_numeric(df['income'], errors='coerce').astype('Int64')

# dt_customer → keep datetime and add integer version
if 'dt_customer' in df.columns:
    df['dt_customer'] = pd.to_datetime(df['dt_customer'], errors='coerce')
    df['dt_customer_int'] = df['dt_customer'].astype('int64')

print("After conversion:\n", df.dtypes)
print("\nYear_Birth sample:", df['year_birth'].head().tolist())
print("Income sample:", df['income'].head().tolist())
if 'dt_customer_int' in df.columns:
    print("Dt_Customer as int sample:", df['dt_customer_int'].head().tolist())

In [None]:
# ✅ Final Summary
print("Final dataset info:")
print(df.info())
print("\nMissing values after cleaning:\n", df.isnull().sum())
df.head()