# Data Cleaning Task - Customer Dataset

This notebook shows the full step-by-step cleaning process for the raw dataset (`raw_customer_data.csv`).
We identify issues (missing values, duplicates, inconsistent formats) and clean them to produce `cleaned_customer_data.csv`.

In [None]:
import pandas as pd
import numpy as np

# Load raw dataset
raw_df = pd.read_csv('raw_customer_data.csv')
print('Raw dataset shape:', raw_df.shape)
raw_df.head()

## Step 1: Inspect dataset info

In [None]:
raw_df.info()
raw_df.isnull().sum()

## Step 2: Remove duplicate rows

In [None]:
before = raw_df.shape[0]
raw_df = raw_df.drop_duplicates()
after = raw_df.shape[0]
print(f'Removed {before-after} duplicate rows.')

## Step 3: Handle missing values
- Fill numeric columns with median
- Fill categorical columns with 'unknown'

In [None]:
num_cols = raw_df.select_dtypes(include=['int64','float64']).columns
cat_cols = raw_df.select_dtypes(include=['object']).columns

for c in num_cols:
    raw_df[c] = raw_df[c].fillna(raw_df[c].median())
for c in cat_cols:
    raw_df[c] = raw_df[c].fillna('unknown').str.strip().str.lower()

raw_df.isnull().sum()

## Step 4: Convert date columns to datetime (if present)

In [None]:
for col in raw_df.columns:
    if 'date' in col.lower():
        raw_df[col] = pd.to_datetime(raw_df[col], errors='coerce')

raw_df.head()

## Step 5: Save cleaned dataset

In [None]:
raw_df.to_csv('cleaned_customer_data.csv', index=False)
print('Cleaned dataset saved as cleaned_customer_data.csv')

## Step 6: Create a summary report

In [None]:
summary = {
    'final_rows': raw_df.shape[0],
    'final_columns': raw_df.shape[1],
    'missing_values': int(raw_df.isnull().sum().sum()),
    'duplicates': int(raw_df.duplicated().sum()),
    'numeric_columns_filled': list(num_cols),
    'categorical_columns_filled': list(cat_cols)
}

import json
with open('cleaning_summary.json','w') as f:
    json.dump(summary, f, indent=2)
summary