In [1]:
# ============================
# Week 2 – ETL: Data Cleaning & Enrichment
# ============================

import pandas as pd
import numpy as np

# ----------------------------
# 1. Extract – Load CSV File
# ----------------------------
df = pd.read_csv(r"C:\Users\user\DataMining_GroupProject\budgetwise_synthethic_dirty.csv")
   
print("Raw Dataset Shape:", df.shape)
df.head()


Raw Dataset Shape: (15836, 9)


Unnamed: 0,transaction_id,user_id,date,transaction_type,category,amount,payment_mode,location,notes
0,T03512,U039,December 22 2021,Expense,Rent,998,Cash,Pune,Paid electricity bill
1,T03261,U179,3/24/2022,Expense,Food,$143,Card,Delhi,Grocery shopping
2,T04316,U143,October 18 2022,Expense,Rent,149,Cash,Bengaluru,
3,T05649,U079,12/12/2021,Expense,Rent,49,UPI,,Paid electricity bill
4,T14750,U020,,Income,Other Income,83802,Bank Transfer,Chennai,Gift via app


In [2]:
# ----------------------------
# 2. Transform – Basic Cleaning
# ----------------------------

# Remove duplicate rows
df = df.drop_duplicates()

# Handle missing values
# - numeric fields → fill with median
# - categorical fields → fill with mode
for col in df.columns:
    if df[col].dtype in ["int64", "float64"]:
        df[col] = df[col].fillna(df[col].median())
    else:
        df[col] = df[col].fillna(df[col].mode()[0])

print("After Cleaning Shape:", df.shape)
df.info()
df.head()

After Cleaning Shape: (15031, 9)
<class 'pandas.core.frame.DataFrame'>
Index: 15031 entries, 0 to 15835
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   transaction_id    15031 non-null  object
 1   user_id           15031 non-null  object
 2   date              15031 non-null  object
 3   transaction_type  15031 non-null  object
 4   category          15031 non-null  object
 5   amount            15031 non-null  object
 6   payment_mode      15031 non-null  object
 7   location          15031 non-null  object
 8   notes             15031 non-null  object
dtypes: object(9)
memory usage: 1.1+ MB


Unnamed: 0,transaction_id,user_id,date,transaction_type,category,amount,payment_mode,location,notes
0,T03512,U039,December 22 2021,Expense,Rent,998,Cash,Pune,Paid electricity bill
1,T03261,U179,3/24/2022,Expense,Food,$143,Card,Delhi,Grocery shopping
2,T04316,U143,October 18 2022,Expense,Rent,149,Cash,Bengaluru,Gift
3,T05649,U079,12/12/2021,Expense,Rent,49,UPI,Hyderabad,Paid electricity bill
4,T14750,U020,12/12/2020,Income,Other Income,83802,Bank Transfer,Chennai,Gift via app


In [3]:
# Standardize column names
df.columns = df.columns.str.lower().str.replace(" ", "_")

# Convert date columns automatically
for col in df.columns:
    if "date" in col:
        df[col] = pd.to_datetime(df[col], errors="coerce")


In [4]:
# ----------------------------
# Handle Outliers
# ----------------------------

from scipy import stats

numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns

# Remove rows where numeric values are too extreme (|z| > 3)
df = df[(np.abs(stats.zscore(df[numeric_cols])) < 3).all(axis=1)]


In [5]:
# ----------------------------
# Add calculated fields
# ----------------------------

# Example: profit margin (profit / revenue)
if {"revenue", "cost"}.issubset(df.columns):
    df["profit"] = df["revenue"] - df["cost"]
    df["profit_margin"] = df["profit"] / df["revenue"]

# Example: purchase frequency (per customer)
if {"customer_id", "transaction_id"}.issubset(df.columns):
    purchase_counts = df.groupby("customer_id")["transaction_id"].count()
    df["purchase_frequency"] = df["customer_id"].map(purchase_counts)

# Example: total spend per customer
if {"customer_id", "amount"}.issubset(df.columns):
    total_spend = df.groupby("customer_id")["amount"].sum()
    df["total_spend"] = df["customer_id"].map(total_spend)


In [6]:
# Remove rows with unreasonable ages, prices, etc.
if "age" in df.columns:
    df = df[df["age"].between(10, 100)]

if "price" in df.columns:
    df = df[df["price"] > 0]


In [7]:
# ----------------------------
#  Load – Export cleaned file
# ----------------------------

df.to_csv("cleaned_dataset.csv", index=False)
print("Cleaned dataset saved as cleaned_dataset.csv")


Cleaned dataset saved as cleaned_dataset.csv
