# Data Cleaning – Loan Default Prediction

This notebook explores the raw dataset and documents the cleaning process.

## Raw data source:
- `data/raw/cs-training.csv`

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/raw/cs-training.csv")
df = df.drop('Unnamed: 0', axis=1)

import sys
sys.path.append('../src')

from data_preprocessing import fill_missing_with_median, log1p_transform, cap_outliers

In [2]:
df = fill_missing_with_median(df, ['MonthlyIncome', 'NumberOfDependents'])

df['MonthlyIncomePerDependent'] = df['MonthlyIncome'] / (df['NumberOfDependents'] + 1)
df['DebtRatioPerDependent'] = df['DebtRatio'] / (df['NumberOfDependents'] + 1)
df['DebtRatioPerAge'] = df['DebtRatio'] / df['age']
df['RevolvingUtilizationOfUnsecuredLinesPerDependent'] = df['RevolvingUtilizationOfUnsecuredLines'] / (df['NumberOfDependents'] + 1)
df['DebtRatioOverIncome'] = df['DebtRatio'] / (df['MonthlyIncome'] + 1)

df['WeightedLateScore'] = (1 * df['NumberOfTime30-59DaysPastDueNotWorse'] +
                            5 * df['NumberOfTime60-89DaysPastDueNotWorse'] +
                            10 * df['NumberOfTimes90DaysLate'])

df['WeightedLateScorePerDependent'] = df['WeightedLateScore'] / (df['NumberOfDependents'] + 1)
df['WeightedLateScorePerAge'] = df['WeightedLateScore'] / df['age']

age_bins = [0, 20, 30, 40, 50, 60, 70, 80, 90, 100]
age_labels = ['0-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90-100']
df['AgeGroup'] = pd.cut(df['age'], bins=age_bins, labels=age_labels, right=False)
df = pd.get_dummies(df, columns=['AgeGroup'], drop_first=True)

df['MonthlyIncomePerAge'] = df['MonthlyIncome'] / df['age']
df['TotalMonthlyDebtPayment'] = df['MonthlyIncome'] * df['DebtRatio']

In [3]:
df.to_csv("../data/processed/train_clean.csv", index=False)