# Data Cleaning – Loan Default Prediction

This notebook explores the raw dataset and documents the cleaning process.

## Raw data source:
- `data/raw/cs-training.csv`

In [8]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/raw/cs-training.csv")
df = df.drop('Unnamed: 0', axis=1)

import sys
sys.path.append('../src')

from data_preprocessing import fill_missing_with_median, log1p_transform, cap_outliers

In [2]:
df = fill_missing_with_median(df, ['MonthlyIncome', 'NumberOfDependents'])
df = log1p_transform(df, ['MonthlyIncome', 'RevolvingUtilizationOfUnsecuredLines', 'DebtRatio'])
df = cap_outliers(df, ['log1pDebtRatio', 'log1pRevolvingUtilizationOfUnsecuredLines',
                      'log1pMonthlyIncome'], lower_q=0.01, upper_q=0.99)

df = df[df['log1pMonthlyIncome'] > 1]

df['TotalLatePayments'] = (df['NumberOfTime30-59DaysPastDueNotWorse'] 
                            + df['NumberOfTime60-89DaysPastDueNotWorse'] 
                            + df['NumberOfTimes90DaysLate'])

df = df.drop(columns=['NumberOfTime30-59DaysPastDueNotWorse', 'NumberOfTime60-89DaysPastDueNotWorse',
                     'NumberOfTimes90DaysLate']);

df = df[df['TotalLatePayments'] < 50]

In [7]:
df.to_csv("../data/processed/train_clean.csv", index=False)