In [142]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler
import os

df = pd.read_csv('data/raw/UCI_Credit_Card.csv')

In [143]:
df = df.drop(columns=['ID'])

In [144]:
df['EDUCATION'] = df['EDUCATION'].replace({0:4, 5:4, 6:4})
df['MARRIAGE'] = df['MARRIAGE'].replace({0:3})

In [145]:
bill_cols = ['BILL_AMT1','BILL_AMT2','BILL_AMT3','BILL_AMT4','BILL_AMT5','BILL_AMT6']
pay_cols = ['PAY_AMT1','PAY_AMT2','PAY_AMT3','PAY_AMT4','PAY_AMT5','PAY_AMT6']
pay_delay_cols = ['PAY_0','PAY_2','PAY_3','PAY_4','PAY_5','PAY_6']

In [146]:
mask_neg = (df[bill_cols] < 0).any(axis=1)
if mask_neg.sum() > 0:
    df = df.loc[~mask_neg].reset_index(drop=True)

In [147]:
x = df.drop(columns='default.payment.next.month').copy()
y = df['default.payment.next.month'].copy()

In [148]:
x['sum_all_bill'] = x[bill_cols].sum(axis=1)
x['avg_bill'] = x[bill_cols].mean(axis=1)
x['max_bill'] = x[bill_cols].max(axis=1)
x['std_bill'] = x[bill_cols].std(axis=1)
x['sum_all_pay'] = x[pay_cols].sum(axis=1)
x['avg_pay'] = x[pay_cols].mean(axis=1)
x['max_pay'] = x[pay_cols].max(axis=1)
x['std_pay'] = x[pay_cols].std(axis=1)
x['bill_to_limit_ratio'] = x['sum_all_bill'] / x['LIMIT_BAL'].replace(0, np.nan)
x['pay_to_bill_ratio'] = x['sum_all_pay'] / x['sum_all_bill'].replace(0, np.nan)
x['avg_pay_delay'] = x[pay_delay_cols].mean(axis=1)
x['count_delays'] = (x[pay_delay_cols] > 0).sum(axis=1)
x['bill_to_limit_ratio'] = x['bill_to_limit_ratio'].fillna(0)
x['pay_to_bill_ratio'] = x['pay_to_bill_ratio'].fillna(0)

In [149]:
categories_cols = ['EDUCATION', 'MARRIAGE']
x = pd.get_dummies(x, columns=categories_cols, drop_first=True)

In [150]:
def iqr_clip(df_in, cols):
    df = df_in.copy()
    for col in cols:
        if col not in df.columns:
            continue
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        df[col] = df[col].clip(lower, upper)
    return df

In [151]:
numeric_cols_for_clip = [
    'LIMIT_BAL', 'AGE',
    'sum_all_bill', 'avg_bill', 'max_bill', 'min_bill', 'std_bill',
    'sum_all_pay', 'avg_pay', 'max_pay', 'std_pay',
    'bill_to_limit_ratio', 'pay_to_bill_ratio', 'avg_pay_delay', 'count_delays'
]

In [152]:
x = iqr_clip(x, numeric_cols_for_clip)

In [153]:
corr_matrix = x.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = upper.columns[upper.gt(0.9).any()]
x = x.drop(columns=to_drop)

In [154]:
x_num = x.select_dtypes(include=[np.number]).fillna(0)
vif = pd.Series(
    [variance_inflation_factor(x_num.values, i) for i in range(x_num.shape[1])],
    index=x_num.columns
)
x.drop(columns=vif[vif > 10].index)

Unnamed: 0,LIMIT_BAL,SEX,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,PAY_AMT1,...,std_bill,sum_all_pay,bill_to_limit_ratio,pay_to_bill_ratio,count_delays,EDUCATION_2,EDUCATION_3,EDUCATION_4,MARRIAGE_2,MARRIAGE_3
0,20000.0,2,2,2,-1,-1,-2,-2,3913.0,0.0,...,1761.633219,689.0,0.385200,0.089434,2.0,True,False,False,False,False
1,120000.0,2,-1,2,0,0,0,2,2682.0,0.0,...,637.967841,5000.0,0.142308,0.292791,2.0,True,False,False,True,False
2,90000.0,2,0,0,0,0,0,0,29239.0,1518.0,...,6064.518593,11018.0,1.129478,0.108388,0.0,True,False,False,True,False
3,50000.0,2,0,0,0,0,0,0,46990.0,2000.0,...,10565.793518,8388.0,4.626680,0.036259,0.0,True,False,False,False,False
4,50000.0,1,-1,0,-1,0,0,0,8617.0,2000.0,...,10668.590074,59049.0,2.186780,0.540054,0.0,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28065,80000.0,1,2,2,2,2,2,2,72557.0,7000.0,...,3510.612335,21500.0,5.886662,0.045654,2.5,True,False,False,True,False
28066,220000.0,1,0,0,0,0,0,0,188948.0,8500.0,...,33619.756248,42550.0,3.297041,0.058661,0.0,False,True,False,False,False
28067,150000.0,1,-1,-1,-1,-1,0,0,1683.0,1837.0,...,3200.534247,14490.0,0.141213,0.684071,0.0,False,True,False,True,False
28068,30000.0,1,4,3,2,-1,0,0,3565.0,0.0,...,9354.149660,31300.0,2.349867,0.443997,2.5,True,False,False,True,False


In [155]:
selector = SelectKBest(score_func=f_classif, k=12)
selector.fit(x, y)
top_features = x.columns[selector.get_support()]
print("Top features:", list(top_features))

Top features: ['LIMIT_BAL', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'sum_all_pay', 'bill_to_limit_ratio', 'pay_to_bill_ratio', 'avg_pay_delay', 'count_delays']


In [156]:
x_train, x_test, y_train, y_test = train_test_split(
    x[top_features], y, stratify=y, test_size=0.2, random_state=42
)

In [157]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [158]:
pd.DataFrame(x_train_scaled, columns=top_features).to_csv("data/processed/x_train.csv", index=False)
pd.DataFrame(x_test_scaled, columns=top_features).to_csv("data/processed/x_test.csv", index=False)

In [159]:
y_train.to_csv("data/processed/y_train.csv", index=False)
y_test.to_csv("data/processed/y_test.csv", index=False)