In [21]:
import pandas as pd

df = pd.read_csv("../data/processed_credit_card_clients.csv")


In [22]:
df.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
1,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [23]:
df["SEX"] = df["SEX"].map({1:0,2:1})

In [24]:
df = pd.get_dummies(df,columns=["EDUCATION","MARRIAGE"], drop_first= True)

In [25]:
df

Unnamed: 0,LIMIT_BAL,SEX,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,...,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month,EDUCATION_2,EDUCATION_3,EDUCATION_4,MARRIAGE_2,MARRIAGE_3
0,20000,1,24,2,2,-1,-1,-2,-2,3913,...,0,0,0,0,1,True,False,False,False,False
1,120000,1,26,-1,2,0,0,0,2,2682,...,1000,1000,0,2000,1,True,False,False,True,False
2,90000,1,34,0,0,0,0,0,0,29239,...,1000,1000,1000,5000,0,True,False,False,True,False
3,50000,1,37,0,0,0,0,0,0,46990,...,1200,1100,1069,1000,0,True,False,False,False,False
4,50000,0,57,-1,0,-1,0,0,0,8617,...,10000,9000,689,679,0,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,220000,0,39,0,0,0,0,0,0,188948,...,5003,3047,5000,1000,0,False,True,False,False,False
29996,150000,0,43,-1,-1,-1,-1,0,0,1683,...,8998,129,0,0,0,False,True,False,True,False
29997,30000,0,37,4,3,2,-1,0,0,3565,...,22000,4200,2000,3100,1,True,False,False,True,False
29998,80000,0,41,1,-1,0,0,0,-1,-1645,...,1178,1926,52964,1804,1,False,True,False,False,False


### Encoding Strategy
- SEX encoded as binary (0/1)
- EDUCATION and MARRIAGE one-hot encoded to avoid artificial order
- Ordinal PAY features kept numeric

In [26]:
from sklearn.preprocessing import StandardScaler
scale_cols=[
   "LIMIT_BAL", "AGE",
    "BILL_AMT1","BILL_AMT2","BILL_AMT3","BILL_AMT4","BILL_AMT5","BILL_AMT6",
    "PAY_AMT1","PAY_AMT2","PAY_AMT3","PAY_AMT4","PAY_AMT5","PAY_AMT6"

]

In [27]:
scaler = StandardScaler()

In [28]:
df[scale_cols] = scaler.fit_transform(df[scale_cols])

In [29]:
scale_cols

['LIMIT_BAL',
 'AGE',
 'BILL_AMT1',
 'BILL_AMT2',
 'BILL_AMT3',
 'BILL_AMT4',
 'BILL_AMT5',
 'BILL_AMT6',
 'PAY_AMT1',
 'PAY_AMT2',
 'PAY_AMT3',
 'PAY_AMT4',
 'PAY_AMT5',
 'PAY_AMT6']

### Feature Scaling
- StandardScaler applied to continuous numerical features
- Binary and ordinal features left unscaled

In [30]:
df[scale_cols].describe()

Unnamed: 0,LIMIT_BAL,AGE,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
count,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0
mean,-6.063298e-17,-1.856885e-16,-1.2316070000000001e-17,-3.7895610000000005e-17,6.252776000000001e-17,5.873820000000001e-17,-2.3684760000000003e-17,1.136868e-17,-4.736952e-18,1.373716e-17,2.0842590000000002e-17,1.800042e-17,-1.9658350000000003e-17,-1.7881990000000003e-17
std,1.000017,1.000017,1.000017,1.000017,1.000017,1.000017,1.000017,1.000017,1.000017,1.000017,1.000017,1.000017,1.000017,1.000017
min,-1.213794,-1.571479,-2.944312,-1.671375,-2.945672,-3.315048,-2.000874,-6.355247,-0.3419416,-0.2569895,-0.2968013,-0.3080626,-0.3141361,-0.2933821
25%,-0.9054983,-0.8120745,-0.647312,-0.6490466,-0.6394814,-0.6363293,-0.63406,-0.6316338,-0.2815661,-0.2208358,-0.2746506,-0.289168,-0.2976091,-0.2867584
50%,-0.2118326,-0.1611565,-0.3916884,-0.3931159,-0.3882529,-0.3763451,-0.3652683,-0.3660725,-0.215153,-0.1697952,-0.1945673,-0.2123132,-0.2159561,-0.2090042
75%,0.5589071,0.5982479,0.2154919,0.2083271,0.1896457,0.1747667,0.1624955,0.1733997,-0.03970176,-0.03998021,-0.0409323,-0.05188511,-0.05026084,-0.06837436
max,6.416528,4.720729,12.40296,13.1336,23.3182,13.18669,14.58743,15.49528,52.39921,72.84299,50.59528,39.33218,27.60363,29.4451
