In [47]:
import pandas as pd 
import numpy as np
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import mean_squared_error,confusion_matrix,classification_report
from sklearn.decomposition import PCA

In [3]:
df = pd.read_csv('train_data.csv')

In [94]:
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month,Paid1,Paid2,Paid3,Paid4,Paid5,Paid6
0,2873,350000,1,1,2,37,-2,-2,-2,-2,...,466,316,316,0,0,0,-150,0,150,0
1,3598,50000,2,2,1,37,2,2,2,0,...,600,600,600,0,35494,43594,37642,12426,12668,12897
2,27623,50000,2,1,2,23,-1,-1,-1,-1,...,9810,660,2980,0,882,157,-2501,-5010,9150,-2320
3,6874,20000,1,3,1,56,0,0,0,0,...,0,490,658,0,9339,10960,11768,13784,12930,13028
4,6444,110000,2,2,2,32,0,0,0,0,...,4100,4100,4200,0,102759,101412,104364,104729,106457,101882


In [50]:
df['Paid1'] = df['BILL_AMT1']-df['PAY_AMT1']
df['Paid2'] = df['BILL_AMT2']-df['PAY_AMT2']
df['Paid3'] = df['BILL_AMT3']-df['PAY_AMT3']
df['Paid4'] = df['BILL_AMT4']-df['PAY_AMT4']
df['Paid5'] = df['BILL_AMT5']-df['PAY_AMT5']
df['Paid6'] = df['BILL_AMT6']-df['PAY_AMT6']

In [51]:
X = df.drop(columns=['ID','default payment next month'])
y = df['default payment next month']

In [52]:
OHE = OneHotEncoder(categories='auto')
pay0 = pd.DataFrame.sparse.from_spmatrix(OHE.fit_transform(df[['PAY_0']]))
pay2 = pd.DataFrame.sparse.from_spmatrix(OHE.fit_transform(df[['PAY_2']]))
pay3 = pd.DataFrame.sparse.from_spmatrix(OHE.fit_transform(df[['PAY_3']]))
pay4 = pd.DataFrame.sparse.from_spmatrix(OHE.fit_transform(df[['PAY_4']]))
pay5 = pd.DataFrame.sparse.from_spmatrix(OHE.fit_transform(df[['PAY_5']]))
pay6 = pd.DataFrame.sparse.from_spmatrix(OHE.fit_transform(df[['PAY_6']]))

In [76]:
df['default payment next month'].value_counts()

0    18691
1     5308
Name: default payment next month, dtype: int64

In [54]:
Xohe = pd.concat([X,pay0,pay2,pay3,pay4,pay5,pay6],axis=1,sort=False)

In [55]:
Xohe = Xohe.drop(columns=['PAY_0','PAY_2','PAY_3','PAY_4','PAY_5','PAY_6'])

In [56]:
Xohe.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,...,0,1,2,3,4,5,6,7,8,9
0,350000,1,1,2,37,316,316,316,466,466,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,50000,2,2,1,37,40994,43594,38222,13026,13268,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,50000,2,1,2,23,3430,2478,2299,4800,9810,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,20000,1,3,1,56,10847,12176,12884,13784,13420,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,110000,2,2,2,32,108159,106812,108464,108829,110557,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [65]:
ss = StandardScaler()
Xss = ss.fit_transform(Xohe)

In [105]:
pca = PCA(n_components=0.9,random_state=42)
Xpca = pca.fit_transform(Xss)

In [112]:
X_train, X_test, y_train, y_test = train_test_split(Xss,y,random_state=42)

In [113]:
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [114]:
print(classification_report(y_test,rfc.predict(X_test)))

              precision    recall  f1-score   support

           0       0.83      0.94      0.88      4657
           1       0.63      0.35      0.45      1343

    accuracy                           0.81      6000
   macro avg       0.73      0.64      0.67      6000
weighted avg       0.79      0.81      0.79      6000



In [115]:
sorted(list(zip(rfc.feature_importances_,Xohe.columns)),reverse=True)

[(0.05308612043955644, 'AGE'),
 (0.05004902312293923, 4),
 (0.04992490203590134, 'LIMIT_BAL'),
 (0.041580184221761345, 'BILL_AMT1'),
 (0.041573494498294804, 'Paid1'),
 (0.03900137086316493, 'Paid2'),
 (0.03742608475958823, 'BILL_AMT2'),
 (0.03739060010782542, 'PAY_AMT1'),
 (0.037058396398079764, 'Paid6'),
 (0.03702544744149214, 'Paid3'),
 (0.03603292269555984, 'BILL_AMT3'),
 (0.035656931799536745, 'Paid4'),
 (0.03553309386994839, 'PAY_AMT2'),
 (0.035328516140964474, 'Paid5'),
 (0.034809795907217354, 'BILL_AMT6'),
 (0.03350254803551463, 'BILL_AMT4'),
 (0.033289108073524164, 'BILL_AMT5'),
 (0.03288320637619154, 'PAY_AMT6'),
 (0.03233146153903807, 'PAY_AMT4'),
 (0.03216610442288429, 'PAY_AMT3'),
 (0.03104050457119765, 'PAY_AMT5'),
 (0.02179398264228584, 4),
 (0.01794643134000124, 4),
 (0.01649853256676639, 'EDUCATION'),
 (0.014446398134407186, 2),
 (0.014213378996276452, 4),
 (0.011712139451436626, 'MARRIAGE'),
 (0.011278403766325471, 3),
 (0.009843701850567332, 3),
 (0.009766084481152302

In [116]:
z =pd.DataFrame(y_test)
z['default payment next month'].value_counts()

0    4657
1    1343
Name: default payment next month, dtype: int64

In [117]:
gb = GradientBoostingClassifier(n_estimators=1000)
gb.fit(X_train,y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=1000,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [118]:
print(classification_report(y_test,gb.predict(X_test)))

              precision    recall  f1-score   support

           0       0.84      0.94      0.89      4657
           1       0.65      0.38      0.48      1343

    accuracy                           0.82      6000
   macro avg       0.75      0.66      0.68      6000
weighted avg       0.80      0.82      0.80      6000



In [119]:
sorted(list(zip(gb.feature_importances_,Xohe.columns)),reverse=True)

[(0.2602956158859282, 4),
 (0.06645701518291741, 4),
 (0.03997427793251638, 'PAY_AMT1'),
 (0.03890147296274421, 4),
 (0.03847842832185488, 'LIMIT_BAL'),
 (0.0367285259568316, 'Paid1'),
 (0.03363521958749958, 'PAY_AMT2'),
 (0.031203078554971966, 'BILL_AMT1'),
 (0.03010077586709589, 'Paid2'),
 (0.025501852302315116, 'Paid3'),
 (0.024875306637171756, 'PAY_AMT3'),
 (0.02435030274423746, 'AGE'),
 (0.02418481914950605, 'PAY_AMT4'),
 (0.02288763727450655, 'Paid6'),
 (0.02262133990142766, 'PAY_AMT6'),
 (0.022406396781690277, 'Paid4'),
 (0.022059353825451104, 'BILL_AMT6'),
 (0.02100495331594933, 'BILL_AMT2'),
 (0.02001508314348313, 'PAY_AMT5'),
 (0.018701409715983143, 'BILL_AMT5'),
 (0.018398466501301045, 'BILL_AMT3'),
 (0.017769498870465708, 3),
 (0.016979673325096957, 5),
 (0.01587909872510829, 4),
 (0.015027757009501316, 'Paid5'),
 (0.014475777188337738, 'BILL_AMT4'),
 (0.008930962649517808, 5),
 (0.007574574840378016, 3),
 (0.006370783420100507, 'EDUCATION'),
 (0.005800062227010165, 3),
 (0