In [1]:
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [2]:
data = pd.read_csv('train_data.csv', dtype=object)
X = data.drop('default payment next month', axis=1)
y = data['default payment next month'].copy()

In [3]:
data.head()
data['default payment next month'] = data['default payment next month'].astype(int)

In [4]:
ohe_features = ['SEX', 'EDUCATION', 'MARRIAGE']
cont_features = ['LIMIT_BAL', 'AGE', 'BILL_AMT1', 'PAY_AMT1']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [6]:
X_train_index = X_train.index
X_test_index = X_test.index

In [7]:
X_train_ohe = X_train[ohe_features]
X_test_ohe = X_test[ohe_features]
X_train_cont = X_train[cont_features].astype(float)
X_test_cont = X_test[cont_features].astype(float)

In [8]:
ohe = OneHotEncoder()
ss = StandardScaler()
X_train_encoded = ohe.fit_transform(X_train_ohe)
X_test_encoded = ohe.transform(X_test_ohe)
X_train_scaled = pd.DataFrame(ss.fit_transform(X_train_cont), columns=X_train[cont_features].columns, index=X_train_index)
X_test_scaled = pd.DataFrame(ss.transform(X_test_cont), columns=X_test[cont_features].columns, index=X_test_index)

In [9]:
train_columns = ohe.get_feature_names(input_features=X_train_ohe.columns)
test_columns = ohe.get_feature_names(input_features=X_test_ohe.columns)
X_train_processed = pd.DataFrame(X_train_encoded.todense(), columns=train_columns, index=X_train_index)
X_test_processed = pd.DataFrame(X_test_encoded.todense(), columns=test_columns, index=X_test_index)

In [10]:
X_train_all = pd.concat([X_train_scaled, X_train_processed], axis=1)
X_test_all = pd.concat([X_test_scaled, X_test_processed], axis=1)

In [11]:
X_train_all

Unnamed: 0,LIMIT_BAL,AGE,BILL_AMT1,PAY_AMT1,SEX_1,SEX_2,EDUCATION_0,EDUCATION_1,EDUCATION_2,EDUCATION_3,EDUCATION_4,EDUCATION_5,EDUCATION_6,MARRIAGE_0,MARRIAGE_1,MARRIAGE_2,MARRIAGE_3
1377,-0.680070,0.160092,-0.678161,-0.114127,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8439,-0.911155,-1.352671,-0.691558,-0.324347,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
23375,-0.911155,0.592309,-0.662886,0.269720,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5719,-0.140871,-0.596290,-0.578220,0.018552,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7833,1.322669,-0.704344,-0.537582,0.269720,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10955,-0.294928,-0.812398,-0.644344,-0.315262,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
17289,1.014555,1.564799,-0.576818,-0.157752,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5192,-0.911155,-1.136562,-0.424354,-0.173372,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
12172,-0.140871,-1.244616,-0.677957,-0.224403,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [15]:
fsm = RandomForestClassifier(random_state=1)
fsm.fit(X_train_all, y_train)
f1_score(y_test, fsm.predict(X_test_all), pos_label='1')


0.20689655172413796

In [16]:
confusion_matrix(y_test, fsm.predict(X_test_all))

array([[4466,  273],
       [1084,  177]], dtype=int64)

In [None]:
y_test

In [None]:
X_test_all