In [149]:
import pandas as pd
from matplotlib import pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score,GridSearchCV
from sklearn.decomposition import PCA

import seaborn as sns

In [150]:
df = pd.read_csv("./train.csv")

df.head()

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
0,0,37,35000,RENT,0.0,EDUCATION,B,6000,11.49,0.17,N,14,0
1,1,22,56000,OWN,6.0,MEDICAL,C,4000,13.35,0.07,N,2,0
2,2,29,28800,OWN,8.0,PERSONAL,A,6000,8.9,0.21,N,10,0
3,3,30,70000,RENT,14.0,VENTURE,B,12000,11.11,0.17,N,5,0
4,4,22,60000,RENT,2.0,MEDICAL,A,6000,6.92,0.1,N,3,0


In [151]:
df["person_home_ownership"].unique()

array(['RENT', 'OWN', 'MORTGAGE', 'OTHER'], dtype=object)

### Ordinal Encoding

In [152]:
df["person_home_ownership"] = df["person_home_ownership"].apply(lambda o:{"OWN":1,"RENT":0,"MORTGAGE":0,"OTHER":0}[o])
df["default"] = df["cb_person_default_on_file"].apply(lambda o:{"N":0,"Y":1}[o])
df["history_length"] = df["cb_person_cred_hist_length"]
df.head(10)

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,default,history_length
0,0,37,35000,0,0.0,EDUCATION,B,6000,11.49,0.17,N,14,0,0,14
1,1,22,56000,1,6.0,MEDICAL,C,4000,13.35,0.07,N,2,0,0,2
2,2,29,28800,1,8.0,PERSONAL,A,6000,8.9,0.21,N,10,0,0,10
3,3,30,70000,0,14.0,VENTURE,B,12000,11.11,0.17,N,5,0,0,5
4,4,22,60000,0,2.0,MEDICAL,A,6000,6.92,0.1,N,3,0,0,3
5,5,27,45000,0,2.0,VENTURE,A,9000,8.94,0.2,N,5,0,0,5
6,6,25,45000,0,9.0,EDUCATION,A,12000,6.54,0.27,N,3,0,0,3
7,7,21,20000,0,0.0,PERSONAL,C,2500,13.49,0.13,Y,3,0,1,3
8,8,37,69600,0,11.0,EDUCATION,D,5000,14.84,0.07,Y,11,0,1,11
9,9,35,110000,0,0.0,DEBTCONSOLIDATION,C,15000,12.98,0.14,Y,6,0,1,6


In [153]:
df["loan_grade"].unique()

array(['B', 'C', 'A', 'D', 'E', 'F', 'G'], dtype=object)

In [154]:
df["loan_grade"] = df["loan_grade"].apply(lambda g:dict(zip([chr(i) for i in range(ord("A"),ord("H"))],[i+1 for i in range(7)]))[g])

df.head(10)

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,default,history_length
0,0,37,35000,0,0.0,EDUCATION,2,6000,11.49,0.17,N,14,0,0,14
1,1,22,56000,1,6.0,MEDICAL,3,4000,13.35,0.07,N,2,0,0,2
2,2,29,28800,1,8.0,PERSONAL,1,6000,8.9,0.21,N,10,0,0,10
3,3,30,70000,0,14.0,VENTURE,2,12000,11.11,0.17,N,5,0,0,5
4,4,22,60000,0,2.0,MEDICAL,1,6000,6.92,0.1,N,3,0,0,3
5,5,27,45000,0,2.0,VENTURE,1,9000,8.94,0.2,N,5,0,0,5
6,6,25,45000,0,9.0,EDUCATION,1,12000,6.54,0.27,N,3,0,0,3
7,7,21,20000,0,0.0,PERSONAL,3,2500,13.49,0.13,Y,3,0,1,3
8,8,37,69600,0,11.0,EDUCATION,4,5000,14.84,0.07,Y,11,0,1,11
9,9,35,110000,0,0.0,DEBTCONSOLIDATION,3,15000,12.98,0.14,Y,6,0,1,6


In [155]:
df["loan_intent"].unique()

array(['EDUCATION', 'MEDICAL', 'PERSONAL', 'VENTURE', 'DEBTCONSOLIDATION',
       'HOMEIMPROVEMENT'], dtype=object)

In [156]:
oneHotEncoded = pd.get_dummies(df["loan_intent"],dtype=int)

pd.concat([df,oneHotEncoded],axis=1)

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,...,cb_person_cred_hist_length,loan_status,default,history_length,DEBTCONSOLIDATION,EDUCATION,HOMEIMPROVEMENT,MEDICAL,PERSONAL,VENTURE
0,0,37,35000,0,0.0,EDUCATION,2,6000,11.49,0.17,...,14,0,0,14,0,1,0,0,0,0
1,1,22,56000,1,6.0,MEDICAL,3,4000,13.35,0.07,...,2,0,0,2,0,0,0,1,0,0
2,2,29,28800,1,8.0,PERSONAL,1,6000,8.90,0.21,...,10,0,0,10,0,0,0,0,1,0
3,3,30,70000,0,14.0,VENTURE,2,12000,11.11,0.17,...,5,0,0,5,0,0,0,0,0,1
4,4,22,60000,0,2.0,MEDICAL,1,6000,6.92,0.10,...,3,0,0,3,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58640,58640,34,120000,0,5.0,EDUCATION,4,25000,15.95,0.21,...,10,0,1,10,0,1,0,0,0,0
58641,58641,28,28800,0,0.0,MEDICAL,3,10000,12.73,0.35,...,8,1,0,8,0,0,0,1,0,0
58642,58642,23,44000,0,7.0,EDUCATION,4,6800,16.00,0.15,...,2,1,0,2,0,1,0,0,0,0
58643,58643,22,30000,0,2.0,EDUCATION,1,5000,8.90,0.17,...,3,0,0,3,0,1,0,0,0,0


In [162]:
# Pca components for loan and default
pca_dict = {"loan_intent":PCA(n_components=3),"default":PCA(n_components=1)}

In [163]:
pca_reduced_1 = pd.DataFrame(pca_dict["loan_intent"].fit_transform(oneHotEncoded),columns=["l_i1","l_i2","l_i3"])
pca_reduced_1

Unnamed: 0,l_i1,l_i2,l_i3
0,0.849062,0.150619,-0.000783
1,-0.419232,0.772996,-0.001920
2,-0.206964,-0.420757,-0.706463
3,-0.206394,-0.417247,0.707744
4,-0.419232,0.772996,-0.001920
...,...,...,...
58640,0.849062,0.150619,-0.000783
58641,-0.419232,0.772996,-0.001920
58642,0.849062,0.150619,-0.000783
58643,0.849062,0.150619,-0.000783


In [159]:
pca_reduced_2 = pd.DataFrame(pca_dict["default"].fit_transform(df[["history_length","default"]]),columns=["default_factor"])
pca_reduced_2

Unnamed: 0,default_factor
0,8.186379
1,-3.813620
2,4.186379
3,-0.813621
4,-2.813620
...,...
58640,4.186813
58641,2.186379
58642,-3.813620
58643,-2.813620


In [160]:
df = pd.concat([df,pca_reduced_1,pca_reduced_2],axis=1)

df.head(10)

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,default,history_length,l_i1,l_i2,l_i3,default_factor
0,0,37,35000,0,0.0,EDUCATION,2,6000,11.49,0.17,N,14,0,0,14,0.849062,0.150619,-0.000783,8.186379
1,1,22,56000,1,6.0,MEDICAL,3,4000,13.35,0.07,N,2,0,0,2,-0.419232,0.772996,-0.00192,-3.81362
2,2,29,28800,1,8.0,PERSONAL,1,6000,8.9,0.21,N,10,0,0,10,-0.206964,-0.420757,-0.706463,4.186379
3,3,30,70000,0,14.0,VENTURE,2,12000,11.11,0.17,N,5,0,0,5,-0.206394,-0.417247,0.707744,-0.813621
4,4,22,60000,0,2.0,MEDICAL,1,6000,6.92,0.1,N,3,0,0,3,-0.419232,0.772996,-0.00192,-2.81362
5,5,27,45000,0,2.0,VENTURE,1,9000,8.94,0.2,N,5,0,0,5,-0.206394,-0.417247,0.707744,-0.813621
6,6,25,45000,0,9.0,EDUCATION,1,12000,6.54,0.27,N,3,0,0,3,0.849062,0.150619,-0.000783,-2.81362
7,7,21,20000,0,0.0,PERSONAL,3,2500,13.49,0.13,Y,3,0,1,3,-0.206964,-0.420757,-0.706463,-2.813186
8,8,37,69600,0,11.0,EDUCATION,4,5000,14.84,0.07,Y,11,0,1,11,0.849062,0.150619,-0.000783,5.186813
9,9,35,110000,0,0.0,DEBTCONSOLIDATION,3,15000,12.98,0.14,Y,6,0,1,6,-0.13918,-0.169289,0.002008,0.186814


In [176]:
X = ["person_age","person_income","loan_grade","loan_int_rate","loan_percent_income","l_i1","l_i2","l_i3","default_factor"]
Y = "loan_status"

new_df = df[X+[Y]]
new_df.head(10)

Unnamed: 0,person_age,person_income,loan_grade,loan_int_rate,loan_percent_income,l_i1,l_i2,l_i3,default_factor,loan_status
0,37,35000,2,11.49,0.17,0.849062,0.150619,-0.000783,8.186379,0
1,22,56000,3,13.35,0.07,-0.419232,0.772996,-0.00192,-3.81362,0
2,29,28800,1,8.9,0.21,-0.206964,-0.420757,-0.706463,4.186379,0
3,30,70000,2,11.11,0.17,-0.206394,-0.417247,0.707744,-0.813621,0
4,22,60000,1,6.92,0.1,-0.419232,0.772996,-0.00192,-2.81362,0
5,27,45000,1,8.94,0.2,-0.206394,-0.417247,0.707744,-0.813621,0
6,25,45000,1,6.54,0.27,0.849062,0.150619,-0.000783,-2.81362,0
7,21,20000,3,13.49,0.13,-0.206964,-0.420757,-0.706463,-2.813186,0
8,37,69600,4,14.84,0.07,0.849062,0.150619,-0.000783,5.186813,0
9,35,110000,3,12.98,0.14,-0.13918,-0.169289,0.002008,0.186814,0


In [177]:
model_params = {
    "n_estimators":[50,100],
    "criterion":["entropy","gini","log_loss"]
}

gs = GridSearchCV(RandomForestClassifier(),model_params,return_train_score=True,cv=5)
gs.fit(new_df[X],new_df[Y])
gs.cv_results_

{'mean_fit_time': array([2.25243573, 5.84115772, 2.67646556, 6.3392333 , 2.02020969,
        4.53074999]),
 'std_fit_time': array([0.04854235, 1.87910323, 0.18223912, 2.91312142, 0.03065432,
        0.2419789 ]),
 'mean_score_time': array([0.07887468, 0.20093832, 0.101193  , 0.22116594, 0.07051249,
        0.16663284]),
 'std_score_time': array([0.00441525, 0.05399897, 0.02096883, 0.09466494, 0.00134661,
        0.02465114]),
 'param_criterion': masked_array(data=['entropy', 'entropy', 'gini', 'gini', 'log_loss',
                    'log_loss'],
              mask=[False, False, False, False, False, False],
        fill_value=np.str_('?'),
             dtype=object),
 'param_n_estimators': masked_array(data=[50, 100, 50, 100, 50, 100],
              mask=[False, False, False, False, False, False],
        fill_value=999999),
 'params': [{'criterion': 'entropy', 'n_estimators': 50},
  {'criterion': 'entropy', 'n_estimators': 100},
  {'criterion': 'gini', 'n_estimators': 50},
  {'criteri

In [178]:
gs.best_params_

{'criterion': 'log_loss', 'n_estimators': 100}

In [179]:
model = RandomForestClassifier(n_estimators=gs.best_params_["n_estimators"],criterion=gs.best_params_["criterion"])
model.fit(new_df[X],new_df[Y])

In [181]:
score = cross_val_score(model,df[X],df[Y],cv=5)

score.mean()

np.float64(0.9215107852331827)

## Dealing with test data

In [194]:
test_df = pd.read_csv("test.csv")
test_df

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,58645,23,69000,RENT,3.0,HOMEIMPROVEMENT,F,25000,15.76,0.36,N,2
1,58646,26,96000,MORTGAGE,6.0,PERSONAL,C,10000,12.68,0.10,Y,4
2,58647,26,30000,RENT,5.0,VENTURE,E,4000,17.19,0.13,Y,2
3,58648,33,50000,RENT,4.0,DEBTCONSOLIDATION,A,7000,8.90,0.14,N,7
4,58649,26,102000,MORTGAGE,8.0,HOMEIMPROVEMENT,D,15000,16.32,0.15,Y,4
...,...,...,...,...,...,...,...,...,...,...,...,...
39093,97738,22,31200,MORTGAGE,2.0,DEBTCONSOLIDATION,B,3000,10.37,0.10,N,4
39094,97739,22,48000,MORTGAGE,6.0,EDUCATION,A,7000,6.03,0.15,N,3
39095,97740,51,60000,MORTGAGE,0.0,PERSONAL,A,15000,7.51,0.25,N,25
39096,97741,22,36000,MORTGAGE,4.0,PERSONAL,D,14000,15.62,0.39,Y,4


In [195]:
def preprocess(df):
    df["person_home_ownership"] = df["person_home_ownership"].apply(lambda o:{"OWN":1,"RENT":0,"MORTGAGE":0,"OTHER":0}[o])
    df["default"] = df["cb_person_default_on_file"].apply(lambda o:{"N":0,"Y":1}[o])
    df["history_length"] = df["cb_person_cred_hist_length"]
    df["loan_grade"] = df["loan_grade"].apply(lambda g:dict(zip([chr(i) for i in range(ord("A"),ord("H"))],[i+1 for i in range(7)]))[g])

    oneHotEncoded = pd.get_dummies(df["loan_intent"],dtype=int)

    pca_reduced_1 = pd.DataFrame(pca_dict["loan_intent"].fit_transform(oneHotEncoded),columns=["l_i1","l_i2","l_i3"])

    pca_reduced_2 = pd.DataFrame(pca_dict["default"].fit_transform(df[["history_length","default"]]),columns=["default_factor"])

    df = pd.concat([df,pca_reduced_1,pca_reduced_2],axis=1)
    return df


test_df = preprocess(test_df)
test_df

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,default,history_length,l_i1,l_i2,l_i3,default_factor
0,58645,23,69000,0,3.0,HOMEIMPROVEMENT,6,25000,15.76,0.36,N,2,0,2,-0.053025,-0.065205,0.025472,-3.830786
1,58646,26,96000,0,6.0,PERSONAL,3,10000,12.68,0.10,Y,4,1,4,-0.186941,-0.530272,-0.656727,-1.830257
2,58647,26,30000,0,5.0,VENTURE,5,4000,17.19,0.13,Y,2,1,2,-0.158744,-0.353132,0.743251,-3.830257
3,58648,33,50000,0,4.0,DEBTCONSOLIDATION,1,7000,8.90,0.14,N,7,0,7,-0.099779,-0.152948,0.079473,1.169213
4,58649,26,102000,0,8.0,HOMEIMPROVEMENT,4,15000,16.32,0.15,Y,4,1,4,-0.053025,-0.065205,0.025472,-1.830257
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39093,97738,22,31200,0,2.0,DEBTCONSOLIDATION,2,3000,10.37,0.10,N,4,0,4,-0.099779,-0.152948,0.079473,-1.830787
39094,97739,22,48000,0,6.0,EDUCATION,1,7000,6.03,0.15,N,3,0,3,0.834774,0.215861,-0.046979,-2.830787
39095,97740,51,60000,0,0.0,PERSONAL,1,15000,7.51,0.25,N,25,0,25,-0.186941,-0.530272,-0.656727,19.169210
39096,97741,22,36000,0,4.0,PERSONAL,4,14000,15.62,0.39,Y,4,1,4,-0.186941,-0.530272,-0.656727,-1.830257


In [209]:
results = model.predict_proba(test_df[X])
# X
result_df = pd.concat([test_df["id"],pd.DataFrame(results[:,1],columns=["loan_status"])],axis=1)
result_df.to_csv("submission.csv",index=False)
result_df

Unnamed: 0,id,loan_status
0,58645,0.70
1,58646,0.03
2,58647,0.95
3,58648,0.01
4,58649,0.60
...,...,...
39093,97738,0.38
39094,97739,0.01
39095,97740,0.01
39096,97741,0.73
