In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.naive_bayes import GaussianNB
from imblearn.under_sampling import NearMiss
from keras.models import Sequential
from keras.layers import Dense
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from pandas_profiling import ProfileReport

  import pandas.util.testing as tm
Using TensorFlow backend.


In [2]:
data=pd.read_csv("train_ctrUa4K.csv")
data

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [3]:
for column in ('Gender','Married','Dependents','Self_Employed'):
    data[column].fillna(data[column].mode()[0],inplace=True)

In [4]:
for column in ('LoanAmount','Loan_Amount_Term','Credit_History'):
    data[column].fillna(data[column].mean(),inplace=True)

In [5]:
for variable in ('Gender','Married','Dependents','Education','Self_Employed','Property_Area'):
    data[variable].fillna("Missing",inplace=True)
    dummies=pd.get_dummies(data[variable],prefix=variable)
    data=pd.concat([data,dummies],axis=1)
    data.drop([variable],axis=1,inplace=True)

In [6]:
data['Loan_Status']=data.Loan_Status.map({'Y':0,'N':1})
Y=data['Loan_Status']
data.drop(['Loan_Status'],axis=1,inplace=True)
X=data[data.iloc[:,1:23].columns]

In [7]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,random_state=100,test_size=0.2)

In [8]:
scaler=StandardScaler()
scaled_X_train=scaler.fit_transform(X_train)
scaled_X_test=scaler.transform(X_test)

In [9]:
RF_model= RandomForestClassifier(1000,oob_score=True)
RF_model.fit(scaled_X_train,Y_train)
RF_model.oob_score_

0.8065173116089613

In [10]:
RF_model.feature_importances_

array([0.17339808, 0.09650978, 0.16877284, 0.04506961, 0.28581346,
       0.01140937, 0.01183386, 0.01525492, 0.01607625, 0.01879254,
       0.01801167, 0.01273553, 0.00895502, 0.0159464 , 0.01667706,
       0.01384091, 0.01337929, 0.01815747, 0.02258941, 0.01677655])

In [11]:
s=pd.Series(RF_model.feature_importances_, index=X_train.columns)
s.get(s.values>0.02)

ApplicantIncome            0.173398
CoapplicantIncome          0.096510
LoanAmount                 0.168773
Loan_Amount_Term           0.045070
Credit_History             0.285813
Property_Area_Semiurban    0.022589
dtype: float64

In [12]:
def randomforest_param(x, y, nfolds):
    n_estimator_val = [100,150,300,500,1000]
    n_sample_leaf_val = [1,2,3,4,5,6]
    max_feature_val=["auto","sqrt",None,0.9]
    param_grid = {'n_estimators': n_estimator_val, 'min_samples_leaf' : n_sample_leaf_val,'max_features':max_feature_val}
    RF_model=RandomForestClassifier()
    grid_search_RF = GridSearchCV(estimator = RF_model,param_grid=param_grid, cv=nfolds)
    grid_search_RF.fit(x, y)
    return grid_search_RF.best_params_

In [13]:
randomforest_param(scaled_X_train,Y_train,3)

{'max_features': 'auto', 'min_samples_leaf': 5, 'n_estimators': 100}

In [14]:
RF_model=RandomForestClassifier(max_features='auto',min_samples_leaf=5,n_estimators=100,n_jobs=-1,oob_score=True)
RF_model.fit(scaled_X_train,Y_train)
RF_pred=RF_model.predict(scaled_X_test)
print("oob score for random forest model:",RF_model.oob_score_)
print("Recall for random forest model:",metrics.recall_score(Y_test,RF_pred))
print("Precision for random forest model:",metrics.precision_score(Y_test,RF_pred))
print("Accuracy for random forest model:",metrics.accuracy_score(Y_test,RF_pred))
print("F-score for random forest model:",metrics.f1_score(Y_test,RF_pred))
print("Log-loss for random forest model:",metrics.log_loss(Y_test,RF_pred))

oob score for random forest model: 0.8268839103869654
Recall for random forest model: 0.375
Precision for random forest model: 0.8571428571428571
Accuracy for random forest model: 0.7317073170731707
F-score for random forest model: 0.5217391304347825
Log-loss for random forest model: 9.266520486376773
