In [14]:
#Importing Libraries
import pandas as pd
import numpy as np
import pickle

In [15]:
#Importing Dataset
Bank=pd.read_csv('bank_final.csv')

In [16]:
# Removing '$' and commas from records in columns with dollar values that should be floats
Bank[['DisbursementGross', 'BalanceGross', 'ChgOffPrinGr', 'GrAppv', 'SBA_Appv']] = Bank[['DisbursementGross', 'BalanceGross', 'ChgOffPrinGr', 'GrAppv', 'SBA_Appv']].applymap(lambda x: x.strip().replace('$', '').replace(',', '')).astype(float)


In [17]:
Bank.loc[(Bank['FranchiseCode'] >= 1), 'FranchiseStatus'] = 1
Bank.loc[(Bank['FranchiseCode'] <1), 'FranchiseStatus'] = 0

In [18]:
# Remove records where RevLineCr != 'Y' or 'N' and LowDoc != 'Y' or 'N'
Bank = Bank[(Bank['RevLineCr'] == 'Y') | (Bank['RevLineCr'] == 'N')]
Bank = Bank[(Bank['LowDoc'] == 'Y') | (Bank['LowDoc'] == 'N')]

In [19]:
# RevLineCr and LowDoc: 0 = No, 1 = Yes
Bank['RevLineCr'] = np.where(Bank['RevLineCr'] == 'N', 0, 1)
Bank['LowDoc'] = np.where(Bank['LowDoc'] == 'N', 0, 1)

In [20]:
#  Converting MIS_Status to numeric format
Bank['MIS_Status'] = np.where(Bank['MIS_Status'] == 'P I F', 1, 0)
Bank['MIS_Status'].value_counts()

1    87599
0    33813
Name: MIS_Status, dtype: int64

In [21]:
df=Bank.drop(['Name','City','State','Bank','BankState','ApprovalFY','ApprovalDate','FranchiseCode','Zip','CCSC','DisbursementDate','BalanceGross','ChgOffDate',],axis=1)

In [22]:
df['FranchiseStatus']=df['FranchiseStatus'].astype('int64')

In [23]:
df = df[['Term','NoEmp','CreateJob','RetainedJob','UrbanRural','FranchiseStatus','NewExist','RevLineCr','LowDoc','DisbursementGross','MIS_Status']]

In [24]:
XFinal_Features=df[['Term','NoEmp','CreateJob','RetainedJob','UrbanRural','FranchiseStatus','NewExist','RevLineCr','LowDoc','DisbursementGross']]
YFinal_Features=df['MIS_Status']

In [35]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(XFinal_Features,YFinal_Features,test_size=0.20)

In [36]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(bootstrap=True,criterion = 'entropy',max_depth=10, max_features=8, min_samples_leaf=1, n_estimators=10,random_state = 25)

In [37]:
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=10, max_features=8,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=25, verbose=0,
                       warm_start=False)

In [38]:
Pred = rfc.predict(X_test)

In [39]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,Pred))

[[ 5997   713]
 [  686 16887]]


In [40]:
print(classification_report(y_test,Pred))

              precision    recall  f1-score   support

           0       0.90      0.89      0.90      6710
           1       0.96      0.96      0.96     17573

    accuracy                           0.94     24283
   macro avg       0.93      0.93      0.93     24283
weighted avg       0.94      0.94      0.94     24283



In [41]:
# Saving model to disk
pickle.dump(rfc, open('model.pkl','wb'))

In [42]:
# Loading model to compare the results
model = pickle.load(open('model.pkl','rb'))