In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn.metrics as metrics 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [2]:
def classificationMetrics(y, yhat):
    prf1 = metrics.precision_recall_fscore_support(y,yhat)
    res = {'Accuracy': metrics.accuracy_score(y,yhat),
           'Precision':prf1[0][1],
           'Recall': prf1[1][1],
           'f1-score': prf1[2][1],
           'Log-loss': metrics.log_loss(y,yhat),
           'AUC': metrics.roc_auc_score(y,yhat)
          }
    return res

In [3]:
# Load the DataFrame from the pickle file using pandas
df = pd.read_csv("loans_clean.csv", index_col=False)
df = df.drop(columns=['Unnamed: 0'])
df.head()

Unnamed: 0,Unnamed: 0.1,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Credit_History,Property_Area,Loan_Status,Loan_Amount_Term_Bin
0,0,0,0,0,0,0,5849,0.0,152.0,1.0,0,0,1
1,1,0,1,1,0,0,4583,1508.0,128.0,1.0,1,1,1
2,2,0,1,0,0,1,3000,0.0,66.0,1.0,0,0,1
3,3,0,1,0,1,0,2583,2358.0,120.0,1.0,0,0,1
4,4,0,0,0,0,0,6000,0.0,141.0,1.0,0,0,1


In [4]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['Loan_Status'])
y = df['Loan_Status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)

In [5]:
grid_params = {
    'n_estimators': [100, 200, 500],         # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],         # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],         # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],           # Minimum number of samples required to be at a leaf node
    'max_features': ['auto', 'sqrt', 'log2'],# Number of features to consider when looking for the best split
    'bootstrap': [True, False]               # Whether bootstrap samples are used when building trees
}


In [6]:
grid = GridSearchCV(RandomForestClassifier(),grid_params,refit=True,n_jobs=18)

In [7]:
import warnings
warnings.filterwarnings('ignore')
grid.fit(X_train, y_train)

In [8]:
pd.set_option('display.max_colwidth', None)
results_df = pd.DataFrame(grid.cv_results_)
top_results = results_df.sort_values(by='mean_test_score', ascending=False).head(20)
results_df.columns
print(top_results[['rank_test_score', 'mean_test_score', 'params']])

     rank_test_score  mean_test_score  \
238                1         0.820520   
50                 1         0.820520   
134                1         0.820520   
153                4         0.818194   
78                 4         0.818194   
211                4         0.818194   
234                4         0.818194   
126                4         0.818194   
46                 4         0.818194   
45                 4         0.818194   
289                4         0.818194   
291                4         0.818194   
322                4         0.818194   
293                4         0.818194   
161                4         0.818194   
76                 4         0.818194   
317                4         0.818194   
159                4         0.818194   
157                4         0.818194   
214                4         0.818194   

                                                                                                                                  params  

In [16]:
final_hparams = top_results.iloc[0]['params']
print(f"Final hyperparams set {final_hparams}")


model = RandomForestClassifier(**final_hparams)
model.fit(X_train,y_train)

Final hyperparams set {'bootstrap': True, 'max_depth': 20, 'max_features': 'log2', 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 200}


In [18]:
prediction = model.predict(X_test)

In [20]:
classificationMetrics(y_test, prediction)

{'Accuracy': 0.7837837837837838,
 'Precision': 0.896551724137931,
 'Recall': 0.4126984126984127,
 'f1-score': 0.5652173913043478,
 'Log-loss': 7.793222354403708,
 'AUC': 0.6940541243819932}