In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn.metrics as metrics 
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [2]:
def classificationMetrics(y, yhat):
    prf1 = metrics.precision_recall_fscore_support(y,yhat)
    res = {'Accuracy': metrics.accuracy_score(y,yhat),
           'Precision':prf1[0][1],
           'Recall': prf1[1][1],
           'f1-score': prf1[2][1],
           'Log-loss': metrics.log_loss(y,yhat),
           'AUC': metrics.roc_auc_score(y,yhat)
          }
    return res

In [3]:
# Load the DataFrame from the pickle file using pandas
df = pd.read_csv("loans_clean.csv", index_col=False)
df = df.drop(columns=['Unnamed: 0'])
df.head()

Unnamed: 0,Unnamed: 0.1,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Credit_History,Property_Area,Loan_Status,Loan_Amount_Term_Bin
0,0,0,0,0,0,0,5849,0.0,152.0,1.0,0,0,1
1,1,0,1,1,0,0,4583,1508.0,128.0,1.0,1,1,1
2,2,0,1,0,0,1,3000,0.0,66.0,1.0,0,0,1
3,3,0,1,0,1,0,2583,2358.0,120.0,1.0,0,0,1
4,4,0,0,0,0,0,6000,0.0,141.0,1.0,0,0,1


In [4]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['Loan_Status'])
y = df['Loan_Status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)

In [5]:
models_list = pd.DataFrame()

In [6]:
mod1 = LogisticRegression(
    max_iter=100, 
    C=0.1, 
    class_weight='balanced', 
    fit_intercept=True,
    penalty='l1',
    solver='liblinear'
)
mod1.fit(X,y)

In [7]:
pred1 = mod1.predict(X)
model_dict = {'model': 'Logistic Regression'}

model_metrics = classificationMetrics(y, pred1)
result_dict = {**model_dict, **model_metrics}
models_list = pd.concat([models_list, pd.DataFrame([result_dict])], ignore_index=True)

In [8]:
pd.crosstab(y, pred1)

col_0,0,1
Loan_Status,Unnamed: 1_level_1,Unnamed: 2_level_1
0,398,24
1,100,92


In [9]:
import sklearn.metrics as skmet
cmat=skmet.confusion_matrix(y,pred1)

In [10]:
mod3 = RandomForestClassifier(max_depth=9, max_features=9, criterion='entropy', random_state=42)
mod3.fit(X,y)

In [11]:
pred3 = mod3.predict(X)
model_dict = {'model': "RandomForest"}

model_metrics = classificationMetrics(y, pred3)
result_dict = {**model_dict, **model_metrics}
models_list = pd.concat([models_list, pd.DataFrame([result_dict])], ignore_index=True)

In [12]:
pd.crosstab(y, pred3)

col_0,0,1
Loan_Status,Unnamed: 1_level_1,Unnamed: 2_level_1
0,422,0
1,49,143


In [13]:
mod6 = SVC(C=2, kernel='rbf', gamma=0.001, tol=0.1)
mod6.fit(X,y)

In [14]:
pred6 = mod6.predict(X)
model_dict = {'model': "SVC"}
model_metrics = classificationMetrics(y, pred6)

In [15]:
result_dict = {**model_dict, **model_metrics}
models_list = pd.concat([models_list, pd.DataFrame([result_dict])], ignore_index=True)

In [16]:
mod4 = AdaBoostClassifier(random_state=1)
mod4.fit(X,y)



In [17]:
pred4 = mod4.predict(X)
model_dict = {'model': "ADABoost"}
new_row = pd.DataFrame([{**model_dict, **classificationMetrics(y, pred4)}])
models_list = pd.concat([models_list, new_row], ignore_index=True)

In [18]:
mod5 = GradientBoostingClassifier(random_state=1)
mod5.fit(X,y)

In [19]:
pred5 = mod5.predict(X)
model_dict = {'model': "GBM"}
new_row = pd.DataFrame([{**model_dict, **classificationMetrics(y, pred5)}])
models_list = pd.concat([models_list, new_row], ignore_index=True)

In [20]:
mod7 = xgb.XGBClassifier(learning_rate=0.04,
                        max_depth=8,
                        min_child_weight=1)
mod7.fit(X,y)

In [21]:
pred7 = mod7.predict(X)
model_dict = {'model': "XGB"}
new_row = pd.DataFrame([{**model_dict, **classificationMetrics(y, pred7)}])
models_list = pd.concat([models_list, new_row], ignore_index=True)
models_list.sort_values('Accuracy', ascending=False)

Unnamed: 0,model,Accuracy,Precision,Recall,f1-score,Log-loss,AUC
2,SVC,1.0,1.0,1.0,1.0,2.220446e-16,1.0
5,XGB,0.956026,0.960894,0.895833,0.927224,1.584982,0.939623
1,RandomForest,0.920195,1.0,0.744792,0.853731,2.876448,0.872396
4,GBM,0.876221,0.967742,0.625,0.759494,4.461429,0.807761
3,ADABoost,0.840391,0.891667,0.557292,0.685897,5.752896,0.763243
0,Logistic Regression,0.798046,0.793103,0.479167,0.597403,7.279174,0.711147


Above comparison shows that XGB provides the most accurate prediction without overfittin, we'll proceed with this model to the fine tunning phase