In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn.metrics as metrics 
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

In [3]:
def classificationMetrics(y, yhat):
    prf1 = metrics.precision_recall_fscore_support(y,yhat)
    res = {'Accuracy': metrics.accuracy_score(y,yhat),
           'Precision':prf1[0][1],
           'Recall': prf1[1][1],
           'f1-score': prf1[2][1],
           'Log-loss': metrics.log_loss(y,yhat),
           'AUC': metrics.roc_auc_score(y,yhat)
          }
    return res

In [4]:
import pickle
# Load the DataFrame from the pickle file using pandas
df = pd.read_csv("C:\\Users\\diana\\Downloads\\Loans\\LoanApprovals\\loans_clean4.csv", index_col=False)
df = df.drop(columns=['Unnamed: 0'])
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1,0,0.0,0,0,5849.0,0.0,128.0,360.0,1.0,2,1
1,1,1,1.0,0,0,4583.0,1508.0,128.0,360.0,1.0,0,0
2,1,1,0.0,0,1,3000.0,0.0,66.0,360.0,1.0,2,1
3,1,1,0.0,1,0,2583.0,2358.0,120.0,360.0,1.0,2,1
4,1,0,0.0,0,0,6000.0,0.0,141.0,360.0,1.0,2,1


In [5]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['Loan_Status'])
y = df['Loan_Status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [7]:
models_list = pd.DataFrame()

In [8]:
lr_grid_params = {
    'C':[0.1,0.2,0.4,0.7,1,5,10,100], 
    'penalty':['none', 'l1', 'l2', 'elasticnet'],
    'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter':[100,1000, 10000],
    'class_weight':['none', 'balanced'],
    'fit_intercept':[True,False]
}

In [9]:
lr_grid = GridSearchCV(LogisticRegression(),lr_grid_params,refit=True,n_jobs=10)

In [10]:
import warnings
warnings.filterwarnings('ignore')
lr_grid.fit(X_train, y_train)

In [11]:
lr_results_df = pd.DataFrame(lr_grid.cv_results_)
top_results = lr_results_df.sort_values(by='mean_test_score', ascending=False).head(20)

print(top_results[['rank_test_score', 'mean_test_score', 'params']])

     rank_test_score  mean_test_score  \
407                1         0.804241   
427                1         0.804241   
387                1         0.804241   
447                1         0.804241   
367                1         0.804241   
467                1         0.804241   
167                7         0.797237   
187                7         0.797237   
147                7         0.797237   
207                7         0.797237   
227                7         0.797237   
127                7         0.797237   
707               13         0.785554   
607               13         0.785554   
627               13         0.785554   
647               13         0.785554   
687               13         0.785554   
667               13         0.785554   
847               19         0.773953   
887               19         0.773953   

                                                params  
407  {'C': 0.2, 'class_weight': 'balanced', 'fit_in...  
427  {'C': 0.2, 'class_w

In [12]:
lr_grid.best_params_

{'C': 0.2,
 'class_weight': 'balanced',
 'fit_intercept': True,
 'max_iter': 100,
 'penalty': 'l1',
 'solver': 'liblinear'}

In [13]:
mod1 = LogisticRegression(
    max_iter=100, 
    C=0.1, 
    class_weight='balanced', 
    fit_intercept=True,
    penalty='l1',
    solver='liblinear'
)
mod1.fit(X,y)

In [14]:
pred1 = mod1.predict(X)
model_dict = {'model': 'Logistic Regression'}

model_metrics = classificationMetrics(y, pred1)
result_dict = {**model_dict, **model_metrics}
models_list = pd.concat([models_list, pd.DataFrame([result_dict])], ignore_index=True)

In [15]:
pd.crosstab(y, pred1)

col_0,0,1
Loan_Status,Unnamed: 1_level_1,Unnamed: 2_level_1
0,91,101
1,23,399


In [16]:
import sklearn.metrics as skmet
cmat=skmet.confusion_matrix(y,pred1)

In [17]:
rf_grid_params = {'max_depth':[6,7,8,9,10],
                  'max_features':['log2','sqrt',5,6,7,8,9,10],
                  'min_samples_split':[2,4,6],
                  'min_samples_leaf': [1,3,5],
                  'max_leaf_nodes':['none',5,7,9],
                  'criterion':['gini','entropy','log_loss']
                 }

In [18]:
rf_grid = GridSearchCV(RandomForestClassifier(),rf_grid_params,refit=True,n_jobs=10)

In [19]:
warnings.filterwarnings('ignore')
rf_grid.fit(X_train, y_train)

In [20]:
rf_results_df = pd.DataFrame(lr_grid.cv_results_)
rf_top_results = rf_results_df.sort_values(by='mean_test_score', ascending=False).head(20)

print(rf_top_results[['rank_test_score', 'mean_test_score', 'params']])

     rank_test_score  mean_test_score  \
407                1         0.804241   
427                1         0.804241   
387                1         0.804241   
447                1         0.804241   
367                1         0.804241   
467                1         0.804241   
167                7         0.797237   
187                7         0.797237   
147                7         0.797237   
207                7         0.797237   
227                7         0.797237   
127                7         0.797237   
707               13         0.785554   
607               13         0.785554   
627               13         0.785554   
647               13         0.785554   
687               13         0.785554   
667               13         0.785554   
847               19         0.773953   
887               19         0.773953   

                                                params  
407  {'C': 0.2, 'class_weight': 'balanced', 'fit_in...  
427  {'C': 0.2, 'class_w

In [None]:
print(rf_top_results)

In [21]:
mod3 = RandomForestClassifier(max_depth=9, max_features=9, criterion='entropy', random_state=42)
mod3.fit(X,y)

In [22]:
pred3 = mod3.predict(X)
model_dict = {'model': "RandomForest"}

model_metrics = classificationMetrics(y, pred3)
result_dict = {**model_dict, **model_metrics}
models_list = pd.concat([models_list, pd.DataFrame([result_dict])], ignore_index=True)

In [23]:
pd.crosstab(y, pred3)

col_0,0,1
Loan_Status,Unnamed: 1_level_1,Unnamed: 2_level_1
0,136,56
1,0,422


In [24]:
svc_grid_params = {'C':[0.1,0.3,0.5,1,2],
                   'kernel':['rbf','linear','poly','sigmoid'],
                   'gamma':['scale','auto',1,0.1,0.01,0.001],
                   'probability':[True,False],
                 }

In [25]:
svc_grid = GridSearchCV(SVC(),svc_grid_params,refit=True,n_jobs=10)

In [None]:
svc_grid.fit(X_train, y_train)

In [None]:
svc_results_df = pd.DataFrame(lr_grid.cv_results_)
svc_top_results = svc_results_df.sort_values(by='mean_test_score', ascending=False).head(20)

print(top_results[['rank_test_score', 'mean_test_score', 'params']])

In [None]:
mod6 = SVC(C=2, kernel='rbf', gamma=0.001, tol=0.1)
mod6.fit(X,y)

In [None]:
pred6 = mod6.predict(X)
model_dict = {'model': "SVC"}
model_metrics = classificationMetrics(y, pred6)

In [None]:
result_dict = {**model_dict, **model_metrics}
models_list = pd.concat([models_list, pd.DataFrame([result_dict])], ignore_index=True)

In [None]:
mod4 = AdaBoostClassifier(random_state=1)
mod4.fit(X,y)

In [None]:
pred4 = mod4.predict(X)
model_dict = {'model': "ADABoost"}
new_row = pd.DataFrame([{**model_dict, **classificationMetrics(y, pred4)}])
models_list = pd.concat([models_list, new_row], ignore_index=True)

In [None]:
mod5 = GradientBoostingClassifier(random_state=1)
mod5.fit(X,y)

In [None]:
pred5 = mod5.predict(X)
model_dict = {'model': "GBM"}
new_row = pd.DataFrame([{**model_dict, **classificationMetrics(y, pred5)}])
models_list = pd.concat([models_list, new_row], ignore_index=True)

In [None]:
mod7 = xgb.XGBClassifier(booster='gbtree',
                        learning_rate=0.04,
                        max_depth=8,
                        min_child_weight=1)
mod7.fit(X,y)

In [None]:
pred7 = mod7.predict(X)
model_dict = {'model': "XGB"}
new_row = pd.DataFrame([{**model_dict, **classificationMetrics(y, pred7)}])
models_list = pd.concat([models_list, new_row], ignore_index=True)
models_list.sort_values('Accuracy', ascending=False)