In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn.metrics as metrics 
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [2]:
def classificationMetrics(y, yhat):
    prf1 = metrics.precision_recall_fscore_support(y,yhat)
    res = {'Accuracy': metrics.accuracy_score(y,yhat),
           'Precision':prf1[0][1],
           'Recall': prf1[1][1],
           'f1-score': prf1[2][1],
           'Log-loss': metrics.log_loss(y,yhat),
           'AUC': metrics.roc_auc_score(y,yhat)
          }
    return res

In [3]:
# Load the DataFrame from the pickle file using pandas
df = pd.read_csv("loans_clean.csv", index_col=False)
df = df.drop(columns=['Unnamed: 0'])
df.head()

Unnamed: 0,Unnamed: 0.1,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Credit_History,Property_Area,Loan_Status,Loan_Amount_Term_Bin
0,0,0,0,0,0,0,5849,0.0,152.0,1.0,0,0,1
1,1,0,1,1,0,0,4583,1508.0,128.0,1.0,1,1,1
2,2,0,1,0,0,1,3000,0.0,66.0,1.0,0,0,1
3,3,0,1,0,1,0,2583,2358.0,120.0,1.0,0,0,1
4,4,0,0,0,0,0,6000,0.0,141.0,1.0,0,0,1


In [4]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['Loan_Status'])
y = df['Loan_Status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)

In [5]:
grid_params = {
    'booster': ['gbtree','dart'],
    'eta':[0.01, 0.04,0.08,0.1,0.2,0.3],
    'max_depth':[7,9,8,11],
    'min_child_weight':[1,4,8,12],
    'gamma':[0,1,4,6,8,12],
    'subsample':[0.5,0.7,1],
    'eval_metric':['auc']
    
}

In [6]:
grid = GridSearchCV(xgb.XGBClassifier(),grid_params,refit=True,n_jobs=10)

In [None]:
import warnings
warnings.filterwarnings('ignore')
grid.fit(X_train, y_train)

In [20]:
pd.set_option('display.max_colwidth', None)
results_df = pd.DataFrame(grid.cv_results_)
top_results = results_df.sort_values(by='mean_score_time', ascending=False).head(20)
#top_results
print(top_results[['rank_test_score', 'mean_score_time', 'params']])

      rank_test_score  mean_score_time  \
2220                1         0.281933   
2106             3193         0.248464   
2483             2299         0.237466   
3272                1         0.232231   
3311             2299         0.232027   
3118             2709         0.228955   
2635             1893         0.224899   
2482             2709         0.220801   
3000             2074         0.219235   
3078             2601         0.206983   
2034             3193         0.200111   
1854             2709         0.196738   
3388                1         0.195059   
3273             3309         0.193970   
3195             2515         0.191487   
3194             2054         0.185385   
3157                1         0.181307   
3156                1         0.178817   
2746             2709         0.173377   
2406             2669         0.171443   

                                                                                                                     

In [11]:
final_hparams = top_results.iloc[2]['params']
print(f"Final hyperparams set {final_hparams}")


model = xgb.XGBClassifier(**final_hparams)
model.fit(X,y)

Final hyperparams set {'booster': 'gbtree', 'eta': 0.1, 'eval_metric': 'auc', 'gamma': 8, 'max_depth': 11, 'min_child_weight': 1, 'subsample': 0.7}


In [15]:
prediction = model.predict(X)

In [16]:
classificationMetrics(y, prediction)

{'Accuracy': 0.8094462540716613,
 'Precision': 0.9213483146067416,
 'Recall': 0.4270833333333333,
 'f1-score': 0.5836298932384342,
 'Log-loss': 6.868253170238936,
 'AUC': 0.7052478278041074}