**IMPORTING THE NECESSARY LIBRARIES**

In [1]:
#importing libraries 
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

#importing decision tree classifier 
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import f1_score

from sklearn.metrics import classification_report

**LOAD THE DATASET**

In [2]:
#reading the data
data=pd.read_csv('Loan data cleaned.csv')

In [3]:
#shape of the data
data.shape

(4368, 64)

In [4]:
#first five rows of the data
data.head()

Unnamed: 0,loannumber,loanamount,totaldue,termdays,good_bad_flag,bank_account_type,longitude_gps,latitude_gps,bank_name_clients,employment_status_clients,...,ploan_additional charges_50%_x,ploan_additional charges_75%_x,ploan_additional charges_max_x,ploan_additional charges_mean_y,ploan_additional charges_min_y,ploan_additional charges_25%_y,ploan_additional charges_50%_y,ploan_additional charges_75%_y,ploan_additional charges_max_y,not_repaid_percentage
0,12,30000.0,34500.0,30,Good,Other,3.43201,6.433055,Diamond Bank,Permanent,...,3800.0,4500.0,4500.0,3900.0,3000.0,3400.0,3800.0,4500.0,4500.0,64.0
1,2,15000.0,17250.0,30,Good,Savings,3.885298,7.3207,GT Bank,Permanent,...,2665.558293,3037.525723,3543.851778,2730.359972,2102.661161,2364.738902,2665.558293,3037.525723,3543.851778,63.573297
2,7,20000.0,22250.0,15,Good,Other,11.13935,10.292041,EcoBank,Permanent,...,1500.0,1500.0,3000.0,1750.0,1500.0,1500.0,1500.0,1500.0,3000.0,50.0
3,3,10000.0,11500.0,15,Good,Savings,3.98577,7.491708,First Bank,Permanent,...,2250.0,2625.0,3000.0,2250.0,1500.0,1875.0,2250.0,2625.0,3000.0,0.0
4,9,40000.0,44000.0,30,Good,Other,7.457913,9.076574,GT Bank,Permanent,...,3800.0,5100.0,9000.0,4800.0,3000.0,3000.0,3800.0,5100.0,9000.0,100.0


In [5]:
#checking missing values in the data
data.isnull().sum()

loannumber                        0
loanamount                        0
totaldue                          0
termdays                          0
good_bad_flag                     0
                                 ..
ploan_additional charges_25%_y    0
ploan_additional charges_50%_y    0
ploan_additional charges_75%_y    0
ploan_additional charges_max_y    0
not_repaid_percentage             0
Length: 64, dtype: int64

In [6]:
#make dummies of categorical columns('bank_account_type','bank_name_clients','employment_status_clients')
data=pd.get_dummies(data=data,columns=['bank_account_type','bank_name_clients','employment_status_clients'],drop_first=True)

### Separating independent and dependent variables

In [7]:
x = data.drop(['good_bad_flag'], axis=1)
y = data['good_bad_flag'].replace({'Good':0, 'Bad':1})

In [8]:
#importing train_test_split
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state = 101, stratify=y, test_size=0.15)

In [10]:
#Scalig the train and testing datasets
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(X_train)
X_train = pd.DataFrame(x_train_scaled, columns=X_train.columns)

x_test_scaled = scaler.transform(X_test)
X_test = pd.DataFrame(x_test_scaled, columns=X_test.columns)

## Install XGBoost

Use the following command in terminal or command prompt

_**$ pip install xgboost**_

## Building an XGBM Model

In [11]:
#Importing XGBM Classifier 
from xgboost import XGBClassifier

In [12]:
#creating an extreme Gradient boosting instance
clf = XGBClassifier(random_state=96)

In [13]:
#training the model
clf.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=96, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [15]:
pred_train=clf.predict(X_train)
pred_test=clf.predict(X_test)

In [16]:
print('\033[1m',"Classification Report for training data",'\033[0m')
print(classification_report(y_train, pred_train))

[1m Classification Report for training data [0m
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      2903
           1       1.00      0.93      0.97       809

    accuracy                           0.99      3712
   macro avg       0.99      0.97      0.98      3712
weighted avg       0.99      0.99      0.99      3712



In [17]:
print('\033[1m',"Classification Report for test data",'\033[0m')
print(classification_report(y_test, pred_test))

[1m Classification Report for test data [0m
              precision    recall  f1-score   support

           0       0.81      0.95      0.87       513
           1       0.52      0.20      0.29       143

    accuracy                           0.79       656
   macro avg       0.66      0.58      0.58       656
weighted avg       0.75      0.79      0.75       656



# Hyperparamter Tuning