In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV

from xgboost import XGBClassifier

from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv("/content/sample_data/Loan_Prediction_Data_Set.csv")

In [3]:
df = df.drop(['Loan_ID'], axis = 1)

In [4]:
df['Gender'].fillna(df['Gender'].mode()[0],inplace=True)
df['Married'].fillna(df['Married'].mode()[0],inplace=True)
df['Dependents'].fillna(df['Dependents'].mode()[0],inplace=True)
df['Self_Employed'].fillna(df['Self_Employed'].mode()[0],inplace=True)
df['Credit_History'].fillna(df['Credit_History'].mode()[0],inplace=True)
df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mode()[0],inplace=True)

df['LoanAmount'].fillna(df['LoanAmount'].mean(),inplace=True)

In [5]:
df = pd.get_dummies(df)

# Drop columns
df = df.drop(['Gender_Female', 'Married_No', 'Education_Not Graduate', 
              'Self_Employed_No', 'Loan_Status_N'], axis = 1)

# Rename columns name
new = {'Gender_Male': 'Gender', 'Married_Yes': 'Married', 
       'Education_Graduate': 'Education', 'Self_Employed_Yes': 'Self_Employed',
       'Loan_Status_Y': 'Loan_Status'}
       
df.rename(columns=new, inplace=True)

In [6]:
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

df = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]

In [7]:
df.ApplicantIncome = np.sqrt(df.ApplicantIncome)
df.CoapplicantIncome = np.sqrt(df.CoapplicantIncome)
df.LoanAmount = np.sqrt(df.LoanAmount)

In [8]:
X = df.drop(["Loan_Status"], axis=1)
y = df["Loan_Status"]

In [9]:
X, y = SMOTE().fit_resample(X, y)

In [10]:
X = MinMaxScaler().fit_transform(X)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [12]:
params_xg = {'max_depth':np.arange(3,10)}
xg_gs = GridSearchCV(XGBClassifier(),params_xg,cv=5)
xg_gs.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=XGBClassifier(),
             param_grid={'max_depth': array([3, 4, 5, 6, 7, 8, 9])})

In [13]:
print(xg_gs.best_estimator_)

XGBClassifier(max_depth=4)


In [14]:
xgboost_model = XGBClassifier(max_depth=6)
xgboost_model.fit(X_train,y_train)

XGBClassifier(max_depth=6)

In [15]:
y_predict = xgboost_model.predict(X_test)

In [16]:
print(classification_report(y_test,y_predict))

              precision    recall  f1-score   support

           0       0.88      0.91      0.89        23
           1       0.90      0.86      0.88        22

    accuracy                           0.89        45
   macro avg       0.89      0.89      0.89        45
weighted avg       0.89      0.89      0.89        45

