Importing Libraries

In [1]:
import numpy as np 
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from imblearn.combine import SMOTETomek
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import  GradientBoostingClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,f1_score

Reading Dataset


In [3]:
data=pd.read_csv("/content/Loan_Prediction.csv")
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
numerical_features = data.select_dtypes(include = [np.number]).columns
categorical_features = data.select_dtypes(include = [np.object]).columns

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  


Checking for Null Values

In [5]:
data.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

Replacing Null Values with Mode and Median

In [6]:
data['Gender'] = data['Gender'].fillna(data['Gender'].mode()[0])
data['Married'] = data['Married'].fillna(data['Married'].mode()[0])
data['Dependents'] = data['Dependents'].str.replace('+','')
data['Dependents'] = data['Dependents'].fillna(data['Dependents'].mode()[0])
data['LoanAmount'] = data['LoanAmount'].fillna(data['LoanAmount'].mode()[0])
data['Self_Employed'] = data['Self_Employed'].fillna(data['Self_Employed'].mode()[0])
data['Loan_Amount_Term'] = data['Loan_Amount_Term'].fillna(data['Loan_Amount_Term'].mode()[0])
data['Credit_History'] = data['Credit_History'].fillna(data['Credit_History'].mode()[0])

  This is separate from the ipykernel package so we can avoid doing imports until


Verifying

In [7]:
data.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

Conversion to Numerical Values

In [8]:
data['Gender'].replace({'Male':1,'Female':0},inplace=True)
data['Dependents'].replace({'0':0,'1':1,'2':2,'3':3},inplace=True)
data['Married'].replace({'Yes':1,'No':0},inplace=True)
data['Self_Employed'].replace({'Yes':1,'No':0},inplace=True)
data['Property_Area'].replace({'Urban':2,'Rural':0,'Semiurban':1},inplace=True)
data['Education'].replace({'Graduate':1,'Not Graduate':0},inplace=True)
data['Loan_Status'].replace({'Y':1,'N':0},inplace=True)

In [9]:
data['CoapplicantIncome']=data['CoapplicantIncome'].astype("int64")
data['LoanAmount']=data['LoanAmount'].astype("int64")
data['Loan_Amount_Term']=data['Loan_Amount_Term'].astype("int64")
data['Credit_History']=data['Credit_History'].astype("int64")

Label Encoding

In [10]:
le = LabelEncoder()
data['Loan_ID'] = le.fit_transform(data.Loan_ID)
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,0,1,0,0,1,0,5849,0,120,360,1,2,1
1,1,1,1,1,1,0,4583,1508,128,360,1,0,0
2,2,1,1,0,1,1,3000,0,66,360,1,2,1
3,3,1,1,0,0,0,2583,2358,120,360,1,2,1
4,4,1,0,0,1,0,6000,0,141,360,1,2,1


In [11]:
smote = SMOTETomek(0.90)

y = data['Loan_Status']
x = data.drop(columns=['Loan_Status'],axis=1)



In [12]:
x_bal,y_bal = smote.fit_resample(x,y)
print(y.value_counts())
print(y_bal.value_counts())

1    422
0    192
Name: Loan_Status, dtype: int64
1    366
0    323
Name: Loan_Status, dtype: int64


In [13]:
scaler = StandardScaler()
x_bal = scaler.fit_transform(x_bal)
x_bal = pd.DataFrame(x_bal)
x_bal.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,-1.771963,0.525287,-1.223265,-0.722371,0.628281,-0.317236,0.081937,-0.564088,-0.302804,0.302353,0.614817,1.357676
1,-1.766122,0.525287,0.817484,0.334346,0.628281,-0.317236,-0.124529,-0.006689,-0.206226,0.302353,0.614817,-1.21929
2,-1.760281,0.525287,0.817484,-0.722371,0.628281,3.152223,-0.382693,-0.564088,-0.954702,0.302353,0.614817,1.357676
3,-1.75444,0.525287,0.817484,-0.722371,-1.591645,-0.317236,-0.450699,0.307495,-0.302804,0.302353,0.614817,1.357676
4,-1.748599,0.525287,-1.223265,-0.722371,0.628281,-0.317236,0.106563,-0.564088,-0.049287,0.302353,0.614817,1.357676


Splitting of Data into Train and Test

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x_bal, y_bal, test_size = 0.33, random_state = 30)

Xgboost Model

In [15]:
xg = GradientBoostingClassifier() 
xg.fit(x_train,y_train)

GradientBoostingClassifier()

In [16]:
yPred = xg.predict(x_test)
print(confusion_matrix(y_test ,yPred) )

[[ 76  33]
 [ 14 105]]


In [17]:
print(classification_report (y_test, yPred))

              precision    recall  f1-score   support

           0       0.84      0.70      0.76       109
           1       0.76      0.88      0.82       119

    accuracy                           0.79       228
   macro avg       0.80      0.79      0.79       228
weighted avg       0.80      0.79      0.79       228



In [18]:
y_pred=xg.predict(x_test)
y_pred1=xg.predict(x_train)

In [19]:
print('Testing accuracy: ',accuracy_score(y_test,y_pred))
print('Training accuracy: ',accuracy_score(y_train,y_pred1))

Testing accuracy:  0.793859649122807
Training accuracy:  0.9479392624728851


In [20]:
pd.crosstab(y_test,y_pred)

col_0,0,1
Loan_Status,Unnamed: 1_level_1,Unnamed: 2_level_1
0,76,33
1,14,105
