In [1]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import sklearn
import imblearn
from imblearn.combine import SMOTETomek
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,f1_score

In [2]:
ds=pd.read_csv("D:\data set\loan_prediction.csv")
ds.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [3]:
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [4]:
ds.shape

(614, 13)

Checking for Null values

In [5]:
ds.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [6]:
ds['Self_Employed']=ds['Self_Employed'].fillna(ds['Self_Employed'].mode()[0])
ds['Gender'] = ds['Gender'].fillna(ds['Gender'].mode()[0])
ds['Married']=ds['Married'].fillna(ds['Married'].mode()[0])
ds['Dependents']=ds['Dependents'].fillna(ds['Dependents'].mode()[0])
ds['Self_Employed']=ds['Self_Employed'].fillna(ds['Self_Employed'].mode()[0])
ds['LoanAmount']=ds['LoanAmount'].fillna(ds['LoanAmount'].mode()[0])
ds['Loan_Amount_Term']=ds['Loan_Amount_Term'].fillna(ds['Loan_Amount_Term'].mode()[0])
ds['Credit_History']=ds['Credit_History'].fillna(ds['Credit_History'].mode()[0])

Handling Categorical Values

In [7]:
ds.select_dtypes(include='object').columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [8]:
ds['Gender'].unique()

array(['Male', 'Female'], dtype=object)

In [9]:
ds['Gender'].replace({'Male':1,'Female':0},inplace=True)
ds['Married'].unique()

array(['No', 'Yes'], dtype=object)

In [10]:
ds['Married'].replace({'Yes':1,'No':0},inplace=True)
ds['Dependents'].unique()

array(['0', '1', '2', '3+'], dtype=object)

In [11]:
ds['Dependents'].replace({'0':0,'1':1,'2':2,'3':3},inplace=True)
ds['Self_Employed'].unique()

array(['No', 'Yes'], dtype=object)

In [12]:
ds['Self_Employed'].replace({'Yes':1,'No':0},inplace=True)
ds['Property_Area'].unique()

array(['Urban', 'Rural', 'Semiurban'], dtype=object)

In [13]:
ds['Property_Area'].replace({'Urban':2,'Rural':0,'Semiurban':1},inplace=True)
ds['Loan_Status'].unique()

array(['Y', 'N'], dtype=object)

In [14]:
ds['Loan_Status'].replace({'Y':1,'N':0},inplace=True)
ds['Education'].unique()

array(['Graduate', 'Not Graduate'], dtype=object)

In [None]:
ds['Education'].replace({'Graduate':1,'Not Graduate':0},inplace=True)
ds['CoapplicantIncome']=ds['CoapplicantIncome'].astype("int64")
ds['LoanAmount']=ds['LoanAmount'].astype("int64")
ds['Loan_Amount_Term']=ds['Loan_Amount_Term'].astype("int64")
ds['Credit_History']=ds['Credit_History'].astype("int64")
ds['Self_Employed']=ds['Self_Employed'].astype("int64")
ds['Dependents']=ds['Dependents'].astype("int64")
ds['Married']=ds['Married'].astype("int64")
ds['Gender']=ds['Gender'].astype("int64")

Balancing Dataset

In [16]:
sm=SMOTETomek(0.90)

In [17]:
y=ds['Loan_Status']
x=ds.drop(columns=['Loan_Status'],axis=1)

In [None]:
xb,yb = sm.fit_resample(x,y)

In [19]:
print(y.value_counts())

1    422
0    192
Name: Loan_Status, dtype: int64


In [None]:
print(yb.value_counts())

Scaling The Data

In [21]:
sc=StandardScaler()

In [None]:
xb=sc.fit_transform(xb)

In [None]:
xb=pd.DataFrame(xb)

In [None]:
xb.head()

Splitting Data Into Train And Test

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_bal, y_bal, test_size = 0.33, random_state = 42)

In [None]:
x_train.shape

In [None]:
x_test.shape

In [None]:
y_train.shape

In [None]:
y_test.shape