In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv("loan_prediction.csv")
df

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [3]:
df.shape

(614, 13)

In [4]:
df=df.drop(['Loan_ID'],axis=1)

# Handle the Missing values

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             601 non-null    object 
 1   Married            611 non-null    object 
 2   Dependents         599 non-null    object 
 3   Education          614 non-null    object 
 4   Self_Employed      582 non-null    object 
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         592 non-null    float64
 8   Loan_Amount_Term   600 non-null    float64
 9   Credit_History     564 non-null    float64
 10  Property_Area      614 non-null    object 
 11  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 57.7+ KB


In [6]:
#checking the null values
df.isnull().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [7]:
df['LoanAmount']=df['LoanAmount'].fillna(df['LoanAmount'].mean())
df['Loan_Amount_Term']=df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mean())
df['Credit_History']=df['Credit_History'].fillna(df['Credit_History'].mean())

In [8]:
df['Gender']=df['Gender'].fillna(df['Gender'].mode()[0])
df['Married']=df['Married'].fillna(df['Married'].mode()[0])
df['Dependents']=df['Dependents'].fillna(df['Dependents'].mode()[0])
df['Self_Employed']=df['Self_Employed'].fillna(df['Self_Employed'].mode()[0])

In [9]:
df.isnull().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

# Handling Categorical Values

In [10]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,146.412162,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [12]:
le=LabelEncoder()

In [13]:
df.Gender=le.fit_transform(df.Gender)
df.Married=le.fit_transform(df.Married)
df.Education=le.fit_transform(df.Education)
df.Self_Employed=le.fit_transform(df.Self_Employed)
df.Property_Area=le.fit_transform(df.Property_Area)
df.Loan_Status=le.fit_transform(df.Loan_Status)
df.Dependents=le.fit_transform(df.Dependents)

In [14]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1,0,0,0,0,5849,0.0,146.412162,360.0,1.0,2,1
1,1,1,1,0,0,4583,1508.0,128.0,360.0,1.0,0,0
2,1,1,0,0,1,3000,0.0,66.0,360.0,1.0,2,1
3,1,1,0,1,0,2583,2358.0,120.0,360.0,1.0,2,1
4,1,0,0,0,0,6000,0.0,141.0,360.0,1.0,2,1


# Balancing The Dataset

In [15]:
from imblearn.combine import SMOTETomek

In [16]:
smote = SMOTETomek(0.90)

In [17]:
y = df['Loan_Status']
x = df.drop(columns=['Loan_Status'],axis=1)

In [18]:
x_bal,y_bal = smote.fit_resample(x,y)

In [19]:
print(y.value_counts())
print(y_bal.value_counts())

1    422
0    192
Name: Loan_Status, dtype: int64
1    351
0    308
Name: Loan_Status, dtype: int64


# Scaling The Data

In [20]:
from sklearn.preprocessing import StandardScaler

In [21]:
sc = StandardScaler()
x_bal = sc.fit_transform(x_bal)

In [22]:
x_bal = pd.DataFrame(x_bal)

In [23]:
x_bal.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.545201,-1.181678,-0.695275,-0.488592,-0.330798,0.091815,-0.532899,0.005255,0.279869,0.579456,1.347254
1,0.545201,0.846254,-0.695275,-0.488592,3.022989,-0.38468,-0.532899,-0.995365,0.279869,0.579456,1.347254
2,0.545201,0.846254,-0.695275,2.046699,-0.330798,-0.454423,0.259268,-0.323408,0.279869,0.579456,1.347254
3,0.545201,-1.181678,-0.695275,-0.488592,-0.330798,0.11707,-0.532899,-0.062092,0.279869,0.579456,1.347254
4,0.545201,0.846254,1.43088,-0.488592,3.022989,0.019563,0.876741,1.505807,0.279869,0.579456,1.347254


# Splitting Data Into Train And Test

In [24]:
x_train, x_test , y_train, y_test = train_test_split(x_bal, y_bal, test_size = 0.33, random_state=42)

In [25]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(441, 11)
(441,)
(218, 11)
(218,)


In [26]:
x_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
203,0.545201,-1.181678,-0.695275,-0.488592,3.022989,0.125432,0.922766,-0.323408,-2.476381,0.579456,1.347254
196,0.545201,0.846254,-0.695275,-0.488592,-0.330798,0.074588,-0.532899,1.356483,0.279869,0.169299,1.347254
374,0.545201,0.846254,-0.695275,-0.488592,-0.330798,-0.167088,-0.532899,-0.348295,0.279869,0.579456,1.347254
93,0.545201,-1.181678,-0.695275,-0.488592,-0.330798,-0.041818,-0.532899,-0.348295,0.279869,0.579456,0.056788
271,-1.834186,0.846254,-0.695275,-0.488592,-0.330798,-0.161736,0.290511,-0.447844,0.279869,0.579456,1.347254


In [27]:
x_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
628,-1.834186,0.846254,-0.695275,-0.488592,-0.330798,1.274774,-0.217652,1.842323,0.279869,-0.858296,0.056788
425,-1.834186,0.846254,1.43088,-0.488592,-0.330798,-0.546745,0.015369,-0.410514,2.117368,0.579456,0.056788
135,0.545201,0.846254,-0.695275,-0.488592,-0.330798,-0.266434,0.530714,0.448098,0.004244,0.579456,-1.233678
321,0.545201,0.846254,-0.695275,-0.488592,-0.330798,-0.496236,0.279089,-0.12431,0.279869,0.579456,1.347254
90,0.545201,-1.181678,-0.695275,-0.488592,-0.330798,0.002671,-0.532899,-0.12431,0.279869,0.579456,1.347254


In [28]:
y_train.head()

203    0
196    0
374    1
93     1
271    0
Name: Loan_Status, dtype: int32

In [29]:
y_test.head()

628    0
425    1
135    1
321    1
90     1
Name: Loan_Status, dtype: int32