**Splitting** **Data** **Into** **Train** **And** **Test**

**IMPORTING** **THE** ***PACKAGE***

In [1]:
import numpy as np 
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

**Load** **the** **dataset**

In [2]:
df=pd.read_csv("/content/Loan_Prediction_Data_Set.csv")
df

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [3]:
df.shape

(614, 13)

**Handle** **the** **Missing** **values**

In [4]:
#checking the null values
df.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

**Treating** **the** **Null** **Value**

We will fill the missing values in numeric data type using the mean value of that particular column and categorical data type using the most repeated value

In [5]:
numerical_features = df.select_dtypes(include = [np.number]).columns
categorical_features = df.select_dtypes(include = [np.object]).columns
numerical_features

Index(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History'],
      dtype='object')

In [6]:
categorical_features

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [7]:
df['Gender'] = df['Gender'].fillna(df['Gender'].mode()[0])
df['Married'] = df['Married'].fillna(df['Married'].mode()[0])
#replace + with non value 
df['Dependents'] = df['Dependents'].str.replace('+','')
df['Dependents'] = df['Dependents'].fillna(df['Dependents'].mode()[0])
df['Self_Employed'] = df['Self_Employed'].fillna(df['Self_Employed'].mode()[0])
df['LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].mode()[0])
df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mode()[0])
df['Credit_History'] = df['Credit_History'].fillna(df['Credit_History'].mode()[0])
#checking the null values now
df.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

**Handling** **Categorical** **Values**

In [8]:
df.select_dtypes(include='object').columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [9]:
df['Gender'].unique()

array(['Male', 'Female'], dtype=object)

In [10]:
df['Gender'].replace({'Male':1,'Female':0},inplace=True)
df['Married'].unique()

array(['No', 'Yes'], dtype=object)

In [11]:
df['Married'].replace({'Yes':1,'No':0},inplace=True)
df['Dependents'].unique()

array(['0', '1', '2', '3'], dtype=object)

In [12]:
df['Dependents'].replace({'0':0,'1':1,'2':2,'3':3},inplace=True)
df['Self_Employed'].unique()

array(['No', 'Yes'], dtype=object)

In [13]:
df['Self_Employed'].replace({'Yes':1,'No':0},inplace=True)
df['Property_Area'].unique()

array(['Urban', 'Rural', 'Semiurban'], dtype=object)

In [14]:
df['Property_Area'].replace({'Urban':2,'Rural':0,'Semiurban':1},inplace=True)
df['Loan_Status'].unique()

array(['Y', 'N'], dtype=object)

In [15]:
df['Loan_Status'].replace({'Y':1,'N':0},inplace=True)
df['Education'].unique()

array(['Graduate', 'Not Graduate'], dtype=object)

In [16]:
df['Education'].replace({'Graduate':1,'Not Graduate':0},inplace=True)
df['CoapplicantIncome']=df['CoapplicantIncome'].astype("int64")
df['LoanAmount']=df['LoanAmount'].astype("int64")
df['Loan_Amount_Term']=df['Loan_Amount_Term'].astype("int64")
df['Credit_History']=df['Credit_History'].astype("int64")
# dummy columns are created for the categories in Loan_ID
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Loan_ID'] = le.fit_transform(df.Loan_ID)
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,0,1,0,0,1,0,5849,0,120,360,1,2,1
1,1,1,1,1,1,0,4583,1508,128,360,1,0,0
2,2,1,1,0,1,1,3000,0,66,360,1,2,1
3,3,1,1,0,0,0,2583,2358,120,360,1,2,1
4,4,1,0,0,1,0,6000,0,141,360,1,2,1


**Balancing** **The** **Dataset**

In [18]:
from imblearn.combine import SMOTETomek
smote = SMOTETomek(0.90)
#dividing the dataset into dependent and independent y and x respectively

y = df['Loan_Status']
x = df.drop(columns=['Loan_Status'],axis=1)
#creating the new x and y for balance data
x_bal,y_bal = smote.fit_resample(x,y)
#printing the value before and after balancing 
print(y.value_counts())
print(y_bal.value_counts())

1    422
0    192
Name: Loan_Status, dtype: int64
1    370
0    327
Name: Loan_Status, dtype: int64


**Scaling** **The** **Data**

In [19]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_bal = sc.fit_transform(x_bal)
x_bal = pd.DataFrame(x_bal)
x_bal.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,-1.745662,0.543635,-1.202337,-0.721234,0.616602,-0.33413,0.087528,-0.510866,-0.3217,0.28587,0.592263,1.369309
1,-1.739935,0.543635,0.831714,0.33044,0.616602,-0.33413,-0.128013,-0.055965,-0.229198,0.28587,0.592263,-1.192862
2,-1.734207,0.543635,0.831714,-0.721234,0.616602,2.992849,-0.397523,-0.510866,-0.946088,0.28587,0.592263,1.369309
3,-1.728479,0.543635,0.831714,-0.721234,-1.621792,-0.33413,-0.468519,0.200444,-0.3217,0.28587,0.592263,1.369309
4,-1.722751,0.543635,-1.202337,-0.721234,0.616602,-0.33413,0.113236,-0.510866,-0.078882,0.28587,0.592263,1.369309



**Splitting** **Data** **Into** ***Train*** And ***Test***

In [20]:
# splitting the data into training and testing set

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_bal, y_bal, test_size = 0.33, random_state = 42)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)
(452, 12)
(452,)
(223, 12)
(223,)
x_train

(466, 12)
(466,)
(231, 12)
(231,)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
137,-0.857871,0.543635,0.831714,2.433786,0.616602,-0.334130,5.901675,-0.510866,5.228416,-2.60462,-1.688440,0.088224
483,1.496208,0.543635,-1.202337,-0.721234,0.616602,-0.334130,-0.340829,-0.510866,-0.899837,0.28587,0.592263,1.369309
481,1.479025,0.543635,-1.202337,-0.721234,-1.621792,2.992849,0.079186,-0.510866,-0.182947,0.28587,0.592263,0.088224
33,-1.545193,0.543635,0.831714,-0.721234,0.616602,-0.334130,-0.520957,0.112662,-0.321700,0.28587,0.592263,1.369309
634,-1.344724,0.543635,0.831714,1.382113,-1.621792,-0.334130,-0.090727,-0.510866,-0.541392,0.28587,-1.688440,-1.192862
...,...,...,...,...,...,...,...,...,...,...,...,...
71,-1.287447,-1.839469,-1.202337,-0.721234,0.616602,-0.334130,-0.253489,-0.510866,-0.425764,0.28587,0.592263,0.088224
106,-1.058340,0.543635,0.831714,-0.721234,0.616602,-0.334130,0.056542,0.293657,0.372065,0.28587,0.592263,-1.192862
270,0.024193,0.543635,0.831714,1.382113,-1.621792,-0.334130,0.397048,-0.510866,0.429878,0.28587,0.592263,-1.192862
435,1.181185,0.543635,0.831714,0.330440,0.616602,-0.334130,0.124303,0.093657,1.181457,0.28587,0.592263,0.088224


In [21]:
x_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
158,-0.714679,0.543635,0.831714,0.330440,0.616602,-0.334130,0.181337,1.676157,0.372065,0.285870,-1.688440,1.369309
497,1.582123,0.543635,-1.202337,-0.721234,0.616602,-0.334130,-0.595698,9.696347,-0.668582,0.285870,0.592263,1.369309
395,0.917712,0.543635,-1.202337,-0.721234,0.616602,-0.334130,0.113236,-0.510866,-0.090445,0.285870,0.592263,-1.192862
155,-0.731862,0.543635,0.831714,2.433786,0.616602,-0.334130,0.030834,2.897873,4.014328,0.285870,-1.688440,0.088224
321,0.367854,0.543635,0.831714,-0.721234,0.616602,-0.334130,2.450813,1.077667,4.881534,0.285870,0.592263,-1.192862
...,...,...,...,...,...,...,...,...,...,...,...,...
292,0.178840,0.543635,0.831714,0.330440,0.616602,-0.334130,-0.014454,-0.303325,0.140810,0.285870,0.592263,-1.192862
275,0.052831,0.543635,0.831714,-0.721234,0.616602,-0.334130,-0.487247,0.045090,0.129247,0.285870,0.592263,-1.192862
651,1.186913,0.543635,-1.202337,-0.721234,0.616602,-0.334130,0.088720,0.614621,1.782719,0.285870,-1.688440,-1.192862
314,0.316305,-1.839469,-1.202337,0.330440,0.616602,-0.334130,-0.104177,-0.510866,-0.772647,0.285870,0.592263,0.088224


In [22]:
y_train

137    1
483    1
481    1
33     1
634    0
      ..
71     1
106    1
270    1
435    1
102    1
Name: Loan_Status, Length: 466, dtype: int64

In [23]:
y_test

158    0
497    0
395    1
155    0
321    0
      ..
292    1
275    0
651    0
314    0
657    0
Name: Loan_Status, Length: 231, dtype: int64