**Splitting Data Into Train And Test**

The dataset is already download in .csv format

**IMPORTING THE PACKAGE**

In [1]:
import numpy as np 
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

**Load the dataset**

In [3]:
df=pd.read_csv("loan_prediction.csv")


In [4]:
df

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [5]:
df.shape


(614, 13)

**Handle the Missing values**

In [6]:
#checking the null values
df.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64


**Treating the Null Value**

We will fill the missing values in numeric data type using the mean value of that particular column and categorical data type using the most repeated value

In [7]:
numerical_features = df.select_dtypes(include = [np.number]).columns
categorical_features = df.select_dtypes(include = [np.object]).columns

In [8]:
numerical_features


Index(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History'],
      dtype='object')

In [9]:
categorical_features


Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [10]:
df['Gender'] = df['Gender'].fillna(df['Gender'].mode()[0])


In [11]:
df['Married'] = df['Married'].fillna(df['Married'].mode()[0])


In [12]:
#replace + with non value 
df['Dependents'] = df['Dependents'].str.replace('+','')

In [13]:
df['Dependents'] = df['Dependents'].fillna(df['Dependents'].mode()[0])


In [14]:
df['Self_Employed'] = df['Self_Employed'].fillna(df['Self_Employed'].mode()[0])


In [15]:
df['LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].mode()[0])


In [16]:
df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mode()[0])


In [17]:
df['Credit_History'] = df['Credit_History'].fillna(df['Credit_History'].mode()[0])


In [18]:
#checking the null values now
df.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64


Now the null value is retreated


**Handling Categorical Values**

In [19]:
df.select_dtypes(include='object').columns


Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [20]:
df['Gender'].unique()


array(['Male', 'Female'], dtype=object)

In [21]:
df['Gender'].replace({'Male':1,'Female':0},inplace=True)


In [22]:
df['Married'].unique()


array(['No', 'Yes'], dtype=object)

In [23]:
df['Married'].replace({'Yes':1,'No':0},inplace=True)


In [24]:
df['Dependents'].unique()


array(['0', '1', '2', '3'], dtype=object)

In [25]:
df['Dependents'].replace({'0':0,'1':1,'2':2,'3':3},inplace=True)


In [26]:
df['Self_Employed'].unique()


array(['No', 'Yes'], dtype=object)

In [27]:
df['Self_Employed'].replace({'Yes':1,'No':0},inplace=True)


In [28]:
df['Property_Area'].unique()


array(['Urban', 'Rural', 'Semiurban'], dtype=object)

In [29]:
df['Property_Area'].replace({'Urban':2,'Rural':0,'Semiurban':1},inplace=True)


In [30]:
df['Loan_Status'].unique()


array(['Y', 'N'], dtype=object)

In [31]:
df['Loan_Status'].replace({'Y':1,'N':0},inplace=True)


In [32]:
df['Education'].unique()


array(['Graduate', 'Not Graduate'], dtype=object)

In [33]:
df['Education'].replace({'Graduate':1,'Not Graduate':0},inplace=True)


In [34]:
df['CoapplicantIncome']=df['CoapplicantIncome'].astype("int64")
df['LoanAmount']=df['LoanAmount'].astype("int64")
df['Loan_Amount_Term']=df['Loan_Amount_Term'].astype("int64")
df['Credit_History']=df['Credit_History'].astype("int64")

In [35]:
# dummy columns are created for the categories in Loan_ID
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Loan_ID'] = le.fit_transform(df.Loan_ID)

In [36]:
df.head()


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,0,1,0,0,1,0,5849,0,120,360,1,2,1
1,1,1,1,1,1,0,4583,1508,128,360,1,0,0
2,2,1,1,0,1,1,3000,0,66,360,1,2,1
3,3,1,1,0,0,0,2583,2358,120,360,1,2,1
4,4,1,0,0,1,0,6000,0,141,360,1,2,1


**Balancing The Dataset**

In [37]:
from imblearn.combine import SMOTETomek


In [38]:
smote = SMOTETomek(0.90)


In [39]:
#dividing the dataset into dependent and independent y and x respectively

y = df['Loan_Status']
x = df.drop(columns=['Loan_Status'],axis=1)

In [40]:
#creating the new x and y for balance data
x_bal,y_bal = smote.fit_resample(x,y)

In [41]:
#printing the value before and after balancing 
print(y.value_counts())
print(y_bal.value_counts())

1    422
0    192
Name: Loan_Status, dtype: int64
1    357
0    314
Name: Loan_Status, dtype: int64


**Scaling The Data**

In [42]:
from sklearn.preprocessing import StandardScaler


In [43]:
sc = StandardScaler()
x_bal = sc.fit_transform(x_bal)

In [44]:
x_bal = pd.DataFrame(x_bal)

In [45]:
x_bal.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,-1.72473,0.525038,-1.181706,-0.68375,0.605474,-0.327507,0.0934,-0.56558,-0.291435,0.287969,0.587103,1.354012
1,-1.719073,0.525038,0.846234,0.380742,0.605474,-0.327507,-0.120248,-0.029043,-0.190777,0.287969,0.587103,-1.219761
2,-1.713417,0.525038,0.846234,-0.68375,0.605474,3.053371,-0.387393,-0.56558,-0.970875,0.287969,0.587103,1.354012
3,-1.707761,0.525038,0.846234,-0.68375,-1.651599,-0.327507,-0.457765,0.273381,-0.291435,0.287969,0.587103,1.354012
4,-1.702104,0.525038,-1.181706,-0.68375,0.605474,-0.327507,0.118883,-0.56558,-0.027208,0.287969,0.587103,1.354012



We will perform scaling only on the input values


**Splitting Data Into Train And Test**

In [46]:
# splitting the data into training and testing set

from sklearn.model_selection import train_test_split

In [47]:
x_train, x_test, y_train, y_test = train_test_split(x_bal, y_bal, test_size = 0.33, random_state = 42)


In [48]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(449, 12)
(449,)
(222, 12)
(222,)


In [49]:
x_train


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
497,1.691652,-1.904622,0.846234,0.380742,0.605474,-0.327507,1.131434,-0.565580,4.439482,0.287969,0.587103,0.067125
573,-1.159103,0.525038,0.846234,-0.683750,-1.651599,-0.327507,-0.460465,0.123237,-0.241106,-0.694108,-1.703280,0.067125
259,0.034367,0.525038,-1.181706,-0.683750,-1.651599,-0.327507,-0.399712,0.290104,-0.404675,0.287969,0.587103,0.067125
203,-0.338946,0.525038,-1.181706,-0.683750,0.605474,3.053371,0.127321,0.976074,-0.291435,-2.431629,0.587103,1.354012
196,-0.378540,-1.904622,-1.181706,0.380742,0.605474,-0.327507,-0.250361,-0.565580,-0.392093,0.287969,0.587103,-1.219761
...,...,...,...,...,...,...,...,...,...,...,...,...
71,-1.255260,0.525038,0.846234,-0.683750,0.605474,-0.327507,0.118883,0.234956,1.532988,0.287969,0.587103,0.067125
106,-1.017697,-1.904622,-1.181706,-0.683750,0.605474,-0.327507,-0.270612,-0.565580,-0.631155,0.287969,0.587103,-1.219761
270,0.096586,0.525038,0.846234,1.445235,-1.651599,-0.327507,-0.285969,0.000132,-0.291435,0.287969,0.587103,-1.219761
435,1.267432,0.525038,-1.181706,-0.683750,-1.651599,-0.327507,0.251021,-0.565580,-0.165613,0.287969,0.587103,0.067125


In [50]:
x_test


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
361,0.769681,0.525038,0.846234,-0.683750,0.605474,-0.327507,0.439693,0.086590,0.463498,0.287969,0.587103,-1.219761
158,-0.667009,0.525038,0.846234,-0.683750,0.605474,-0.327507,-0.491517,0.195107,-1.071533,0.287969,0.587103,-1.219761
480,1.584183,0.525038,0.846234,0.380742,0.605474,-0.327507,-0.170875,0.501801,0.362841,-3.882081,0.587103,-1.219761
641,0.181430,0.525038,-1.181706,-0.683750,0.605474,-0.327507,0.575375,1.070003,2.514401,0.287969,-1.703280,-1.219761
275,0.147493,0.525038,-1.181706,-0.683750,0.605474,-0.327507,-0.398362,-0.565580,-0.568244,0.287969,0.587103,0.067125
...,...,...,...,...,...,...,...,...,...,...,...,...
590,0.560400,0.525038,0.846234,-0.683750,-1.651599,-0.327507,-0.383849,0.041048,-0.467586,1.557115,-1.703280,0.067125
656,0.153149,-1.904622,-1.181706,-0.683750,0.605474,-0.327507,-0.723897,0.074849,-0.366928,0.287969,-1.703280,-1.219761
645,1.318339,0.525038,0.846234,-0.683750,-1.651599,-0.327507,-0.511768,0.792129,-0.102702,-1.933036,-1.703280,0.067125
84,-1.170416,0.525038,0.846234,-0.683750,-1.651599,-0.327507,-0.186908,-0.565580,-0.354346,-2.431629,0.587103,0.067125


In [51]:
y_train


497    1
573    0
259    1
203    0
196    1
      ..
71     0
106    1
270    1
435    1
102    1
Name: Loan_Status, Length: 449, dtype: int64

In [52]:
y_test


361    1
158    1
480    0
641    0
275    1
      ..
590    0
656    0
645    0
84     1
314    1
Name: Loan_Status, Length: 222, dtype: int64