In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np

Data Extraction

In [3]:
df= pd.read_csv("/content/drive/My Drive/Analytics Vidhya/train.csv")

Data Preprocessing

In [4]:
#copying original data to duplicate dataframe - train_original
train_original=df

In [5]:
#getting basic info about the data
train_original.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [6]:
#getting info about the columns
df.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [7]:
#getting data types of each column
df.dtypes

Unnamed: 0,0
Loan_ID,object
Gender,object
Married,object
Dependents,object
Education,object
Self_Employed,object
ApplicantIncome,int64
CoapplicantIncome,float64
LoanAmount,float64
Loan_Amount_Term,float64


In [8]:
#check for the shape(=size=(rows,columns)) of the entire data
df.shape

(614, 13)

In [9]:
#getting count of each unique entity
df['Loan_Status'].value_counts()

Unnamed: 0_level_0,count
Loan_Status,Unnamed: 1_level_1
Y,422
N,192


In [10]:
#Finding missing values and their counts
df.isnull().sum()

Unnamed: 0,0
Loan_ID,0
Gender,13
Married,3
Dependents,15
Education,0
Self_Employed,32
ApplicantIncome,0
CoapplicantIncome,0
LoanAmount,22
Loan_Amount_Term,14


In [11]:
#first fill the gender column with mode
#(as this is a categorical label
#i.e, it can only consist either Male or Female and no other integer entity)
#if we have integer values in the given column... we can use either mode or mean or median
k = df['Gender'].mode()[0]
df['Gender'] = df['Gender'].fillna(k)

In [12]:
#same procedure for married, self employed
df['Married'] = df['Married'].fillna(df['Married'].mode()[0])
df['Self_Employed'] = df['Self_Employed'].fillna(df['Self_Employed'].mode()[0])
df['Credit_History'] = df['Credit_History'].fillna(df['Credit_History'].mode()[0])

In [13]:
df['Dependents'] = df['Dependents'].fillna(df['Dependents'].mode()[0]) #mean value of the dependents also can be taken

In [14]:
#take mean for loan amount,loan amount term
df['LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].mean())
df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mean())

In [15]:
df.isnull().sum()

Unnamed: 0,0
Loan_ID,0
Gender,0
Married,0
Dependents,0
Education,0
Self_Employed,0
ApplicantIncome,0
CoapplicantIncome,0
LoanAmount,0
Loan_Amount_Term,0


In [16]:
#importing label encoding
from sklearn import preprocessing
le=preprocessing.LabelEncoder()

In [17]:
df.dtypes

Unnamed: 0,0
Loan_ID,object
Gender,object
Married,object
Dependents,object
Education,object
Self_Employed,object
ApplicantIncome,int64
CoapplicantIncome,float64
LoanAmount,float64
Loan_Amount_Term,float64


In [18]:
#encoding all the object typed columns
df['Gender']=le.fit_transform(df['Gender'])
df['Married']=le.fit_transform(df['Married'])
df['Dependents']=le.fit_transform(df['Dependents'])
df['Education']=le.fit_transform(df['Education'])
df['Self_Employed']=le.fit_transform(df['Self_Employed'])
df['Property_Area']=le.fit_transform(df['Property_Area'])
df['Loan_Status']=le.fit_transform(df['Loan_Status'])


In [19]:
df.dtypes

Unnamed: 0,0
Loan_ID,object
Gender,int64
Married,int64
Dependents,int64
Education,int64
Self_Employed,int64
ApplicantIncome,int64
CoapplicantIncome,float64
LoanAmount,float64
Loan_Amount_Term,float64


In [20]:
#train test split
from sklearn.model_selection import train_test_split

In [21]:
y=df['Loan_Status']
x=df.iloc[:,1:12]

In [22]:
x_train, x_test, y_train,y_test =train_test_split(x,y,test_size=0.33,random_state=42)

In [23]:
from sklearn.linear_model import LogisticRegression as lr

In [24]:
model=lr(solver='liblinear',C=1.0).fit(x_train,y_train)

In [25]:
m=model.predict(x_test)

In [26]:
model.score(x_test,y_test)

0.7980295566502463

In [27]:
le.inverse_transform(m)

array(['Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'N', 'Y', 'Y',
       'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'N', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N',
       'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y',
       'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y',
       'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y',
       'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N',
       'Y', 'N', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y

In [28]:
import xgboost as xgb

In [29]:
model2=xgb.XGBClassifier().fit(x_train,y_train)

In [30]:
m2=model2.predict(x_test)

In [31]:
model2.score(x_test,y_test)

0.7684729064039408

In [32]:
model3=xgb.XGBClassifier(n_estimators=100, learning_rate=0.05, subsample=0.75, max_depth=7).fit(x_train,y_train)

In [33]:
m3=model3.predict(x_test)

In [34]:
model3.score(x_test,y_test)

0.7931034482758621

In [42]:
model4 = xgb.XGBClassifier(n_estimators=100, learning_rate=0.05, subsample=0.5, max_depth=7).fit(x_train,y_train)
m4 = model4.predict(x_test)
model4.score(x_test,y_test)

0.8029556650246306