In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn. model_selection import cross_val_score
from sklearn.impute import KNNImputer

In [2]:
# Reading the dataset
Train = pd.read_csv('./dataset/train.csv', index_col=0)
Test  = pd.read_csv('./dataset/test.csv',  index_col=0)

In [3]:
#Encoding categorical features values to integers
catg_map = {}
for catg in ['Gender','Married','Dependents','Education','Self_Employed','Credit_History','Property_Area']:
    unq = Train[catg].unique()
    catg_map[catg] = {key:val for val, key in enumerate(unq) if (key==key)}

    Train[catg] = Train[catg].map(catg_map[catg])
    Test[catg]  = Test[catg].map(catg_map[catg])

# Base Model

In [4]:
# dropping the examples with missing values for any of the features
train = Train.dropna().copy()
test  = Test.copy()

In [5]:
# splitting into features (xTrain) and labels (yTrain)
xTrain = train.drop('Loan_Status', axis=1)
yTrain = train['Loan_Status']
xTest  = test

In [6]:
# instantiating LogisticRegression
estimator = LogisticRegression(tol=1e-4, solver='liblinear', random_state=1)

# Computing the cross validation accuracy as base model performance estimate.
cv = cross_val_score(estimator, xTrain, yTrain, cv=10)
print('Cross Validation Accuracy for the Base Model:', round(np.mean(cv), 4))

Cross Validation Accuracy for the Base Model: 0.8021


# Filling missing values using mean, median & mode

In [7]:
# creating a copy of test and train
train = Train.copy()
test  = Test.copy()

In [8]:
print('Missing values in Train data:')
print(train.isna().sum())

Missing values in Train data:
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64


In [9]:
# splitting into features (xTrain) and labels (yTrain)
xTrain = train.drop('Loan_Status', axis=1)
yTrain = train['Loan_Status']
xTest  = test

In [10]:
#fillna for real valued features with mean
for attr in ['LoanAmount']:
    fill = xTrain[attr].mean()
    xTrain[attr].fillna(fill, inplace=True)
    xTest[attr].fillna(fill, inplace=True)

In [11]:
# as Fare has skewed distribution using median as central tendancy
for attr in ['ApplicantIncome', 'CoapplicantIncome',]:
    fill = xTrain[attr].median()
    xTrain[attr].fillna(fill, inplace=True)
    xTest[attr].fillna(fill, inplace=True)

In [12]:
#fillna for categorical features with mode
for attr in ['Gender','Married','Dependents','Education','Self_Employed','Credit_History','Property_Area', 'Loan_Amount_Term']:
    fill = xTrain[attr].mode()[0]
    xTrain[attr].fillna(fill, inplace=True)
    xTest[attr].fillna(fill, inplace=True)   

In [13]:
print('Missing values in Train data:')
print(xTrain.isna().sum())

Missing values in Train data:
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
dtype: int64


In [14]:
# instantiating LogisticRegression
estimator = LogisticRegression(tol=1e-4, solver='liblinear', random_state=1)

# Computing the cross validation accuracy as base model performance estimate.
cv = cross_val_score(estimator, xTrain, yTrain, cv=10)
print('Cross Validation Accuracy:', round(np.mean(cv), 4))

Cross Validation Accuracy: 0.8079


## KNN Imputer

In [15]:
# creating a copy of test and train
train = Train.copy()
test  = Test.copy()

In [16]:
print('Missing values in Train data:')
print(train.isna().sum())

Missing values in Train data:
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64


In [17]:
# splitting into features (xTrain) and labels (yTrain)
xTrain = train.drop('Loan_Status', axis=1)
yTrain = train['Loan_Status']
xTest  = test

In [18]:
impute = KNNImputer()
impute.fit(xTrain)

xTrain = impute.transform(xTrain)
xTest  = impute.transform(xTest)

In [19]:
# instantiating LogisticRegression
estimator = LogisticRegression(tol=1e-4, solver='liblinear', random_state=1)

# Computing the cross validation accuracy as base model performance estimate.
cv = cross_val_score(estimator, xTrain, yTrain, cv=10)
print('Cross Validation Accuracy:', round(np.mean(cv), 4))

Cross Validation Accuracy: 0.8047
