In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn. model_selection import cross_val_score
from sklearn.impute import KNNImputer

In [2]:
# Reading the dataset
Train = pd.read_csv('./dataset/train.csv', index_col=0)
Test  = pd.read_csv('./dataset/test.csv',  index_col=0)

In [3]:
# Features 'Name' and 'Ticket' have specific values for each example(passenger)
# We will need to do some feature engineering to utlise them. Also 'Cabin' has
# lot of missing values and will need special attension as well.
# So, Dropping these features for the initial models
features = ['Name', 'Ticket', 'Cabin']
Train = Train.drop(features, axis=1)
Test  = Test.drop(features, axis=1)

In [4]:
#Encoding categorical features values to integers
catg_map = {}
for catg in ['Sex', 'Embarked']:
    unq = Train[catg].unique()
    catg_map[catg] = {key:val for val, key in enumerate(unq)}

    Train[catg] = Train[catg].map(catg_map[catg])
    Test[catg]  = Test[catg].map(catg_map[catg])

# Base Model

In [5]:
# dropping the examples with missing values for any of the features
train = Train.dropna()
test  = Test.copy()

In [6]:
# splitting into features (xTrain) and labels (yTrain)
xTrain = train.drop('Survived', axis=1)
yTrain = train['Survived']
xTest  = test

In [7]:
# instantiating RandomForestClassifier
estimator = LogisticRegression(tol=1e-4, solver='liblinear', random_state=1)

# Computing the cross validation accuracy as base model performance estimate.
cv = cross_val_score(estimator, xTrain, yTrain, cv=10)
print('Cross Validation Accuracy for the Base Model:', round(np.mean(cv), 4))

Cross Validation Accuracy for the Base Model: 0.7871


# Filling missing values using mean, median & mode

In [8]:
# creating a copy of test and train
train = Train.copy()
test  = Test.copy()

In [9]:
print('Missing values in Train data:')
print(train.isna().sum())
print('\nMissing values in Train data:')
print(test.isna().sum())

Missing values in Train data:
Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      0
dtype: int64

Missing values in Train data:
Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64


In [10]:
# splitting into features (xTrain) and labels (yTrain)
xTrain = train.drop('Survived', axis=1)
yTrain = train['Survived']
xTest  = test

In [11]:
for attr in ['Age']: #fillna for real valued features with mean
    fill = xTrain[attr].mean()
    xTrain[attr].fillna(fill, inplace=True)
    xTest[attr].fillna(fill, inplace=True)

In [12]:
# as Fare has skewed distribution using median as central tendancy
for attr in ['Fare']: #fillna for real valued features with median
    fill = xTrain[attr].median()
    xTrain[attr].fillna(fill, inplace=True)
    xTest[attr].fillna(fill, inplace=True)

In [13]:
for attr in ['Embarked']: #fillna for categorical features with mode
    fill = xTrain[attr].mode()[0]
    xTrain[attr].fillna(fill, inplace=True)
    xTest[attr].fillna(fill, inplace=True)   

In [14]:
# instantiating RandomForestClassifier
estimator = LogisticRegression(tol=1e-4, solver='liblinear', random_state=1)

# Computing the cross validation accuracy as base model performance estimate.
cv = cross_val_score(estimator, xTrain, yTrain, cv=10)
print('Cross Validation Accuracy:', round(np.mean(cv), 4))

Cross Validation Accuracy: 0.7958


## KNN Imputer

In [15]:
# creating a copy of test and train
train = Train.copy()
test  = Test.copy()

In [16]:
# splitting into features (xTrain) and labels (yTrain)
xTrain = train.drop('Survived', axis=1)
yTrain = train['Survived']
xTest  = test

In [17]:
impute = KNNImputer()
impute.fit(xTrain)

xTrain = impute.transform(xTrain)
xTest  = impute.transform(xTest)

In [18]:
# instantiating RandomForestClassifier
estimator = LogisticRegression(tol=1e-4, solver='liblinear', random_state=1)

# Computing the cross validation accuracy as base model performance estimate.
cv = cross_val_score(estimator, xTrain, yTrain, cv=10)
print('Cross Validation Accuracy:', round(np.mean(cv), 4))

Cross Validation Accuracy: 0.8025
