In [1]:
import numpy as np
import pandas as pd
from sklearn. model_selection import cross_val_score
from sklearn.impute import KNNImputer

In [2]:
# Reading the dataset
Train = pd.read_csv('./dataset/train.csv', index_col=0)
Test  = pd.read_csv('./dataset/test.csv',  index_col=0)

In [3]:
# Features 'Name' and 'Ticket' have specific values for each example(passenger)
# We will need to do some feature engineering to utlise them. Also 'Cabin' has
# lot of missing values and will need special attension as well.
# So, Dropping these features for the initial models
features = ['Name', 'Ticket', 'Cabin']
Train = Train.drop(features, axis=1)
Test  = Test.drop(features, axis=1)
Test_id = Test.index

In [4]:
#Encoding categorical features values to integers
catg_map = {}
for catg in ['Sex', 'Embarked']:
    unq = Train[catg].unique()
    catg_map[catg] = {key:val for val, key in enumerate(unq)}

    Train[catg] = Train[catg].map(catg_map[catg])
    Test[catg]  = Test[catg].map(catg_map[catg])

In [5]:
# creating a copy of test and train
train = Train.copy()
test  = Test.copy()

In [6]:
# splitting into features (xTrain) and labels (yTrain)
xTrain = train.drop('Survived', axis=1)
yTrain = train['Survived']
xTest  = test

In [7]:
impute = KNNImputer()
impute.fit(xTrain)

xTrain = impute.transform(xTrain)
xTest  = impute.transform(xTest)

## Base Model

In [8]:
# instantiating LogisticRegression
from sklearn.linear_model import LogisticRegression
estimator = LogisticRegression(tol=1e-4, solver='liblinear', random_state=1)

# Computing the cross validation accuracy as base model performance estimate.
cv = cross_val_score(estimator, xTrain, yTrain, cv=10)
print('Cross Validation Accuracy for the Base Model:', round(np.mean(cv), 4))

Cross Validation Accuracy for the Base Model: 0.8025


# Different Classifiers and Parameter Tuning 

In [9]:
def parameterTune(estimator, param_grid):
    from sklearn.model_selection import GridSearchCV
    grid = GridSearchCV(
            estimator  = estimator,
            param_grid = param_grid,
            n_jobs     = 11,
            cv         = 5,

    )
    grid.fit(xTrain, yTrain)
    
    return grid.best_score_, grid.best_params_

In [10]:
# function to generate submission file
def test_eval(estimator, params):
    clf = estimator(**params)
    clf.fit(xTrain, yTrain)
    yPred = clf.predict(xTest)
    
    df = pd.DataFrame({'PassengerId':Test_id, 'Survived':yPred})
    return df

## Gaussian Naive Bayes

In [11]:
# instantiating GaussianNB
from sklearn.naive_bayes import GaussianNB
estimator = GaussianNB()

# Computing the cross validation accuracy as base model performance estimate.
cv = cross_val_score(estimator, xTrain, yTrain, cv=10)
print('Cross Validation Accuracy for the Base Model:', round(np.mean(cv), 4))

Cross Validation Accuracy for the Base Model: 0.789


## KNN

In [12]:
# instantiating KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier

estimator = KNeighborsClassifier()
param_grid = {
    'n_neighbors' : [3, 5, 7, 10],
}

In [13]:
best_score_, best_params_ = parameterTune(estimator, param_grid)
print('best_score_:',best_score_,'\nbest_params_:',best_params_)

best_score_: 0.7093716653066349 
best_params_: {'n_neighbors': 3}


## Support Vector Classifier

In [14]:
# instantiating Support Vector Classifier
from sklearn.svm import SVC
estimator = SVC()

param_grid = {
    'C'      : [1, 10, 100, 1000],
    'kernel' : ['linear', 'rbf'],
    'gamma'  : ['scale', 0.1, 0.01, 1e-3, 1e-4],
}

In [15]:
best_score_, best_params_ = parameterTune(estimator, param_grid)
svc_df = test_eval(SVC, best_params_)

print('best_score_:',best_score_,'\nbest_params_:',best_params_)

best_score_: 0.8069675475488042 
best_params_: {'C': 1000, 'gamma': 'scale', 'kernel': 'rbf'}


## Random Forest Classifier

In [16]:
# instantiating RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
estimator = RandomForestClassifier()

param_grid = {
    'n_estimators' : [50, 100, 250, 500, 750],
    'criterion'    : ["gini", "entropy"],
    'max_features' : [2,3,4,5,6],
}

In [17]:
best_score_, best_params_ = parameterTune(estimator, param_grid)
rfc_df = test_eval(RandomForestClassifier, best_params_)

print('best_score_:',best_score_,'\nbest_params_:',best_params_)

best_score_: 0.8283284162952734 
best_params_: {'criterion': 'gini', 'max_features': 5, 'n_estimators': 100}


# Submission File

In [18]:
svc_df.to_csv('./results/01_svc.csv', index=None)
rfc_df.to_csv('./results/02_rfc.csv', index=None)