In [1]:
import re
import json
import numpy as np
import pandas as pd

from sklearn. model_selection import cross_val_score
from sklearn.impute import KNNImputer , SimpleImputer

In [2]:
# Reading the dataset
Train = pd.read_csv('./dataset/train.csv', index_col=0)
Test  = pd.read_csv('./dataset/test.csv',  index_col=0)

In [3]:
Train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Pre-Processing

In [4]:
features = ['Ticket', 'Cabin']
Train = Train.drop(features, axis=1)
Test  = Test.drop(features, axis=1)
Test_id = Test.index

In [5]:
Train['Name'] = Train.Name.map(lambda x:re.findall('([A-Za-z]+\.)' ,x)[0])
Test['Name']  = Test.Name.map(lambda x:re.findall('([A-Za-z]+\.)' ,x)[0])

In [6]:
def group_titles(titles):
    for i, each in enumerate(titles):
        if  any(each == ele for ele in ['Mr.', 'Miss.', 'Mrs.', 'Master.']):
            continue
        elif  any(each == ele for ele in ['Sir.', 'Ms.', 'Mme.', 'Mlle.', 'Lady.', 'Countess.']):
            titles[i] = 'grp1'
        else:
            titles[i] = 'grp2'

In [7]:
group_titles(Train.Name.values)
group_titles(Test.Name.values)

In [8]:
for attr in ['Age']: #fillna for real valued features with mean
    fill = Train[attr].mean()
    Train[attr].fillna(fill, inplace=True)
    Test[attr].fillna(fill, inplace=True)
    
# as Fare has skewed distribution using median as central tendancy
for attr in ['Fare']: #fillna for real valued features with median
    fill = Train[attr].median()
    Train[attr].fillna(fill, inplace=True)
    Test[attr].fillna(fill, inplace=True)
    
for attr in ['Embarked']: #fillna for categorical features with mode
    fill = Train[attr].mode()[0]
    Train[attr].fillna(fill, inplace=True)
    Test[attr].fillna(fill, inplace=True)   

In [9]:
train = pd.get_dummies(Train)
test  = pd.get_dummies(Test)

In [10]:
# splitting into features (xTrain) and labels (yTrain)
xTrain = train.drop('Survived', axis=1)
yTrain = train['Survived']
xTest  = test

In [11]:
from sklearn.preprocessing import StandardScaler
scaller = StandardScaler()
scaller.fit(xTrain[['Age', 'Fare']])

StandardScaler()

In [12]:
xTrain[['Age', 'Fare']] = scaller.transform(xTrain[['Age', 'Fare']])
xTest[['Age', 'Fare']]  = scaller.transform(xTest[['Age', 'Fare']])

In [13]:
xTrain.head()

Unnamed: 0_level_0,Pclass,Age,SibSp,Parch,Fare,Name_Master.,Name_Miss.,Name_Mr.,Name_Mrs.,Name_grp1,Name_grp2,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,3,-0.592481,1,0,-0.502445,0,0,1,0,0,0,0,1,0,0,1
2,1,0.638789,1,0,0.786845,0,0,0,1,0,0,1,0,1,0,0
3,3,-0.284663,0,0,-0.488854,0,1,0,0,0,0,1,0,0,0,1
4,1,0.407926,1,0,0.42073,0,0,0,1,0,0,1,0,0,0,1
5,3,0.407926,0,0,-0.486337,0,0,1,0,0,0,0,1,0,0,1


# Different Classifiers and Parameter Tuning 

In [14]:
def parameterTune(estimator, param_grid):
    from sklearn.model_selection import GridSearchCV
    grid = GridSearchCV(
            estimator  = estimator,
            param_grid = param_grid,
            n_jobs     = 11,
            cv         = 5,

    )
    grid.fit(xTrain, yTrain)
    
    return grid.best_score_, grid.best_params_

In [15]:
# function to generate submission file
def test_eval(estimator, params):
    clf = estimator(**params)
    clf.fit(xTrain, yTrain)
    yPred = clf.predict(xTest)
    
    df = pd.DataFrame({'PassengerId':Test_id, 'Survived':yPred})
    return df

## Gaussian Naive Bayes

In [16]:
from sklearn.naive_bayes import GaussianNB
estimator = GaussianNB()

param_grid = {}

gnb_best_score_, gnb_best_params_ = parameterTune(estimator, param_grid)
gnb_df = test_eval(GaussianNB, gnb_best_params_)

In [17]:
print('best_score_:',gnb_best_score_,'\nbest_params_:',gnb_best_params_)

best_score_: 0.7677044755508129 
best_params_: {}


## Logistic Regression

In [18]:
from sklearn.linear_model import LogisticRegression
estimator = LogisticRegression(tol=1e-4, solver='liblinear', random_state=1)

param_grid = {
    'max_iter' : [1000, 2000, 3000],
    'penalty'  : ['l1', 'l2'],
    'solver'   : ['liblinear']
}

lrc_best_score_, lrc_best_params_ = parameterTune(estimator, param_grid)
lrc_df = test_eval(LogisticRegression, lrc_best_params_)

In [19]:
print('best_score_:',lrc_best_score_,'\nbest_params_:',lrc_best_params_)

best_score_: 0.8260247316552632 
best_params_: {'max_iter': 1000, 'penalty': 'l1', 'solver': 'liblinear'}


## KNN

In [20]:
from sklearn.neighbors import KNeighborsClassifier
estimator = KNeighborsClassifier()

param_grid = {
    'n_neighbors' : [3, 5, 7, 10],
    'weights'     : ['uniform', 'distance'],
    'p'           : [1, 2]
}

knn_best_score_, knn_best_params_ = parameterTune(estimator, param_grid)
knn_df = test_eval(KNeighborsClassifier, knn_best_params_)

In [21]:
print('best_score_:',knn_best_score_,'\nbest_params_:',knn_best_params_)

best_score_: 0.8282907538760906 
best_params_: {'n_neighbors': 10, 'p': 1, 'weights': 'uniform'}


## Support Vector Classifier

In [22]:
# instantiating Support Vector Classifier
from sklearn.svm import SVC
estimator = SVC()

param_grid = [
    {   'kernel' : ['linear'],
        'C'      : [0.1, 1, 10, 100]},
    
    {   'kernel' : ['rbf'],
        'C'      : [0.1, 1, 10, 100],
        'gamma'  : ['scale', 'auto'],},
]

svc_best_score_, svc_best_params_ = parameterTune(estimator, param_grid)
svc_df = test_eval(SVC, svc_best_params_)

In [23]:
print('best_score_:',svc_best_score_,'\nbest_params_:',svc_best_params_)

best_score_: 0.8338773460548616 
best_params_: {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}


## Random Forest Classifier

In [24]:
# instantiating RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
estimator = RandomForestClassifier()

param_grid = {
    'n_estimators' : [50, 100, 250, 500, 750, 1000],
    'criterion'    : ["gini", "entropy"],
    'max_features' : ["auto", 2, 5, 7, 10],
}

rfc_best_score_, rfc_best_params_ = parameterTune(estimator, param_grid)
rfc_df = test_eval(RandomForestClassifier, rfc_best_params_)

In [25]:
print('best_score_:',rfc_best_score_,'\nbest_params_:',rfc_best_params_)

best_score_: 0.8226790534178645 
best_params_: {'criterion': 'gini', 'max_features': 10, 'n_estimators': 250}


# Submission File

In [26]:
pd.DataFrame({
    'GaussianNB'      : gnb_best_score_,
    'LogisticRegression'      : lrc_best_score_,
    'KNeighborsClassifier'    : knn_best_score_,
    'SVC' : svc_best_score_,
    'RandomForestClassifier'  : rfc_best_score_
}, index=['Accuracy'])

Unnamed: 0,GaussianNB,LogisticRegression,KNeighborsClassifier,SVC,RandomForestClassifier
Accuracy,0.767704,0.826025,0.828291,0.833877,0.822679


In [27]:
best_params = {
    'GaussianNB'              : gnb_best_params_,
    'LogisticRegression'      : lrc_best_params_,
    'KNeighborsClassifier'    : knn_best_params_,
    'SVC'                     : svc_best_params_,
    'RandomForestClassifier'  : rfc_best_params_
}

with open("./results/04_.json", 'w') as file:
    json.dump(best_params, file)

In [28]:
svc_df.to_csv('./results/04_01_svc.csv', index=None) #0.77990
rfc_df.to_csv('./results/04_02_rfc.csv', index=None) #0.74401