In [104]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import random
import re
from sklearn.impute import KNNImputer

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn import tree
import graphviz

from sklearn.metrics import classification_report, make_scorer, accuracy_score, precision_score, recall_score, f1_score

import warnings
import sklearn.exceptions
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)

In [105]:
### Importing Data and Creating Dataframe
train_df = pd.read_csv('input/train.csv')
test_df = pd.read_csv('input/test.csv')
# train_df.info()
# print(train_df.describe())
# train_df.head(8)
combine = [train_df, test_df]

In [106]:
###Checking for columns with null values in Training Data

### Checking for null values in features "Pclass"
# bool_series = pd.isnull(train_df["Pclass"]) 
# train_df[bool_series]

### Checking for null values in features "Sex"
# bool_series = pd.isnull(train_df["Sex"]) 
# train_df[bool_series] 

### Checking for null values in features "Age"
# bool_series = pd.isnull(train_df["Age"])     #has empty values
# train_df[bool_series] 

### Checking for null values in features "SibSp"
# bool_series = pd.isnull(train_df["SibSp"]) 
# train_df[bool_series] 

### Checking for null values in features "Parch"
# bool_series = pd.isnull(train_df["Parch"]) 
# train_df[bool_series] 

### Checking for null values in features "Fare"
# bool_series = pd.isnull(train_df["Fare"]) 
# train_df[bool_series] 

### Checking for null values in features "Embarked"
# bool_series = pd.isnull(train_df["Embarked"])     #has empty values
# train_df[bool_series]


In [107]:
###Checking for columns with null values in Testing Data

### Checking for null values in features "Pclass"
# bool_series = pd.isnull(test_df["Pclass"]) 
# test_df[bool_series]

### Checking for null values in features "Sex"
# bool_series = pd.isnull(test_df["Sex"]) 
# test_df[bool_series] 

### Checking for null values in features "Age"
# bool_series = pd.isnull(test_df["Age"])     #has empty values
# test_df[bool_series] 

### Checking for null values in features "SibSp"
# bool_series = pd.isnull(test_df["SibSp"]) 
# test_df[bool_series] 

### Checking for null values in features "Parch"
# bool_series = pd.isnull(test_df["Parch"]) 
# test_df[bool_series] 

### Checking for null values in features "Fare"
# bool_series = pd.isnull(test_df["Fare"])    #has empty values
# test_df[bool_series] 

### Checking for null values in features "Embarked"
# bool_series = pd.isnull(test_df["Embarked"])    
# test_df[bool_series]

In [108]:
### Converting Categorial Feature "Sex" to Numeric value [male-> 0, female -> 1]
gender_map = {'female':1, 'male':0}
for dataset in combine:
    dataset['Sex'] = dataset['Sex'].map(gender_map).astype('int32')

print(dataset['Sex'])


0      0
1      1
2      0
3      0
4      1
      ..
413    0
414    1
415    0
416    0
417    0
Name: Sex, Length: 418, dtype: int32


In [109]:
### Handling Feature "Age"

### Filling up missing values in "Age" using KNN
print(train_df['Age'].isnull().sum())
print(test_df['Age'].isnull().sum())

imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')

for dataset in combine:    
    Age_val = dataset['Age'].values
    Age_val = Age_val.reshape(-1,1)
    imputer.fit(Age_val)
    Xtrans = imputer.transform(Age_val)
    dataset['Age'] = Xtrans
    
# print(train_df['Age'].isnull().sum())
# print(test_df['Age'].isnull().sum())

### Converting Continuous Feature "Age" to Ordinal Value

for dataset in combine:
    for index, row in dataset.iterrows():
        if(row['Age'] >= 0 and row['Age'] <= 12):
            dataset.loc[index, 'Age'] = 0
        elif(row['Age'] > 12 and row['Age'] <= 18):
            dataset.loc[index, 'Age'] = 1
        elif(row['Age'] > 18 and row['Age'] <= 24):
            dataset.loc[index, 'Age'] = 2
        elif(row['Age'] > 24 and row['Age'] <= 30):
            dataset.loc[index, 'Age'] = 3
        elif(row['Age'] > 30 and row['Age'] <= 40):
            dataset.loc[index, 'Age'] = 4
        elif(row['Age'] > 40 and row['Age'] <= 64):
            dataset.loc[index, 'Age'] = 5
        elif(row['Age'] > 64):
            dataset.loc[index, 'Age'] = 6
    dataset['Age'] = train_df['Age'].astype(int) 
       

177
86


In [110]:
### Handling Feature "Cabin"

deck = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "X": 8}
data = [train_df, test_df]

for dataset in data:
    dataset['Cabin'] = dataset['Cabin'].fillna("X0")
    dataset['Deck'] = dataset['Cabin'].map(lambda x: re.compile("([a-zA-Z]+)").search(x).group())
    dataset['Deck'] = dataset['Deck'].map(deck)
    dataset['Deck'] = dataset['Deck'].fillna(0)
    dataset['Deck'] = dataset['Deck'].astype(int)


In [111]:
###Handling feature "Embarked"

### Filling up missing values in "Embarked" using Highest Frquency
emb_high_freq_train = train_df.Embarked.dropna().mode()[0]
emb_high_freq_train_count = train_df["Embarked"].value_counts().nlargest(n=1).values[0]
emb_high_freq_test = test_df.Embarked.dropna().mode()[0]
emb_high_freq_test_count = test_df["Embarked"].value_counts().nlargest(n=1).values[0]

if emb_high_freq_train_count >= emb_high_freq_test_count:
    emb_high_freq = emb_high_freq_train
else:
    emb_high_freq = emb_high_freq_test
    
for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].fillna(emb_high_freq)


### Converting Categorial Feature "Embarked" to Numeric value [S-> 0, C -> 1, Q -> 2] 

emb_map = {'S': 0, 'C': 1, 'Q': 2}
for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].map(emb_map).astype('int32')

In [112]:
###Handling feature "Fare"

### Filling up missing values in "Embarked" using Highest Frquency
fare_med = test_df.Fare.dropna().median()
dataset['Fare'] = dataset['Fare'].fillna(fare_med) 

### Converting Continuous Feature "Fare" to Ordinal Value

for dataset in combine:
    for index, row in dataset.iterrows():
        if(row['Fare'] > -0.001 and row['Fare'] <= 7.91):
            dataset.loc[index, 'Fare'] = 0
        elif(row['Fare'] > 7.91 and row['Fare'] <= 14.454):
            dataset.loc[index, 'Fare'] = 1
        elif(row['Fare'] > 14.454 and row['Fare'] <= 31):
            dataset.loc[index, 'Fare'] = 2
        elif(row['Fare'] > 31 and row['Fare'] <= 512.329):
            dataset.loc[index, 'Fare'] = 3
    
train_df['Fare'] = train_df['Fare'].astype(int)        

In [113]:
###Handling feature "SibSp" and "Parch"
for dataset in combine:
    dataset['Family'] = dataset['SibSp'] + dataset['Parch']
    dataset.loc[dataset['Family'] > 0, 'Alone'] = 0
    dataset.loc[dataset['Family'] == 0, 'Alone'] = 1
    dataset['Alone'] = dataset['Alone'].astype(int)

In [114]:
### Drop Columns 
train_df = train_df.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin'], axis=1)
test_df = test_df.drop(['PassengerId','Name', 'SibSp', 'Parch', 'Ticket', 'Cabin'], axis=1)

print(train_df.head())
print(test_df.head())
combine = [train_df, test_df]

   Survived  Pclass  Sex  Age  Fare  Embarked  Deck  Family  Alone
0         0       3    0    2     0         0     8       1      0
1         1       1    1    4     3         1     3       1      0
2         1       3    1    3     1         0     8       0      1
3         1       1    1    4     3         0     3       1      0
4         0       3    0    4     1         0     8       0      1
   Pclass  Sex  Age  Fare  Embarked  Deck  Family  Alone
0       3    0    2   0.0         2     8       0      1
1       3    1    4   0.0         0     8       1      0
2       2    0    3   1.0         2     8       0      1
3       3    0    4   1.0         0     8       0      1
4       3    1    4   1.0         0     8       2      0


In [115]:
### Hyperparameter tuning on decision tree

# X_train = train_df.drop("Survived", axis=1)
# Y_train = train_df["Survived"]
# X_test  = test_df.copy()

# decision_tree = DecisionTreeClassifier(criterion='gini')
# decision_tree.fit(X_train, Y_train)
# Y_pred = decision_tree.predict(X_test)
# acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
# print('DT', acc_decision_tree)

# print(decision_tree.get_params())

# param_dict = {
#     "criterion":['gini'],
#     "max_depth":range(1,50),
#     "min_samples_split":range(1,50),
#     "min_samples_leaf":range(1,50)
# }

# grid = GridSearchCV(decision_tree,
#                    param_grid = param_dict,
#                    cv=10,
#                    verbose=1,
#                    n_jobs=-1)

# grid.fit(X_train, Y_train)

# print(grid.best_params_)
# print(grid.best_estimator_)
# print(grid.best_score_)

In [116]:
### Decision Tree
# X_train = train_df.drop("Survived", axis=1)
# Y_train = train_df["Survived"]
# X_test  = test_df.copy()

# decision_tree = DecisionTreeClassifier(criterion='gini', max_depth = 15, min_samples_leaf = 1, min_samples_split = 2)
# decision_tree.fit(X_train, Y_train)
# Y_pred = decision_tree.predict(X_test)
# acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
# print('Decision Tree Accuracy:', acc_decision_tree)

# # print(decision_tree.get_params())

# ### Drawing Decision Tree
# feature_names = ['Pclass', 'Sex', 'Age', 'Fare', 'Family', 'Embarked', 'Deck', 'Alone']
# class_names = ['0', '1']
# dot_data = tree.export_graphviz(decision_tree, out_file=None,
#                                 feature_names=feature_names,
#                                 class_names=class_names,
#                                 filled=True)
# graph = graphviz.Source(dot_data, format="png")
# graph.render("Decision_tree_plot")

In [117]:
### 5-fold cross-validation on the decision tree
# scores = cross_val_score(decision_tree, X_train, Y_train, cv=5, scoring = "accuracy")
# print("Scores:", scores)
# print("Mean accuracy after 5-fold cross-validation:", scores.mean()*100)
# print("Standard Deviation:", scores.std()*100)

In [118]:
### Random forest
# random_forest = RandomForestClassifier(n_estimators=100)
# random_forest.fit(X_train, Y_train)

# Y_prediction = random_forest.predict(X_test)

# random_forest.score(X_train, Y_train)
# acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
# print('Random Forest Accuracy', acc_random_forest)

In [119]:
### 5-fold cross-validation on the random forest
# scores = cross_val_score(random_forest, X_train, Y_train, cv=5, scoring = "accuracy")
# print("Scores:", scores)
# print("Mean accuracy after 5-fold cross-validation:", scores.mean()*100)
# print("Standard Deviation:", scores.std()*100)

In [120]:
scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}

In [121]:
### SVM
X_train = train_df.drop("Survived", axis=1)
Y_train = train_df["Survived"]
X_test  = test_df.copy()
svc = svm.SVC()
svc.fit(X_train, Y_train)

scores = cross_validate(svc, X_train, Y_train, cv=5, scoring=scoring)

print('Mean Accuracy:', scores['test_accuracy'].mean() * 100)
print('Mean Precision', scores['test_precision'].mean() * 100)
print('Mean Recall', scores['test_recall'].mean() * 100)
print('Mean f1_score', scores['test_f1_score'].mean() * 100)

Mean Accuracy: 68.69248634737303
Mean Precision 67.90742484621836
Mean Recall 34.51832907075874
Mean f1_score 45.56274898483441


In [122]:
### SVM with linear kernel
X_train = train_df.drop("Survived", axis=1)
Y_train = train_df["Survived"]
X_test  = test_df.copy()
svc = svm.SVC(kernel = 'linear')
svc.fit(X_train, Y_train)

scores = cross_validate(svc, X_train, Y_train, cv=5, scoring=scoring)

print('Mean Accuracy:', scores['test_accuracy'].mean() * 100)
print('Mean Precision', scores['test_precision'].mean() * 100)
print('Mean Recall', scores['test_recall'].mean() * 100)
print('Mean f1_score', scores['test_f1_score'].mean() * 100)

Mean Accuracy: 79.1224656330425
Mean Precision 74.93187061947673
Mean Recall 68.41005967604433
Mean f1_score 71.48742726340538


In [123]:
### SVM with quadratic kernel
X_train = train_df.drop("Survived", axis=1)
Y_train = train_df["Survived"]
X_test  = test_df.copy()
svc = svm.SVC(kernel='poly', degree=2)
svc.fit(X_train, Y_train)

scores = cross_validate(svc, X_train, Y_train, cv=5, scoring=scoring)

print('Mean Accuracy:', scores['test_accuracy'].mean() * 100)
print('Mean Precision', scores['test_precision'].mean() * 100)
print('Mean Recall', scores['test_recall'].mean() * 100)
print('Mean f1_score', scores['test_f1_score'].mean() * 100)

Mean Accuracy: 61.95342414161069
Mean Precision 60.0
Mean Recall 0.8780903665814151
Mean f1_score 1.730848861283644


In [124]:
### SVM with RBF kernel
X_train = train_df.drop("Survived", axis=1)
Y_train = train_df["Survived"]
X_test  = test_df.copy()
svc = svm.SVC(kernel='rbf')
svc.fit(X_train, Y_train)

scores = cross_validate(svc, X_train, Y_train, cv=5, scoring=scoring)

print('Mean Accuracy:', scores['test_accuracy'].mean() * 100)
print('Mean Precision', scores['test_precision'].mean() * 100)
print('Mean Recall', scores['test_recall'].mean() * 100)
print('Mean f1_score', scores['test_f1_score'].mean() * 100)

Mean Accuracy: 68.69248634737303
Mean Precision 67.90742484621836
Mean Recall 34.51832907075874
Mean f1_score 45.56274898483441
