# Predict whether you survive or not in a Titanic Disaster

* ### Import all required modules

In [5]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

import coremltools

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

* ### Set numpy random state and examine the data

In [2]:
random_state = 42

In [3]:
data = pd.read_csv('train.csv', header = 0)

In [4]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


* ### Let's process the data for prediction algorithm

* **Delete useless columns and add new columns**

In [31]:
useless_labels = [
    'PassengerId',
    'Name',
    'Cabin',
    'Ticket',
    'Fare'
]
data_without_useless_labels = data.copy()
data_without_useless_labels.drop(useless_labels, axis=1, inplace=True)

In [32]:
data_with_new_columns = data_without_useless_labels.copy()
data_with_new_columns['Alliance'] = data_with_new_columns['SibSp'] + data_with_new_columns['Parch']
data_with_new_columns.drop(['SibSp', 'Parch'], axis=1, inplace=True)

* **Fill `NaN` in the dataset**

In [33]:
data_without_nan = data_with_new_columns.copy()
data_without_nan.isna().sum()

Survived      0
Pclass        0
Sex           0
Age         177
Embarked      2
Alliance      0
dtype: int64

In [34]:
numbers_type = data_without_nan.select_dtypes(include=[np.number]).isna().any()
numbers_type = numbers_type[numbers_type]
for index in numbers_type.index:
    data_without_nan[index].fillna(value = data_without_nan[index].mean(), inplace = True)

In [35]:
object_type = data_without_nan.select_dtypes(include=[np.object]).isna().any()
object_type = object_type[object_type]
for index in object_type.index:
    data_without_nan[index].fillna(value = data_without_nan[index].mode()[0], inplace = True)

In [36]:
data_without_nan.isna().any()

Survived    False
Pclass      False
Sex         False
Age         False
Embarked    False
Alliance    False
dtype: bool

* **Encode categorical features**

In [37]:
data_without_cat_columns = data_without_nan.copy()

In [38]:
encoded_columns = {
    'Embarked': {'S': 3, 'C': 2, 'Q': 1},
    'Sex': {'male': 1, 'female': 2}
}
data_without_cat_columns.replace(encoded_columns, inplace=True)

* **Normalize data**

In [39]:
normalize_data = data_without_cat_columns.copy()

In [40]:
scaler = StandardScaler()
normalize_data.loc[:, ['Age']] = scaler.fit_transform(normalize_data.loc[:, ['Age']])

* ### Let's see the importance of the selected features in the classification

In [51]:
labels = [
    'Pclass', 
    'Age', 
    'Sex', 
    'Embarked', 
    'Alliance'
]
X_for_tree = normalize_data.loc[:, labels]
y_for_tree = data['Survived']
y_for_tree = y_for_tree[X_for_tree.index.values]
dec_tree_clf = DecisionTreeClassifier(random_state=random_state)
dec_tree_clf.fit(np.array(X_for_tree), np.array(y_for_tree))
importances = dec_tree_clf.feature_importances_
importances_data = pd.Series(importances, index=labels)
importances_data.sort_values(ascending=False)

Sex         0.365439
Age         0.324613
Alliance    0.158856
Pclass      0.130904
Embarked    0.020187
dtype: float64

* ### Split data into train and test

In [42]:
X = normalize_data.drop('Survived', axis=1)
y = normalize_data['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = random_state)

* ### Perform the cross validation to tune the model (SVC)

In [43]:
kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
params = {'C': np.arange(1, 10), 'kernel': ['poly', 'rbf', 'sigmoid'], 'class_weight': ['balanced', None]}
grid_search = GridSearchCV(SVC(), params, cv=kf, scoring='accuracy')

In [44]:
grid_search.fit(X_train, y_train)
best_params, best_score, best_model = grid_search.best_params_, grid_search.best_score_, grid_search.best_estimator_

In [45]:
print("Best score: {}".format(best_score))
print("Best parameters: {}".format(best_params))
print("Best model: {}".format(best_model))

Best score: 0.8293413173652695
Best parameters: {'C': 2, 'class_weight': None, 'kernel': 'poly'}
Best model: SVC(C=2, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


* ### Let's see the quality of the model

In [46]:
def get_scores(estimator, X, y=y_test):
    y_pred = estimator.predict(X)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    columns = ['accuracy', 'precision', 'recall', 'f1', 'AUC-ROC']
    results = pd.DataFrame([accuracy, precision, recall, f1, roc_auc], index=columns).T
    conf_matrix = pd.DataFrame(conf_matrix, columns=['Predicted: 0', 'Predicted: 1'], index=['Actual: 0', 'Actual: 1'])
    
    return results, conf_matrix

In [47]:
best_model.fit(X_train, y_train)
results, conf_matrix = get_scores(best_model, X_test)
results

Unnamed: 0,accuracy,precision,recall,f1,AUC-ROC
0,0.820628,0.802469,0.730337,0.764706,0.805467


In [48]:
conf_matrix

Unnamed: 0,Predicted: 0,Predicted: 1
Actual: 0,118,16
Actual: 1,24,65


* ### Convert the model into MLModel format

In [49]:
input_features = [
    'Pclass',
    'Sex',
    'Age',
    'Embarked',
    'Alliance'
]
coreml_model = coremltools.converters.sklearn.convert(best_model,
                                                     input_features=input_features,
                                                     output_feature_names ='Survived')

coreml_model.author = 'Alex Milogradsky'
coreml_model.short_description = 'Simple SVC to predict whether you fortune in a Titanic Disaster'
coreml_model.input_description['Pclass'] = 'Passenger Ticket class'
coreml_model.input_description['Sex'] = 'Passenger Sex'
coreml_model.input_description['Age'] = 'Passenger Age'
coreml_model.input_description['Embarked'] = 'Port of Embarkation'
coreml_model.input_description['Alliance'] = 'Number of family members on the ship'
coreml_model.output_description['Survived'] = 'Survived: yes or no'
coreml_model.save('TitanicFortune2.mlmodel')