### Libraries Used
* Pandas, Numpy - Data Loading / Transformation / Analysis
* Sklearn - ML Algorithms / Preprocessing ( PCA, TfidF Vectorizer )

In [1]:
# Import all dependencies required for the problem.
from __future__ import print_function
import numpy as np
import pandas as pd

from sklearn import tree
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [3]:
# Set a Seed for random number generation for reproducible results
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [4]:
# Load the titanic dataset using Pandas library
df = pd.read_excel('../../data/titanic_dataset.xlsx').dropna(subset=['Age'])

In [5]:
# Preview the Titanic Dataset
df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0


In [6]:
# Split the dataset into dependent features (passenger details used for prediction)
# and target features (prediction if the passenger survived)
x = df.loc[:,:'Embarked']
y = df['Survived']

In [7]:
# Convert categorical data (strings) to numerical for running ML Algorithms
x['Sex'] = x['Sex'].map(lambda x: 0 if x == 'male' else 1)

In [10]:
# x.Embarked = x.Embarked.map({'S': 1, 'Q': 2, 'N': 3}).fillna(4)

In [11]:
# Split the dataset into train and test, for learning from one dataset and test it on the other.
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [175]:
# Filter only required columns for training
X_train_scaled = preprocessing.scale(X_train[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']])
X_test_scaled = preprocessing.scale(X_test[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']])

In [179]:
# Create a Decision Tree, with a max depth of 3 levels.
clf = tree.DecisionTreeClassifier(random_state=42 max_depth=7, min_samples_leaf=4)

In [180]:
# Train the Decision Tree classifier with the training dataset
clf.fit(X_train_scaled, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=7,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=49, splitter='best')

In [181]:
from sklearn.metrics import accuracy_score
print("Accuracy of Decision Tree: {:.2f}".format(
    accuracy_score(y_test, clf.predict(X_test_scaled)) * 100.0
))
print("Accuracy of Smart Classifier: {:.2f}".format(
    accuracy_score(y_test, [z[1] >
                            
                            0 for z in X_test_scaled]) * 100.0
))

Accuracy of Decision Tree: 81.82
Accuracy of Smart Classifier: 73.43


In [182]:
# Plot the features the decision tree found important
pd.DataFrame(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare'], clf.feature_importances_)

Unnamed: 0,0
0.165183,Pclass
0.517587,Sex
0.172668,Age
0.035574,SibSp
0.019375,Parch
0.089613,Fare


In [127]:
# Export the Tree, to manually explore its decision criterias
tree.export_graphviz(clf, 'tree.dot')

In [92]:
# MultiLevel Survival Rate
df.pivot_table(index=['Sex', 'Pclass'], values=['Survived'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Survived
Sex,Pclass,Unnamed: 2_level_1
female,1,0.964706
female,2,0.918919
female,3,0.460784
male,1,0.39604
male,2,0.151515
male,3,0.150198
