### Libraries Used
* Pandas, Numpy - Data Loading / Transformation / Analysis
* Sklearn - ML Algorithms / Preprocessing ( PCA, TfidF Vectorizer )

In [14]:
# Import all dependencies required for the problem.
from __future__ import print_function
import numpy as np
import pandas as pd

from sklearn import ensemble
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [15]:
# Set a Seed for random number generation for reproducible results
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [16]:
# Load the titanic dataset using Pandas library
df = pd.read_excel('../../data/titanic_dataset.xlsx').dropna(subset=['Age'])

In [17]:
# Preview the Titanic Dataset
df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0


In [18]:
# Split the dataset into dependent features (passenger details used for prediction)
# and target features (prediction if the passenger survived)
x = df.loc[:,:'Embarked']
y = df['Survived']

In [19]:
# Convert categorical data (strings) to numerical for running ML Algorithms
x['Sex'] = x['Sex'].map(lambda x: 0 if x == 'male' else 1)

In [20]:
# x.Embarked = x.Embarked.map({'S': 1, 'Q': 2, 'N': 3}).fillna(4)

In [23]:
# Split the dataset into train and test, for learning from one dataset and test it on the other.
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [202]:
# Create a Decision Tree, with a max depth of 3 levels.
clf = ensemble.RandomForestClassifier(random_state=42, n_estimators=115)

In [203]:
# Filter only required columns for training
X_train_scaled = preprocessing.scale(X_train[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']])
X_test_scaled = preprocessing.scale(X_test[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']])

# X_train_scaled = preprocessing.scale(X_train[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch']])
# X_test_scaled = preprocessing.scale(X_test[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch']])

In [204]:
# Train the Decision Tree classifier with the training dataset
clf = clf.fit(X_train_scaled, y_train)

In [205]:
from sklearn.metrics import accuracy_score
print("Accuracy of Decision Tree: {:.2f}".format(
    accuracy_score(y_test, clf.predict(X_test_scaled)) * 100.0
))
print("Accuracy of Smart Classifier: {:.2f}".format(
    accuracy_score(y_test, [z[1] > 0 for z in X_test_scaled]) * 100.0
))

Accuracy of Decision Tree: 82.52
Accuracy of Smart Classifier: 73.43
