In [1]:
import os 
import time
import random

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from scipy.stats import norm  

%matplotlib inline


In [2]:
# Read data into a Dataframe
data = pd.read_csv(
    filepath_or_buffer='../data/external/train.csv',
    sep=',')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


ARIABLE DESCRIPTIONS:
* survival        Survival
                (0 = No; 1 = Yes)
* pclass          Passenger Class
                (1 = 1st; 2 = 2nd; 3 = 3rd)
* name      :      Name
* sex       :      Sex
* age       :      Age
* sibsp     :      Number of Siblings/Spouses Aboard
* parch     :      Number of Parents/Children Aboard
* ticket    :      Ticket Number
* fare      :      Passenger Fare
* cabin     :      Cabin
* embarked  :      Port of Embarkation
                (C = Cherbourg; Q = Queenstown; S = Southampton)

In [3]:
 # transform data
# transform sex
dict_gender = {'female':0, 'male':1, '':2}
data['Sex'].replace(dict_gender,inplace=True) # convert Sex into numeric values

data.Embarked.unique() # array(['S', 'C', 'Q', nan], dtype=object)
dict_embarked = {'S':0, 'C':1,'Q':2,'':3}
data.Embarked.replace(dict_embarked,inplace=True) # convert Sex into numeric values

In [4]:
# Choose features based on correlation results (y=Survived)
data.corr() # 'Pclass','Sex','Fare' > |0.20|


Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,1.0,-0.005007,-0.035144,0.042939,0.036847,-0.057527,-0.001652,0.012658,-0.030555
Survived,-0.005007,1.0,-0.338481,-0.543351,-0.077221,-0.035322,0.081629,0.257307,0.108669
Pclass,-0.035144,-0.338481,1.0,0.1319,-0.369226,0.083081,0.018443,-0.5495,0.043835
Sex,0.042939,-0.543351,0.1319,1.0,0.093254,-0.114631,-0.245489,-0.182333,-0.118593
Age,0.036847,-0.077221,-0.369226,0.093254,1.0,-0.308247,-0.189119,0.096067,0.012186
SibSp,-0.057527,-0.035322,0.083081,-0.114631,-0.308247,1.0,0.414838,0.159651,-0.060606
Parch,-0.001652,0.081629,0.018443,-0.245489,-0.189119,0.414838,1.0,0.216225,-0.07932
Fare,0.012658,0.257307,-0.5495,-0.182333,0.096067,0.159651,0.216225,1.0,0.063462
Embarked,-0.030555,0.108669,0.043835,-0.118593,0.012186,-0.060606,-0.07932,0.063462,1.0


In [5]:
# Select X-train  and y-train
X_train = data[['Pclass','Sex','Fare']]
Y_train = data[['Survived']]

In [6]:
from sklearn import tree
import numpy as np
clf=tree.DecisionTreeClassifier(max_depth=6,max_leaf_nodes=20)


In [7]:
X=np.array(X_train)
y=np.array(Y_train)
clf=clf.fit(X,y)

In [8]:
with open("../models/decisionTree_titanic.dot","w") as f:
    f = tree.export_graphviz(clf,
                            feature_names=X_train.columns.tolist(),out_file=f)


In [9]:
# Export tree model


##  Saving doesn't work for some reason: dot program missing (but pydot + graphviz installed)
# import pydot
# (graph,) = pydot.graph_from_dot_file('../models/decisionTree_titanic.dot')
# graph.write_png('../reports/decisionTree.png')


# from subprocess import check_call
# check_call(['dot','-Tpng','../models/decisionTree_titanic.dot','-o','../reports/decisionTree.png'])

In [10]:
# load test data
test_data = pd.read_csv(
    filepath_or_buffer='../data/external/test.csv',
    sep=',')
test_data.head()


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [11]:
 # transform test data
# transform sex
test_data['Sex'].replace(dict_gender,inplace=True) # convert Sex into numeric values
test_data.Embarked.replace(dict_embarked,inplace=True) # convert Sex into numeric values
print(test_data.Sex.value_counts())
print(test_data.Embarked.value_counts())

1    266
0    152
Name: Sex, dtype: int64
0    270
1    102
2     46
Name: Embarked, dtype: int64


In [12]:
# Handle NA values

test_data.Embarked.value_counts()
test_data[test_data['Fare'].isnull()] # 152 	1044 	3 	Storey, Mr. Thomas 	1 	60.5 	0 	0 	3701 	NaN 	NaN 	0
test_data.Fare.fillna(0.0,inplace=True) # very rudimentary fix. just set NaN to 0.0

In [13]:
X_test = np.array(test_data[['PassengerId','Pclass','Sex','Fare']])


In [14]:
prediction = clf.predict(X_test[0:,1:])

In [15]:

# X_test[0:,:1]prediction

result = pd.DataFrame(np.c_[ X_test[0:,:1].reshape(len(X_test[0:,:1])),prediction ].astype(int), 
             columns=["PassengerId","Survived"] )

result.to_csv(quotechar="\"", sep=",", path_or_buf="../reports/result_test_1.csv", index=False)