# Homework 2 - Using a Decision Tree Regressor to determine who survives the Titanic Disaster

In [75]:
# Copy-pasted cell to get everything set up nicely

# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

import pandas as pd

In [76]:
# import the datasets

df_train = pd.read_csv('datasets/titanic/train.csv')
df_test = pd.read_csv('datasets/titanic/test.csv')

df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [77]:
# some setup that will be used for all attempts

from sklearn.tree import DecisionTreeRegressor

det_reg = DecisionTreeRegressor()

# prediction target: whether they survived
y = df_train.Survived

In [78]:
# describing the training data, to get a general idea of what's all there
df_train.describe()
# this function only covers numerical entries though

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


Here are a couple of observations of the data:
1. about 38% of passengers survived.
2. over half of the passengers had 3rd class tickets.
3. the average age of the passengers was about 30. The youngest was 5 months, the eldest was 80.
4. A majority of passengers came with none of their siblings or spouses. A vast majority came with none of their parents or children.
5. about 75% of the passengers paid equal or less than the average fare, which means the price of high-end tickets were far enough above the regular ticket price to skew the data. At least one passenger didn't pay; they might have been the crew.

## First Attempt: decision tree where features are sex and ticket class

In [79]:
features = ['Sex','Pclass']
X = df_train[features]

# modify the gender data of the training set to be a binary number
X['Sex'] = [1 if x == 'female' else 0 for x in X['Sex']]

det_reg.fit(X,y)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [80]:
# modify the gender data of the test set as well
df_test['Sex'] = [1 if x == 'female' else 0 for x in df_test['Sex']]

predictions = det_reg.predict(df_test[features])
predictions = np.around(predictions) # predictions give us a decimal, so we round it to make it consistent
df_test['Survived'] = predictions
df_test['Survived'] = df_test.Survived.apply(lambda x: int(x)) # convert the rounded numbers into ints

In [81]:
df_test[['PassengerId','Survived']].to_csv('datasets/titanic/class_gender_only.csv', index=False)
df_test[['PassengerId','Survived']].head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


### This attempt got a score of .75598, or about 75% correct--at the time of submission, this placed me at 8166th on the leaderboard.

## Second attempt: features are sex, ticket class, number of parents/children, whether or not they are a child, and whether or not they were crewmembers

In [82]:
# changing the training set's sex column to be a binary number for all future attempts
df_train['Sex'] = [1 if x == 'female' else 0 for x in df_train['Sex']]

# creating a new column that determines whether or not the passenger is a child
# they are a child if their age is 16 or less
df_train['Child?'] = [1 if x <= 16 else 0 for x in df_train['Age']]

df_train[868:877]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Child?
868,869,0,3,"van Melkebeke, Mr. Philemon",0,,0,0,345777,9.5,,S,0
869,870,1,3,"Johnson, Master. Harold Theodor",0,4.0,1,1,347742,11.1333,,S,1
870,871,0,3,"Balkic, Mr. Cerin",0,26.0,0,0,349248,7.8958,,S,0
871,872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",1,47.0,1,1,11751,52.5542,D35,S,0
872,873,0,1,"Carlsson, Mr. Frans Olof",0,33.0,0,0,695,5.0,B51 B53 B55,S,0
873,874,0,3,"Vander Cruyssen, Mr. Victor",0,47.0,0,0,345765,9.0,,S,0
874,875,1,2,"Abelson, Mrs. Samuel (Hannah Wizosky)",1,28.0,1,0,P/PP 3381,24.0,,C,0
875,876,1,3,"Najib, Miss. Adele Kiamie ""Jane""",1,15.0,0,0,2667,7.225,,C,1
876,877,0,3,"Gustafsson, Mr. Alfred Ossian",0,20.0,0,0,7534,9.8458,,S,0


In [83]:
print("Everyone who paid no fare:")
print(df_train[df_train['Fare'].isin([0.0])])

print('\n' + '\n' + "Everyone with a ticket number 'LINE':")
print(df_train[df_train['Ticket'].isin(['LINE'])])

Everyone who paid no fare:
     PassengerId  Survived  Pclass                              Name  Sex  \
179          180         0       3               Leonard, Mr. Lionel    0   
263          264         0       1             Harrison, Mr. William    0   
271          272         1       3      Tornquist, Mr. William Henry    0   
277          278         0       2       Parkes, Mr. Francis "Frank"    0   
302          303         0       3   Johnson, Mr. William Cahoone Jr    0   
413          414         0       2    Cunningham, Mr. Alfred Fleming    0   
466          467         0       2             Campbell, Mr. William    0   
481          482         0       2  Frost, Mr. Anthony Wood "Archie"    0   
597          598         0       3               Johnson, Mr. Alfred    0   
633          634         0       1     Parr, Mr. William Henry Marsh    0   
674          675         0       2        Watson, Mr. Ennis Hastings    0   
732          733         0       2              K

##### It appears that ticket fare is a good way to determine who were crew members. 
I'm not sure what ticket number 'LINE' means, but all passengers with those tickets also paid no fare, so I assume it's a crew member thing.

If you look closely at the data for the people who paid no fare, you find more evidence that they were all crew; they are all male and they all embarked from the same place: Southampton, which is where the Titanic first embarked from. I'm not sure why they have different ticket classes though.

Either way, it seems this is a good indicator for whether they survived or not--out of the 15 who we assume were crewmembers in this instance, only one survived.

Now I'm curious if fare is a good indicator over all passengers

In [84]:
print(df_train[['Survived','Fare', 'Pclass']].sort_values(by='Fare'))

     Survived      Fare  Pclass
271         1    0.0000       3
597         0    0.0000       3
302         0    0.0000       3
633         0    0.0000       1
277         0    0.0000       2
413         0    0.0000       2
674         0    0.0000       2
263         0    0.0000       1
466         0    0.0000       2
732         0    0.0000       2
179         0    0.0000       3
806         0    0.0000       1
481         0    0.0000       2
822         0    0.0000       1
815         0    0.0000       1
378         0    4.0125       3
872         0    5.0000       1
326         0    6.2375       3
843         0    6.4375       3
818         0    6.4500       3
371         0    6.4958       3
202         0    6.4958       3
654         0    6.7500       3
143         0    6.7500       3
411         0    6.8583       3
825         0    6.9500       3
129         0    6.9750       3
804         1    6.9750       3
477         0    7.0458       3
611         0    7.0500       3
..      

After looking at the data, it seems to me like people who paid more were more likely to survive. This already seems to be encapsulated in ticket class, except for the people who paid no fare. Since those people were unlikely to survive, it makes sense to add that into the decision tree.

In [85]:
# a passenger is considered crew if they paid no fare
df_train['Crew?'] = [1 if x == 0.0 else 0 for x in df_train['Fare']]

df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Child?,Crew?
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C,0,0
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S,0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S,0,0
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S,0,0


In [86]:
features = ['Sex','Pclass','Parch','Child?','Crew?']
X = df_train[features]

det_reg.fit(X,y)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [87]:
# modify the test set to add the new features
df_test['Child?'] = [1 if x <= 16 else 0 for x in df_test['Age']]
df_test['Crew?'] = [1 if x == 0.0 else 0 for x in df_test['Fare']]

predictions = det_reg.predict(df_test[features])
predictions = np.around(predictions) # predictions give us a decimal, so we round it to make it consistent
df_test['Survived'] = predictions
df_test['Survived'] = df_test.Survived.apply(lambda x: int(x)) # convert the rounded numbers into ints

In [88]:
df_test[['PassengerId','Survived']].to_csv('datasets/titanic/attempt_2.csv', index=False)
df_test[['PassengerId','Survived']].head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,0


### This attempt got a score of .77033, or about 77% correct--although it doesn't seem like much of an improvement over the last attempt, I shot up to 5992nd on the leaderboard.