# Reading in the Data

In [1]:
import csv as csv
import numpy as np
import pandas as pd


train_df = pd.read_csv('csv/train.csv',header=0)
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Identifying columns that aren't Float type or with NaN's

In [2]:
def floatOrNan (train_df):
    for col in train_df:
        if (train_df[col].dtypes != "float64"):
            print col, train_df[col].dtypes
        if ((len(train_df[train_df[col].isnull() == True])) > 0):
            print col, "Contains NaN's"
floatOrNan(train_df)

PassengerId int64
Survived int64
Pclass int64
Name object
Sex object
Age Contains NaN's
SibSp int64
Parch int64
Ticket object
Cabin object
Cabin Contains NaN's
Embarked object
Embarked Contains NaN's


# Dealing with NaN's in Age

I will replace the NaN's in Age with the median age of each passenger based off their respective gender and class.

In [3]:
median_ages = np.zeros((2,3))
median_ages

array([[ 0.,  0.,  0.],
       [ 0.,  0.,  0.]])

Map passenger's gender to 0 or 1 in order to map gender and class later on.

In [4]:
train_df['Sex'] = train_df['Sex'].map({'female':0, 'male':1}).astype(int)
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,S


Calculate median age by gender and class.

In [5]:
for i in range(0,2):
    for j in range(0,3):
        median_ages[i,j] = train_df[(train_df['Sex'] == i) & \
                              (train_df['Pclass'] == j+1)]['Age'].dropna().median()
median_ages

array([[ 35. ,  28. ,  21.5],
       [ 40. ,  30. ,  25. ]])

Replace NaN's with median age for passenger's gender and class

In [6]:
for i in range(0,2):
    for j in range(0,3):
        train_df.loc[(train_df.Age.isnull()) & (train_df.Sex == i) & (train_df.Pclass == j+1), \
                'Age'] = median_ages[i,j]
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,S


Confirm Age column is no longer empty

In [7]:
# Checks for empty columns in train_df
for column in train_df:
    if (len(train_df[train_df[column].isnull() == True])) > 0:
        print column

Cabin
Embarked


# Converting int64 col's to float

In [8]:
for col in train_df:
    if (train_df[col].dtypes == "int64"):
        train_df[col] = train_df[col].astype(float)

Confirm int's are converted.

In [9]:
for col in train_df:
    if (train_df[col].dtypes != "float64"):
        print col, train_df[col].dtypes
    if ((len(train_df[train_df[col].isnull() == True])) > 0):
        print col, "Contains NaN's"

Name object
Ticket object
Cabin object
Cabin Contains NaN's
Embarked object
Embarked Contains NaN's


In [10]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1.0,0.0,3.0,"Braund, Mr. Owen Harris",1.0,22.0,1.0,0.0,A/5 21171,7.25,,S
1,2.0,1.0,1.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0.0,38.0,1.0,0.0,PC 17599,71.2833,C85,C
2,3.0,1.0,3.0,"Heikkinen, Miss. Laina",0.0,26.0,0.0,0.0,STON/O2. 3101282,7.925,,S
3,4.0,1.0,1.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0.0,35.0,1.0,0.0,113803,53.1,C123,S
4,5.0,0.0,3.0,"Allen, Mr. William Henry",1.0,35.0,0.0,0.0,373450,8.05,,S


# Converting object col's to float

### Name

In [11]:
# Name: Nothing to do with name and can't be converted.
train_df = train_df.drop('Name',axis=1)
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1.0,0.0,3.0,1.0,22.0,1.0,0.0,A/5 21171,7.25,,S
1,2.0,1.0,1.0,0.0,38.0,1.0,0.0,PC 17599,71.2833,C85,C
2,3.0,1.0,3.0,0.0,26.0,0.0,0.0,STON/O2. 3101282,7.925,,S
3,4.0,1.0,1.0,0.0,35.0,1.0,0.0,113803,53.1,C123,S
4,5.0,0.0,3.0,1.0,35.0,0.0,0.0,373450,8.05,,S


### Embarked

In [12]:
# Embarked: Only three ports, so map to 0 1 2.
train_df['Embarked'] = train_df['Embarked'].map({'S':0, 'C':1, 'Q':2}).astype(float)
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1.0,0.0,3.0,1.0,22.0,1.0,0.0,A/5 21171,7.25,,0.0
1,2.0,1.0,1.0,0.0,38.0,1.0,0.0,PC 17599,71.2833,C85,1.0
2,3.0,1.0,3.0,0.0,26.0,0.0,0.0,STON/O2. 3101282,7.925,,0.0
3,4.0,1.0,1.0,0.0,35.0,1.0,0.0,113803,53.1,C123,0.0
4,5.0,0.0,3.0,1.0,35.0,0.0,0.0,373450,8.05,,0.0


In [13]:
median_embarked = np.zeros((2,3))
median_embarked

array([[ 0.,  0.,  0.],
       [ 0.,  0.,  0.]])

In [14]:
for i in range(0,2):
    for j in range(0,3):
        median_embarked[i,j] = train_df[(train_df['Sex'] == i) & \
                              (train_df['Pclass'] == j+1)]['Embarked'].dropna().mean()
median_embarked

array([[ 0.48913043,  0.14473684,  0.61805556],
       [ 0.36065574,  0.11111111,  0.34870317]])

In [15]:
# Take the average port embarked by passengers based on their class and sex
for i in range(0,2):
    for j in range(0,3):
        median_embarked[i,j] = round(median_embarked[i,j] * 3)
median_embarked        

array([[ 1.,  0.,  2.],
       [ 1.,  0.,  1.]])

In [16]:
for i in range(0,2):
    for j in range(0,3):
        train_df.loc[(train_df.Embarked.isnull()) & (train_df.Sex == i) & (train_df.Pclass == j+1), \
                'Embarked'] = median_embarked[i,j]

Confirm embarked is all float and no more NaN's.

In [17]:
for col in train_df:
    if (train_df[col].dtypes != "float64"):
        print col, train_df[col].dtypes
    if ((len(train_df[train_df[col].isnull() == True])) > 0):
        print col, "Contains NaN's"

Ticket object
Cabin object
Cabin Contains NaN's


### Ticket

In [18]:
import re
# Strip all the prefixes
train_df['Ticket'] = train_df['Ticket'].map(lambda x: (re.sub(".*? (.+)", "\\1", x)))

# Define function that returns 0 if it can't convert to type
def tryconvert(value, default, *types):
    for t in types:
        try:
            return t(value)
        except ValueError, TypeError:
            continue
    return default

train_df['Ticket'] = train_df['Ticket'].map(lambda x: tryconvert(x,0,float))

Confirm Ticket is a float.

In [19]:
for col in train_df:
    if (train_df[col].dtypes != "float64"):
        print col, train_df[col].dtypes
    if ((len(train_df[train_df[col].isnull() == True])) > 0):
        print col, "Contains NaN's"

Cabin object
Cabin Contains NaN's


### Cabin

In [20]:
# Drop cabin because not much can be done with it right now.
train_df = train_df.drop('Cabin', axis=1)

Confirm everything is a float

In [21]:
for col in train_df:
    if (train_df[col].dtypes != "float64"):
        print col, train_df[col].dtypes
    if ((len(train_df[train_df[col].isnull() == True])) > 0):
        print col, "Contains NaN's"

In [22]:
#train_df = train_df.drop('Ticket',axis=1)
train_df = train_df.drop('PassengerId', axis=1)

# Random Forests! Finally!

## Read in test data

In [23]:
test_train_df = pd.read_csv('csv/test.csv',header=0)
test_train_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


Which ones need cleaning?

In [24]:
floatOrNan(test_train_df)

PassengerId int64
Pclass int64
Name object
Sex object
Age Contains NaN's
SibSp int64
Parch int64
Ticket object
Fare Contains NaN's
Cabin object
Cabin Contains NaN's
Embarked object


## Drop unneeded columns

In [25]:
# Collect all passengerIds before throwing them out
ids = test_train_df['PassengerId'].values
test_train_df = test_train_df.drop('PassengerId', axis=1)
test_train_df = test_train_df.drop('Name', axis=1)
test_train_df = test_train_df.drop('Cabin', axis=1)
#test_train_df = test_train_df.drop('Ticket', axis=1)


In [26]:
floatOrNan(test_train_df)

Pclass int64
Sex object
Age Contains NaN's
SibSp int64
Parch int64
Ticket object
Fare Contains NaN's
Embarked object


### Convert int's to floats

In [27]:
for col in test_train_df:
    if (test_train_df[col].dtypes == "int64"):
        test_train_df[col] = test_train_df[col].astype(float)

In [28]:
test_train_df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,3.0,male,34.5,0.0,0.0,330911,7.8292,Q
1,3.0,female,47.0,1.0,0.0,363272,7.0,S
2,2.0,male,62.0,0.0,0.0,240276,9.6875,Q
3,3.0,male,27.0,0.0,0.0,315154,8.6625,S
4,3.0,female,22.0,1.0,1.0,3101298,12.2875,S


In [29]:
test_train_df['Embarked'] = test_train_df['Embarked'].map({'S':0, 'C':1, 'Q':2}).astype(float)
test_train_df['Sex'] = test_train_df['Sex'].map({'female':0, 'male':1}).astype(float)


In [30]:
test_train_df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,3.0,1.0,34.5,0.0,0.0,330911,7.8292,2.0
1,3.0,0.0,47.0,1.0,0.0,363272,7.0,0.0
2,2.0,1.0,62.0,0.0,0.0,240276,9.6875,2.0
3,3.0,1.0,27.0,0.0,0.0,315154,8.6625,0.0
4,3.0,0.0,22.0,1.0,1.0,3101298,12.2875,0.0


In [31]:
import re
# Strip all the prefixes
test_train_df['Ticket'] = test_train_df['Ticket'].map(lambda x: (re.sub(".*? (.+)", "\\1", x)))

# Define function that returns 0 if it can't convert to type
def tryconvert(value, default, *types):
    for t in types:
        try:
            return t(value)
        except ValueError, TypeError:
            continue
    return default

test_train_df['Ticket'] = test_train_df['Ticket'].map(lambda x: tryconvert(x,0,float))

In [32]:
for i in range(0,2):
    for j in range(0,3):
        median_ages[i,j] = test_train_df[(test_train_df['Sex'] == i) & \
                              (test_train_df['Pclass'] == j+1)]['Age'].dropna().median()
median_ages

array([[ 41.,  24.,  22.],
       [ 42.,  28.,  24.]])

In [33]:
for i in range(0,2):
    for j in range(0,3):
        test_train_df.loc[(test_train_df.Age.isnull()) & (test_train_df.Sex == i) & (test_train_df.Pclass == j+1), \
                'Age'] = median_ages[i,j]

In [42]:
test_train_df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,3.0,1.0,34.5,0.0,0.0,330911.0,7.8292,2.0
1,3.0,0.0,47.0,1.0,0.0,363272.0,7.0,0.0
2,2.0,1.0,62.0,0.0,0.0,240276.0,9.6875,2.0
3,3.0,1.0,27.0,0.0,0.0,315154.0,8.6625,0.0
4,3.0,0.0,22.0,1.0,1.0,3101298.0,12.2875,0.0


In [35]:
floatOrNan(test_train_df)

Fare Contains NaN's


In [36]:
### Add median fare for all missing fares 

if len(test_train_df.Fare[ test_train_df.Fare.isnull() ]) > 0:
    median_fare = np.zeros(3)
    for f in range(0,3):                                        
        median_fare[f] = test_train_df[ test_train_df.Pclass == f+1 ]['Fare'].dropna().median()
    for f in range(0,3):                                    
        test_train_df.loc[ (test_train_df.Fare.isnull()) & (test_train_df.Pclass == f+1 ), 'Fare'] = median_fare[f]

In [37]:
floatOrNan(test_train_df)

In [38]:
from sklearn.ensemble import RandomForestClassifier 

# Convert back to a numpy array
train_data = train_df.values
test_data = test_train_df.values

In [39]:
print 'Training...'
forest = RandomForestClassifier(n_estimators=1000)
forest = forest.fit( train_data[0::,1::], train_data[0::,0] )

Training...


In [40]:
print 'Predicting...'
output = forest.predict(test_data).astype(int)

Predicting...


In [41]:
predictions_file = open("csv/ourfirstforest.csv", "wb")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["PassengerId","Survived"])
open_file_object.writerows(zip(ids, output))
predictions_file.close()
print 'Done.'

Done.
