# Kaggle: The "Titanic" Challenge

## Loading training data

Loading packages

In [1]:
import numpy as np
import pandas as pd
import pylab as P
import string
import math as maths
import csv
from sklearn.ensemble import RandomForestClassifier

Loading in the data

In [2]:
train = pd.read_csv("C:/Users/Keith/Documents/Kaggle/Titanic/train.csv", header=0)
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 90.5+ KB


## Data cleaning

In [3]:
train['Gender'] = train['Sex'].map( {'female': 0, 'male': 1} ).astype(int)

train['Port'] = train['Embarked']
train['Port'] = train['Port'].fillna('Unknown')

#Calculating values to estimate unknown ages
train['Family_Size'] = train['SibSp']+train['Parch']+1
train['Alone'] = train['Family_Size'].map(lambda x: 0 if x>1 else 1)

#Estimating medians of ages by gender, class and whether or not they are travelling alone
median_ages = np.zeros((2,2,3))

for i in range(0, 2):
    for j in range(0, 2):
        for k in range(0, 3):
            median_ages[i,j,k] = train[(train['Gender'] == i) & \
                                    (train['Alone'] == j) & \
                                    (train['Pclass'] == k+1)]['Age'].dropna().median()

#Creating a filled in column of actual and estimated ages
train['AgeFill'] = train['Age']

for i in range(0, 2):
    for j in range(0, 2):
        for k in range(0, 3):
            train.loc[ (train.Age.isnull()) & (train.Gender == i) & (train.Alone == j) & (train.Pclass == k+1),\
                'AgeFill'] = median_ages[i,j,k]

## Feature engineering

Function to search for strings

In [4]:
def substrings_in_string(big_string, substrings):
    for substring in substrings:
        if string.find(big_string, substring) != -1:
            return substring
    print big_string
    return np.nan

Deck from first letter of cabin

In [5]:
train.Cabin = train.Cabin.fillna('Unknown')  
cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'Unknown']
train['Deck'] = train['Cabin'].map(lambda x: substrings_in_string(x, cabin_list))

Extracting a person's title

In [6]:
title_list=['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
                'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
                'Don', 'Jonkheer']

train['Title']=train['Name'].map(lambda x: substrings_in_string(x, title_list))

def replace_titles(x):
    title = x['Title']
    if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
        return 'Mr'
    elif title in ['Countess', 'Mme']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='Male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title

train['Title']=train.apply(replace_titles, axis=1)


Class crossed with age to see if younger more affluent passengers even more likely to be saved

In [7]:
train['Age_x_Class'] = train['AgeFill']*train['Pclass']

Indicator if the age was unknown in case people more reckless or their families are

In [8]:
train['Age_un'] = 2

def age_type(y):
    orig_age = y['Age']
    if pd.isnull(orig_age):
        return 1
    elif orig_age == 0.5:
        return 0
    elif (orig_age - maths.floor(orig_age))== 0.5:
        return 1
    else:
        return 0

train['Age_un']=train.apply(age_type, axis=1)


Converting to fares per person for families will fail for groups, but on test data this will also fail if the paseenger groups by ticket are split, whereas the sibsp and parch data crosses the divides

In [9]:
train['Fare_Per_Person']=train['Fare']/(train['Family_Size'])
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 22 columns):
PassengerId        891 non-null int64
Survived           891 non-null int64
Pclass             891 non-null int64
Name               891 non-null object
Sex                891 non-null object
Age                714 non-null float64
SibSp              891 non-null int64
Parch              891 non-null int64
Ticket             891 non-null object
Fare               891 non-null float64
Cabin              891 non-null object
Embarked           889 non-null object
Gender             891 non-null int32
Port               891 non-null object
Family_Size        891 non-null int64
Alone              891 non-null int64
AgeFill            891 non-null float64
Deck               891 non-null object
Title              891 non-null object
Age_x_Class        891 non-null float64
Age_un             891 non-null int64
Fare_Per_Person    891 non-null float64
dtypes: float64(5), int32(1), int64(8), o

## Final Data clean

In [10]:
train = train.drop(['PassengerId','Name','Sex','Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked','Alone'], 1)

In [11]:
train['Emb'] = train['Port'].map( {'S': 0, 'C': 1, 'Q': 2, 'Unknown': 3} ).astype(float)
train['OnDeck'] = train['Deck'].map( {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, \
                                      'F': 5, 'T': 6, 'G': 7, 'Unknown': 8} ).astype(float)
train['Address'] = train['Title'].map( {'Mr': 0, 'Master': 1, 'Mrs': 2, 'Miss': 3} ).astype(float)
train = train.drop(['Port','Deck','Title'], 1)
train = train[['Survived','Gender','AgeFill','Pclass','Age_x_Class',\
               'Family_Size','Fare_Per_Person','Address','OnDeck','Age_un','Emb']]
train = train.drop(['Age_x_Class','Age_un','Emb'], 1)
train.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 8 columns):
Survived           891 non-null int64
Gender             891 non-null int32
AgeFill            891 non-null float64
Pclass             891 non-null int64
Family_Size        891 non-null int64
Fare_Per_Person    891 non-null float64
Address            891 non-null float64
OnDeck             891 non-null float64
dtypes: float64(4), int32(1), int64(3)
memory usage: 59.2 KB


In [12]:
train_data = train.values

## Creating test file

In hindsight, should have written this as a function to easily repeat

In [13]:
test = pd.read_csv("C:/Users/Keith/Documents/Kaggle/Titanic/test.csv", header=0)
test.info()

# Collect the test data's PassengerIds before dropping it later
ids = test['PassengerId'].values

test['Port'] = test['Embarked']
test['Port'] = test['Port'].fillna('Unknown')
test['Family_Size'] = test['SibSp']+test['Parch']+1
test['Alone'] = test['Family_Size'].map(lambda x: 0 if x>1 else 1)
test['Gender'] = test['Sex'].map( {'female': 0, 'male': 1} ).astype(int)

test['AgeFill'] = test['Age']

for i in range(0, 2):
    for j in range(0, 2):
        for k in range(0, 3):
            test.loc[ (test.Age.isnull()) & (test.Gender == i) & (test.Alone == j) & (test.Pclass == k+1),\
                'AgeFill'] = median_ages[i,j,k]

test.Cabin = test.Cabin.fillna('Unknown')  
cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'Unknown']
test['Deck'] = test['Cabin'].map(lambda x: substrings_in_string(x, cabin_list))

test['Title']=test['Name'].map(lambda x: substrings_in_string(x, title_list))
test['Title']=test.apply(replace_titles, axis=1)

test['Age_x_Class'] = test['AgeFill']*test['Pclass']

test['Age_un'] = 2
test['Age_un']=test.apply(age_type, axis=1)

test['Fare_Per_Person2']=test['Fare']/(test['Family_Size'])

test = test.drop(['PassengerId','Name','Sex','Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked','Alone'], 1)

test['Emb'] = test['Port'].map( {'S': 0, 'C': 1, 'Q': 2, 'Unknown': 3} ).astype(float)
test['OnDeck'] = test['Deck'].map( {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, \
                                      'F': 5, 'T': 6, 'G': 7, 'Unknown': 8} ).astype(float)
test['Address'] = test['Title'].map( {'Mr': 0, 'Master': 1, 'Mrs': 2, 'Miss': 3} ).astype(float)

#Test has empty fare information
median_fare = np.zeros((3))

for i in range(0, 3):
    median_fare[i] = test[(test['Pclass'] == i+1)]['Fare_Per_Person2'].dropna().median()

print(median_fare)
#Creating a filled in column of actual and estimated fares
test['Fare_Per_Person'] = test['Fare_Per_Person2']

for i in range(0, 3):
            test.loc[ (test.Fare_Per_Person2.isnull()) & (train.Pclass == i+1),\
                'Fare_Per_Person'] = median_fare[i]

test = test.drop(['Port','Deck','Title','Fare_Per_Person2'], 1)
test = test[['Gender','AgeFill','Pclass','Age_x_Class',\
               'Family_Size','Fare_Per_Person','Address','OnDeck','Age_un','Emb']]
test = test.drop(['Age_x_Class','Age_un','Emb'], 1)
test.info()
test.head(5)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 39.2+ KB
[ 31.67915  13.        7.75   ]
<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 0 to 417
Data columns (total 7 columns):
Gender             418 non-null int32
AgeFill            418 non-null float64
Pclass             418 non-null int64
Family_Size        418 non-null int64
Fare_Per_Person    418 non-null float64
Address            418 non-null float64
OnDeck             418 non-null float64
dtypes: float64(4), int32(1), int64(2)
m

Unnamed: 0,Gender,AgeFill,Pclass,Family_Size,Fare_Per_Person,Address,OnDeck
0,1,34.5,3,1,7.8292,0,8
1,0,47.0,3,2,3.5,2,8
2,1,62.0,2,1,9.6875,0,8
3,1,27.0,3,1,8.6625,0,8
4,0,22.0,3,3,4.095833,2,8


In [14]:
test_data = test.values

## Machine learning

A bit more thought behind the forrest, from adjustments above

In [15]:
forest = RandomForestClassifier(n_estimators=100)
forest = forest.fit( train_data[0::,1::], train_data[0::,0] )

output = forest.predict(test_data).astype(int)


predictions_file = open("C:/Users/Keith/Documents/Kaggle/Titanic/random_forrest2.csv", "wb")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["PassengerId","Survived"])
open_file_object.writerows(zip(ids, output))
predictions_file.close()
