# Kaggle: The "Titanic" Challenge

## Loading training data

Loading packages

In [1]:
import csv as csv 
import numpy as np
import string

Loading in the data

In [2]:
train_file = open("C:/Users/Keith/Documents/Kaggle/Titanic/train.csv", 'rb')
train_content = csv.reader(train_file) 
header = train_content.next()
train = []                          
for row in train_content:      
    train.append(row)             
train = np.array(train)          
train_file.close() 
print(header)

['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


## Background stats

In [3]:
number_passengers = np.size(train[0::,1].astype(np.float))
number_survived = np.sum(train[0::,1].astype(np.float))
proportion_survivors = number_survived / number_passengers
print('Proportion who survived is %s' % proportion_survivors)

Proportion who survived is 0.383838383838


Assuming everyone died would give a low score on Kaggle unless the test data was biased and only continaed those that died. Not going to test this one.

In [4]:
women_only_stats = train[0::,4] == "female" 
men_only_stats = train[0::,4] != "female"

#converting to numbers
women_onboard = train[women_only_stats,1].astype(np.float)     
men_onboard = train[men_only_stats,1].astype(np.float)

proportion_women_survived = \
                       np.sum(women_onboard) / np.size(women_onboard)  
proportion_men_survived = \
                       np.sum(men_onboard) / np.size(men_onboard) 

# and then print it out
print('Proportion of women who survived is %s' % proportion_women_survived)
print('Proportion of men who survived is %s' % proportion_men_survived)

Proportion of women who survived is 0.742038216561
Proportion of men who survived is 0.188908145581


Model of all women survived and no men should do better, but likely to be fairly blunt. Trying to re-code the next step in the "gentle" tutorial first. 

Next step is to haave a gender-class model...

In [5]:
#Setting an upper limit on fares to split them 0-10, 10-20, 20-30, 30+ 
fare_ceiling = 40
train[ train[0::,9].astype(np.float) >= fare_ceiling, 9 ] = fare_ceiling - 1.0
fare_bracket_size = 10
number_of_price_brackets = fare_ceiling / fare_bracket_size
number_of_classes = len(np.unique(train[0::,2]))   

#Table of zeros, female row 0, male row 1
survival_table = np.zeros([2,number_of_classes,number_of_price_brackets],float)

#Filling in the tables
for i in xrange(number_of_classes):
    for j in xrange(number_of_price_brackets):

        women_only_stats = train[ (train[0::,4] == "female") \
                                 & (train[0::,2].astype(np.float) == i+1) \
                                 & (train[0:,9].astype(np.float) >= j*fare_bracket_size) \
                                 & (train[0:,9].astype(np.float) < (j+1)*fare_bracket_size), 1]

        men_only_stats = train[ (train[0::,4] != "female") \
                                 & (train[0::,2].astype(np.float) == i+1) \
                                 & (train[0:,9].astype(np.float) >= j*fare_bracket_size) \
                                 & (train[0:,9].astype(np.float) < (j+1)*fare_bracket_size), 1]
        
        survival_table[0,i,j] = np.mean(women_only_stats.astype(np.float))  # Female stats
        survival_table[1,i,j] = np.mean(men_only_stats.astype(np.float))    # Male stats

#removing errors from no data by replacing erros with zero
survival_table[ survival_table != survival_table ] = 0.

print(survival_table)

[[[ 0.          0.          0.83333333  0.97727273]
  [ 0.          0.91428571  0.9         1.        ]
  [ 0.59375     0.58139535  0.33333333  0.125     ]]

 [[ 0.          0.          0.4         0.38372093]
  [ 0.          0.15873016  0.16        0.21428571]
  [ 0.11153846  0.23684211  0.125       0.24      ]]]


  ret = ret.dtype.type(ret / rcount)


Looking by sex, fare band and class refines the all women survive and all women die. For some reason it looks like unfortunately some women in third class who paid a lot for their tickets did not have good odds of survival. The men didn't have good odds.

In [6]:
#If above 0.5 chance of survival, predict survive
survival_table[ survival_table < 0.5 ] = 0
survival_table[ survival_table >= 0.5 ] = 1

#manually over-riding women in higher classes with lower fares - as looks class driven and no data
survival_table[0,0,0] = 1.
survival_table[0,1,0] = 1.
survival_table[0,0,1] = 1.

print(survival_table)


[[[ 1.  1.  1.  1.]
  [ 1.  1.  1.  1.]
  [ 1.  1.  0.  0.]]

 [[ 0.  0.  0.  0.]
  [ 0.  0.  0.  0.]
  [ 0.  0.  0.  0.]]]


## Test file

In [7]:
#Opening test file
test_file = open("C:/Users/Keith/Documents/Kaggle/Titanic/test.csv", 'rb')
test_content = csv.reader(test_file)
header2 = test_content.next()
print(header2)

['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


In [8]:
#Creating prediciton file
prediction_file = open("C:/Users/Keith/Documents/Kaggle/Titanic/gender_class.csv", "wb")
prediction_file_object = csv.writer(prediction_file)
prediction_file_object.writerow(["PassengerId", "Survived"])

#Running and writing prediction
for row in test_content:
    for j in xrange(number_of_price_brackets):
        try:
            row[8] = float(row[8])
        except:  #if no fare then use class to estimate fare
            bin_fare = 3 - float(row[1])
            break
        if row[8] > fare_ceiling:
            bin_fare = number_of_price_brackets - 1
            break
        if row[8] >= j*fare_bracket_size and row[8] < (j+1)*fare_bracket_size:
            bin_fare = j
            break
    if row[3] == 'female':
        prediction_file_object.writerow([row[0], "%d" % int(survival_table[ 0, float(row[1]) - 1, bin_fare ])])
    else:
        prediction_file_object.writerow([row[0], "%d" % int(survival_table[ 1, float(row[1]) - 1, bin_fare])])

prediction_file.close()
test_file.close()

