In [2]:
import numpy as np
import csv

In [3]:
csv_file_object = csv.reader(open('data/train.csv', 'rb'))

In [4]:
header = csv_file_object.next()

In [5]:
data=[]  

In [6]:
for row in csv_file_object: 
    data.append(row)

In [7]:
data = np.array(data)

In [8]:
print data[0]

['1' '0' '3' 'Braund, Mr. Owen Harris' 'male' '22' '1' '0' 'A/5 21171'
 '7.25' '' 'S']


In [9]:
print data[-1]

['891' '0' '3' 'Dooley, Mr. Patrick' 'male' '32' '0' '0' '370376' '7.75' ''
 'Q']


In [10]:
number_passengers = np.size(data[0::,1].astype(np.float))
number_survived = np.sum(data[0::,1].astype(np.float))
proportion_survivors = number_survived / number_passengers

In [11]:
women_only_stats = data[0::,4] == "female" # This finds where all 
                                           # the elements in the gender
                                           # column that equals “female”
men_only_stats = data[0::,4] != "female"   # This finds where all the 
                                           # elements do not equal 
                                           # female (i.e. male)

In [12]:
# Using the index from above we select the females and males separately
women_onboard = data[women_only_stats,1].astype(np.float)     
men_onboard = data[men_only_stats,1].astype(np.float)

# Then we finds the proportions of them that survived
proportion_women_survived =  np.sum(women_onboard) / np.size(women_onboard)  
proportion_men_survived = np.sum(men_onboard) / np.size(men_onboard) 

# and then print it out
print 'Proportion of women who survived is %s' % proportion_women_survived
print 'Proportion of men who survived is %s' % proportion_men_survived

Proportion of women who survived is 0.742038216561
Proportion of men who survived is 0.188908145581


In [13]:
test_file = open('data/test.csv', 'rb')
test_file_object = csv.reader(test_file)
header = test_file_object.next()

In [14]:
prediction_file = open("data/genderbasedmodel.csv", "wb")
prediction_file_object = csv.writer(prediction_file)

In [15]:
prediction_file_object.writerow(["PassengerId", "Survived"])
for row in test_file_object:       # For each row in test.csv
    if row[3] == 'female':         # is it a female, if yes then                                       
        prediction_file_object.writerow([row[0],'1'])    # predict 1
    else:                              # or else if male,       
        prediction_file_object.writerow([row[0],'0'])    # predic t 0
test_file.close()
prediction_file.close()

In [16]:
# So we add a ceiling
fare_ceiling = 40
# then modify the data in the Fare column to = 39, if it is greater or equal to the ceiling
data[ data[0::,9].astype(np.float) >= fare_ceiling, 9 ] = fare_ceiling - 1.0

In [17]:
fare_bracket_size = 10
number_of_price_brackets = fare_ceiling / fare_bracket_size

In [18]:
# I know there were 1st, 2nd and 3rd classes on board
number_of_classes = 3
# But it's better practice to calculate this from the data directly
# Take the length of an array of unique values in column index 2
number_of_classes = len(np.unique(data[0::,2])) 

# Initialize the survival table with all zeros
survival_table = np.zeros((2, number_of_classes, number_of_price_brackets))

In [19]:
for i in xrange(number_of_classes):
    for j in xrange(number_of_price_brackets):
        women_only_stats = data[(data[0::,4] == "female")&(data[0::,2].astype(np.float) == i+1) \
                            &(data[0:,9].astype(np.float) >= j*fare_bracket_size)   \
                            &(data[0:,9].astype(np.float) < (j+1)*fare_bracket_size), 1]                         

        men_only_stats = data[(data[0::,4] != "female")&(data[0::,2].astype(np.float) \
                            == i+1)&(data[0:,9].astype(np.float) >= j*fare_bracket_size)   \
                          &(data[0:,9].astype(np.float) < (j+1)*fare_bracket_size), 1] 
        survival_table[0,i,j] = np.mean(women_only_stats.astype(np.float)) 
        survival_table[1,i,j] = np.mean(men_only_stats.astype(np.float))

  ret = ret.dtype.type(ret / rcount)


In [21]:
survival_table[ survival_table != survival_table ] = 0.

In [22]:
print survival_table

[[[ 0.          0.          0.83333333  0.97727273]
  [ 0.          0.91428571  0.9         1.        ]
  [ 0.59375     0.58139535  0.33333333  0.125     ]]

 [[ 0.          0.          0.4         0.38372093]
  [ 0.          0.15873016  0.16        0.21428571]
  [ 0.11153846  0.23684211  0.125       0.24      ]]]


In [23]:
survival_table[ survival_table < 0.5 ] = 0
survival_table[ survival_table >= 0.5 ] = 1 

In [24]:
test_file = open('data/test.csv', 'rb')
test_file_object = csv.reader(test_file)
header = test_file_object.next()
predictions_file = open("data/genderclassmodel.csv", "wb")
p = csv.writer(predictions_file)
p.writerow(["PassengerId", "Survived"])

In [25]:
for row in test_file_object:
    for j in xrange(number_of_price_brackets):
        try:
            row[8]=float(row[8])
        except:
            bin_fare = 3 - float(row[1])
            break
        if row[8] > fare_ceiling:
            bin_fare = number_of_price_brackets-1
            break
        if row[8] >= j * fare_bracket_size and row[8] < (j+1) * fare_bracket_size:
            bin_fare = j
            break
        if row[3] == 'female':
            p.writerow([row[0], "%d" % int(survival_table[0, float(row[1])-1, bin_fare])])
        else:
            p.writerow([row[0], "%d" % int(survival_table[1, float(row[1])-1, bin_fare])])
test_file.close() 
predictions_file.close()