# Team YSI - Titanic: Machine Learning from Disaster

## Version 1

In [1]:
#########################################################################
#
# Titanic: Machine Learning from Disaster
#
# Python script for generation of a model predicting the survivals.
#
# Amendment date             Amended by            Description
# 22/11/2016                 Ivaylo Shalev         Initial version.
#
#
#########################################################################
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation

# Reading of input data (train and test)
main_train_df = pd.read_csv('input/train.csv', header=0)      # Load the train file into a dataframe
main_test_df = pd.read_csv('input/test.csv', header=0)        # Load the test file into a dataframe

# The test data doesn't contain the target (survived), however it still can be used when we are doing data preparation
# That's why we create a third dataframe which will contain both training and test data into one.
# When executing the modeling we will split them back.
main_all_df = main_train_df.append(main_test_df)              # Create a union between both data frames

# Show some stats
print "Training data - number of rows: %s" % main_train_df['PassengerId'].size
print "Testing data - number of rows:  %s" % main_test_df['PassengerId'].size
print "Total data - number of rows:    %s" % main_all_df['PassengerId'].size
print ""

# training data
print "ALL DATA"
# show first row
print main_all_df.iloc[0]
print ""
# show last row
print main_all_df.iloc[-1]
print ""


Training data - number of rows: 891
Testing data - number of rows:  418
Total data - number of rows:    1309

ALL DATA
Age                                 22
Cabin                              NaN
Embarked                             S
Fare                              7.25
Name           Braund, Mr. Owen Harris
Parch                                0
PassengerId                          1
Pclass                               3
Sex                               male
SibSp                                1
Survived                             0
Ticket                       A/5 21171
Name: 0, dtype: object

Age                                 NaN
Cabin                               NaN
Embarked                              C
Fare                            22.3583
Name           Peter, Master. Michael J
Parch                                 1
PassengerId                        1309
Pclass                                3
Sex                                male
SibSp                        

In [2]:
# Data Preparation

# PassengerId - do nothing (as it is - int), but it will not be used as a feature
# Pclass - do nothing (as it is - int 1,2,3)
# SibSp - do nothing (as it is - int 1,2,3,4,5,6,7,8)
# Parch - do nothing (as it is - int 1,2,3,4,5,6,7,8)

# Survived - convert to int
main_all_df['Survived'] = main_all_df.ix[main_all_df.Survived.isnull() == False, 'Survived'].astype(np.int)

# Sex - convert it to ID (int): 0 - female, 1 - male
main_all_df['GenderId'] = [ 0 if x == 'female' else 1 for x in main_all_df['Sex'] ]

# Cabin - extract Deck letter and convert it to ID (int)
main_all_df['DeckId'] = [ 0 if np.isnan(x) else x.astype(int) for x in main_all_df['Cabin'].str[:1].map(
        {
            'T': 1 # Boat Deck - most top
         ,  'A': 2 # higher
         ,  'B': 3
         ,  'C': 4
         ,  'D': 5
         ,  'E': 6
         ,  'F': 7
         ,  'G': 8 # lowest deck
        })]

"""
Avg Fare per Deck
1     35.500000
2     41.244314
3    122.383078
4    107.926598
5     53.007339
6     54.564634
7     18.079367
8     14.205000
"""

main_all_df.ix[(main_all_df.DeckId == 0) & (main_all_df.Fare > 30) & (main_all_df.Fare < 38), 'DeckId'] = 1
#main_all_df.ix[(main_all_df.DeckId == 0) & (main_all_df.Fare > 37) & (main_all_df.Fare < 45), 'DeckId'] = 2
main_all_df.ix[(main_all_df.DeckId == 0) & (main_all_df.Fare > 110) & (main_all_df.Fare < 130), 'DeckId'] = 3
main_all_df.ix[(main_all_df.DeckId == 0) & (main_all_df.Fare > 95) & (main_all_df.Fare < 110), 'DeckId'] = 4
main_all_df.ix[(main_all_df.DeckId == 0) & (main_all_df.Fare > 52) & (main_all_df.Fare < 60), 'DeckId'] = 5
#main_all_df.ix[(main_all_df.DeckId == 0) & (main_all_df.Fare > 45) & (main_all_df.Fare < 53), 'DeckId'] = 6
main_all_df.ix[(main_all_df.DeckId == 0) & (main_all_df.Fare > 15) & (main_all_df.Fare < 25), 'DeckId'] = 7
main_all_df.ix[(main_all_df.DeckId == 0) & (main_all_df.Fare > 10) & (main_all_df.Fare < 16), 'DeckId'] = 8
#print main_all_df.groupby('DeckId').count()['PassengerId']


# Name - extract family name and title
# Name - Surname
#main_all_df['Surname'] = main_all_df['Name'].replace("(\\,..*)", "", regex=True)

# Name - Title - group common titles and factor them all
main_all_df['Title'] = main_all_df['Name'].replace("(.*, )|(\\..*)", "", regex=True)
common_titles = [['Other', 0], ["Miss", 1], ["Mr", 2], ["Master", 3], ["Mile", 1], ["Ms", 1], ["Mme", 2]]
common_titles_dict = { title : i for title, i in common_titles }
main_all_df['TitleId'] = [ 'Other' if x not in list(common_titles_dict) else x for x in main_all_df['Title'] ]
main_all_df['TitleId'] = main_all_df['TitleId'].map( lambda x: common_titles_dict[x])


# Embarked - decode letter to ID (int)
main_all_df['EmbarkedId'] = [ 0 if np.isnan(x) else x.astype(int) for x in main_all_df['Embarked'].map(
        {
            'C': 1 # Cherbourg
         ,  'Q': 2 # Queenstown
         ,  'S': 3 # Southampton
        })]

# Age - get median per gender and apply for null
median_age_f = main_all_df.ix[main_all_df['GenderId'] == 0, 'Age'].dropna().astype(np.float).median()
median_age_m = main_all_df.ix[main_all_df['GenderId'] == 1, 'Age'].dropna().astype(np.float).median()
main_all_df.ix[(main_all_df['GenderId'] == 0) & (main_all_df['Age'].isnull()), 'Age'] = median_age_f
main_all_df.ix[(main_all_df['GenderId'] == 1) & (main_all_df['Age'].isnull()), 'Age'] = median_age_m

#print main_all_df.groupby(['Sex', 'DeckId']).median()['Age']

"""
import matplotlib.pyplot as plt
plt.figure(figsize=(8,4))
plt.title('Age')
plt.xlabel('Value')
main_all_df.ix[main_all_df.GenderId == 1, 'Age'].fillna(-10).plot.hist(bins=20)

plt.show()
"""

# Family Size - sum SibSp + Parch + 1
main_all_df['FamSize'] = main_all_df.SibSp + main_all_df.Parch + 1

# Child
main_all_df['Child'] = 0
main_all_df.loc[main_all_df.Age < 18, 'Child'] = 1

# Mother
main_all_df['Mother'] = 0
main_all_df.loc[  (main_all_df.Age >= 18)
                & (main_all_df.Parch > 0)
                & (main_all_df.GenderId == 0)
                & (main_all_df.Title != "Miss"), 'Mother'] = 1


In [3]:
# Classification

# Split into Train and Test DF
# get only the good features, ID and Target
all_good_df = main_all_df[[
     'PassengerId'
    ,'Survived'
    ,'TitleId'
    ,'GenderId'
    ,'DeckId'
    ,'EmbarkedId'
    ,'Pclass'
    #,'FamSize'
    #,'Child'
    #,'Mother'
    #,'Age'
    #,'Fare'
    #,'SibSp'
    #,'Parch'
]]

# Split rows into original sets
train_df = all_good_df.ix[all_good_df.PassengerId <= 891]
test_df = all_good_df.ix[all_good_df.PassengerId > 891]

# Get ID and Target
test_ids = test_df['PassengerId'].values
target_df = all_good_df.ix[all_good_df.PassengerId <= 891, 'Survived']

# Remove ID and Target columns from the datasets
train_df = train_df.drop(['PassengerId', 'Survived'], axis = 1)
test_df = test_df.drop(['PassengerId', 'Survived'], axis = 1)

# RandomForest
print 'Training...'
forest_model = RandomForestClassifier(n_estimators=100)

# Cross validation
scores = cross_validation.cross_val_score(forest_model
                                          ,train_df
                                          ,target_df
                                          ,cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


# Predict
print 'Predicting...'
forest_model = forest_model.fit(train_df, target_df)
predict_output = forest_model.predict(test_df).astype(int)
results_df = pd.DataFrame({'PassengerId': test_ids, 'Survived': predict_output})
# Save to CSV file
results_df.to_csv(path_or_buf="output/ysi_titanic_prediction.csv", index=False)
print 'Done.'

Training...
Accuracy: 0.83 (+/- 0.06)
Predicting...
Done.
