Data Dictionary
Variable	Definition	Key
survival 	Survival 	0 = No, 1 = Yes
pclass 	Ticket class 	1 = 1st, 2 = 2nd, 3 = 3rd
sex 	Sex 	
Age 	Age in years 	
sibsp 	# of siblings / spouses aboard the Titanic 	
parch 	# of parents / children aboard the Titanic 	
ticket 	Ticket number 	
fare 	Passenger fare 	
cabin 	Cabin number 	
embarked 	Port of Embarkation 	C = Cherbourg, Q = Queenstown, S = Southampton
Variable Notes

pclass: A proxy for socio-economic status (SES)
1st = Upper
2nd = Middle
3rd = Lower

age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5

sibsp: The dataset defines family relations in this way...
Sibling = brother, sister, stepbrother, stepsister
Spouse = husband, wife (mistresses and fiancés were ignored)

parch: The dataset defines family relations in this way...
Parent = mother, father
Child = daughter, son, stepdaughter, stepson
Some children travelled only with a nanny, therefore parch=0 for them.

In [1]:
import pandas as pd
import xgboost as xgb
import numpy as np
import scipy.sparse
import pickle



In [2]:
ports = {'C':1, 'Q':2, 'S':3, 'X':0}
sexes = {'male':1, 'female':2}
ticket_class = {'A':1, 'B':2, 'C':3, 'D':4, 'E':5, 'F':6, 'G':7, 'T':8, 'X':0}
columns = ['Name', 'Ticket', 'PassengerId']

In [3]:
def get_class(y):
    ticket = y.split(' ', 1)
    return ticket_class[ticket[0][0]]

In [4]:
data = pd.read_csv('data/train.csv').drop(columns, 1).fillna('X')
data['Cabin'] = data['Cabin'].map(lambda x: get_class(x), 1)
#-1 to 0 change is cause of libsvm converter's strange behavior
data['Age'] = data['Age'].map(lambda x: 0 if x=='X' else x) 
data['Sex'].replace(sexes, inplace = True)
data['Embarked'].replace(ports, inplace = True)
print(len(data))
matrix = data.as_matrix()
data[:10]

891


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,1,22.0,1,0,7.25,0,3
1,1,1,2,38.0,1,0,71.2833,3,1
2,1,3,2,26.0,0,0,7.925,0,3
3,1,1,2,35.0,1,0,53.1,3,3
4,0,3,1,35.0,0,0,8.05,0,3
5,0,3,1,0.0,0,0,8.4583,0,2
6,0,1,1,54.0,0,0,51.8625,5,3
7,0,3,1,2.0,3,1,21.075,0,3
8,1,3,2,27.0,0,2,11.1333,0,3
9,1,2,2,14.0,1,0,30.0708,0,1


In [5]:
submission_format = pd.read_csv('data/gender_submission.csv')
submission_format.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


# Split training data to new train-test sets 0.8 - 0.2

In [6]:
from sklearn.cross_validation import train_test_split
train, test = train_test_split(data, test_size = 0.2)

# Creating libSVM version

In [7]:
from sklearn.datasets import dump_svmlight_file

In [8]:
target = data['Survived']
data = data.drop('Survived', 1)
dummy = pd.get_dummies(data)
mat = dummy.as_matrix()
dump_svmlight_file(mat, target, 'data.libsvm')

In [9]:
target = train['Survived']
target.head()

633    0
684    0
399    1
390    1
98     1
Name: Survived, dtype: int64

In [10]:
train = train.drop('Survived', 1)
train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
633,1,1,0.0,0,0,0.0,0,3
684,2,1,60.0,1,1,39.0,0,3
399,2,2,28.0,0,0,12.65,0,3
390,1,1,36.0,1,2,120.0,2,3
98,2,2,34.0,0,1,23.0,0,3


In [11]:
dump_svmlight_file(train.as_matrix(), target.as_matrix(), 'train.libsvm')

In [12]:
target = test['Survived']
test = test.drop('Survived', 1)
#dummy = pd.get_dummies(test)
#mat = dummy.as_matrix()
dump_svmlight_file(test.as_matrix(), target.as_matrix(), 'test.libsvm')


# Loading libSVM-files

In [13]:
xg_train = xgb.DMatrix('train.libsvm')
xg_test = xgb.DMatrix('test.libsvm')

In [14]:
print(xg_train.num_col(), xg_test.num_col())

(8L, 8L)


# XGBoost binary logistic

In [56]:
# setup parameters for xgboost
param = {}
# use softmax multi-class classification
#param['objective'] = 'multi:softmax'
param['objective'] = 'binary:logistic'
# scale weight of positive examples
param['eta'] = 0.1
param['lambda'] = 0.08
param['alpha'] = 0.08
param['gamma'] = 0.08
param['max_depth'] = 3
#param['max_delta_step'] = 1
param['subsample'] = 0.5
param['min_child_weight'] = 1
param['silent'] = 1
param['nthread'] = 8
#param['num_class'] = 2

watchlist = [ (xg_train,'train'), (xg_test, 'test') ]

In [57]:
num_round = 500
bst = xgb.train(param, xg_train, num_round, watchlist );

[0]	train-error:0.167135	test-error:0.206704
[1]	train-error:0.162921	test-error:0.206704
[2]	train-error:0.155899	test-error:0.189944
[3]	train-error:0.158708	test-error:0.195531
[4]	train-error:0.154494	test-error:0.217877
[5]	train-error:0.158708	test-error:0.195531
[6]	train-error:0.150281	test-error:0.201117
[7]	train-error:0.150281	test-error:0.201117
[8]	train-error:0.15309	test-error:0.206704
[9]	train-error:0.146067	test-error:0.184358
[10]	train-error:0.144663	test-error:0.184358
[11]	train-error:0.141854	test-error:0.184358
[12]	train-error:0.148876	test-error:0.184358
[13]	train-error:0.143258	test-error:0.189944
[14]	train-error:0.141854	test-error:0.189944
[15]	train-error:0.13764	test-error:0.189944
[16]	train-error:0.133427	test-error:0.189944
[17]	train-error:0.139045	test-error:0.189944
[18]	train-error:0.13764	test-error:0.189944
[19]	train-error:0.139045	test-error:0.189944
[20]	train-error:0.13764	test-error:0.189944
[21]	train-error:0.134831	test-error:0.189944
[2