# Titanic: Machine Learning From Disaster

## [Link](https://www.kaggle.com/c/titanic/)

In [1]:
%matplotlib inline
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.base import TransformerMixin
import numpy as np

In [2]:
train_df = pd.read_csv('data/titanic/train.csv', header=0)
test_df = pd.read_csv('data/titanic/test.csv', header=0)

In [3]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S


In [4]:
pd.isnull(train_df).any()

PassengerId    False
Survived       False
Pclass         False
Name           False
Sex            False
Age             True
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin           True
Embarked        True
dtype: bool

In [5]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [6]:
pd.isnull(test_df).any()

PassengerId    False
Pclass         False
Name           False
Sex            False
Age             True
SibSp          False
Parch          False
Ticket         False
Fare            True
Cabin           True
Embarked       False
dtype: bool

In [7]:
feature_columns =  ['Pclass','Sex','Age','Fare','Parch']

Let's replace missing features

In [8]:
all_data = train_df[feature_columns].append(
    test_df[feature_columns]
)


In [9]:
class DataFrameImputer(TransformerMixin):
    def fit(self, X, y=None):
        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].median() for c in X],
            index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.fill)

In [10]:
all_data_df = DataFrameImputer().fit_transform(all_data)

In [11]:
all_data_df.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Parch
0,3,male,22,7.25,0
1,1,female,38,71.2833,0
2,3,female,26,7.925,0
3,1,female,35,53.1,0
4,3,male,35,8.05,0


Limitation of XGBoost is no categorical features automatically... change to integer

In [12]:
nonnumeric_columns = ['Sex']

le = LabelEncoder()
for i_feature, feature in enumerate(nonnumeric_columns):
    all_data_df[feature] = le.fit_transform(all_data_df[feature])


In [13]:
all_data_df.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Parch
0,3,1,22,7.25,0
1,1,0,38,71.2833,0
2,3,0,26,7.925,0
3,1,0,35,53.1,0
4,3,1,35,8.05,0


In [14]:
x_train = all_data_df[0:train_df.shape[0]].as_matrix()
x_test = all_data_df[train_df.shape[0]::].as_matrix()
y_train = train_df['Survived']

In [15]:
gbm = xgb.XGBClassifier(
    max_depth=3,
    n_estimators=300,
    learning_rate=0.05
).fit(x_train, y_train)
predictions = gbm.predict(x_test)

In [16]:
pd.DataFrame(predictions).head()

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,1


In [17]:
submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': predictions
})
submission.to_csv("titanic_submission.csv", index=False)

Submission got a `0.75120` out of 1.0