[View in Colaboratory](https://colab.research.google.com/github/JacksonIsaac/colab_notebooks/blob/master/kaggle_titanic.ipynb)

# Kaggle Notebook
For *Titanic* competition:
https://www.kaggle.com/c/titanic

In [0]:
!pip install kaggle

In [2]:
## Load Kaggle config JSON
from googleapiclient.discovery import build
import io, os
from googleapiclient.http import MediaIoBaseDownload
from google.colab import auth

auth.authenticate_user()

drive_service = build('drive', 'v3')
results = drive_service.files().list(
        q="name = 'kaggle.json'", fields="files(id)").execute()
kaggle_api_key = results.get('files', [])

filename = "/content/.kaggle/kaggle.json"
os.makedirs(os.path.dirname(filename), exist_ok=True)

request = drive_service.files().get_media(fileId=kaggle_api_key[0]['id'])
fh = io.FileIO(filename, 'wb')
downloader = MediaIoBaseDownload(fh, request)
done = False
while done is False:
    status, done = downloader.next_chunk()
    print("Download %d%%." % int(status.progress() * 100))
os.chmod(filename, 600)

!mkdir ~/.kaggle
!cp /content/.kaggle/kaggle.json ~/.kaggle/kaggle.json

Download 100%.


In [19]:
!kaggle competitions download -c titanic -p data/

Downloading train.csv to data
  0% 0.00/59.8k [00:00<?, ?B/s]
100% 59.8k/59.8k [00:00<00:00, 29.2MB/s]
Downloading test.csv to data
  0% 0.00/28.0k [00:00<?, ?B/s]
100% 28.0k/28.0k [00:00<00:00, 22.8MB/s]
Downloading gender_submission.csv to data
  0% 0.00/3.18k [00:00<?, ?B/s]
100% 3.18k/3.18k [00:00<00:00, 3.55MB/s]


In [20]:
!ls data/

gender_submission.csv  test.csv  train.csv


In [0]:
%matplotlib inline

import pandas as pd
import numpy as np

import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

In [21]:
train = pd.read_csv('./data/train.csv', header=0)
print(train.shape)
test = pd.read_csv('./data/test.csv', header=0)
print(test.shape)

(891, 12)
(418, 11)


In [22]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Fill missing data with median value
Reference: http://stackoverflow.com/a/25562948 (https://www.kaggle.com/datacanary/xgboost-example-python)

In [0]:
from sklearn.base import TransformerMixin

In [0]:
class DataFrameImputer(TransformerMixin):
    def fit(self, X, y=None):
        self.fill = pd.Series([X[c].value_counts().index[0]
                              if X[c].dtype == np.dtype('O')
                              else X[c].median() for c in X], # Can use .mean() as well
                             index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.fill)

In [0]:
columns_to_use = ['Pclass', 'Sex', 'Age', 'Fare', 'Parch', 'SibSp', 'Embarked']
categorical_columns = ['Sex', 'Embarked']

In [0]:
X = train[columns_to_use].append(test[columns_to_use])
X_imputed = DataFrameImputer().fit_transform(X)

# Label Encoding
Converting categorical values to integer values since XGBoost doesn't support categorical values yet.

In [31]:
X_imputed.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Parch,SibSp,Embarked
0,3,male,22.0,7.25,0,1,S
1,1,female,38.0,71.2833,0,1,C
2,3,female,26.0,7.925,0,0,S
3,1,female,35.0,53.1,0,1,S
4,3,male,35.0,8.05,0,0,S


In [0]:
le = LabelEncoder()

for feature in categorical_columns:
    X_imputed[feature] = le.fit_transform(X_imputed[feature])

In [33]:
X_imputed.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Parch,SibSp,Embarked
0,3,1,22.0,7.25,0,1,2
1,1,0,38.0,71.2833,0,1,0
2,3,0,26.0,7.925,0,0,2
3,1,0,35.0,53.1,0,1,2
4,3,1,35.0,8.05,0,0,2


# Create train and test set

In [0]:
X_train = X_imputed[0:train.shape[0]].as_matrix()
X_test = X_imputed[train.shape[0]:].as_matrix()
Y_train = train['Survived']

In [35]:
X_train.shape

(891, 7)

In [36]:
X_test.shape

(418, 7)

# Train using XGBoost

In [0]:
gbm = xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.01)\
                        .fit(X_train, Y_train)

In [39]:
predictions = gbm.predict(X_test)

  if diff:


In [41]:
predictions[:10]

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0])

In [47]:
X_test[:5]

array([[ 3.    ,  1.    , 34.5   ,  7.8292,  0.    ,  0.    ,  1.    ],
       [ 3.    ,  0.    , 47.    ,  7.    ,  0.    ,  1.    ,  2.    ],
       [ 2.    ,  1.    , 62.    ,  9.6875,  0.    ,  0.    ,  1.    ],
       [ 3.    ,  1.    , 27.    ,  8.6625,  0.    ,  0.    ,  2.    ],
       [ 3.    ,  0.    , 22.    , 12.2875,  1.    ,  1.    ,  2.    ]])

In [48]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


# Create Submission file

In [0]:
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': predictions
})

In [0]:
submission.to_csv('submission.csv', index=False)

In [52]:
!kaggle competitions submit -c titanic -f submission.csv -m "Initial Submission using XGBoost example"

Successfully submitted to Titanic: Machine Learning from Disaster