Like many others, Titanic is the first kaggle completition I attended. In this notebook you will find how I use model stacking, a common ensemble technique on Kaggle, to submit my very first result on the LB! 

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sklearn
import re
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt

train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
fullset = [train, test] 
train.columns
train.head()

In [None]:
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

import warnings
warnings.filterwarnings('ignore')


The following EDA and feature engineering are inspired  by Sina: https://www.kaggle.com/sinakhorami/titanic-best-working-classifier


In [None]:
# Feature Engineering

# Let's tackle each feature one by one
# PassengerId, leave it there?
# Survived: our target
# Pclass
train.Pclass.isnull().value_counts() # there are no null values
# train.Sex.value_counts()
# train.Sex.isnull().value_counts()
# test.Sex.isnull().value_counts()
for dataset in fullset:
    # Sex: let's convert Sex into binary variable. Non-binary shouldn't exist back then right?
    dataset['Sex'] = dataset['Sex'].map({'male': 0, 'female': 1}).astype(int)
    


In [None]:
# for age we need imputation
for dataset in fullset:
    age_mean = dataset['Age'].mean()
    age_std = dataset['Age'].std()
    age_null_count = dataset['Age'].isnull().sum()
    age_imp = np.random.randint(age_mean - age_std,age_mean + age_std, size = age_null_count)
    
    dataset['Age'][np.isnan(dataset['Age'])] = age_imp
    dataset['Age'] = dataset['Age'].astype(int)
    dataset['Categorical_Age'] = pd.qcut(dataset['Age'],4)
    
print(train[['Survived','Categorical_Age']].groupby(['Categorical_Age'],as_index = False).mean())
for dataset in fullset:
    dataset.loc[dataset['Age']<=21.0,'Age'] = 0
    dataset.loc[(dataset['Age']>21.0)&(dataset['Age']<=28.0),'Age'] = 1
    dataset.loc[(dataset['Age']>28.0)&(dataset['Age']<=38.0),'Age'] = 2
    dataset.loc[dataset['Age']>38.0,'Age'] = 4
    dataset['Age'] = dataset['Age'].astype(int)
    
print(dataset['Age'].value_counts())

In [None]:
# Create feature FamilySize from sibsp and parch
for dataset in fullset:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
print (train[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean())

In [None]:
train['Fare'].describe()

In [None]:
# Fare

test['Fare'].isnull().value_counts() # only one missing value in testing set
# we can just use mean imputation
test['Fare'] = test['Fare'].fillna(test['Fare'].mean())
for dataset in fullset:
    dataset['Categorical_Fare'] = pd.qcut(dataset['Fare'],4)
    
print(train[['Survived','Categorical_Fare']].groupby('Categorical_Fare',as_index = False).mean())
print(train['Categorical_Fare'].value_counts())
print(test['Categorical_Fare'].value_counts())

for dataset in fullset:
    dataset.loc[dataset['Fare']<=7.91,'Fare'] = 0
    dataset.loc[(dataset['Fare']<=14.454) & (dataset['Fare']>7.91),'Fare'] = 1
    dataset.loc[(dataset['Fare']<=31.0) & (dataset['Fare']>14.454),'Fare'] = 2
    dataset.loc[dataset['Fare']>31.0,'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)

print(train['Fare'].value_counts())
    

In [None]:
# Cabin Number
train.Cabin.isnull().value_counts()
# perhaps no need for using this feature??

In [None]:
# Embarked
train.Embarked.isnull().value_counts() # 2 missing values
test.Embarked.isnull().value_counts()  # no missing values
# impute using median
train['Embarked'] = train['Embarked'].fillna('S')
# train.Embarked.value_counts() # 2 missing values


# converting
embarked_mapping = {'C':0,'Q':1,'S':2}
for dataset in fullset:
    dataset['Embarked'] = dataset['Embarked'].map(embarked_mapping).astype(int)

print(train[['Survived','Embarked']].groupby(['Embarked'],as_index = False).mean())

In [None]:
# credit to Sina 
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

for dataset in fullset:
    dataset['Title'] = dataset['Name'].apply(get_title)

print(pd.crosstab(train['Title'], train['Sex']))

In [None]:
# Here let's be creative and get some manual feature different from the original
for dataset in fullset:
    dataset['Title'] = dataset['Title'].replace(['Capt','Major','Col'],'Military')
    dataset['Title'] = dataset['Title'].replace(['Countess','Don','Dona','Jonkheer','Lady','Master','Sir'],'Nobility')
    dataset['Title'] = dataset['Title'].replace(['Dr','Rev'],'Educated')
    #common sense replacement
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    
print(train[['Survived','Title']].groupby(['Title'],as_index = False).mean())
print(test['Title'].value_counts())

In [None]:
title_mapping = {'Educated': 0, 'Military':1,'Miss':2,'Mr':3,'Mrs':4,'Nobility':5}
for dataset in fullset:
    dataset['Title'] = dataset['Title'].map(title_mapping).astype(int)
    
print(train['Title'].value_counts())

In [None]:
# preparing training and testing dataset
train_X = train.drop(['Survived','Name','Ticket','Categorical_Age','Categorical_Fare',
                     'PassengerId','SibSp','Parch','Cabin'],axis = 1)
# test_X = test.drop(['Name','Ticket'],axis = 1)
train_y = train['Survived']
test_X = test.drop(['Name','Ticket','Categorical_Age','Categorical_Fare',
                     'PassengerId','SibSp','Parch','Cabin'],axis = 1)
# retain only numpy array
train_X = train_X.values
train_y = train_y.values
test_X = test_X.values



Investigate different classifiers:
* Logistic Regression
* SVM
* Decision Tree
* Random Forest
* AdaBoost
* Gradient Boosting Classifier
* Multilayer perceptron
* Gaussian Naive Bayes
* Linear Discriminant Analysis
* K-Nearest Neighbor




In [None]:
# import model
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score
SEED = 0
sss = StratifiedShuffleSplit(n_splits = 10,test_size= 0.1,random_state = SEED)
sss.split(train_X,train_y)
classifiers = [SVC(),
               RandomForestClassifier(),
               AdaBoostClassifier(),
               GradientBoostingClassifier(),
               LogisticRegression(),
               GaussianNB(),
               KNeighborsClassifier(),
               LinearDiscriminantAnalysis(),
               MLPClassifier(),
               DecisionTreeClassifier()]

acc_table = {} # a dictionary store the prediction
for train_index, test_index in sss.split(train_X,train_y):
    train_X_cv, test_X_cv = train_X[train_index],train_X[test_index]
    train_y_cv, test_y_cv = train_y[train_index],train_y[test_index]
    for clf in classifiers:
        name = clf.__class__.__name__
        clf.fit(train_X_cv,train_y_cv)
        predict_y = clf.predict(test_X_cv)
        acc = accuracy_score(test_y_cv,predict_y)
        if name in acc_table:
            acc_table[name] += acc
        else:
            acc_table[name] = acc

for name in acc_table:
    acc_table[name] = acc_table[name]/len(classifiers)
print(acc_table)

In [None]:
# print()
# acc_df = pd.DataFrame(acc_table.items(),columns = ['Classifier','Accuracy'])

acc_df = pd.DataFrame(list(acc_table.items()),columns = ['Classifier','Accuracy'])
# acc_df.index.name = 'Classifier'
# acc_df.reset_index()
acc_df = acc_df.sort_values('Accuracy',ascending = 0)
# acc_df

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.xlabel('Accuracy')
plt.title('Classifier Accuracy')
sns.set_color_codes("muted")
sns.barplot(x='Accuracy', y='Classifier', data=acc_df, color="b")

Here, we can use the top five classifiers as our first stacking layer.
Or using all of them and see how things go?
We can repeat the fit and predict process but this time around we wrap them into pipeline for simplification.


In [None]:
# from sklearn.model_selection import KFold
# SEED = 0
# kf = KFold(n_splits=5,random_state=SEED)
# kf.get_n_splits(train_X)
# # train_idx, test_idx = kf.split(train_X,train_y)
# type(kf)
# for i, (trainindex, testindex) in enumerate(kf.split(train_X)): 
#     print("%s %s" % (trainindex, testindex))

In [None]:
NFOLD = 10
sss = StratifiedShuffleSplit(n_splits = NFOLD,test_size= 0.1,random_state = SEED)
class SklearnHelper(object):
    def __init__(self,clf,seed=0,params=None):
        params['random_state'] = seed
        self.clf = clf(**params)
    def train(self, train_X, train_y):
        self.clf.fit(train_X, train_y)
    def predict(self, x):
        return self.clf.predict(x)
    def fit(self,x,y):
        return self.clf.fit(x,y)
    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_)
    
def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLD, ntest))

    for i, (train_index, test_index) in enumerate(sss.split(x_train,y_train)):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [None]:
# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    'n_estimators': 500,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0
}
# AdaBoost parameters
ada_params = {
    'n_estimators': 500,
    'learning_rate' : 0.75
}

# Gradient Boosting parameters
gb_params = {
    'n_estimators': 500,
     #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}

# Support Vector Classifier parameters 
svc_params = {
    'C' : 0.025
    }
logreg_params = {'max_iter' : 100}
mlp_params = {'verbose' : 0}
dt_params = {'min_samples_split' : 2}

svc = SklearnHelper(clf=SVC, seed=SEED, params=svc_params)
rf = SklearnHelper(clf = RandomForestClassifier, seed=SEED,params=rf_params)
ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)
logreg = SklearnHelper(clf=LogisticRegression, seed=SEED, params=logreg_params)
mlp = SklearnHelper(clf=MLPClassifier, seed=SEED, params=mlp_params)
dt = SklearnHelper(clf=DecisionTreeClassifier, seed=SEED, params=dt_params)

In [None]:
x_train = train_X
y_train = train_y
x_test = test_X

ntrain = x_train.shape[0]
ntest = x_test.shape[0]
gb_oof_train, gb_oof_test = get_oof(gb,x_train, y_train, x_test) # Gradient Boost
svc_oof_train, svc_oof_test = get_oof(svc,x_train, y_train, x_test) # Support Vector Classifier
rf_oof_train, rf_oof_test = get_oof(rf,x_train, y_train, x_test) # Random Forest
ada_oof_train, ada_oof_test = get_oof(ada, x_train, y_train, x_test) # AdaBoost 
logreg_oof_train, logreg_oof_test = get_oof(logreg, x_train, y_train, x_test) # logreg
mlp_oof_train, mlp_oof_test = get_oof(mlp, x_train, y_train, x_test) # mlp
dt_oof_train, dt_oof_test = get_oof(dt, x_train, y_train, x_test) # mlp
print("Training is complete")

In [None]:
base_predictions_train = pd.DataFrame( {'RandomForest': rf_oof_train.ravel(),
      'AdaBoost': ada_oof_train.ravel(),
      'GradientBoost': gb_oof_train.ravel(),
      'DecisionTrees': dt_oof_train.ravel(),
      'MLP': mlp_oof_train.ravel(),
      'LogReg':logreg_oof_train.ravel(),
      'SVC':svc_oof_train.ravel()
                                       })
base_predictions_train.head()

py.init_notebook_mode(connected=True)
data = [
    go.Heatmap(
        z= base_predictions_train.astype(float).corr().values ,
        x=base_predictions_train.columns.values,
        y= base_predictions_train.columns.values,
          colorscale='Viridis',
            showscale=True,
            reversescale = True
    )
]
py.iplot(data, filename='labelled-heatmap')

In [None]:
x_train = np.concatenate((rf_oof_train, ada_oof_train, gb_oof_train, dt_oof_train,mlp_oof_train,logreg_oof_train,
                          svc_oof_train), axis=1)
x_test = np.concatenate((rf_oof_test, ada_oof_test, gb_oof_test, dt_oof_test,mlp_oof_test,mlp_oof_test,
                         svc_oof_test), axis=1)

In [None]:
gbm = xgb.XGBClassifier(
    #learning_rate = 0.02,
 n_estimators= 2000,
 max_depth= 4,
 min_child_weight= 2,
 #gamma=1,
 gamma=0.9,                        
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread= -1,
 scale_pos_weight=1).fit(x_train, y_train)
predictions = gbm.predict(x_test)

In [None]:
SubmissionCL = pd.DataFrame({ 'PassengerId': test['PassengerId'],
                            'Survived': predictions })
SubmissionCL.to_csv("SubmissionCL.csv", index=False)
