In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('bmh')
%matplotlib inline
from sklearn import preprocessing

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import KFold

#### Details of data manipulation codes: Please see file in the same folder, named: 'Ensemble method 1 - (soft) Voting Classifier'
#### In order to focus on Stacking method and implement Stacking quickly, I just throw all the data clearning code all together.

In [44]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
combine = [train, test] # combine train and test data, easy to do data manipulation

for df in combine: # add feature 'FamilySize'                                  
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

for df in combine: # add feature 'Alone' 
    df['Alone'] = 0
    df.loc[df['FamilySize'] == 1, 'Alone'] = 1

for df in combine: # fill missing values for 'Embarked'
    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
    
for df in combine: # fill missing values for 'Fare' and transform into categorical feature
    df['Fare'].fillna(df['Fare'].median(), inplace=True)
    df.loc[df['Fare'] <= 10.5, 'Fare'] = 0
    df.loc[(df['Fare'] > 10.5) & (df['Fare'] <= 21.679), 'Fare'] = 1
    df.loc[(df['Fare'] > 21.679) & (df['Fare'] <= 39.688), 'Fare'] = 2
    df.loc[(df['Fare'] > 39.688) & (df['Fare'] <= 512.329), 'Fare'] = 3
    df.loc[df['Fare'] > 512.329, 'Fare'] = 4
    
    
for df in combine: # fill missing values for 'Age' and transform into categorical feature
    avg = df['Age'].mean()
    std = df['Age'].std()
    NaN_count = df['Age'].isnull().sum()
    
    age_fill = np.random.randint(avg-std, avg+std, NaN_count)
    df.loc[df['Age'].isnull(), 'Age'] = age_fill
    df['Age'] = df['Age'].astype(int)
    
    df.loc[df['Age'] <= 16, 'Age'] = 0
    df.loc[(df['Age'] > 16) & (df['Age'] <= 32), 'Age'] = 1
    df.loc[(df['Age'] > 32) & (df['Age'] <= 48), 'Age'] = 2
    df.loc[(df['Age'] > 48) & (df['Age'] <= 64), 'Age'] = 3
    df.loc[df['Age'] > 64, 'Age'] = 4
    
#for df in combine:
#    df['Age*Pclass'] = df['Age'] * df['Pclass']
    
#for df in combine:
#    df['Age*Fare'] = df['Age'] * df['Fare']
    
import re
def only_title(name): # manipulation 'Name', extracting titles from names
    title = re.findall(' ([A-Za-z]+)\.', name)
    if title:
        return title[0]
    
for df in combine:
    df['Title'] = df['Name'].apply(only_title) 
    
for df in combine:
    df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 
                                     'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')

############ Encoding features, make them ready for classifiers
feature_drop = ['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'FamilySize']
for df in combine:
    df.drop(feature_drop, axis=1, inplace=True)

def encode_features(train, test):
    features = ['Sex', 'Embarked', 'Fare', 'Age', 'Title']
    df_combined = pd.concat([train[features], test[features]])
    
    for feature in features:
        le = preprocessing.LabelEncoder()
        le = le.fit(df_combined[feature])
        train[feature] = le.transform(train[feature])
        test[feature] = le.transform(test[feature])
    return train, test
    
train, test = encode_features(train, test)
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Alone,Title
0,0,3,1,1,0,2,0,2
1,1,1,0,2,3,0,0,3
2,1,3,0,1,0,2,1,1
3,1,1,0,2,3,2,0,3
4,0,3,1,2,0,2,1,2


## Ensembling & Stacking models

In [45]:
# Create Numpy arrays of train, test and target (Survived) dataframes to feed into our models
y_train = train['Survived'].ravel()
train = train.drop(['Survived'], axis=1)
X_train = train.values # Creates an array of the train data
X_test = test.values # Creats an array of the test data

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)

(891, 7)
(891,)
(418, 7)


In [46]:
# Class to extend the Sklearn classifier
class SklearnHelper(object):
    def __init__(self, clf, seed=2017, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, X_train, y_train):
        self.clf.fit(X_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    
    def fit(self, x, y):
        return self.clf.fit(x,y)
    
    def feature_importances(self, x, y):
        print(self.clf.fit(x, y).feature_importances_)

### Define Out-of-Fold Predictions

In [47]:
##### KFold ouput example:
#kf = KFold(n_splits=3)
#for i, (j, k) in enumerate(kf.split(train[:13])):
#    print(i, (j, k))

# Out-of-Fold Predictions 
ntrain = train.shape[0]  # 891
ntest = test.shape[0]    # 418
SEED = 2017
kf = KFold(n_splits=5, random_state=SEED)

def get_oof(clf, X_train, y_train, X_test):
    oof_train = np.zeros((ntrain,))    # 1 * 891
    oof_test = np.zeros((ntest,))      # 1* 418
    oof_test_skf = np.empty((5, ntest))  # 5 * 418
    
    for i, (train_index, test_index) in enumerate(kf.split(X_train)):    # X_train: 891 * 7
        kf_X_train = X_train[train_index]  # 712 * 7   ex: 712 instances for each fold
        kf_y_train = y_train[train_index]  # 712 * 1   ex: 712 instances for each fold
        kf_X_test = X_train[test_index]    # 179 * 7   ex: 178 instances for each fold

        clf.train(kf_X_train, kf_y_train)

        oof_train[test_index] = clf.predict(kf_X_test) # 1 * 179 =======> will be 1 * 891 after 5 folds
        oof_test_skf[i, :] = clf.predict(X_test)   #  oof_test_skf[i, :]: 1 * 418

    oof_test[:] = oof_test_skf.mean(axis=0) # oof_test[:]  1 * 418
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1) 
    # oof_train.reshape(-1, 1): 891 * 1        oof_test.reshape(-1, 1): 418 * 1

## First Level Base Models

##### Random Forest classifier + Extra Trees classifier + AdaBoost classifer + Gradient Boosting classifer

In [48]:
# Random Forest parameters
rf_params = {'n_jobs':-1, 'n_estimators':20, 'criterion':'entropy'}

# Extra Trees Parameters
et_params = {'n_jobs':-1, 'n_estimators':200, 'max_depth': 8, 'min_samples_leaf': 2}

# AdaBoost parameters
ada_params = {'n_estimators':90, 'learning_rate':0.9 }

# Gradient Boosting parameters
gb_params = {'n_estimators':20, 'max_depth':5, 'min_samples_leaf':2}

In [49]:
# Create objects from 4 models
rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)

In [50]:
et_oof_train, et_oof_test = get_oof(et, X_train, y_train, X_test) # Extra Trees
rf_oof_train, rf_oof_test = get_oof(rf, X_train, y_train, X_test) # Random Forest
ada_oof_train, ada_oof_test = get_oof(ada, X_train, y_train, X_test) # AdaBoost 
gb_oof_train, gb_oof_test = get_oof(gb, X_train, y_train, X_test) # Gradient Boost

### Feature importances  from 4 classifiers

In [51]:
rf_feature = rf.feature_importances(X_train,y_train)
et_feature = et.feature_importances(X_train, y_train)
ada_feature = ada.feature_importances(X_train, y_train)
gb_feature = gb.feature_importances(X_train,y_train)

[ 0.17696528  0.23800994  0.12226612  0.16721264  0.07909706  0.0448232
  0.17162576]
[ 0.17966485  0.4463214   0.04482821  0.10127096  0.04270606  0.03838342
  0.1468251 ]
[ 0.14444444  0.11111111  0.06666667  0.13333333  0.12222222  0.08888889
  0.33333333]
[ 0.19752195  0.49965435  0.05772726  0.11152833  0.04082558  0.01437443
  0.07836811]


In [52]:
rf_features = [0.17696528, 0.23800994, 0.12226612, 0.16721264, 0.07909706, 0.0448232, 0.17162576]
et_features = [0.17966485, 0.4463214,  0.04482821, 0.10127096, 0.04270606, 0.03838342, 0.1468251]
ada_features = [0.14444444, 0.11111111, 0.06666667, 0.13333333, 0.12222222, 0.08888889, 0.33333333]
gb_features = [0.19752195, 0.49965435, 0.05772726, 0.11152833, 0.04082558, 0.01437443, 0.07836811]

In [53]:
cols = train.columns.values
# Create a dataframe with features
feature_dataframe = pd.DataFrame( {'features': cols, 'Random Forest feature importances': rf_features,
                                                     'Extra Trees  feature importances': et_features,
                                                      'AdaBoost feature importances': ada_features,
                                                    'Gradient Boost feature importances': gb_features})
feature_dataframe

Unnamed: 0,AdaBoost feature importances,Extra Trees feature importances,Gradient Boost feature importances,Random Forest feature importances,features
0,0.144444,0.179665,0.197522,0.176965,Pclass
1,0.111111,0.446321,0.499654,0.23801,Sex
2,0.066667,0.044828,0.057727,0.122266,Age
3,0.133333,0.101271,0.111528,0.167213,Fare
4,0.122222,0.042706,0.040826,0.079097,Embarked
5,0.088889,0.038383,0.014374,0.044823,Alone
6,0.333333,0.146825,0.078368,0.171626,Title


In [54]:
feature_dataframe['mean'] = feature_dataframe.mean(axis= 1) 
feature_dataframe[['features', 'mean']]

Unnamed: 0,features,mean
0,Pclass,0.174649
1,Sex,0.323774
2,Age,0.072872
3,Fare,0.128336
4,Embarked,0.071213
5,Alone,0.046617
6,Title,0.182538


#### Second-Level Predictions from the First-level Output

In [55]:
base_predictions_train = pd.DataFrame({'RandomForest': rf_oof_train.ravel(),
                                         'ExtraTrees': et_oof_train.ravel(),
                                         'AdaBoost': ada_oof_train.ravel(),
                                          'GradientBoost': gb_oof_train.ravel()})
base_predictions_train.head()

Unnamed: 0,AdaBoost,ExtraTrees,GradientBoost,RandomForest
0,0.0,0.0,0.0,0.0
1,1.0,1.0,1.0,1.0
2,1.0,0.0,0.0,0.0
3,1.0,1.0,1.0,1.0
4,0.0,0.0,0.0,0.0


In [32]:
X_train = np.concatenate((et_oof_train, rf_oof_train, ada_oof_train, gb_oof_train), axis=1)
X_test = np.concatenate((et_oof_test, rf_oof_test, ada_oof_test, gb_oof_test), axis=1)

# Output

In [41]:
gb_clf = GradientBoostingClassifier()
gb_clf.fit(X_train, y_train)
pred = gb_clf.predict(X_test)

In [36]:
test_id = pd.read_csv('test.csv')['PassengerId']
output = pd.DataFrame({'PassengerId' : test_id, 'Survived': pred})

output.to_csv('Predictions.csv', index = False)
output.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
