In [5]:
from utils import css_from_file
css_from_file('style/style.css')

In [6]:
import pandas as pd
import numpy as np

from sklearn.metrics import log_loss
from sklearn.ensemble import VotingClassifier

def load(path):
    df = pd.read_csv(path)
    if "Activity" not in df.columns:
        df["Activity"] = np.nan
    return df.drop("Activity",axis=1), df.Activity
    
X_tr, y_tr = load("data/boehringer/train.csv")
X_te, y_te = load("data/boehringer/test.csv")

print("training data shape", X_tr.shape)
print("testing data shape", X_te.shape)

training data shape (3751, 1776)
testing data shape (2501, 1776)


Exercise
----------------

Using your knowledge from 2.1 try to combine many models into a single solution.
The simplest way to do this is to use `sklearn.ensemble.VotingClassifier`.

1. Read the documentation http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html#sklearn.ensemble.VotingClassifier

2. Which method is better for voting (`hard` or `soft`)? Why?

3. Use more than 3 different algorithms. Check their performance separately and using VotingClassifier.

In [7]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from cross_validation import cross_val_apply
from xgboost import XGBClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC


clfs = [
    ('rf',RandomForestClassifier(n_estimators=100,n_jobs=1)),
    ('rf',RandomForestClassifier(n_estimators=100,n_jobs=1))
]

voting_clf = ('voting',VotingClassifier(estimators=clfs,voting='soft'))

for clf_name, clf in clfs + [voting_clf]:
    print(clf_name)
    oof_predictions = cross_val_apply(clf, X_tr, y_tr, cv=4,
                                      n_jobs=-1, decision_func="predict_proba")

    err = log_loss(y_tr, oof_predictions)
    print("Your error is", err)

rf
Your error is 0.469467792644
rf
Your error is 0.469467792644
voting
Your error is 0.459418713915


Double click to see the answers

<div class="spoiler">
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from cross_validation import cross_val_apply
from xgboost import XGBClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC

clfs = [
    ('rf',RandomForestClassifier(n_estimators=100,n_jobs=1)),
    ('gbm',GradientBoostingClassifier()),
    ('et',ExtraTreesClassifier(n_estimators=100,n_jobs=1)),
    ('xgb', XGBClassifier()),
    ('bag', BaggingClassifier(n_estimators=100)),
    ('svm', make_pipeline(StandardScaler(), SVC(probability=True)))
]

voting_clf = ('voting',VotingClassifier(estimators=clfs,voting='soft'))

for clf_name, clf in clfs + [voting_clf]:
    print(clf_name)
    oof_predictions = cross_val_apply(clf, X_tr, y_tr, cv=4,
                                      n_jobs=-1, decision_func="predict_proba")

    err = log_loss(y_tr, oof_predictions)
    print("Your error is", err)
</div>

In [10]:
from sklearn.preprocessing import Imputer
from sklearn.decomposition import PCA


pipe1 = make_pipeline(
    StandardScaler(),
    Imputer(),
    PCA(n_components= 300),
    XGBClassifier(),
        )


pipe2 = make_pipeline(
    StandardScaler(),
    Imputer(),
    PCA(n_components= 100),
    XGBClassifier(),
        )

clfs = [
    ('rf_100',RandomForestClassifier(n_estimators=100,n_jobs=1)),
    ('rf_100_depth_20',RandomForestClassifier(n_estimators=100,n_jobs=1, max_depth = 20)),
    ('rf_300',RandomForestClassifier(n_estimators=300,n_jobs=1)),
    ('pipe1', pipe1),
    ('pipe2', pipe2)
]

voting_clf = ('voting',VotingClassifier(estimators=clfs,voting='soft'))

for clf_name, clf in clfs + [voting_clf]:
    print(clf_name)
    oof_predictions = cross_val_apply(clf, X_tr, y_tr, cv=4,
                                      n_jobs=-1, decision_func="predict_proba")

    err = log_loss(y_tr, oof_predictions)
    print("Your error is", err)

rf_100
Your error is 0.469467792644
rf_100_depth_20
Your error is 0.461622351538
rf_300
Your error is 0.458681784559
pipe1
Your error is 0.50965683573
pipe2
Your error is 0.512340810285
voting
Your error is 0.468530743728


In [14]:
from sklearn.preprocessing import Imputer
from sklearn.decomposition import PCA




clfs = [
    #('pipe3', pipe3),
    #('pipe4', pipe4),
    ('bc', BaggingClassifier(n_estimators = 300)),
]

voting_clf = ('voting',VotingClassifier(estimators=clfs,voting='soft'))

for clf_name, clf in clfs + [voting_clf]:
    print(clf_name)
    oof_predictions = cross_val_apply(clf, X_tr, y_tr, cv=4,
                                      n_jobs=-1, decision_func="predict_proba")

    err = log_loss(y_tr, oof_predictions)
    print("Your error is", err)

bc
Your error is 0.454333831858
sgd


AttributeError: predict_proba not implemented in estimator

In [15]:
clfs = [
    ('rf',RandomForestClassifier(n_estimators=100,n_jobs=1)),
    ('rf',RandomForestClassifier(n_estimators=300,n_jobs=1, criterion = 'entropy', max_depth = 50,
                                 max_features = 350, random_state = 123)),
    ('bc', BaggingClassifier(n_estimators = 300)),
    ('bc', BaggingClassifier(n_estimators = 100))
]

voting_clf = ('voting',VotingClassifier(estimators=clfs,voting='soft'))

for clf_name, clf in clfs + [voting_clf]:
    print(clf_name)
    oof_predictions = cross_val_apply(clf, X_tr, y_tr, cv=4,
                                      n_jobs=-1, decision_func="predict_proba")

    err = log_loss(y_tr, oof_predictions)
    print("Your error is", err)
    
    
clf = RandomForestClassifier(n_estimators=200, n_jobs=-1, criterion='entropy', max_depth=20, 
                                 min_samples_split=2, min_samples_leaf=1, max_features=250, 
                                 max_leaf_nodes=300, bootstrap=True, 
                                 oob_score=False, random_state=123, 
                                 verbose=0, warm_start=False, class_weight=None)

rf
Your error is 0.469467792644
rf
Your error is 0.442813586654
bc
Your error is 0.454333831858
bc
Your error is 0.483269833631
voting
Your error is 0.445699477614


In [18]:
clfs = [
    ('xgbc', XGBClassifier(n_estimators = 300, max_depth = 50, seed = 123)),
    ('xgbc', XGBClassifier(n_estimators = 300, max_depth = 30, seed = 123)),
    ('xgbc', XGBClassifier(n_estimators = 300, max_depth = 10, seed = 123))
]

voting_clf = ('voting',VotingClassifier(estimators=clfs,voting='soft'))

for clf_name, clf in clfs + [voting_clf]:
    print(clf_name)
    oof_predictions = cross_val_apply(clf, X_tr, y_tr, cv=4,
                                      n_jobs=-1, decision_func="predict_proba")

    err = log_loss(y_tr, oof_predictions)
    print("Your error is", err)

xgbc
Your error is 0.586488637215
xgbc
Your error is 0.589734950514
xgbc
Your error is 0.57011577935
voting
Your error is 0.571336008015


XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [None]:
### 0.4882 accuracy, but can be useful in the ensemble
nn_forest = BaggingClassifier(make_pipeline(
                        make_union(RandomTreesEmbedding(n_estimators=10), 
                                   LazyTransformer()),
                        StandardScaler(with_mean=False), 
                        VarianceThreshold(0.001),
                        MLPClassifier((25,), alpha=10.0, verbose=False)), 
                        max_samples=0.75,
                        max_features=0.75,
                        n_estimators=10)

#0.4678
greg = RandomForestClassifier(min_samples_leaf = 2, min_samples_split = 4, n_estimators = 200)
#0.4428
karthick = RandomForestClassifier(n_estimators=300,n_jobs=1, criterion = 'entropy', max_depth = 50,
                                 max_features = 350, random_state = 123)
#0.447
belal = RandomForestClassifier(n_estimators=200, n_jobs=-1, criterion='entropy', max_depth=20, 
                                 min_samples_split=2, min_samples_leaf=1, max_features=250, 
                                 max_leaf_nodes=300, bootstrap=True, 
                                 oob_score=False, random_state=123, 
                                 verbose=0, warm_start=False, class_weight=None)


