In [4]:
from utils import css_from_file
css_from_file('style/style.css')

In [5]:
import pandas as pd
import numpy as np

from sklearn.metrics import log_loss
from sklearn.ensemble import VotingClassifier

def load(path):
    df = pd.read_csv(path)
    if "Activity" not in df.columns:
        df["Activity"] = np.nan
    return df.drop("Activity",axis=1), df.Activity
    
X_tr, y_tr = load("data/boehringer/train.csv")
X_te, y_te = load("data/boehringer/test.csv")

print("training data shape", X_tr.shape)
print("testing data shape", X_te.shape)

training data shape (3751, 1776)
testing data shape (2501, 1776)


Exercise
----------------

Using your knowledge from 2.1 try to combine many models into a single solution.
The simplest way to do this is to use `sklearn.ensemble.VotingClassifier`.

1. Read the documentation http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html#sklearn.ensemble.VotingClassifier

2. Which method is better for voting (`hard` or `soft`)? Why?

3. Use more than 3 different algorithms. Check their performance separately and using VotingClassifier.

In [7]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from cross_validation import cross_val_apply
#from xgboost import XGBClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC


clfs = [
    ('rf',RandomForestClassifier(n_estimators=300,n_jobs=1, criterion = 'entropy', max_depth = 50,
                                max_features = 350, random_state = 123)),
    ('gbm',GradientBoostingClassifier()),
    ('et',ExtraTreesClassifier(n_estimators=100,n_jobs=1)),
    ('bag', BaggingClassifier(n_estimators=100)),
    ('svm', make_pipeline(StandardScaler(), SVC(probability=True)))
]

voting_clf = ('voting',VotingClassifier(estimators=clfs,voting='soft'))

for clf_name, clf in clfs + [voting_clf]:
    print(clf_name)
    oof_predictions = cross_val_apply(clf, X_tr, y_tr, cv=4,
                                      n_jobs=-1, decision_func="predict_proba")

    err = log_loss(y_tr, oof_predictions)
    print("Your error is", err)

rf
Your error is 0.442813586654
gbm
Your error is 0.478147807518
et
Your error is 0.519231127932
bag
Your error is 0.482395310289
svm
Your error is 0.475816755304
voting
Your error is 0.446862440071


Double click to see the answers

<div class="spoiler">
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from cross_validation import cross_val_apply
from xgboost import XGBClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC

clfs = [
    ('rf',RandomForestClassifier(n_estimators=100,n_jobs=1)),
    ('gbm',GradientBoostingClassifier()),
    ('et',ExtraTreesClassifier(n_estimators=100,n_jobs=1)),
    ('xgb', XGBClassifier()),
    ('bag', BaggingClassifier(n_estimators=100)),
    ('svm', make_pipeline(StandardScaler(), SVC(probability=True)))
]

voting_clf = ('voting',VotingClassifier(estimators=clfs,voting='soft'))

for clf_name, clf in clfs + [voting_clf]:
    print(clf_name)
    oof_predictions = cross_val_apply(clf, X_tr, y_tr, cv=4,
                                      n_jobs=-1, decision_func="predict_proba")

    err = log_loss(y_tr, oof_predictions)
    print("Your error is", err)
</div>

In [None]:
nn_forest = BaggingClassifier(make_pipeline(
                        make_union(RandomTreesEmbedding(n_estimators=10), 
                                   LazyTransformer()),
                        StandardScaler(with_mean=False), 
                        VarianceThreshold(0.001),
                        MLPClassifier((25,), alpha=10.0, verbose=False)), 
                        max_samples=0.75,
                        max_features=0.75,
                        n_estimators=10)