In [1]:
from utils import css_from_file
css_from_file('style/style.css')

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold
from sklearn.metrics import log_loss

Read data from the file __data/boehringer/(train|test).csv__ from the data folder. 

The first column is a binary variable that you want to predict. The rest are numericals

In [3]:
def load(path):
    df = pd.read_csv(path)
    if "Activity" not in df.columns:
        df["Activity"] = np.nan
    return df.drop("Activity",axis=1), df.Activity
    
X_tr, y_tr = load("data/boehringer/train.csv")
X_te, y_te = load("data/boehringer/test.csv")

print("training data shape", X_tr.shape)
print("testing data shape", X_te.shape)

training data shape (3751, 1776)
testing data shape (2501, 1776)


In [4]:
X_tr.head()

Unnamed: 0,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
0,0.0,0.497009,0.1,0.0,0.132956,0.678031,0.273166,0.585445,0.743663,0.243144,...,0,0,0,0,0,0,0,0,0,0
1,0.366667,0.606291,0.05,0.0,0.111209,0.803455,0.106105,0.411754,0.836582,0.10648,...,1,1,1,1,0,1,0,0,1,0
2,0.0333,0.480124,0.0,0.0,0.209791,0.61035,0.356453,0.51772,0.679051,0.352308,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.538825,0.0,0.5,0.196344,0.72423,0.235606,0.288764,0.80511,0.208989,...,0,0,0,0,0,0,0,0,0,0
4,0.1,0.517794,0.0,0.0,0.494734,0.781422,0.154361,0.303809,0.812646,0.125177,...,0,0,0,0,0,0,0,0,0,0


Exercise
---------------------

Using the starter code below try to improve the solution

1. What kind of models you can use?
2. Try changing model parameters to get the best cross validation error.
3. Use pipeline to transform features before modeling:
   - use some feature selection mechanism
   - use dimension reduction method (pca, svd, etc)
   
Tip: It is ok to loop over models and datasets like this.

```python
for data in [pipeline_1, pipeline_2, pipeline_3]:
    for model in [model_1, model_2, model_3]:
        # do stuff
```

In [14]:
from sklearn.ensemble import RandomForestClassifier
from cross_validation import cross_val_apply
from sklearn.model_selection import GridSearchCV

params = {'n_estimators' : [100,150,200],
          'min_samples_split' : [2,3,4],
          'min_samples_leaf': [1,2,3]
}

clf = RandomForestClassifier(n_jobs=-1)
gs = GridSearchCV(clf,params)
gs.fit(X_tr,y_tr)

gs.best_params_

{'min_samples_leaf': 2, 'min_samples_split': 4, 'n_estimators': 200}

In [16]:
clf = RandomForestClassifier(**gs.best_params_,n_jobs=-1)
%time oof_predictions = cross_val_apply(clf, X_tr, y_tr, decision_func="predict_proba")

err = log_loss(y_tr, oof_predictions)
print("Your error is", err)
if err > 0.5:
    print("You can still improve :)")

CPU times: user 11.3 s, sys: 496 ms, total: 11.8 s
Wall time: 3.07 s
Your error is 0.467801621363


In [19]:
from sklearn.linear_model import LogisticRegression

params = {'penalty' : ['l1','l2'],
          'C' : [.01, .1, 1, 10]    
}

lr = LogisticRegression()
gs = GridSearchCV(lr,params)
gs.fit(X_tr,y_tr)

gs.best_params_

{'C': 0.1, 'penalty': 'l2'}

In [21]:
lr = LogisticRegression(**gs.best_params_,n_jobs=-1)
%time oof_predictions = cross_val_apply(lr, X_tr, y_tr, decision_func="predict_proba")

err = log_loss(y_tr, oof_predictions)
print("Your error is", err)
if err > 0.5:
    print("You can still improve :)")

CPU times: user 2.79 s, sys: 52 ms, total: 2.84 s
Wall time: 962 ms
Your error is 0.523339196322
You can still improve :)


In [41]:
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

pipeline = make_pipeline(StandardScaler(),
                         PCA(n_components=100),
                         LogisticRegression(n_jobs=-1))
pipeline.fit(X_tr,y_tr)

Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=100, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('logisticregression', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_in...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [42]:
%time oof_predictions = cross_val_apply(pipeline, X_tr, y_tr, decision_func="predict_proba",n_jobs=-1)

err = log_loss(y_tr, oof_predictions)
print("Your error is", err)
if err > 0.5:
    print("You can still improve :)")

CPU times: user 352 ms, sys: 128 ms, total: 480 ms
Wall time: 2.48 s
Your error is 0.546206436954
You can still improve :)


In [53]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis


pipeline = make_pipeline(StandardScaler(),
                         PCA(n_components=200),
                         LinearDiscriminantAnalysis())
pipeline.fit(X_tr,y_tr)

%time oof_predictions = cross_val_apply(pipeline, X_tr, y_tr, decision_func="predict_proba",n_jobs=-1)

err = log_loss(y_tr, oof_predictions)
print("Your error is", err)
if err > 0.5:
    print("You can still improve :)")

CPU times: user 1.1 s, sys: 224 ms, total: 1.33 s
Wall time: 3.24 s
Your error is 0.523272866372
You can still improve :)
