In [2]:
import os

import h5py as h5
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.externals import joblib
from sklearn.metrics import roc_auc_score, make_scorer

train_file = 'features/train_stats_features.h5'
test_file = 'features/test_stats_features.h5'
out_dir = 'models/grad_boost'

n_ch = 16

with h5.File(train_file, 'r') as train:
    for i in range(n_ch):
        ch = 'ch%d' % i

        train_x = train[ch][:]
        train_y = [int(name.split('.')[-2].split('_')[-1]) for name in train['names'][:]]

        scaler = StandardScaler().fit(train_x)
        train_x = scaler.transform(train_x)

        roc = make_scorer(roc_auc_score)

        model = GradientBoostingClassifier(random_state = 1, n_estimators = 100)

        searchParams = dict(
            learning_rate = np.logspace(-2, 2, 5),
            subsample = np.linspace(0.01, 1.0, 5)
        )

        search = GridSearchCV(model, param_grid = searchParams, scoring = roc, cv = 2)
        search.fit(train_x, train_y)

        print(search.best_score_)
        print(search.best_params_)

        model.set_params(**(search.best_params_))
        model.fit(train_x, train_y)
        
        if not os.path.isdir(out_dir):
            os.makedirs(out_dir)
            
        joblib.dump(model, "%s/%s.pkl" % (out_dir, ch), compress = 1)

0.544750327919
{'subsample': 0.505, 'learning_rate': 1.0}


  np.sum(sample_weight * ((y * pred) - np.logaddexp(0.0, pred))))
  sample_weight[~sample_mask]))


0.602293715725
{'subsample': 0.01, 'learning_rate': 1.0}
0.530336040985
{'subsample': 0.01, 'learning_rate': 100.0}
0.559521879151
{'subsample': 0.25750000000000001, 'learning_rate': 10.0}
0.560631249531
{'subsample': 0.25750000000000001, 'learning_rate': 100.0}
0.545775220335
{'subsample': 0.75249999999999995, 'learning_rate': 1.0}
0.526264410438
{'subsample': 1.0, 'learning_rate': 100.0}
0.551438132005
{'subsample': 0.01, 'learning_rate': 1.0}
0.575506634359
{'subsample': 0.505, 'learning_rate': 100.0}
0.601931396043
{'subsample': 1.0, 'learning_rate': 10.0}
0.556102013008
{'subsample': 0.505, 'learning_rate': 1.0}
0.579363182145
{'subsample': 0.01, 'learning_rate': 1.0}
0.532695489802
{'subsample': 0.75249999999999995, 'learning_rate': 100.0}
0.555055505591
{'subsample': 0.25750000000000001, 'learning_rate': 1.0}
0.534592346412
{'subsample': 1.0, 'learning_rate': 100.0}
0.548838156944
{'subsample': 1.0, 'learning_rate': 10.0}
