In [152]:
import pandas as pd
import scipy as sc
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
%pylab inline
pylab.rcParams['figure.figsize'] = (15, 10)

Populating the interactive namespace from numpy and matplotlib


In [153]:
train = pd.read_csv("train.tsv")
test = pd.read_csv("test.tsv")
sample_submission = pd.read_csv("sample_submission.tsv")
train = train.sample(frac=0.1, random_state=42)

In [154]:
ohe = OneHotEncoder(sparse=False)
dummy_features = ohe.fit_transform(train['item_id'].values.reshape(-1, 1))
dummy_features = pd.DataFrame(dummy_features, columns=['good = ' + str(x) for x in range(dummy_features.shape[1])])
train.index = dummy_features.index
new_train = pd.concat([train, dummy_features], axis=1).drop(['item_id', 'Num'], axis=1)

In [155]:
dummy_features = ohe.fit_transform(new_train['week'].values.reshape(-1, 1))
dummy_features = pd.DataFrame(dummy_features, columns=['week = ' + str(x) for x in range(dummy_features.shape[1])])
new_train.index = dummy_features.index
new_train = pd.concat([new_train, dummy_features], axis=1).drop(['week'], axis=1)

In [156]:
dummy_features = ohe.fit_transform(new_train['year'].values.reshape(-1, 1))
dummy_features = pd.DataFrame(dummy_features, columns=['year = ' + str(x) for x in range(dummy_features.shape[1])])
new_train.index = dummy_features.index
new_train = pd.concat([new_train, dummy_features], axis=1).drop(['year'], axis=1)

In [157]:
def SMAPE(target, prediction):
    return 200*np.mean(np.abs(target-prediction)/(np.abs(target) + np.abs(prediction)))

In [164]:
def test_estimator(estimator):
    def do_stuff(X, y):
        tscv = TimeSeriesSplit(n_splits=3)
        score = []
        for train_index, test_index in tscv.split(X):
            estimator.fit(X[train_index], y[train_index])
            prediction = estimator.predict(X[test_index])
            score.append(SMAPE(y[test_index], prediction))
        print score
        return np.mean(score)
    return do_stuff(new_train.drop(['y'], axis=1).as_matrix(), new_train['y'].values), \
            do_stuff(train.drop(['y'], axis=1).as_matrix(), new_train['y'].values)    

In [171]:
def SMAPE_scoring(estimator, X_test, y_test):
    tscv = TimeSeriesSplit(n_splits=3)
    score = []
    for train_index, test_index in tscv.split(X):
        estimator.fit(X[train_index], y[train_index])
        prediction = estimator.predict(X[test_index])
        score.append(SMAPE(y[test_index], prediction))
    ret = -np.mean(score)
    print -ret
    return ret

In [165]:
X, y = new_train.drop(['y'], axis=1).as_matrix(), new_train['y'].values

In [198]:
class sum_regressor:
    def __init__(self, weights):
        
        self.estimators = [RandomForestRegressor(n_estimators=weights[0]), 
                          GradientBoostingRegressor(n_estimators=weights[1], max_depth=5, loss='lad'), 
                          ExtraTreesRegressor(n_estimators=weights[2])]
        '''
        self.estimators = [GradientBoostingRegressor(loss='lad'),
                           RandomForestRegressor(),
                          ExtraTreesRegressor()]
        '''
    def fit(self, X, y):
        X_mine = np.copy(X)
        for est in self.estimators:
            est.fit(X_mine, y)
            if est!=self.estimators[-1]:
                np.c_[X_mine, est.predict(X_mine)]

    def predict(self, X):
        X_mine = np.copy(X)
        for est in self.estimators[:-1]:
            np.c_[X_mine, est.predict(X_mine)]
        return self.estimators[-1].predict(X_mine)
    

In [199]:
true_weights = None
score = 200
for w1 in range(10, 70, 10):
    for w2 in range(10, 70, 10):
        for w3 in range(10, 70, 10):
            print '\n', [w1, w2, w3]
            clf = sum_regressor(weights=[w1, w2, w3])
            new_score = test_estimator_onm(clf)
            if new_score < score:
                score = new_score
                true_weights = [w1, w2, w3]
            print new_score


[10, 10, 10]
[28.719211308345443, 27.817681194801608, 26.683307810215574]
27.7400667711

[10, 10, 20]
[28.080849072696758, 26.977913021887357, 26.644713415849708]
27.2344918368

[10, 10, 30]
[28.049054645393916, 27.096063934608438, 26.27095379866784]
27.1386907929

[10, 10, 40]
[27.956607779923893, 26.643029386924709, 25.988416677569887]
26.8626846148

[10, 10, 50]
[28.107354896778002, 26.757534976120162, 26.553047310170331]
27.1393123944

[10, 10, 60]
[27.747783690299794, 26.823195608130156, 26.236724914274255]
26.9359014042

[10, 20, 10]
[29.265019657199471, 27.683493628037827, 27.27868062838844]
28.0757313045

[10, 20, 20]
[28.094297163474785, 27.181613415104717, 26.567540090031244]
27.2811502229

[10, 20, 30]
[27.720343285587717, 26.801496808634678, 26.379825229915539]
26.9672217747

[10, 20, 40]
[27.956222963891953, 26.66715038648378, 26.346602927792713]
26.9899920927

[10, 20, 50]
[27.88472069670684, 26.612375841005864, 26.091160398771979]
26.8627523122

[10, 20, 60]
[27.8439100

KeyboardInterrupt: 

In [196]:
test_estimator_onm(sum_regressor())

[27.799928501491177, 26.517960228790638, 26.130408874123624]


26.816099201468479