In [27]:
import functools
import pathlib
import pickle

import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import model_util
import utils

In [28]:
metrics = [mean_absolute_error, mean_squared_error, r2_score]
grid_cv = functools.partial(model_util.grid_search_cv, X=X_train, y=y_train)
score_est = functools.partial(model_util.score_estimator, 
                              metrics=metrics, 
                              X=X_train,
                              y=y_train)
plot_resid = functools.partial(model_util.plot_residuals, X=X_train, y=y_train)

In [22]:
def score(fit_estimator, X, y):
    pred = fit_estimator.predict(X)
    return {m.__name__: m(y, pred) for m in metrics}


def save_model(model, name):
    file = models / f'{name}.pkl'
    with open(file, 'wb') as fp:
        pickle.dump(model, fp)

In [15]:
models = pathlib.Path.cwd() / 'data' / 'models'
# list(models.iterdir())

In [86]:
with open(models / 'rand_forest.pkl', 'rb') as fp:
    rand_forest = pickle.load(fp)
    
with open(models / 'extra_trees.pkl', 'rb') as fp:
    extra_trees = pickle.load(fp)
    
with open(models / 'gradient_boost.pkl', 'rb') as fp:
    gradient_boost = pickle.load(fp)
    
with open(models / 'svr.pkl', 'rb') as fp:
    svr = pickle.load(fp)

In [87]:
train_df = utils.merge_features('train')
valid_df = utils.merge_features('valid')

X_train = train_df.drop(columns=['domain1_percent', 'domain2_percent'])
y_train = train_df['domain1_percent']

X_valid = valid_df.drop(columns=['domain1_percent', 'domain2_percent'])
y_valid = valid_df['domain1_percent']

In [114]:
def pred_estimators(X, estimators):
    dct = {type(est).__name__: est.predict(X) for est in estimators}
    X_pred = pd.DataFrame(dct)
    return X_pred

estimators = [rand_forest, gradient_boost]
X_pred_train = pred_estimators(X_train, estimators)
X_pred_valid = pred_estimators(X_valid, estimators)

stacker= LinearRegression()
stacker.fit(X_pred_train, y_train);

### Train Set

In [115]:
score(stacker, X_pred_train, y_train)

{'mean_absolute_error': 4.789088501153577,
 'mean_squared_error': 43.86568785809041,
 'r2_score': 0.9200481852425139}

In [116]:
score(rand_forest, X_train, y_train)

{'mean_absolute_error': 5.360344739005344,
 'mean_squared_error': 55.95773607685984,
 'r2_score': 0.8980086083788551}

In [117]:
score(extra_trees, X_train, y_train)

{'mean_absolute_error': 0.007706535141803097,
 'mean_squared_error': 0.21407042060556242,
 'r2_score': 0.99960982445622}

In [118]:
score(gradient_boost, X_train, y_train)

{'mean_absolute_error': 12.042495190869928,
 'mean_squared_error': 248.57934014684102,
 'r2_score': 0.5469267592416693}

In [119]:
score(svr, X_train, y_train)

{'mean_absolute_error': 14.735326940552477,
 'mean_squared_error': 384.5142465932981,
 'r2_score': 0.2991649438007917}

### Valid Sets

In [120]:
score(stacker, X_pred_valid, y_valid)

{'mean_absolute_error': 12.830181860973484,
 'mean_squared_error': 279.29355680022945,
 'r2_score': 0.39995372300994725}

In [121]:
score(rand_forest, X_valid, y_valid)

{'mean_absolute_error': 12.184302723776408,
 'mean_squared_error': 252.68090971088048,
 'r2_score': 0.45712947740171794}

In [125]:
252.68090971088048 ** 0.5

15.895940038603584

In [122]:
score(extra_trees, X_valid, y_valid)

{'mean_absolute_error': 12.059190769717087,
 'mean_squared_error': 249.75721774405986,
 'r2_score': 0.463410862836654}

In [123]:
score(gradient_boost, X_valid, y_valid)

{'mean_absolute_error': 11.485501396856137,
 'mean_squared_error': 227.16956316873797,
 'r2_score': 0.5119391503816251}

In [112]:
score(svr, X_valid, y_valid)

{'mean_absolute_error': 13.478348235007521,
 'mean_squared_error': 329.4043923498372,
 'r2_score': 0.292293450954655}

In [124]:
len(X_train), len(X_valid)

(12976, 4218)