In [1]:
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
from xgboost import XGBClassifier

In [2]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
import functools
import pathlib
import pickle
import warnings

import numpy as np
import pandas as pd
from sklearn.exceptions import DataConversionWarning
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

import src.utils.model as model
import src.utils.misc as misc

warnings.filterwarnings('ignore', category=DataConversionWarning)

In [4]:
def score(fit_estimator, X, y):
    pred = fit_estimator.predict(X)
    return {m.__name__: m(y, pred) for m in metrics}


def save_model(model, name):
    file = models / f'{name}.pkl'
    with open(file, 'wb') as fp:
        pickle.dump(model, fp)

In [5]:
data = pathlib.Path('..') / 'data' 
models = data / 'models'
pkls = data / 'pkls'

with open(models / 'catboost.pkl', 'rb') as fp:
    catboost = pickle.load(fp)

with open(models / 'extra_trees.pkl', 'rb') as fp:
    extra_trees = pickle.load(fp)
    
with open(models / 'gradient_boost.pkl', 'rb') as fp:
    gradient_boost = pickle.load(fp)
    
with open(models / 'rand_forest.pkl', 'rb') as fp:
    rand_forest = pickle.load(fp)
    
with open(models / 'svr.pkl', 'rb') as fp:
    svr = pickle.load(fp)
    
with open(models / 'xgboost.pkl', 'rb') as fp:
    xgboost = pickle.load(fp)

In [6]:
train_df = misc.merge_features(pkls, 'train')
valid_df = misc.merge_features(pkls, 'valid')

X_train = train_df.drop(columns=['domain1_percent', 'domain2_percent'])
y_train = train_df['domain1_percent']

X_valid = valid_df.drop(columns=['domain1_percent', 'domain2_percent'])
y_valid = valid_df['domain1_percent']

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)

In [7]:
metrics = [mean_absolute_error, mean_squared_error, r2_score]
grid_cv = functools.partial(model.grid_search_cv, X=X_train, y=y_train)
score_est = functools.partial(model.score_estimator, 
                              metrics=metrics, 
                              X=X_train,
                              y=y_train)
plot_resid = functools.partial(model.plot_residuals, X=X_train, y=y_train)

In [23]:
def pred_estimators(X, estimators):
    dct = {type(est).__name__: est.predict(X) for est in estimators}
    X_pred = pd.DataFrame(dct)
    return X_pred

# omit extra_trees since 0.97 r-squared is likely overfitting
estimators = [catboost, gradient_boost, rand_forest, svr, xgboost]
X_pred_train = pred_estimators(X_train, estimators)
X_pred_valid = pred_estimators(X_valid, estimators)

stacker= LinearRegression()
stacker.fit(X_pred_train, y_train);

### Train Set

In [24]:
score(stacker, X_pred_train, y_train)

{'mean_absolute_error': 9.131611662164794,
 'mean_squared_error': 143.65934201634565,
 'r2_score': 0.7381592387601231}

In [10]:
score(catboost, X_train, y_train)

{'mean_absolute_error': 9.7861010170049,
 'mean_squared_error': 168.89167847601186,
 'r2_score': 0.6921695099076273}

In [11]:
score(extra_trees, X_train, y_train)

{'mean_absolute_error': 2.4028077837620265,
 'mean_squared_error': 12.575471700993393,
 'r2_score': 0.9770793111194677}

In [12]:
score(gradient_boost, X_train, y_train)

{'mean_absolute_error': 10.820734817246063,
 'mean_squared_error': 201.95194881447347,
 'r2_score': 0.6319121940190862}

In [13]:
score(rand_forest, X_train, y_train)

{'mean_absolute_error': 10.501461336695913,
 'mean_squared_error': 191.22502440274397,
 'r2_score': 0.6514636273913085}

In [14]:
score(svr, X_train, y_train)

{'mean_absolute_error': 12.334358186225979,
 'mean_squared_error': 273.4511543780627,
 'r2_score': 0.5015941363832207}

In [15]:
score(xgboost, X_train, y_train)

{'mean_absolute_error': 10.836947873813523,
 'mean_squared_error': 202.94300046037134,
 'r2_score': 0.63010585331233}

### Valid Sets

In [25]:
score(stacker, X_pred_valid, y_valid)

{'mean_absolute_error': 10.134197339301112,
 'mean_squared_error': 177.32279742339395,
 'r2_score': 0.6190320835239496}

In [17]:
score(catboost, X_valid, y_valid)

{'mean_absolute_error': 9.5911674900411,
 'mean_squared_error': 160.5404959339303,
 'r2_score': 0.6550879010782377}

In [18]:
score(extra_trees, X_valid, y_valid)

{'mean_absolute_error': 9.946190616732732,
 'mean_squared_error': 174.66536981121456,
 'r2_score': 0.6247414151796001}

In [19]:
score(gradient_boost, X_valid, y_valid)

{'mean_absolute_error': 9.926562417509873,
 'mean_squared_error': 169.43600510969043,
 'r2_score': 0.6359764069786311}

In [20]:
score(rand_forest, X_valid, y_valid)

{'mean_absolute_error': 10.552453943164833,
 'mean_squared_error': 196.25524808599724,
 'r2_score': 0.5783567931071403}

In [21]:
score(svr, X_valid, y_valid)

{'mean_absolute_error': 11.619603726958935,
 'mean_squared_error': 241.9746501613391,
 'r2_score': 0.48013126540138384}

In [22]:
score(xgboost, X_valid, y_valid)

{'mean_absolute_error': 9.926696587196737,
 'mean_squared_error': 168.612737857514,
 'r2_score': 0.6377451497140378}