In [5]:
import os
import time
import numpy as np

from model.model import model_load
from model.model import model_predict

from model.cslib import fetch_ts, engineer_features

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score

In [2]:
data_dir = os.path.join("data","cs-train")

ts_all = fetch_ts(data_dir,clean=False)

... loading ts data from files


In [6]:
metrics = {
    'r2': r2_score,
    'mae': mean_absolute_error,
    'mape': mean_absolute_percentage_error,
    'mse': mean_squared_error
}

In [3]:
X,y,dates = engineer_features(ts_all['all'])
        
## Perform a train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=42)

In [14]:
model_metadata = dict()

param_grid = {
    'model__criterion': ['mse','mae'],
    'model__n_estimators': [10,15,20,25,50,100]
    }

time_start = time.time()
pipeline = Pipeline(steps=[('scaler', StandardScaler()), ('model', RandomForestRegressor())])

grid = GridSearchCV(pipeline, param_grid=param_grid, cv=5, n_jobs=-1)
grid.fit(X_train, y_train)
model_metadata['fit_time'] = time.strftime('%H:%M:%S', time.gmtime(time.time()-time_start))
model_metadata['best_param'] = grid.best_params_
model_metadata['model'] = str(pipeline['model'])

y_pred = grid.predict(X_test)
model_metadata['metrics'] = dict()
for _metric in metrics.keys():
    model_metadata['metrics'][_metric] = metrics[_metric](y_test, y_pred)

model_metadata

{'fit_time': '00:00:01',
 'best_param': {'model__criterion': 'mse', 'model__n_estimators': 50},
 'model': 'RandomForestRegressor()',
 'metrics': {'r2': 0.9617302584168695,
  'mae': 11318.618091304337,
  'mape': 0.06431629529186617,
  'mse': 251075873.50874883}}

In [15]:
model_metadata = dict()

param_grid = {
    'model__criterion': ['friedman_mse','mae'],
    'model__n_estimators': [10,25,70,150],
    'model__learning_rate': [0.01, 0.05],
    'model__loss': ['lad', 'ls']
    }

time_start = time.time()
pipeline = Pipeline(steps=[('scaler', StandardScaler()), ('model', GradientBoostingRegressor())])

grid = GridSearchCV(pipeline, param_grid=param_grid, cv=5, n_jobs=-1)
grid.fit(X_train, y_train)
model_metadata['fit_time'] = time.strftime('%H:%M:%S', time.gmtime(time.time()-time_start))
model_metadata['best_param'] = grid.best_params_
model_metadata['model'] = str(pipeline['model'])

y_pred = grid.predict(X_test)
model_metadata['metrics'] = dict()
for _metric in metrics.keys():
    model_metadata['metrics'][_metric] = metrics[_metric](y_test, y_pred)

model_metadata

{'fit_time': '00:00:05',
 'best_param': {'model__criterion': 'friedman_mse',
  'model__learning_rate': 0.05,
  'model__loss': 'ls',
  'model__n_estimators': 150},
 'model': 'GradientBoostingRegressor()',
 'metrics': {'r2': 0.9180209272573785,
  'mae': 17618.50374574912,
  'mape': 0.10293284456508923,
  'mse': 537839202.6395093}}

In [16]:
model_metadata = dict()

param_grid = {
    'model__criterion': ['mse','mae'],
    'model__max_depth': [5,10,20,50],
    'model__min_samples_leaf': [1,2,3,4,5]
    }

time_start = time.time()
pipeline = Pipeline(steps=[('scaler', StandardScaler()), ('model', DecisionTreeRegressor())])

grid = GridSearchCV(pipeline, param_grid=param_grid, cv=5, n_jobs=-1)
grid.fit(X_train, y_train)
model_metadata['fit_time'] = time.strftime('%H:%M:%S', time.gmtime(time.time()-time_start))
model_metadata['best_param'] = grid.best_params_
model_metadata['model'] = str(pipeline['model'])

y_pred = grid.predict(X_test)
model_metadata['metrics'] = dict()
for _metric in metrics.keys():
    model_metadata['metrics'][_metric] = metrics[_metric](y_test, y_pred)

model_metadata

{'fit_time': '00:00:00',
 'best_param': {'model__criterion': 'mse',
  'model__max_depth': 20,
  'model__min_samples_leaf': 2},
 'model': 'DecisionTreeRegressor()',
 'metrics': {'r2': 0.9357708759896458,
  'mae': 11778.131530193237,
  'mape': 0.06243777231006749,
  'mse': 421387308.8369653}}

Decided to continue with RF model.