# Non deep learning models - multistep

The multistep model is composed for a different regressor for each output variable so it's performance will be the same as the sinle step models.


https://stackoverflow.com/questions/39540123/muti-output-regression-in-xgboost

In [1]:
# Local imports
import pathlib
import sys
sys.path.insert(0, str(pathlib.Path.cwd().parent / 'src'))

from paths import PATHS
from data.base import make_splits, get_data, single_X_y

In [8]:
import matplotlib
import matplotlib.pylab as plt
import numpy as np
import pandas as pd

from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor

In [3]:
df = get_data(trend=True, multistep=7)
splits = make_splits(df,
                     norm=False,
                     multistep=7)
X, y = single_X_y(splits)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [4]:
# # Format y for this specific notebook
# y = y.loc[:, 'incidence 7 (t+7)']
# for i in ['train', 'val', 'test']:
#     splits[i]['y'] = splits[i]['y'].loc[:, 'incidence 7 (t+7)']

In [28]:
def model_summary(model):
    
#     print('# Feature importance')
#     for i, j in zip(X.columns, model.feature_importances_):
#         print(f'{j:.2f} - {i} ')
    
    print('\n# Metrics')
    metrics = {'mae': {}, 'mape': {}}
    
    for i in ['train', 'val', 'test']:
        x_t = splits[i]['X']
        y_t = splits[i]['y']

        y_p = model.predict(x_t)
        
        # We just compare results for t+7 to be able to compare with single step
        y_t = y_t[y_t.columns[-1]]
        y_p = y_p[:, -1]
        
        mae = np.abs(y_p - y_t)
        mape = np.abs( (y_p - y_t) / y_t)
        mape = mape.replace([np.inf, -np.inf], np.nan)
        
        metrics['mae'][i] = mae.mean()
        metrics['mape'][i] = mape.mean()

        print(f' - {i.capitalize()}')
        print(f'   Mean Absolute Error (in incidence): {mae.mean():.4f}')
        print(f'   Mean Absolute Percentage Error (in incidence): {mape.mean():.4f}')
        
    return metrics


def save_metrics(metrics, modelname):
    for m in metrics.keys():
        b = pd.DataFrame.from_dict(metrics[m], orient='index', columns=[modelname]).T
        a = pd.read_csv(f'single_step/{m}.csv', index_col=0)
        a = pd.concat([a[~a.index.isin(b.index)], b])
        a.to_csv(f'single_step/{m}.csv')

In [5]:
# Sample weights (recent samples have higher weight)
dates = splits['train']['X'].index.get_level_values(0)
dates = pd.to_datetime(dates)
n = (dates[-1] - dates).days
w = 0.99 ** n  # geometric discount (works better than linear) 

In [9]:
model = MultiOutputRegressor(
    GradientBoostingRegressor(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=5,
        random_state=0,
        loss='ls')
)
model = model.fit(splits['train']['X'],
                  splits['train']['y'],
                  sample_weight=w)

# metrics = model_summary(model)

In [29]:
metrics = model_summary(model)


# Metrics
 - Train
   Mean Absolute Error (in incidence): 13.2096
   Mean Absolute Percentage Error (in incidence): 0.4572
 - Val
   Mean Absolute Error (in incidence): 39.6169
   Mean Absolute Percentage Error (in incidence): 0.2230
 - Test
   Mean Absolute Error (in incidence): 16.8738
   Mean Absolute Percentage Error (in incidence): 0.3705


In [11]:
# save_metrics(metrics, 'Gradient Boost')

# y_pred = model.predict(X)
# y_pred = np.round(y_pred).astype(np.int)
# dfp = pd.DataFrame(y_pred, index=X.index, columns=['Gradient Boost'])
# dfp.to_csv("single_step/pred_gradboost.csv")