In [9]:
! pip install yfinance

Looking in indexes: https://jinlei:****@jfrog.ngridtools.com/artifactory/api/pypi/pypi-remote/simple




# VECM
- Using a VECM to predict FANG stocks
- See the [VECM documentation](https://scalecast.readthedocs.io/en/latest/Forecaster/Auxmodels.html#vecm)

In [10]:
import pandas as pd
import numpy as np
from pandas_datareader import data as pdr
import yfinance as yf
from scalecast.Forecaster import Forecaster
from scalecast.MVForecaster import MVForecaster
from scalecast.Pipeline import Transformer, Reverter, MVPipeline
from scalecast.util import (
    find_optimal_lag_order, 
    find_optimal_coint_rank,
    Forecaster_with_missing_vals,
)
from scalecast.auxmodels import vecm
from scalecast.multiseries import export_model_summaries
from scalecast import GridGenerator
import matplotlib.pyplot as plt

In [11]:
yf.pdr_override()

## Download data using a public API

In [12]:
FANG = [
    'META',
    'AMZN',
    'NFLX',
    'GOOG',
]

fs = []
for sym in FANG:
    df = pdr.get_data_yahoo(sym)
    # since the api doesn't send the data exactly in Business-day frequency
    # we can correct it using this function
    f = Forecaster_with_missing_vals(
        y=df['Close'],
        current_dates = df.index,
        future_dates = 65,
        end = '2022-09-30',
        desired_frequency = 'B',
        fill_strategy = 'linear_interp',
        add_noise = True,
        noise_lookback = 5,
    )
    fs.append(f)
    
mvf = MVForecaster(*fs,names=FANG,test_length=65)
mvf.set_validation_metric('rmse')
mvf.add_sklearn_estimator(vecm,'vecm')

mvf

[*********************100%%**********************]  1 of 1 completed


1 Failed download:
['META']: OperationalError('unable to open database file')





ValueError: Neither `start` nor `end` can be NaT

In [5]:
mvf.plot()
plt.show()

NameError: name 'mvf' is not defined

## Augmented Dickey Fuller Tests to Confirm Unit-1 Roots

In [None]:
for stock, f in zip(FANG,fs):
    adf_result = f.adf_test(full_res=True)
    print('the stock {} is {}stationary at level'.format(
        stock,
        'not ' if adf_result[1] > 0.05 else ''
    )
    )

In [None]:
for stock, f in zip(FANG,fs):
    adf_result = f.adf_test(diffy=True,full_res=True)
    print('the stock {} is {}stationary at its first difference'.format(
        stock,
        'not ' if adf_result[1] > 0.05 else ''
    )
    )

## Measure IC to Find Optimal Lag Order
- this is used to run the cointegration test

In [None]:
lag_test = find_optimal_lag_order(mvf,train_only=True)
pd.DataFrame(
    {
        'aic':lag_test.aic,
        'bic':lag_test.bic,
        'hqic':lag_test.hqic,
        'fpe':lag_test.fpe,
    },
    index = ['optimal lag order'],
).T

## Johansen cointegration test

In [None]:
coint_res = find_optimal_coint_rank(
    mvf,
    det_order=1,
    k_ar_diff=10,
    train_only=True,
)
print(coint_res)
coint_res.rank

We found a cointegration rank of 1.

## Run VECM
- Now, we can specify a grid that will try more lags, deterministic terms, seasonal fluctuations, and cointegration ranks of 0 and 1

In [None]:
vecm_grid = dict(
    lags = [0], # required to set this to 0 for the vecm model in scalecast
    freq = ['B'], # only necessary to suppress a warning
    k_ar_diff = range(1,66), # lag orders to try
    coint_rank = [0,1],
    deterministic = ["n","co","lo","li","cili","colo"],
    seasons = [0,5,30,65,260],
)

mvf.set_estimator('vecm')
mvf.ingest_grid(vecm_grid)
mvf.limit_grid_size(100,random_seed=20)
mvf.cross_validate(k=3,verbose=True)
mvf.auto_forecast()

results = mvf.export('model_summaries')
results[[
    'ModelNickname',
    'Series',
    'TestSetRMSE',
    'TestSetMAE',
]]

## View VECM Results

In [None]:
results['TestSetRMSE'].mean()

In [None]:
mvf.export_validation_grid('vecm').sample(15)

In [None]:
mvf.plot_test_set(
    series='AMZN',
    models='vecm',
    include_train=130,
    figsize=(16,8)
)
plt.show()

## Re-weight Evaluation Metrics and Rerun VECM

In [None]:
weights = results['TestSetRMSE'] / results['TestSetRMSE'].sum()
weights

In [None]:
mvf.set_optimize_on(
    lambda x: (
        x[0]*weights[0] + 
        x[1]*weights[1] + 
        x[2]*weights[2] + 
        x[3]*weights[3]
    )
)
mvf.ingest_grid(vecm_grid)
mvf.limit_grid_size(100,random_seed=20)
mvf.cross_validate(k=3,verbose=True)
mvf.auto_forecast(call_me='vecm_weighted')

results = mvf.export('model_summaries')
results[[
    'ModelNickname',
    'Series',
    'TestSetRMSE',
    'TestSetMAE',
]]

In [None]:
results.loc[results['ModelNickname'] == 'vecm_weighted','TestSetRMSE'].mean()

An improvement by weighting the optimizer!

In [None]:
mvf.plot_test_set(
    series='META',
    models='all',
    include_train=130,
    figsize=(16,8)
)
plt.show()

In [None]:
mvf.plot(
    series='all',
    models='all',
    figsize=(16,8)
)
plt.show()

## Try Other MV Models

In [None]:
GridGenerator.get_mv_grids()
# open MVGrids.py and manually change all lags arguments to range(1,66)

In [None]:
transformers = []
reverters = []
for stock, f in zip(FANG,fs):
    transformer = Transformer(
        transformers = [('DiffTransform',)]
    )
    reverter = Reverter(
        reverters = [('DiffRevert',)],
        base_transformer = transformer,
    )
    transformers.append(transformer)
    reverters.append(reverter)

In [None]:
def Xvar_select(f):
    f.set_validation_length(65)
    f.auto_Xvar_select(
        estimator='gbt',
        max_depth=2,
        max_ar=0, # in mv modeling, lags are a hyperparameter, not a regressor in the MVForecaster object
    )

def mvforecaster(mvf):
    models = (
        'mlr',
        'elasticnet',
        'gbt',
        'xgboost',
        'lightgbm',
        'knn',
    )
    mvf.set_test_length(65)
    mvf.tune_test_forecast(
        models,
        limit_grid_size=10,
        cross_validate=True,
        k=3,
    )

In [None]:
pipeline = MVPipeline(
    steps = [
        ('Transform',transformers),
        ('Xvar Select',[Xvar_select]*4),
        ('Forecast',mvforecaster),
        ('Revert',reverters),
    ],
    names = FANG,
)

In [None]:
fs = pipeline.fit_predict(*fs)

In [None]:
results = export_model_summaries(dict(zip(FANG,fs)))

## View Results

In [None]:
model_rmses = results.groupby('ModelNickname')['TestSetRMSE'].mean().sort_values().reset_index()
model_rmses

The above table is the mean mape performance from each model over all series.

In [None]:
series_rmses = results.groupby('Series')['TestSetRMSE'].min().reset_index()
series_rmses['Model'] = [
    results.loc[
        results['TestSetRMSE'] == i,
        'ModelNickname'
    ].values[0] for i in series_rmses['TestSetRMSE']
]
series_rmses

In [None]:
series_rmses['TestSetRMSE'].mean()

The above table shows the best model for each series and its derived RMSE. The average RMSE of all these models applied to the individual series is 27.9, but being so dependent on the test set to choose the model probably leads to overfitting.

In [None]:
fs[1].plot_test_set(
    models='xgboost',
    include_train=130,
)
plt.title('AMZN Best Model Test Predictions')
plt.show()

In [None]:
fs[1].plot(
    models='xgboost',
)
plt.title('AMZN Best Model Forecast')
plt.show()