In [120]:
import pandas as pd
import numpy as np
from statsmodels.tsa.filters.hp_filter import hpfilter
import matplotlib.pyplot as plt
import statsmodels.api as sm
import datetime as dt
import seaborn as sns
import os
from sklearn.model_selection import train_test_split
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model._ridge import Ridge
from sklearn.metrics import r2_score
from keras.metrics import mean_absolute_percentage_error, RootMeanSquaredError
from sklearn.model_selection import GridSearchCV, train_test_split
import warnings
from xgboost import XGBRegressor
warnings.filterwarnings('ignore')


In this notebook, I will explore the difference between applying regression model (Kernel ridge regression) using different sets of data : a raw data (closing price of stocks daily), and a preprocessed smoothed data (closing price of stocks w/ HP-Filter applied). For the sake of comparison, I will be using two different assets, which are TESLA and US Treasury bond. 

In [59]:
days = 250 # This parameter sets different number of days we are going to use for our time lag

We apply timelag to our data in order to generate more data from a single source. This technique is commonly used in time series analysis.

In [60]:

def produceHP(ticker):
    ticker = "\\Data\\{}".format(ticker) + ".csv"
    dir = os.getcwd() + ticker
    data = pd.read_csv(dir)
    data = data.set_index('Date')
    data['preprice'] = data['Close'].T.shift(1)
    data = data.dropna()
    data['Close'] = (data['Close'] - data['preprice']) / data['Close']
    cycle, trend = sm.tsa.filters.hpfilter(data['Close'], 10000000)
    x = data.index
    x = [dt.datetime.strptime(d, '%Y-%m-%d') for d in x]
    y = trend
    # plt.plot(x, y)
    # y = data['Close']
    # plt.plot(x, y)
    # plt.show()
    return (data, trend, cycle)
produceHP("TSLA")
def reshape_data(df):
    data_reshape = pd.concat([df['Close'].T.shift(i).to_frame().stack(dropna=False) for i in range(days) ], 1).dropna()
    data_reshape.columns = pd.Index(range(days), name='timeLag')
    return data_reshape

First, we apply HP-filter to a data and see what it does in order to understand it. Then, we apply time lag to the raw data.

In [61]:
data, trend, cycle = produceHP("TSLA")
data_reshape = reshape_data(data)

We define a split method in order to set some of the data as test set, and some as training set.

In [125]:
def split(data):
    X = data[np.arange(1, days)]
    y = data[0]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)
    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.25)
    return (X_train, X_test, X_valid, y_train, y_test, y_valid)

Then, we apply time lag onto HP-Filter too.

In [63]:
def get_HP_lag(data_reshape):
    trend_data = pd.DataFrame()
    # trend_data.columns = pd.Index(range(50), name='timeLag')
    rows = []
    for i in range(len(data_reshape)):
        row = data_reshape.iloc[i]
        cycle, trend = sm.tsa.filters.hpfilter(row[1:], 100000)
        trend = pd.concat([cycle, trend], axis = 0)
        trend.index = np.arange(1, len(cycle) * 2 + 1)
        rows.append(pd.concat([pd.Series(row[0]), trend], axis = 0))
    trend_data = pd.DataFrame(rows)
    trend_data.columns = pd.Index(range(days * 2 - 1), name='timeLag')
    return trend_data

Since we now have all data that we need for comparison, we run kernel-ridge regression to test how the model performs. We decided to use this model since it supports non-linearity of the data. Hyperparameter, also known as a regularization factor can be adjusted accordingly. If regularization term is 0, it becomes a traditional least-square problem, while higher regularization factor underfits the model. Thus, there are two hyperparameters in our model : number of days used for prediction, and regularization factor alpha.

In [157]:
def test(data_reshape, reg):
    sum = 0.0
    for i in range(10):
        X_train, X_test, X_valid, y_train, y_test, y_valid = split(data_reshape)
        model = XGBRegressor()
        model.fit(X_train, y_train, eval_set = [(X_valid, y_valid)], early_stopping_rounds=5, verbose = 0)
        prediction = model.predict(X_test)
        m = RootMeanSquaredError()
        m.update_state(y_test, prediction)
        sum += m.result()
    return sum.numpy()

In [65]:
ticker = "TSLA"
days = 250

In [137]:
def pipeline(t, d, a):
    ticker = t
    days = d
    data, trend, cycle = produceHP(ticker)
    data_reshape = reshape_data(data)
    trend_reshape = get_HP_lag(data_reshape)
    print(trend_reshape)
    return (test(data_reshape, a), test(trend_reshape, a))

In [184]:
pipeline("TSLA", 10, 1)

timeLag       0         1         2         3         4         5         6    \
0        0.003586 -0.001257  0.017668 -0.031366  0.030052 -0.024811 -0.003755   
1        0.017303  0.006218 -0.001750  0.017199 -0.031812  0.029628 -0.025212   
2        0.027286  0.018401  0.004756 -0.003149  0.015862 -0.033087  0.028415   
3       -0.024550  0.026145  0.016321  0.002762 -0.005058  0.014038 -0.034827   
4        0.024296 -0.023869  0.028037  0.018124  0.004477 -0.003429  0.015580   
...           ...       ...       ...       ...       ...       ...       ...   
1091     0.002464 -0.042675 -0.037198 -0.021627  0.003262  0.022499  0.002306   
1092     0.024493  0.007690 -0.043296 -0.037806 -0.022221  0.002681  0.021931   
1093     0.016921  0.027776  0.005490 -0.045392 -0.039799 -0.024112  0.000891   
1094    -0.073077  0.018896  0.026272  0.004046 -0.046776 -0.041124 -0.025377   
1095    -0.011159 -0.065483  0.024118  0.031293  0.008868 -0.042151 -0.036693   

timeLag       7         8  

(0.47286835, 0.4684743)

As you can see, RMSE for model using raw data was higher compared to its counterpart using smoothed data. 

In [185]:
pipeline("^TNX", 10, 1)

timeLag       0         1         2         3         4         5         6    \
0       -0.017059 -0.007492 -0.020807 -0.000211 -0.000912 -0.014002  0.004148   
1       -0.058887 -0.015812 -0.006232 -0.019597  0.000950  0.000200 -0.012938   
2        0.026038 -0.053139 -0.011584 -0.002178 -0.015716  0.004660  0.003741   
3       -0.007087  0.029607 -0.055495 -0.013843 -0.004341 -0.017783  0.002687   
4        0.025216 -0.003086  0.029853 -0.055258 -0.013615 -0.004122 -0.017574   
...           ...       ...       ...       ...       ...       ...       ...   
1091     0.046674 -0.014358  0.042307 -0.028178  0.012175  0.001818 -0.013115   
1092     0.021695  0.032320 -0.016928  0.039846 -0.030531  0.009930 -0.000320   
1093    -0.069906  0.006412  0.031804 -0.017432  0.039354 -0.031011  0.009462   
1094     0.011209 -0.079051  0.012698  0.037826 -0.011672  0.044854 -0.025768   
1095     0.014984  0.001768 -0.079191  0.012566  0.037702 -0.011789  0.044745   

timeLag       7         8  

(0.4784996, 0.49320853)

In [176]:
d = [10, 50, 100]
a = [0.1, 0.01, 1, 10]
rawMin = (0, 0)
trendMin = (0, 0)
rawScore = 999999
trendScore = 999999
for day in d:
    days = day
    ticker = "TSLA"
    data, trend, cycle = produceHP(ticker)
    data_reshape = reshape_data(data)
    trend_reshape = get_HP_lag(data_reshape)
    for alpha in a:
        raw = test(data_reshape, alpha)
        raw = raw
        trend = test(trend_reshape, alpha)
        trend = trend
        if raw < rawScore:
            rawScore = raw
            rawMin = (day, alpha)
        if trend < trendScore:
            trendScore = trend
            trendMin = (day, alpha)
print(rawMin)
print(trendMin)

(10, 0.01)
(10, 0.01)


In [177]:
print(rawScore)
print(trendScore)

0.43850455
0.4281397


In [178]:
rawMin = (0, 0)
trendMin = (0, 0)
rawScore = 999999
trendScore = 999999
for day in d:
    days = day
    ticker = "NVDA"
    data, trend, cycle = produceHP(ticker)
    data_reshape = reshape_data(data)
    trend_reshape = get_HP_lag(data_reshape)
    for alpha in a:
        raw = test(data_reshape, alpha)
        raw = raw
        trend = test(trend_reshape, alpha)
        trend = trend
        if raw < rawScore:
            rawScore = raw
            rawMin = (day, alpha)
        if trend < trendScore:
            trendScore = trend
            trendMin = (day, alpha)
print(rawMin)
print(trendMin)

(10, 10)
(50, 1)


In [167]:
print(rawScore)
print(trendScore)

0.33981633
0.31923437


In [179]:
rawMin = (0, 0)
trendMin = (0, 0)
rawScore = 999999
trendScore = 999999
for day in d:
    days = day
    ticker = "^TNX"
    data, trend, cycle = produceHP(ticker)
    data_reshape = reshape_data(data)
    trend_reshape = get_HP_lag(data_reshape)
    for alpha in a:
        raw = test(data_reshape, alpha)
        raw = raw
        trend = test(trend_reshape, alpha)
        trend = trend
        if raw < rawScore:
            rawScore = raw
            rawMin = (day, alpha)
        if trend < trendScore:
            trendScore = trend
            trendMin = (day, alpha)
print(rawMin)
print(trendMin)

(50, 1)
(100, 0.01)


In [180]:
print(rawScore)
print(trendScore)

0.44377202
0.43739983


In [181]:
rawMin = (0, 0)
trendMin = (0, 0)
rawScore = 999999
trendScore = 999999
for day in d:
    days = day
    ticker = "TWTR"
    data, trend, cycle = produceHP(ticker)
    data_reshape = reshape_data(data)
    trend_reshape = get_HP_lag(data_reshape)
    for alpha in a:
        raw = test(data_reshape, alpha)
        raw = raw
        trend = test(trend_reshape, alpha)
        trend = trend
        if raw < rawScore:
            rawScore = raw
            rawMin = (day, alpha)
        if trend < trendScore:
            trendScore = trend
            trendMin = (day, alpha)
print(rawMin)
print(trendMin)
print(rawScore)
print(trendScore)

(10, 0.01)
(10, 0.1)
0.3573674
0.3548293


In [182]:
rawMin = (0, 0)
trendMin = (0, 0)
rawScore = 999999
trendScore = 999999
for day in d:
    days = day
    ticker = "AXON"
    data, trend, cycle = produceHP(ticker)
    data_reshape = reshape_data(data)
    trend_reshape = get_HP_lag(data_reshape)
    for alpha in a:
        raw = test(data_reshape, alpha)
        raw = raw
        trend = test(trend_reshape, alpha)
        trend = trend
        if raw < rawScore:
            rawScore = raw
            rawMin = (day, alpha)
        if trend < trendScore:
            trendScore = trend
            trendMin = (day, alpha)
print(rawMin)
print(trendMin)
print(rawScore) 
print(trendScore)

(10, 0.1)
(100, 0.1)
0.35752606
0.35081267


In [183]:
rawMin = (0, 0)
trendMin = (0, 0)
rawScore = 999999
trendScore = 999999
for day in d:
    days = day
    ticker = "FDP"
    data, trend, cycle = produceHP(ticker)
    data_reshape = reshape_data(data)
    trend_reshape = get_HP_lag(data_reshape)
    for alpha in a:
        raw = test(data_reshape, alpha)
        raw = raw
        trend = test(trend_reshape, alpha)
        trend = trend
        if raw < rawScore:
            rawScore = raw
            rawMin = (day, alpha)
        if trend < trendScore:
            trendScore = trend
            trendMin = (day, alpha)
print(rawMin)
print(trendMin)
print(rawScore) 
print(trendScore)

(10, 10)
(10, 0.01)
0.2807043
0.2716002
