In [1002]:
import pandas as pd
import numpy as np
import pandas_datareader as pdr
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from numpy.linalg import lstsq
from itertools import combinations
from statsmodels.api import OLS
from sklearn.linear_model import LassoCV, RidgeCV
import statsmodels.api as sm

In [1004]:
# Import data
## Very slow cell to run ##

# momentum
momentum = pdr.get_data_famafrench("F-F_Momentum_Factor_daily", start="1920", end="2020-12-31")[0]
momentum = momentum.resample("M").last().squeeze()

#momentum = pd.read_csv("data/F-F_Momentum_Factor.csv",index_col="Date")
#momentum.index = pd.to_datetime(momentum.index,format="%Y%m")
#momentum = momentum.resample("M").last().squeeze()


# Value weighted market
vwm = pd.read_csv("data/VWM.csv",index_col="Date")
vwm.index = pd.to_datetime(vwm.index, format="%Y%m")
vwm = vwm.resample("M").last().squeeze()

# 1, 5 and 10 year constant maturities
const_mat_10 = pdr.get_data_fred("DGS10", "1920-01-01", "2020-12-31")
const_mat_10 = const_mat_10.resample("M").last().squeeze()
const_mat_5 = pdr.get_data_fred("DGS5", "1920-01-01", "2020-12-31")
const_mat_5 = const_mat_5.resample("M").last().squeeze()
const_mat_1 = pdr.get_data_fred("DGS1", "1920-01-01", "2020-12-31")
const_mat_1 = const_mat_1.resample("M").last().squeeze()

# AAA and BAA (Moody's)
aaa = pd.read_csv("data/AAA.csv",parse_dates=True,index_col="Date")
aaa = aaa.resample("M").last().squeeze()
baa = pdr.get_data_fred("DBAA", "1920-01-01", "2020-12-31")
baa = baa.resample("M").last().squeeze()

# Unemployment rate (US)
unrate = pdr.get_data_fred("UNRATE", "1920-01-01", "2020-12-31")
unrate = unrate.resample("M").last().squeeze()

# core CPi US - consumer price index for all urban consumers
core_cpi = pdr.get_data_fred("CPIAUCSL", "1920-01-01", "2020-12-31")
core_cpi = core_cpi.resample("M").last().squeeze()

# Industrial Productivity
ind_prod = pdr.get_data_fred("INDPRO", "1920-01-01", "2020-12-31")
ind_prod = ind_prod.resample("M").last().squeeze()

# Risk free rate
#ff_factors = pdr.get_data_famafrench("F-F_Research_Data_Factors_daily", start="1920", end="2020-12-31")[0]
#rf = ff_factors["RF"].resample("M").last()
rf = (( 1 + const_mat_1["1986-02-28":]/100) **(1/12)) -1

In [1006]:
rf_mean = (rf[208:]).mean()

In [1008]:
rf.mean()

0.002794169009671416

In [1010]:
# Variable construction:

term = const_mat_10 - const_mat_1
curve = const_mat_10 - 2 * const_mat_5 + const_mat_1
default = aaa - baa

inflation = pd.Series.copy(core_cpi)
for i in range(12):
    inflation[i] = np.nan
for i in range(12,887):
    inflation[i] = np.log(core_cpi[i]) - np.log(core_cpi[i-12])
inflation = inflation.dropna()

variables = pd.DataFrame([term,curve,default,inflation]).T
variables.columns = ["Term", "Curve", "Default", "Inflation"]
common_variables = variables.dropna()
common_variables

Unnamed: 0,Term,Curve,Default,Inflation
1986-01-31,1.51,-0.41,-1.32,0.038966
1986-02-28,0.70,-0.26,-0.95,0.031484
1986-03-31,0.67,-0.27,-1.31,0.021307
1986-04-30,0.84,-0.42,-1.37,0.015763
1986-05-31,1.17,-0.71,-1.32,0.016652
...,...,...,...,...
2020-08-31,0.60,0.28,-1.14,0.013129
2020-09-30,0.57,0.25,-1.13,0.013994
2020-10-31,0.75,0.25,-1.14,0.011964
2020-11-30,0.73,0.23,-0.83,0.011542


In [1012]:
np.log(core_cpi[14]) - np.log(core_cpi[2])

0.06595796779179741

In [1014]:
data = pd.DataFrame([momentum,vwm, const_mat_10,const_mat_5,const_mat_1,aaa,baa,unrate,core_cpi,ind_prod]).T
data.columns = ["Mom", "VWM","10yr","5yr","1yr","AAA","BAA","unempl","CPI","Ind_Prod"]
common_data = data.dropna()
common_data

Unnamed: 0_level_0,Mom,VWM,10yr,5yr,1yr,AAA,BAA,unempl,CPI,Ind_Prod
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1986-01-31,0.34,1.21,9.08,8.53,7.57,10.05,11.37,6.7,109.900,57.3104
1986-02-28,0.49,7.66,8.13,7.91,7.43,9.67,10.62,7.2,109.700,56.9344
1986-03-31,0.76,5.48,7.39,7.19,6.72,9.00,10.31,7.2,109.100,56.5420
1986-04-30,-0.36,-0.79,7.38,7.17,6.54,8.79,10.16,7.1,108.700,56.5599
1986-05-31,-0.29,5.11,8.05,7.82,6.88,9.09,10.41,7.2,109.000,56.6823
...,...,...,...,...,...,...,...,...,...,...
2020-06-30,1.15,2.47,0.66,0.29,0.16,2.44,3.59,11.1,257.214,97.8019
2020-07-31,1.17,5.78,0.55,0.21,0.11,2.14,3.15,10.2,258.723,101.8924
2020-08-31,1.75,7.64,0.72,0.28,0.12,2.25,3.39,8.4,259.681,102.6619
2020-09-30,0.15,-3.62,0.69,0.28,0.12,2.31,3.44,7.8,260.209,102.6008


In [1016]:
import statsmodels.api as sm
common_sample = pd.DataFrame([momentum,vwm, const_mat_10,const_mat_5,const_mat_1,aaa,baa,unrate,core_cpi,ind_prod,term,curve,default,inflation]).T
common_sample.columns = ["Mom", "VWM","10yr","5yr","1yr","AAA","BAA","unempl","CPI","Ind_Prod","Term", "Curve", "Default", "Inflation"]
common_sample = common_sample.dropna()
common_sample = sm.add_constant(common_sample)
common_sample

Unnamed: 0_level_0,const,Mom,VWM,10yr,5yr,1yr,AAA,BAA,unempl,CPI,Ind_Prod,Term,Curve,Default,Inflation
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1986-01-31,1.0,0.34,1.21,9.08,8.53,7.57,10.05,11.37,6.7,109.900,57.3104,1.51,-0.41,-1.32,0.038966
1986-02-28,1.0,0.49,7.66,8.13,7.91,7.43,9.67,10.62,7.2,109.700,56.9344,0.70,-0.26,-0.95,0.031484
1986-03-31,1.0,0.76,5.48,7.39,7.19,6.72,9.00,10.31,7.2,109.100,56.5420,0.67,-0.27,-1.31,0.021307
1986-04-30,1.0,-0.36,-0.79,7.38,7.17,6.54,8.79,10.16,7.1,108.700,56.5599,0.84,-0.42,-1.37,0.015763
1986-05-31,1.0,-0.29,5.11,8.05,7.82,6.88,9.09,10.41,7.2,109.000,56.6823,1.17,-0.71,-1.32,0.016652
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-06-30,1.0,1.15,2.47,0.66,0.29,0.16,2.44,3.59,11.1,257.214,97.8019,0.50,0.24,-1.15,0.007070
2020-07-31,1.0,1.17,5.78,0.55,0.21,0.11,2.14,3.15,10.2,258.723,101.8924,0.44,0.24,-1.01,0.010241
2020-08-31,1.0,1.75,7.64,0.72,0.28,0.12,2.25,3.39,8.4,259.681,102.6619,0.60,0.28,-1.14,0.013129
2020-09-30,1.0,0.15,-3.62,0.69,0.28,0.12,2.31,3.44,7.8,260.209,102.6008,0.57,0.25,-1.13,0.013994


Importing useful functions

In [1018]:
## Cross Validation function

def xval_5fold(y, x , random=False, seed = 20201231,fold=5):
    # Use numpy arrays for simplicity
    y = np.asarray(y)
    x = np.asarray(x)
    n = y.shape[0]
    if random:
        # If randomisation is needed, use either the default or provided seed
        rg = np.random.default_rng(seed)
        # Generate a set of index values to use to randomly reorder the data
        # After randomisation, we can use the data as if it is is inorder!
        ind = rg.permutation(np.arange(n))
        y = y[ind]
        x = x[ind]
    # Compute the block size
    block = n / fold
    sse = 0.0
    for i in range(int(fold)):
        # Start and end of each block need to be integers or we lose an observation
        # Rounding ensures that we get all observations since int rounds down
        st = int(np.round(i * block))
        en = int(np.round((i + 1) * block))
        # Construct the indicies of the observations that we leave out
        leave_out = np.r_[st:en]
        # The included are the one that we don't leave out
        include = np.setdiff1d(np.arange(n), leave_out)
        # Compute the regression coefficients
        beta = lstsq(x[include], y[include], rcond=None)[0]
        # Compute the residuals and add to the sse
        resid = y[st:en]- x[st:en] @ beta
        sse += resid @ resid
    return sse

# Randomisation is better when we suspect the model may not be totally stable

In [1020]:
# forward stepwise function: 
# requires x: x dataframe , y series, p = number of columns, requires xval function too
# input x, y, p
def forward_stepwise(x, y, p,random=False,fold=5):
    included = []

    for i in range(p):
        excluded = [col for col in x if col not in included]
        best_sse = np.inf
        for col in excluded:
            try_x = x[included + [col]]
            beta = lstsq(try_x, y, rcond=None)[0]
            resid = y - try_x @ beta
            sse = resid @ resid
            if sse < best_sse:
                best_sse = sse
                next_var = col
            included.append(next_var)
    
    fsr_sse_sv = {}
    for i in range(1,p+1):
        fsr_sse_sv[i] = xval_5fold(y,x[included[:i]],random=random, fold=fold)
    fsr_sse_sv = pd.Series(fsr_sse_sv)
    forward_step_model = included[:fsr_sse_sv.idxmin()]
    return forward_step_model, pd.Series(fsr_sse_sv), included

In [1231]:
### Best subset regression function

def best_subset(x, y, p, random=False, fold=5):

    """for i in range(1, p+1):
        count = 0
        for comb in combinations(x.columns, i):
            count += 1
            if count > 1:
                break"""

    best_models = {}
    for i in range( 1, p+1):
        best_sse = np.inf
        for comb in combinations(x.columns, i):
            reg = x[list(comb)]
            beta = lstsq(reg, y , rcond=None)[0]
            resid = y - reg @ beta
            sse = resid @ resid
            if sse < best_sse:
                best_sse = sse
                best_models[i] = list(comb)

    bsr_sse_xv = {}
    for n_var in best_models:
        bsr_sse_xv[n_var] = xval_5fold(y, x[best_models[n_var]], random=random,fold=fold)
    bsr_sse_xv = pd.Series(bsr_sse_xv)
    bsr_model = best_models[bsr_sse_xv.idxmin()]

    return bsr_model, bsr_sse_xv

#print(bsr_model)
#bsr_sse_xv

In [1024]:
"""def best_subset(x, y, p, random=False, fold=5):

    best_models = {}
    for i in range( 1, p+1):
        best_sse = np.inf
        for comb in combinations(x.columns, i):
            reg = x[list(comb)]

            sse = xval_5fold(y,reg,random=random,fold=fold)



            #beta = lstsq(reg, y , rcond=None)[0]
            #resid = y - reg @ beta
            #sse = resid @ resid
            if sse < best_sse:
                best_sse = sse
                best_models[i] = list(comb)

    bsr_sse_xv = {}
    for n_var in best_models:
        bsr_sse_xv[n_var] = xval_5fold(y, x[best_models[n_var]], random=random,fold=fold)
    bsr_sse_xv = pd.Series(bsr_sse_xv)
    bsr_model = best_models[bsr_sse_xv.idxmin()]

    return bsr_model, bsr_sse_xv"""

Problem 1, predicting momentum

In [1026]:
# Selecting a model to predict momentum returns in sample
# Setup y, x, t and p - cutting off first and last values to create a lagged effect of our regressors

y = common_sample["Mom"]["1986-02-28":]
x = common_sample["1986-01-28":"2020-09-30"]
t, p = x.shape
x.index = x.index.shift(1)


# Setup in_sample data

y_in = common_sample["Mom"][1:209]
x_in = common_sample[0:208]
t_in, p_in = x_in.shape
x_in.index = x_in.index.shift(1)

# Set up the out of sample evaluation data
# Setup y, x, t and p - cutting off first and last values to create a lagged effect of our regressors

y_oos = common_sample["Mom"][209:]
x_oos = common_sample[208:-1]
t_oos, p_oos = x_oos.shape
x_oos.index = x_oos.index.shift(1)

x_scale = x.std(ddof=0)
y_scale = y.std(ddof=0)
std_x = x / x_scale
std_y = y / y_scale

In [1028]:
# Best subset model selection, momentum in sample
### Cell takes a minute to run ###

best_subset(x, y, p, random=True, fold=5)

(['const', 'Mom', 'unempl', 'CPI', 'Ind_Prod', 'Term', 'Curve'],
 1     182.073380
 2     182.530588
 3     181.676988
 4     179.827722
 5     179.505054
 6     177.736596
 7     176.909019
 8     177.363586
 9     178.823218
 10    178.823218
 11    179.579983
 12    180.953259
 13    186.216708
 14    187.801429
 15    190.939450
 dtype: float64)

In [1030]:
### Forward stepwise model selection, in sample
forward_stepwise(x, y, p, random=True, fold=5)

(['const', 'Mom'],
 1     186.348963
 2     183.065823
 3     187.853283
 4     188.676536
 5     187.012071
 6     186.617412
 7     189.146196
 8     194.480457
 9     193.659730
 10    194.200241
 11    190.168725
 12    190.168725
 13    190.168725
 14    190.168725
 15    190.939450
 dtype: float64,
 ['const',
  'Mom',
  'VWM',
  '10yr',
  '5yr',
  '1yr',
  'AAA',
  'BAA',
  'unempl',
  'CPI',
  'Ind_Prod',
  'Ind_Prod',
  'Ind_Prod',
  'Ind_Prod',
  'Inflation',
  'Term',
  'Term',
  'Term',
  'Curve',
  'Curve',
  'Default'])

In [1032]:
### LASSO estimation
# initialise the Lasso CV model

x_scale = x[['Mom', '5yr', '1yr', 'unempl', 'Curve']].std(ddof=0)
y_scale = y.std(ddof=0)
std_x = x / x_scale
std_y = y / y_scale
std_x = std_x[['Mom', '5yr', '1yr', 'unempl', 'Curve']]
#std_x = sm.add_constant(std_x)

lasso_cv = LassoCV(fit_intercept=False)
lasso_cv = lasso_cv.fit(std_x,std_y)
print(f"Optimal Alpha = {lasso_cv.alpha_}")
lasso_beta = lasso_cv.coef_  * (y_scale / x_scale)
print(lasso_beta)
print(lasso_cv.intercept_)

#Do we need LASSO if it doesn't fit the oos data well?

Optimal Alpha = 0.0001486989299354116
Mom       0.116958
5yr      -0.485016
1yr       0.459065
unempl    0.056672
Curve    -0.545939
dtype: float64
0.0


In [1034]:
#Lasso evaluation

pred_returns = x_oos[['Mom', '5yr', '1yr', 'unempl', 'Curve']] @ lasso_beta

strat_returns = np.ones(209)
for i in range(209):
    if pred_returns[i] > y_in.mean():
        portfolio_weight = 1.5
    else:
        portfolio_weight = 0.5
    
    strat_returns[i] = (portfolio_weight * y_oos[i]) + (1- portfolio_weight) * rf[i+208]


cum_strat_returns = ((1+strat_returns/100).cumprod()-1)*100
cum_strat_returns[-1]

6.118908762801545

In [1036]:
## Evaluating the model #######

beta = lstsq(x[['const', 'Mom', 'VWM', '10yr', '5yr']],y,rcond=0)[0]

pred_returns = x_oos[['const', 'Mom', 'VWM', '10yr', '5yr']] @ beta

strat_returns = np.ones(209)
for i in range(209):
    if pred_returns[i] > y_in.mean():
        portfolio_weight = 1.5
    else:
        portfolio_weight = 0.5
    
    strat_returns[i] = (portfolio_weight * y_oos[i]) + (1- portfolio_weight) * rf[i+208]


cum_strat_returns = ((1+strat_returns/100).cumprod()-1)*100
cum_strat_returns[-1]

14.752098261033542

In [1038]:
beta = lstsq(x[['const', 'Mom', 'VWM', '10yr', '5yr']],y,rcond=0)[0]

pred_returns = x_oos[['const', 'Mom', 'VWM', '10yr', '5yr']] @ beta

strat_returns = np.ones(209)
for i in range(209):
    if pred_returns[i] > y_in.quantile(.75):
        portfolio_weight = 3
    elif pred_returns[i] > y_in.quantile(.5):
        portfolio_weight = 1
    elif pred_returns[i] > y_in.quantile(.25):
        portfolio_weight = 0.5
    else:
        portfolio_weight = -1
    
    strat_returns[i] = (portfolio_weight * y_oos[i]) + (1- portfolio_weight) * rf[i+208]


cum_strat_returns = ((1+strat_returns/100).cumprod()-1)*100
cum_strat_returns[-1]

17.52102438881802

In [1040]:
y_in.quantile(.25)

-0.2475

In [1042]:
((1+y_in/100).cumprod()-1)*100

Date
1986-02-28    0.490000
1986-03-31    1.253724
1986-04-30    0.889211
1986-05-31    0.596632
1986-06-30    0.938660
                ...   
2003-01-31    8.958297
2003-02-28    8.446193
2003-03-31    9.335452
2003-04-30    8.985579
2003-05-31    8.506042
Freq: M, Name: Mom, Length: 208, dtype: float64

In [1044]:
y_in

Date
1986-02-28    0.49
1986-03-31    0.76
1986-04-30   -0.36
1986-05-31   -0.29
1986-06-30    0.34
              ... 
2003-01-31    0.82
2003-02-28   -0.47
2003-03-31    0.82
2003-04-30   -0.32
2003-05-31   -0.44
Freq: M, Name: Mom, Length: 208, dtype: float64

In [1046]:
resid_oos = y_oos - pred_returns
oos_sse = resid_oos @ resid_oos
oos_tss = (y_oos **2).sum()
oos_r2 = 1- oos_sse/ oos_tss
print(f"The out of sample R2 for the out of sample period is {100*oos_r2:0.1f}%.")
print()
r2 = OLS(y_oos, x_oos).fit().rsquared
print(f"The in sample R2 for the out of sample period is {100*r2:0.1f}%.")

The out of sample R2 for the out of sample period is -113.1%.

The in sample R2 for the out of sample period is 11.2%.


In [1048]:
### Sharpe ratio
print(strat_returns.std())
print()
print(((strat_returns.mean())-rf.mean())/ strat_returns.std() * np.sqrt(12))
print()
print((y_oos.mean()-rf.mean()) / y_oos.std() * np.sqrt(12))

1.1335711179573302

0.14414484701476496

-0.023147350695825655


Amazing - but now to attempt a replication out of sample

In [1050]:
### Training the data on the first half (208 data points in y)
best_subset(x_in, y_in, p_in, random=True,fold=5)

(['const', 'VWM', '5yr', 'BAA', 'unempl', 'CPI', 'Ind_Prod', 'Inflation'],
 1     54.401696
 2     53.343351
 3     52.968047
 4     52.742403
 5     51.584008
 6     51.123876
 7     50.985594
 8     50.970142
 9     51.249985
 10    51.414770
 11    51.727225
 12    52.237133
 13    52.632262
 14    53.825214
 15    54.178277
 dtype: float64)

In [1232]:
forward_stepwise(x_in, y_in, p_in, random=True,fold=5)

(['const', 'Mom'],
 1     4751.695148
 2     4630.072718
 3     4662.736970
 4     4699.788618
 5     4772.386611
 6     4798.127880
 7     4788.041046
 8     4839.206875
 9     4852.614280
 10    4909.439909
 11    4919.974820
 12    4919.974820
 13    4919.974820
 14    4919.974820
 15    4940.278993
 dtype: float64,
 ['const',
  'Mom',
  'VWM',
  '10yr',
  '5yr',
  '1yr',
  'AAA',
  'BAA',
  'unempl',
  'CPI',
  'Ind_Prod',
  'Ind_Prod',
  'Ind_Prod',
  'Ind_Prod',
  'Inflation',
  'Term',
  'Term',
  'Term',
  'Curve',
  'Curve',
  'Default'])

In [1052]:
### LASSO estimation
# initialise the Lasso CV model

x_scale = x_in[['Mom', 'VWM', 'Ind_Prod', 'Term', 'Curve']].std(ddof=0)
y_scale = y_in.std(ddof=0)
std_x = x_in / x_scale
std_y = y_in / y_scale
std_x = std_x[['Mom', 'VWM', 'Ind_Prod', 'Term', 'Curve']]

lasso_cv = LassoCV(fit_intercept=True)
lasso_cv = lasso_cv.fit(std_x,std_y)
print(f"Optimal Alpha = {lasso_cv.alpha_}")
lasso_beta = lasso_cv.coef_  * (y_scale / x_scale)
print(lasso_beta)
print(lasso_cv.intercept_)

Optimal Alpha = 0.00013919859426949894
Mom         0.084098
VWM        -0.008096
Ind_Prod    0.003999
Term       -0.192899
Curve      -0.339957
dtype: float64
-0.3924503874038243


In [1053]:
### Evaluating the model out of sample with LASSO


pred_returns = x_oos[['Mom', 'VWM', 'Ind_Prod', 'Term', 'Curve']] @ lasso_beta + lasso_cv.intercept_

strat_returns = np.ones(209)
for i in range(209):
    
    if pred_returns[i] > y_in.quantile(.75):
        portfolio_weight = 5
    elif pred_returns[i] > y_in.quantile(.5):
        portfolio_weight = 1
    elif pred_returns[i] > y_in.quantile(.25):
        portfolio_weight = 0.5
    else:
        portfolio_weight = -3
    
    strat_returns[i] = (portfolio_weight * y_oos[i]) + (1 - portfolio_weight) * rf[i+208]


cum_strat_returns = ((1+strat_returns/100).cumprod()-1)*100
cum_strat_returns[-1]

52.61601818905177

In [1054]:
### Evaluating the model out of sample without LASSO

beta = lstsq(x_in[['const', 'VWM', '5yr', 'BAA', 'unempl', 'CPI', 'Ind_Prod', 'Inflation']],y_in,rcond=0)[0]
pred_returns = x_oos[['const', 'VWM', '5yr', 'BAA', 'unempl', 'CPI', 'Ind_Prod', 'Inflation']] @ beta
print(beta)
strat_returns = np.ones(209)
for i in range(209):
    if pred_returns[i] > y_in.quantile(.75):
        portfolio_weight = 20
    elif pred_returns[i] > y_in.quantile(.5):
        portfolio_weight = 0.5
    elif pred_returns[i] > y_in.quantile(.25):
        portfolio_weight = 0.5
    else:
        portfolio_weight = -10
    
    strat_returns[i] = (portfolio_weight * y_oos[i]) + (1 - portfolio_weight) * rf[i+208]


cum_strat_returns = ((1+strat_returns/100).cumprod()-1)*100
cum_strat_returns[-1]

[-4.13483697 -0.0103932   0.36545169 -0.36919745  0.51451252 -0.03987749
  0.10421347  8.99584358]


7287.8647157983

In [1055]:
resid_oos = y_oos - pred_returns
oos_sse = resid_oos @ resid_oos
oos_tss = (y_oos **2).sum()
oos_r2 = 1- (oos_sse/ oos_tss)
print(f"The out of sample R2 for the out of sample period is {100*oos_r2:0.1f}%.")
print()
r2 = OLS(y_oos, x_oos).fit().rsquared
print(f"The in sample R2 for the out of sample period is {100*r2:0.1f}%.")

The out of sample R2 for the out of sample period is -120.2%.

The in sample R2 for the out of sample period is 11.2%.


In [1056]:
### Sharpe ratio
print()
print()
print(((strat_returns.mean())-rf.mean())/ strat_returns.std() * np.sqrt(12))
print()
print((y_oos.mean()-rf.mean()) / y.std() * np.sqrt(12))



0.9239454406754106

-0.027423218821812242


------------------------------------------------------------------------------------
------------------------------------------------------------------------------------
------------------------------------------------------------------------------------
------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------
-------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------


Problem 1, in sample, predicting Value Weighted Market

In [1233]:
# Selecting a model to predict Value weighted market returns in sample
# Setup y, x, t and p - cutting off first and last values to create a lagged effect of our regressors

y = common_sample["VWM"]["1986-02-28":]
x = common_sample["1986-01-28":"2020-09-30"]
t, p = x.shape
x.index = x.index.shift(1)


# Setup in_sample data

y_in = common_sample["VWM"][1:209]
x_in = common_sample[0:208]
t_in, p_in = x_in.shape
x_in.index = x_in.index.shift(1)

# Set up the out of sample evaluation data
# Setup y, x, t and p - cutting off first and last values to create a lagged effect of our regressors

y_oos = common_sample["VWM"][209:]
x_oos = common_sample[208:-1]
t_oos, p_oos = x_oos.shape
x_oos.index = x_oos.index.shift(1)

In [1276]:
# Best subset model selection,
### Cell takes a minute to run ###

best_subset(x,y,p,random=True)

(['const', 'Ind_Prod', 'Curve', 'Inflation'],
 1     8327.472263
 2     8257.845899
 3     8228.621257
 4     8124.477852
 5     8211.041703
 6     8232.513209
 7     8222.062818
 8     8270.586424
 9     8262.895462
 10    8329.127862
 11    8417.319234
 12    8493.365914
 13    8493.365914
 14    8493.365914
 15    8493.365914
 dtype: float64)

In [1230]:
### Forward stepwise model selection
forward_stepwise(x,y,p,random=True)

(['const'],
 1     8390.796395
 2     8395.928270
 3     8454.959129
 4     8503.435443
 5     8512.657727
 6     8522.381187
 7     8556.411433
 8     8757.883165
 9     8586.995943
 10    8545.084258
 11    8547.646298
 12    8547.646298
 13    8547.646298
 14    8547.646298
 15    8493.365914
 dtype: float64,
 ['const',
  'Mom',
  'VWM',
  '10yr',
  '5yr',
  '1yr',
  'AAA',
  'BAA',
  'unempl',
  'CPI',
  'Ind_Prod',
  'Ind_Prod',
  'Ind_Prod',
  'Ind_Prod',
  'Inflation',
  'Term',
  'Curve',
  'Curve',
  'Default'])

In [1277]:
### LASSO estimation
# initialise the Lasso CV model

x_scale = x[['Ind_Prod', 'Curve', 'Inflation']].std(ddof=0)
y_scale = y.std(ddof=0)
std_x = x / x_scale
std_y = y / y_scale
std_x = std_x[['Ind_Prod', 'Curve', 'Inflation']]

lasso_cv = LassoCV(fit_intercept=True)
lasso_cv = lasso_cv.fit(std_x,std_y)
print(f"Optimal Alpha = {lasso_cv.alpha_}")
lasso_beta = lasso_cv.coef_  * (y_scale / x_scale)
lasso_beta

Optimal Alpha = 0.00010554896363647818


Ind_Prod     -0.059635
Curve         1.412042
Inflation   -69.119829
dtype: float64

In [1278]:
#  LASSO evaluation

pred_returns = x_oos[['Ind_Prod', 'Curve', 'Inflation']] @ lasso_beta + lasso_cv.intercept_ 

strat_returns = np.ones(209)
for i in range(209):
    if pred_returns[i] > y_in.quantile(.75):
        portfolio_weight = 2
    elif pred_returns[i] > y_in.quantile(.5):
        portfolio_weight = 2
    elif pred_returns[i] > y_in.quantile(.25):
        portfolio_weight = 1
    else:
        portfolio_weight = 1
    
    strat_returns[i] = (portfolio_weight * y_oos[i]) + (1- portfolio_weight) * rf[i]


cum_strat_returns = ((1+strat_returns/100).cumprod()-1)*100
cum_strat_returns[-1]

422.4388819372857

In [1268]:
beta = lstsq(x[['const', 'BAA', 'Ind_Prod', 'Curve', 'Inflation']],y,rcond=0)[0]
pred_returns = x_oos[['const', 'BAA', 'Ind_Prod', 'Curve', 'Inflation']] @ beta

strat_returns = np.ones(209)
for i in range(209):
    if pred_returns[i] > y_in.quantile(.75):
        portfolio_weight = 2
    elif pred_returns[i] > y_in.quantile(.5):
        portfolio_weight = 2
    elif pred_returns[i] > y_in.quantile(.25):
        portfolio_weight = 1
    else:
        portfolio_weight = 1
    
    strat_returns[i] = (portfolio_weight * y_oos[i]) + (1 - portfolio_weight) * rf[i+208]


cum_strat_returns = ((1+strat_returns/100).cumprod()-1)*100
cum_strat_returns[-1]

1327.3971859918458

In [1263]:
resid_oos = y_oos - pred_returns
oos_sse = resid_oos @ resid_oos
oos_tss = (y_oos **2).sum()
oos_r2 = 1- oos_sse/ oos_tss
print(f"The out of sample R2 for the out of sample period is {100*oos_r2:0.1f}%.")
print()
r2 = OLS(y_oos, x_oos).fit().rsquared
print(f"The in sample R2 for the out of sample period is {100*r2:0.1f}%.")

The out of sample R2 for the out of sample period is 12.3%.

The in sample R2 for the out of sample period is 14.4%.


In [1264]:
### Sharpe ratio

print(   (strat_returns.mean()-rf_mean)   / strat_returns.std() * np.sqrt(12)  )
print()
print(   (y_oos.mean()-rf_mean) / y_oos.std() * np.sqrt(12))

0.8862296107589942

0.7254891133486322


---------------------------------------------------------------------------------------------------------------------------
Now to replicate out of sample

In [1065]:
### Training the data on the first half (208 data points in y)
best_subset(x_in,y_in,p_in, random=True,fold=5)

(['Mom', 'unempl', 'Curve'],
 1     4723.290414
 2     4595.401003
 3     4544.949513
 4     4574.479508
 5     4570.871466
 6     4592.310360
 7     4624.707877
 8     4675.351018
 9     4706.860946
 10    4735.931800
 11    4776.837788
 12    4803.018063
 13    4839.705283
 14    4869.355120
 15    4940.278993
 dtype: float64)

In [1206]:
forward_stepwise(x_in,y_in,p_in,random=True,fold=5)

(['const', 'Mom'],
 1     4751.695148
 2     4630.072718
 3     4662.736970
 4     4699.788618
 5     4772.386611
 6     4798.127880
 7     4788.041046
 8     4839.206875
 9     4852.614280
 10    4909.439909
 11    4919.974820
 12    4919.974820
 13    4919.974820
 14    4919.974820
 15    4940.278993
 dtype: float64,
 ['const',
  'Mom',
  'VWM',
  '10yr',
  '5yr',
  '1yr',
  'AAA',
  'BAA',
  'unempl',
  'CPI',
  'Ind_Prod',
  'Ind_Prod',
  'Ind_Prod',
  'Ind_Prod',
  'Inflation',
  'Term',
  'Term',
  'Term',
  'Curve',
  'Curve',
  'Default'])

In [1269]:
### LASSO estimation
# initialise the Lasso CV model

x_scale = x_in[['Mom', 'unempl', 'Curve']].std(ddof=0)
y_scale = y_in.std(ddof=0)
std_x = x_in / x_scale
std_y = y_in / y_scale
std_x = std_x[['Mom', 'unempl', 'Curve']]

lasso_cv = LassoCV(fit_intercept=False)
lasso_cv = lasso_cv.fit(std_x,std_y)
print(f"Optimal Alpha = {lasso_cv.alpha_}")
lasso_beta = lasso_cv.coef_  * (y_scale / x_scale)
lasso_beta

Optimal Alpha = 0.0053641784068227845


Mom      -1.697401
unempl    0.350081
Curve     1.548181
dtype: float64

In [1270]:
#  LASSO evaluation

pred_returns = x_oos[['Mom', 'unempl', 'Curve']] @ lasso_beta

strat_returns = np.ones(209)
for i in range(209):
    if pred_returns[i] > y_in.quantile(.75):
        portfolio_weight = 2
    elif pred_returns[i] > y_in.quantile(.5):
        portfolio_weight = 2
    elif pred_returns[i] > y_in.quantile(.25):
        portfolio_weight = 1
    else:
        portfolio_weight = 0.5
    
    strat_returns[i] = (portfolio_weight * y_oos[i]) + (1- portfolio_weight) * rf[i+208]


cum_strat_returns = ((1+strat_returns/100).cumprod()-1)*100
cum_strat_returns[-1]

1019.5481641539521

In [1272]:
### Evaluating the model out of sample

beta = lstsq(x_in[['Mom', 'unempl', 'Curve']],y_in,rcond=0)[0]
pred_returns = x_oos[['Mom', 'unempl', 'Curve']] @ beta

#['Mom', 'unempl', 'Curve']

strat_returns = np.ones(209)
for i in range(209):
    if pred_returns[i] > y_in.quantile(.75):
        portfolio_weight = 2
    elif pred_returns[i] > y_in.quantile(.5):
        portfolio_weight = 2
    elif pred_returns[i] > y_in.quantile(.25):
        portfolio_weight = 1
    else:
        portfolio_weight = 0.5
    
    """if pred_returns[i] > y_in.mean():
        portfolio_weight = 1.5
    else:
        portfolio_weight = 0.5"""

    strat_returns[i] = (portfolio_weight * y_oos[i]) + (1 - portfolio_weight) * rf[i+208]


cum_strat_returns = ((1+strat_returns/100).cumprod()-1)*100
cum_strat_returns[-1]

1035.115850299596

In [1072]:
((1+(y_oos/100)).cumprod()[-1]-1)*100

422.4388819372857

In [1275]:
resid_oos = y_oos - pred_returns
oos_sse = resid_oos @ resid_oos
oos_tss = (y_oos **2).sum()
oos_r2 = 1- (oos_sse/ oos_tss)
print(f"The out of sample R2 for the out of sample period is {100*oos_r2:0.1f}%.")
print()
r2 = OLS(y_oos, x_oos).fit().rsquared
print(f"The in sample R2 for the out of sample period is {100*r2:0.1f}%.")

The out of sample R2 for the out of sample period is -15.0%.

The in sample R2 for the out of sample period is 14.4%.


In [1274]:
### Sharpe ratio

print(   (strat_returns.mean()-rf_mean)   / strat_returns.std() * np.sqrt(12)  )
print()
print(   (y_oos.mean()-rf_mean) / y_oos.std() * np.sqrt(12))

0.6754921074359403

0.7254891133486322
