In [517]:
import pandas as pd
import numpy as np
import pandas_datareader as pdr
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from numpy.linalg import lstsq
from itertools import combinations
from statsmodels.api import OLS
from sklearn.linear_model import LassoCV, RidgeCV
import statsmodels.api as sm

In [546]:
# Import data
## Very slow cell to run ##

# momentum
momentum = pdr.get_data_famafrench("F-F_Momentum_Factor_daily", start="1920", end="2020-12-31")[0]
momentum = momentum.resample("M").last().squeeze()

#momentum = pd.read_csv("data/F-F_Momentum_Factor.csv",index_col="Date")
#momentum.index = pd.to_datetime(momentum.index,format="%Y%m")
#momentum = momentum.resample("M").last().squeeze()


# Value weighted market
vwm = pd.read_csv("data/VWM.csv",index_col="Date")
vwm.index = pd.to_datetime(vwm.index, format="%Y%m")
vwm = vwm.resample("M").last().squeeze()

# 1, 5 and 10 year constant maturities
const_mat_10 = pdr.get_data_fred("DGS10", "1920-01-01", "2020-12-31")
const_mat_10 = const_mat_10.resample("M").last().squeeze()
const_mat_5 = pdr.get_data_fred("DGS5", "1920-01-01", "2020-12-31")
const_mat_5 = const_mat_5.resample("M").last().squeeze()
const_mat_1 = pdr.get_data_fred("DGS1", "1920-01-01", "2020-12-31")
const_mat_1 = const_mat_1.resample("M").last().squeeze()

# AAA and BAA (Moody's)
aaa = pd.read_csv("data/AAA.csv",parse_dates=True,index_col="Date")
aaa = aaa.resample("M").last().squeeze()
baa = pdr.get_data_fred("DBAA", "1920-01-01", "2020-12-31")
baa = baa.resample("M").last().squeeze()

# Unemployment rate (US)
unrate = pdr.get_data_fred("UNRATE", "1920-01-01", "2020-12-31")
unrate = unrate.resample("M").last().squeeze()

# core CPi US - consumer price index for all urban consumers
core_cpi = pdr.get_data_fred("CPIAUCSL", "1920-01-01", "2020-12-31")
core_cpi = core_cpi.resample("M").last().squeeze()

# Industrial Productivity
ind_prod = pdr.get_data_fred("INDPRO", "1920-01-01", "2020-12-31")
ind_prod = ind_prod.resample("M").last().squeeze()

# Risk free rate
ff_factors = pdr.get_data_famafrench("F-F_Research_Data_Factors_daily", start="1920", end="2020-12-31")[0]
rf = ff_factors["RF"].resample("M").last()
rf = rf["1986-02-28":]

In [547]:
momentum

Date
1926-11-30    0.16
1926-12-31   -0.69
1927-01-31    0.46
1927-02-28    0.29
1927-03-31   -0.15
              ... 
2020-07-31    1.17
2020-08-31    1.75
2020-09-30    0.15
2020-10-31   -2.36
2020-11-30    3.05
Freq: M, Name: Mom   , Length: 1129, dtype: float64

In [520]:
# Variable construction:

term = const_mat_10 - const_mat_1
curve = const_mat_10 - 2 * const_mat_5 + const_mat_1
default = aaa - baa

inflation = pd.Series.copy(core_cpi)
for i in range(12):
    inflation[i] = np.nan
for i in range(12,887):
    inflation[i] = np.log(core_cpi[i]) - np.log(core_cpi[i-12])
inflation = inflation.dropna()

variables = pd.DataFrame([term,curve,default,inflation]).T
variables.columns = ["Term", "Curve", "Default", "Inflation"]
common_variables = variables.dropna()
common_variables

Unnamed: 0,Term,Curve,Default,Inflation
1986-01-31,1.51,-0.41,-1.32,0.038966
1986-02-28,0.70,-0.26,-0.95,0.031484
1986-03-31,0.67,-0.27,-1.31,0.021307
1986-04-30,0.84,-0.42,-1.37,0.015763
1986-05-31,1.17,-0.71,-1.32,0.016652
...,...,...,...,...
2020-08-31,0.60,0.28,-1.14,0.013129
2020-09-30,0.57,0.25,-1.13,0.013994
2020-10-31,0.75,0.25,-1.14,0.011964
2020-11-30,0.73,0.23,-0.83,0.011542


In [521]:
np.log(core_cpi[14]) - np.log(core_cpi[2])

0.06595796779179741

In [522]:
data = pd.DataFrame([momentum,vwm, const_mat_10,const_mat_5,const_mat_1,aaa,baa,unrate,core_cpi,ind_prod]).T
data.columns = ["Mom", "VWM","10yr","5yr","1yr","AAA","BAA","unempl","CPI","Ind_Prod"]
common_data = data.dropna()
common_data

Unnamed: 0_level_0,Mom,VWM,10yr,5yr,1yr,AAA,BAA,unempl,CPI,Ind_Prod
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1986-01-31,2.96,1.21,9.08,8.53,7.57,10.05,11.37,6.7,109.900,57.3104
1986-02-28,2.78,7.66,8.13,7.91,7.43,9.67,10.62,7.2,109.700,56.9344
1986-03-31,2.45,5.48,7.39,7.19,6.72,9.00,10.31,7.2,109.100,56.5420
1986-04-30,-0.50,-0.79,7.38,7.17,6.54,8.79,10.16,7.1,108.700,56.5599
1986-05-31,2.02,5.11,8.05,7.82,6.88,9.09,10.41,7.2,109.000,56.6823
...,...,...,...,...,...,...,...,...,...,...
2020-06-30,-0.75,2.47,0.66,0.29,0.16,2.44,3.59,11.1,257.214,97.8019
2020-07-31,7.61,5.78,0.55,0.21,0.11,2.14,3.15,10.2,258.723,101.8924
2020-08-31,0.51,7.64,0.72,0.28,0.12,2.25,3.39,8.4,259.681,102.6619
2020-09-30,3.05,-3.62,0.69,0.28,0.12,2.31,3.44,7.8,260.209,102.6008


In [548]:
import statsmodels.api as sm
common_sample = pd.DataFrame([momentum,vwm, const_mat_10,const_mat_5,const_mat_1,aaa,baa,unrate,core_cpi,ind_prod,term,curve,default,inflation]).T
common_sample.columns = ["Mom", "VWM","10yr","5yr","1yr","AAA","BAA","unempl","CPI","Ind_Prod","Term", "Curve", "Default", "Inflation"]
common_sample = common_sample.dropna()
common_sample = sm.add_constant(common_sample)
common_sample

Unnamed: 0_level_0,const,Mom,VWM,10yr,5yr,1yr,AAA,BAA,unempl,CPI,Ind_Prod,Term,Curve,Default,Inflation
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1986-01-31,1.0,0.34,1.21,9.08,8.53,7.57,10.05,11.37,6.7,109.900,57.3104,1.51,-0.41,-1.32,0.038966
1986-02-28,1.0,0.49,7.66,8.13,7.91,7.43,9.67,10.62,7.2,109.700,56.9344,0.70,-0.26,-0.95,0.031484
1986-03-31,1.0,0.76,5.48,7.39,7.19,6.72,9.00,10.31,7.2,109.100,56.5420,0.67,-0.27,-1.31,0.021307
1986-04-30,1.0,-0.36,-0.79,7.38,7.17,6.54,8.79,10.16,7.1,108.700,56.5599,0.84,-0.42,-1.37,0.015763
1986-05-31,1.0,-0.29,5.11,8.05,7.82,6.88,9.09,10.41,7.2,109.000,56.6823,1.17,-0.71,-1.32,0.016652
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-06-30,1.0,1.15,2.47,0.66,0.29,0.16,2.44,3.59,11.1,257.214,97.8019,0.50,0.24,-1.15,0.007070
2020-07-31,1.0,1.17,5.78,0.55,0.21,0.11,2.14,3.15,10.2,258.723,101.8924,0.44,0.24,-1.01,0.010241
2020-08-31,1.0,1.75,7.64,0.72,0.28,0.12,2.25,3.39,8.4,259.681,102.6619,0.60,0.28,-1.14,0.013129
2020-09-30,1.0,0.15,-3.62,0.69,0.28,0.12,2.31,3.44,7.8,260.209,102.6008,0.57,0.25,-1.13,0.013994


Importing useful functions

In [524]:
## Cross Validation function

def xval_5fold(y, x , random=False, seed = 20201231,fold=5):
    # Use numpy arrays for simplicity
    y = np.asarray(y)
    x = np.asarray(x)
    n = y.shape[0]
    if random:
        # If randomisation is needed, use either the default or provided seed
        rg = np.random.default_rng(seed)
        # Generate a set of index values to use to randomly reorder the data
        # After randomisation, we can use the data as if it is is inorder!
        ind = rg.permutation(np.arange(n))
        y = y[ind]
        x = x[ind]
    # Compute the block size
    block = n / fold
    sse = 0.0
    for i in range(int(fold)):
        # Start and end of each block need to be integers or we lose an observation
        # Rounding ensures that we get all observations since int rounds down
        st = int(np.round(i * block))
        en = int(np.round((i + 1) * block))
        # Construct the indicies of the observations that we leave out
        leave_out = np.r_[st:en]
        # The included are the one that we don't leave out
        include = np.setdiff1d(np.arange(n), leave_out)
        # Compute the regression coefficients
        beta = lstsq(x[include], y[include], rcond=None)[0]
        # Compute the residuals and add to the sse
        resid = y[st:en]- x[st:en] @ beta
        sse += resid @ resid
    return sse

# Randomisation is better when we suspect the model may not be totally stable

In [525]:
# forward stepwise function: 
# requires x: x dataframe , y series, p = number of columns, requires xval function too
# input x, y, p
def forward_stepwise(x, y, p,random=False,fold=5):
    included = []

    for i in range(p):
        excluded = [col for col in x if col not in included]
        best_sse = np.inf
        for col in excluded:
            try_x = x[included + [col]]
            beta = lstsq(try_x, y, rcond=None)[0]
            resid = y - try_x @ beta
            sse = resid @ resid
            if sse < best_sse:
                best_sse = sse
                next_var = col
            included.append(next_var)
    
    fsr_sse_sv = {}
    for i in range(1,p+1):
        fsr_sse_sv[i] = xval_5fold(y,x[included[:i]],random=random, fold=fold)
    fsr_sse_sv = pd.Series(fsr_sse_sv)
    forward_step_model = included[:fsr_sse_sv.idxmin()]
    return forward_step_model, pd.Series(fsr_sse_sv), included

In [526]:
### Best subset regression function

def best_subset(x, y, p, random=False, fold=5):

    """for i in range(1, p+1):
        count = 0
        for comb in combinations(x.columns, i):
            count += 1
            if count > 1:
                break"""

    best_models = {}
    for i in range( 1, p+1):
        best_sse = np.inf
        for comb in combinations(x.columns, i):
            reg = x[list(comb)]
            beta = lstsq(reg, y , rcond=None)[0]
            resid = y - reg @ beta
            sse = resid @ resid
            if sse < best_sse:
                best_sse = sse
                best_models[i] = list(comb)

    bsr_sse_xv = {}
    for n_var in best_models:
        bsr_sse_xv[n_var] = xval_5fold(y, x[best_models[n_var]], random=random,fold=fold)
    bsr_sse_xv = pd.Series(bsr_sse_xv)
    bsr_model = best_models[bsr_sse_xv.idxmin()]

    return bsr_model, bsr_sse_xv, best_models

#print(bsr_model)
#bsr_sse_xv

In [527]:
def best_subset(x, y, p, random=False, fold=5):

    best_models = {}
    for i in range( 1, p+1):
        best_sse = np.inf
        for comb in combinations(x.columns, i):
            reg = x[list(comb)]

            sse = xval_5fold(y,reg,random=random,fold=fold)



            #beta = lstsq(reg, y , rcond=None)[0]
            #resid = y - reg @ beta
            #sse = resid @ resid
            if sse < best_sse:
                best_sse = sse
                best_models[i] = list(comb)

    bsr_sse_xv = {}
    for n_var in best_models:
        bsr_sse_xv[n_var] = xval_5fold(y, x[best_models[n_var]], random=random,fold=fold)
    bsr_sse_xv = pd.Series(bsr_sse_xv)
    bsr_model = best_models[bsr_sse_xv.idxmin()]

    return bsr_model, bsr_sse_xv

Problem 1, predicting momentum

In [550]:
# Selecting a model to predict momentum returns in sample
# Setup y, x, t and p - cutting off first and last values to create a lagged effect of our regressors

y = common_sample["Mom"]["1986-02-28":]
x = common_sample["1986-01-28":"2020-09-30"]
t, p = x.shape
x.index = x.index.shift(1)


# Setup in_sample data

y_in = common_sample["Mom"][1:209]
x_in = common_sample[0:208]
t_in, p_in = x_in.shape
x_in.index = x_in.index.shift(1)

# Set up the out of sample evaluation data
# Setup y, x, t and p - cutting off first and last values to create a lagged effect of our regressors

y_oos = common_sample["Mom"][209:]
x_oos = common_sample[208:-1]
t_oos, p_oos = x_oos.shape
x_oos.index = x_oos.index.shift(1)

x_scale = x.std(ddof=0)
y_scale = y.std(ddof=0)
std_x = x / x_scale
std_y = y / y_scale

In [553]:
# Best subset model selection, momentum in sample
### Cell takes a minute to run ###

best_subset(x, y, p, random=False, fold=5)

(['Mom', '5yr', '1yr', 'unempl', 'Curve'],
 1     182.510349
 2     183.106176
 3     180.947733
 4     179.647622
 5     178.438102
 6     178.438102
 7     178.438102
 8     178.856467
 9     179.632685
 10    180.414190
 11    184.609070
 12    185.451034
 13    188.895037
 14    195.316071
 15    216.566010
 dtype: float64)

In [552]:
### Forward stepwise model selection, in sample
forward_stepwise(x, y, p, random=False, fold=5)

(['const', 'Mom', 'VWM', '10yr', '5yr'],
 1     187.397205
 2     183.777992
 3     185.468864
 4     187.059188
 5     183.310086
 6     185.238623
 7     187.236003
 8     187.676343
 9     190.765506
 10    193.271479
 11    215.554859
 12    215.554859
 13    215.554859
 14    215.554859
 15    216.566010
 dtype: float64,
 ['const',
  'Mom',
  'VWM',
  '10yr',
  '5yr',
  '1yr',
  'AAA',
  'BAA',
  'unempl',
  'CPI',
  'Ind_Prod',
  'Ind_Prod',
  'Ind_Prod',
  'Ind_Prod',
  'Inflation',
  'Term',
  'Term',
  'Term',
  'Curve',
  'Curve',
  'Default'])

In [531]:
### LASSO estimation
# initialise the Lasso CV model

x_scale = x[['10yr', '5yr', 'unempl', 'Default']].std(ddof=0)
y_scale = y.std(ddof=0)
std_x = x / x_scale
std_y = y / y_scale
std_x = std_x[['10yr', '5yr', 'unempl', 'Default']]
#std_x = sm.add_constant(std_x)

lasso_cv = LassoCV(fit_intercept=False)
lasso_cv = lasso_cv.fit(std_x,std_y)
print(f"Optimal Alpha = {lasso_cv.alpha_}")
lasso_beta = lasso_cv.coef_  * (y_scale / x_scale)
lasso_beta

#Do we need LASSO if it doesn't fit the oos data well?

Optimal Alpha = 0.0029348317308275106


10yr       0.080408
5yr        0.153856
unempl     0.153849
Default    1.605512
dtype: float64

In [532]:
#Lasso evaluation

pred_returns = x_oos[['10yr', '5yr', 'unempl', 'Default']] @ lasso_beta

strat_returns = np.ones(209)
for i in range(209):
    if pred_returns[i] > y_in.mean():
        portfolio_weight = 1.5
    else:
        portfolio_weight = 0.5
    
    strat_returns[i] = (portfolio_weight * y_oos[i]) + (1- portfolio_weight) * rf[i]


cum_strat_returns = ((1+strat_returns/100).cumprod()-1)*100
cum_strat_returns[-1]

13.978510210129835

In [533]:
## Evaluating the model #######

beta = lstsq(x[['5yr', 'BAA', 'Ind_Prod', 'Term']],y,rcond=0)[0]

pred_returns = x_oos[['5yr', 'BAA', 'Ind_Prod', 'Term']] @ beta

strat_returns = np.ones(209)
for i in range(209):
    if pred_returns[i] > y_in.mean():
        portfolio_weight = 1.5
    else:
        portfolio_weight = 0.5
    
    strat_returns[i] = (portfolio_weight * y_oos[i]) + (1- portfolio_weight) * rf[i]


cum_strat_returns = ((1+strat_returns/100).cumprod()-1)*100
cum_strat_returns[-1]

63.92728815079478

In [534]:
beta = lstsq(x[['10yr', '5yr', 'unempl', 'Default']],y,rcond=0)[0]

pred_returns = x_oos[['10yr', '5yr', 'unempl', 'Default']] @ beta

strat_returns = np.ones(209)
for i in range(209):
    if pred_returns[i] > y_in.mean():
        portfolio_weight = 1.5
    else:
        portfolio_weight = 0.5
    
    strat_returns[i] = (portfolio_weight * y_oos[i]) + (1- portfolio_weight) * rf[i]


cum_strat_returns = ((1+strat_returns/100).cumprod()-1)*100
cum_strat_returns[-1]

13.978510210129835

In [535]:
resid_oos = y_oos - pred_returns
oos_sse = resid_oos @ resid_oos
oos_tss = (y_oos **2).sum()
oos_r2 = 1- oos_sse/ oos_tss
print(f"The out of sample R2 for the out of sample period is {100*oos_r2:0.1f}%.")
print()
r2 = OLS(y_oos, x_oos).fit().rsquared
print(f"The in sample R2 for the out of sample period is {100*r2:0.1f}%.")

The out of sample R2 for the out of sample period is 4.3%.

The in sample R2 for the out of sample period is 14.9%.


In [536]:
### Sharpe ratio
print(strat_returns.std())
print()
print(((strat_returns.mean())-rf.mean())/ strat_returns.std())
print()
print((y_oos.mean()-rf.mean()) / y_oos.std())

2.160565093780198

0.03465002585626911

0.03329865891513081


Amazing - but now to attempt a replication out of sample

In [554]:
### Training the data on the first half (208 data points in y)
best_subset(x_in, y_in, p_in, random=False,fold=5)

(['const', 'Mom', 'VWM', 'Ind_Prod', 'Term', 'Curve'],
 1     54.756031
 2     54.468678
 3     53.950066
 4     53.739105
 5     53.601216
 6     53.587857
 7     53.781846
 8     53.781846
 9     54.075157
 10    54.492795
 11    55.002823
 12    55.822320
 13    59.370418
 14    64.738893
 15    71.169348
 dtype: float64)

In [555]:
### Training the data on the first half (208 data points in y)
best_subset(x_in, y_in, p_in, random=True,fold=5)

(['const', 'VWM', '5yr', 'BAA', 'unempl', 'CPI', 'Ind_Prod', 'Inflation'],
 1     54.401696
 2     53.343351
 3     52.968047
 4     52.742403
 5     51.584008
 6     51.123876
 7     50.985594
 8     50.970142
 9     51.249985
 10    51.414770
 11    51.727225
 12    52.237133
 13    52.632262
 14    53.825214
 15    54.178277
 dtype: float64)

In [556]:
forward_stepwise(x_in, y_in, p_in, random=False,fold=5)

(['const', 'Mom', 'VWM', '10yr', '5yr', '1yr'],
 1     55.435265
 2     54.776108
 3     54.764475
 4     55.034073
 5     54.493515
 6     53.995281
 7     55.687199
 8     56.030801
 9     56.084036
 10    59.370418
 11    72.970732
 12    72.970732
 13    72.970732
 14    72.970732
 15    71.169348
 dtype: float64,
 ['const',
  'Mom',
  'VWM',
  '10yr',
  '5yr',
  '1yr',
  'AAA',
  'BAA',
  'unempl',
  'CPI',
  'Ind_Prod',
  'Ind_Prod',
  'Ind_Prod',
  'Ind_Prod',
  'Inflation',
  'Term',
  'Curve',
  'Curve',
  'Default'])

In [540]:
"""### LASSO estimation
# initialise the Lasso CV model

x_scale = x_in[['const', '1yr', 'unempl', 'CPI', 'Ind_Prod', 'Default']].std(ddof=0)
y_scale = y_in.std(ddof=0)
std_x = x_in / x_scale
std_y = y_in / y_scale
std_x = std_x[['const', '1yr', 'unempl', 'CPI', 'Ind_Prod', 'Default']]

lasso_cv = LassoCV(fit_intercept=False)
lasso_cv = lasso_cv.fit(std_x,std_y)
print(f"Optimal Alpha = {lasso_cv.alpha_}")
lasso_beta = lasso_cv.coef_  * (y_scale / x_scale)
lasso_beta"""

'### LASSO estimation\n# initialise the Lasso CV model\n\nx_scale = x_in[[\'const\', \'1yr\', \'unempl\', \'CPI\', \'Ind_Prod\', \'Default\']].std(ddof=0)\ny_scale = y_in.std(ddof=0)\nstd_x = x_in / x_scale\nstd_y = y_in / y_scale\nstd_x = std_x[[\'const\', \'1yr\', \'unempl\', \'CPI\', \'Ind_Prod\', \'Default\']]\n\nlasso_cv = LassoCV(fit_intercept=False)\nlasso_cv = lasso_cv.fit(std_x,std_y)\nprint(f"Optimal Alpha = {lasso_cv.alpha_}")\nlasso_beta = lasso_cv.coef_  * (y_scale / x_scale)\nlasso_beta'

In [542]:
### Evaluating the model out of sample with LASSO
"""
beta = lstsq(x_in[['VWM', '10yr', 'BAA', 'unempl', 'CPI', 'Ind_Prod']],y_in,rcond=0)[0]
pred_returns = x_oos[['VWM', '10yr', 'BAA', 'unempl', 'CPI', 'Ind_Prod']] @ lasso_beta

strat_returns = np.ones(209)
for i in range(209):
    if pred_returns[i] > y_in.mean():
        portfolio_weight = 1.5
    else:
        portfolio_weight = 0.5
    
    strat_returns[i] = (portfolio_weight * y_oos[i]) + (1 - portfolio_weight) * rf[i]


cum_strat_returns = ((1+strat_returns/100).cumprod()-1)*100
cum_strat_returns[-1]"""

"\nbeta = lstsq(x_in[['VWM', '10yr', 'BAA', 'unempl', 'CPI', 'Ind_Prod']],y_in,rcond=0)[0]\npred_returns = x_oos[['VWM', '10yr', 'BAA', 'unempl', 'CPI', 'Ind_Prod']] @ lasso_beta\n\nstrat_returns = np.ones(209)\nfor i in range(209):\n    if pred_returns[i] > y_in.mean():\n        portfolio_weight = 1.5\n    else:\n        portfolio_weight = 0.5\n    \n    strat_returns[i] = (portfolio_weight * y_oos[i]) + (1 - portfolio_weight) * rf[i]\n\n\ncum_strat_returns = ((1+strat_returns/100).cumprod()-1)*100\ncum_strat_returns[-1]"

In [557]:
### Evaluating the model out of sample without LASSO

beta = lstsq(x_in[['const', 'VWM', '5yr', 'BAA', 'unempl', 'CPI', 'Ind_Prod', 'Inflation']],y_in,rcond=0)[0]
pred_returns = x_oos[['const', 'VWM', '5yr', 'BAA', 'unempl', 'CPI', 'Ind_Prod', 'Inflation']] @ beta
print(beta)
strat_returns = np.ones(209)
for i in range(209):
    if pred_returns[i] > y_in.mean():
        portfolio_weight = 1.5
    else:
        portfolio_weight = 0.5
    
    strat_returns[i] = (portfolio_weight * y_oos[i]) + (1 - portfolio_weight) * rf[i]


cum_strat_returns = ((1+strat_returns/100).cumprod()-1)*100
cum_strat_returns[-1]

[-4.13483697 -0.0103932   0.36545169 -0.36919745  0.51451252 -0.03987749
  0.10421347  8.99584358]


12.198549686559957

In [443]:
resid_oos = y_oos - pred_returns
oos_sse = resid_oos @ resid_oos
oos_tss = (y_oos **2).sum()
oos_r2 = 1- (oos_sse/ oos_tss)
print(f"The out of sample R2 for the out of sample period is {100*oos_r2:0.1f}%.")
print()
r2 = OLS(y_oos, x_oos).fit().rsquared
print(f"The in sample R2 for the out of sample period is {100*r2:0.1f}%.")

The out of sample R2 for the out of sample period is -108.4%.

The in sample R2 for the out of sample period is 11.2%.


In [444]:
### Sharpe ratio

print(((strat_returns.mean())-rf.mean())/ strat_returns.std())
print()
print((y_oos.mean()-rf.mean()) / y.std())

0.0610941576293837

-0.021784813659725295


------------------------------------------------------------------------------------
------------------------------------------------------------------------------------
------------------------------------------------------------------------------------
------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------
-------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------


Problem 1, in sample, predicting Value Weighted Market

In [307]:
# Selecting a model to predict Value weighted market returns in sample
# Setup y, x, t and p - cutting off first and last values to create a lagged effect of our regressors

y = common_sample["VWM"]["1986-02-28":]
x = common_sample["1986-01-28":"2020-09-30"]
t, p = x.shape
x.index = x.index.shift(1)


# Setup in_sample data

y_in = common_sample["VWM"][1:209]
x_in = common_sample[0:208]
t_in, p_in = x_in.shape
x_in.index = x_in.index.shift(1)

# Set up the out of sample evaluation data
# Setup y, x, t and p - cutting off first and last values to create a lagged effect of our regressors

y_oos = common_sample["VWM"][209:]
x_oos = common_sample[208:-1]
t_oos, p_oos = x_oos.shape
x_oos.index = x_oos.index.shift(1)

In [308]:
# Best subset model selection,
### Cell takes a minute to run ###

best_subset(x,y,p)

(['1yr', 'unempl', 'Inflation'],
 1     8357.986250
 2     8299.554460
 3     8293.245607
 4     8364.744233
 5     8353.987520
 6     8600.669364
 7     8744.606539
 8     8838.360279
 9     8915.064758
 10    9093.463106
 11    9885.708087
 12    9885.708087
 13    9885.708087
 14    9885.708087
 dtype: float64)

In [309]:
### Forward stepwise model selection
forward_stepwise(x,y,p)

(['Mom', 'VWM', '10yr'],
 1      8846.154957
 2      8799.955431
 3      8642.002909
 4      8647.027271
 5      8819.968826
 6      8985.431033
 7      9259.600811
 8      9086.398355
 9      9247.723995
 10    10547.469697
 11    10547.469697
 12     9885.708087
 13     9885.708087
 14     9885.708087
 dtype: float64,
 ['Mom',
  'VWM',
  '10yr',
  '5yr',
  '1yr',
  'AAA',
  'BAA',
  'unempl',
  'CPI',
  'Ind_Prod',
  'Term',
  'Inflation',
  'Curve',
  'Default'])

In [310]:
### LASSO estimation
# initialise the Lasso CV model

x_scale = x[['1yr', 'unempl', 'Inflation']].std(ddof=0)
y_scale = y.std(ddof=0)
std_x = x / x_scale
std_y = y / y_scale
std_x = std_x[['1yr', 'unempl', 'Inflation']]

lasso_cv = LassoCV(fit_intercept=False)
lasso_cv = lasso_cv.fit(std_x,std_y)
print(f"Optimal Alpha = {lasso_cv.alpha_}")
lasso_beta = lasso_cv.coef_  * (y_scale / x_scale)
lasso_beta

Optimal Alpha = 0.0008438441678212342


1yr           0.275359
unempl        0.274736
Inflation   -64.704897
dtype: float64

In [311]:
#  LASSO evaluation

pred_returns = x_oos[['1yr', 'unempl', 'Inflation']] @ lasso_beta

strat_returns = np.ones(209)
for i in range(209):
    if pred_returns[i] > y_in.mean():
        portfolio_weight = 1.5
    else:
        portfolio_weight = 0.5
    
    strat_returns[i] = (portfolio_weight * y_oos[i]) + (1- portfolio_weight) * rf[i]


cum_strat_returns = ((1+strat_returns/100).cumprod()-1)*100
cum_strat_returns[-1]

539.0374879996183

In [312]:
beta = lstsq(x[['1yr', 'unempl', 'Inflation']],y,rcond=0)[0]
pred_returns = x_oos[['1yr', 'unempl', 'Inflation']] @ beta

strat_returns = np.ones(209)
for i in range(209):
    if pred_returns[i] > y_in.mean():
        portfolio_weight = 1.5
    else:
        portfolio_weight = 0.5
    
    strat_returns[i] = (portfolio_weight * y_oos[i]) + (1 - portfolio_weight) * rf[i]


cum_strat_returns = ((1+strat_returns/100).cumprod()-1)*100
cum_strat_returns[-1]

539.0374879996183

In [313]:
resid_oos = y_oos - pred_returns
oos_sse = resid_oos @ resid_oos
oos_tss = (y_oos **2).sum()
oos_r2 = 1- oos_sse/ oos_tss
print(f"The out of sample R2 for the out of sample period is {100*oos_r2:0.1f}%.")
print()
r2 = OLS(y_oos, x_oos).fit().rsquared
print(f"The in sample R2 for the out of sample period is {100*r2:0.1f}%.")

The out of sample R2 for the out of sample period is 10.6%.

The in sample R2 for the out of sample period is 14.8%.


In [314]:
### Sharpe ratio

print(((strat_returns.mean())-rf.mean())/ strat_returns.std())
print()
print((y_oos.mean()-rf.mean()) / y.std())

0.23214144249636384

0.19433975856468655


---------------------------------------------------------------------------------------------------------------------------
Now to replicate out of sample

In [315]:
### Training the data on the first half (208 data points in y)
best_subset(x_in,y_in,p_in, random=True,fold=5)

(['Mom', 'unempl', 'Curve'],
 1     4723.290414
 2     4595.401003
 3     4544.949513
 4     4592.170260
 5     4590.924004
 6     4683.694311
 7     4689.139039
 8     4710.483300
 9     4784.698767
 10    4796.941527
 11    4825.331600
 12    4825.331600
 13    4825.331600
 14    4825.331600
 dtype: float64)

In [316]:
forward_stepwise(x_in,y_in,p_in,random=True,fold=5)

(['Mom', 'VWM', '10yr'],
 1     4807.915063
 2     4822.116671
 3     4654.563908
 4     4685.377330
 5     4679.082366
 6     4752.488538
 7     4780.496079
 8     4761.567120
 9     4804.726211
 10    4810.196630
 11    4810.196630
 12    4825.331600
 13    4825.331600
 14    4825.331600
 dtype: float64,
 ['Mom',
  'VWM',
  '10yr',
  '5yr',
  '1yr',
  'AAA',
  'BAA',
  'unempl',
  'CPI',
  'Ind_Prod',
  'Default',
  'Inflation',
  'Term',
  'Curve'])

In [317]:
### LASSO estimation
# initialise the Lasso CV model

x_scale = x_in[['Mom', 'unempl', 'Curve']].std(ddof=0)
y_scale = y_in.std(ddof=0)
std_x = x_in / x_scale
std_y = y_in / y_scale
std_x = std_x[['Mom', 'unempl', 'Curve']]

lasso_cv = LassoCV(fit_intercept=False)
lasso_cv = lasso_cv.fit(std_x,std_y)
print(f"Optimal Alpha = {lasso_cv.alpha_}")
lasso_beta = lasso_cv.coef_  * (y_scale / x_scale)
lasso_beta

Optimal Alpha = 0.0053641784068227845


Mom      -1.697401
unempl    0.350081
Curve     1.548181
dtype: float64

In [318]:
#  LASSO evaluation

pred_returns = x_oos[['Mom', 'unempl', 'Curve']] @ lasso_beta

strat_returns = np.ones(209)
for i in range(209):
    if pred_returns[i] > y_in.mean():
        portfolio_weight = 1.5
    else:
        portfolio_weight = 0.5
    
    strat_returns[i] = (portfolio_weight * y_oos[i]) + (1- portfolio_weight) * rf[i]


cum_strat_returns = ((1+strat_returns/100).cumprod()-1)*100
cum_strat_returns[-1]

673.4066355281525

In [319]:
### Evaluating the model out of sample

beta = lstsq(x_in[['Mom', 'unempl', 'Curve']],y_in,rcond=0)[0]
pred_returns = x_oos[['Mom', 'unempl', 'Curve']] @ beta

strat_returns = np.ones(209)
for i in range(209):
    if pred_returns[i] > y_in.mean():
        portfolio_weight = 1.5
    else:
        portfolio_weight = 0.5
    
    strat_returns[i] = (portfolio_weight * y_oos[i]) + (1 - portfolio_weight) * rf[i]


cum_strat_returns = ((1+strat_returns/100).cumprod()-1)*100
cum_strat_returns[-1]

673.4066355281525

In [320]:
((1+(y_oos/100)).cumprod()[-1]-1)*100

422.4388819372857

In [321]:
resid_oos = y_oos - pred_returns
oos_sse = resid_oos @ resid_oos
oos_tss = (y_oos **2).sum()
oos_r2 = 1- (oos_sse/ oos_tss)
print(f"The out of sample R2 for the out of sample period is {100*oos_r2:0.1f}%.")
print()
r2 = OLS(y_oos, x_oos).fit().rsquared
print(f"The in sample R2 for the out of sample period is {100*r2:0.1f}%.")

The out of sample R2 for the out of sample period is -15.0%.

The in sample R2 for the out of sample period is 14.8%.


In [322]:
### Sharpe ratio

print(((strat_returns.mean())-rf.mean())/ strat_returns.std())
print()
print((y_oos.mean()-rf.mean()) / y.std())

0.19945127344923944

0.19433975856468655
