In [114]:
import pandas as pd
import numpy as np
import pandas_datareader as pdr
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from numpy.linalg import lstsq
from itertools import combinations
from statsmodels.api import OLS
from sklearn.linear_model import LassoCV, RidgeCV

In [20]:
# Import data
## Very slow cell to run ##

# momentum
momentum = pdr.get_data_famafrench("F-F_Momentum_Factor_daily", start="1920", end="2020-12-31")[0]
momentum = momentum.resample("M").last().squeeze()


# Value weighted market
vwm = pd.read_csv("data/VWM.csv",index_col="Date")
vwm.index = pd.to_datetime(vwm.index, format="%Y%m")
vwm = vwm.resample("M").last().squeeze()

# 1, 5 and 10 year constant maturities
const_mat_10 = pdr.get_data_fred("DGS10", "1920-01-01", "2020-12-31")
const_mat_10 = const_mat_10.resample("M").last().squeeze()
const_mat_5 = pdr.get_data_fred("DGS5", "1920-01-01", "2020-12-31")
const_mat_5 = const_mat_5.resample("M").last().squeeze()
const_mat_1 = pdr.get_data_fred("DGS1", "1920-01-01", "2020-12-31")
const_mat_1 = const_mat_1.resample("M").last().squeeze()

# AAA and BAA (Moody's)
aaa = pdr.get_data_fred("DAAA", "1920-01-01", "2020-12-31")
aaa = aaa.resample("M").last().squeeze()
baa = pdr.get_data_fred("DBAA", "1920-01-01", "2020-12-31")
baa = baa.resample("M").last().squeeze()

# Unemployment rate (US)
unrate = pdr.get_data_fred("UNRATE", "1920-01-01", "2020-12-31")
unrate = unrate.resample("M").last().squeeze()

# core CPi US - consumer price index for all urban consumers
core_cpi = pdr.get_data_fred("CPIAUCSL", "1920-01-01", "2020-12-31")
core_cpi = core_cpi.resample("M").last().squeeze()

# Industrial Productivity
ind_prod = pdr.get_data_fred("INDPRO", "1920-01-01", "2020-12-31")
ind_prod = ind_prod.resample("M").last().squeeze()

# Risk free rate
ff_factors = pdr.get_data_famafrench("F-F_Research_Data_Factors_daily", start="1920", end="2020-12-31")[0]
rf = ff_factors["RF"].resample("M").last()
rf = rf["1986-02-28":]

In [21]:
# Variable construction:

term = const_mat_10 - const_mat_1
curve = const_mat_10 - 2 * const_mat_5 + const_mat_1
default = aaa - baa

inflation = pd.Series.copy(core_cpi)
for i in range(12):
    inflation[i] = np.nan
for i in range(12,887):
    inflation[i] = np.log(core_cpi[i]) - np.log(core_cpi[i-12])
inflation = inflation.dropna()

variables = pd.DataFrame([term,curve,default,inflation]).T
variables.columns = ["Term", "Curve", "Default", "Inflation"]
common_variables = variables.dropna()
common_variables

Unnamed: 0_level_0,Term,Curve,Default,Inflation
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1986-01-31,1.51,-0.41,-1.40,0.038966
1986-02-28,0.70,-0.26,-1.49,0.031484
1986-03-31,0.67,-0.27,-1.58,0.021307
1986-04-30,0.84,-0.42,-1.22,0.015763
1986-05-31,1.17,-0.71,-1.17,0.016652
...,...,...,...,...
2020-07-31,0.44,0.24,-1.12,0.010241
2020-08-31,0.60,0.28,-0.97,0.013129
2020-09-30,0.57,0.25,-1.12,0.013994
2020-10-31,0.75,0.25,-1.10,0.011964


In [22]:
np.log(core_cpi[14]) - np.log(core_cpi[2])

0.06595796779179741

In [23]:
data = pd.DataFrame([momentum,vwm, const_mat_10,const_mat_5,const_mat_1,aaa,baa,unrate,core_cpi,ind_prod]).T
data.columns = ["Mom", "VWM","10yr","5yr","1yr","AAA","BAA","unempl","CPI","Ind_Prod"]
common_data = data.dropna()
common_data

Unnamed: 0_level_0,Mom,VWM,10yr,5yr,1yr,AAA,BAA,unempl,CPI,Ind_Prod
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1986-01-31,0.34,1.21,9.08,8.53,7.57,9.97,11.37,6.7,109.900,57.3104
1986-02-28,0.49,7.66,8.13,7.91,7.43,9.13,10.62,7.2,109.700,56.9344
1986-03-31,0.76,5.48,7.39,7.19,6.72,8.73,10.31,7.2,109.100,56.5420
1986-04-30,-0.36,-0.79,7.38,7.17,6.54,8.94,10.16,7.1,108.700,56.5599
1986-05-31,-0.29,5.11,8.05,7.82,6.88,9.24,10.41,7.2,109.000,56.6823
...,...,...,...,...,...,...,...,...,...,...
2020-06-30,1.15,2.47,0.66,0.29,0.16,2.34,3.59,11.1,257.214,97.8019
2020-07-31,1.17,5.78,0.55,0.21,0.11,2.03,3.15,10.2,258.723,101.8924
2020-08-31,1.74,7.64,0.72,0.28,0.12,2.42,3.39,8.4,259.681,102.6619
2020-09-30,0.16,-3.62,0.69,0.28,0.12,2.32,3.44,7.9,260.209,102.6008


In [24]:
common_sample = pd.DataFrame([momentum,vwm, const_mat_10,const_mat_5,const_mat_1,aaa,baa,unrate,core_cpi,ind_prod,term,curve,default,inflation]).T
common_sample.columns = ["Mom", "VWM","10yr","5yr","1yr","AAA","BAA","unempl","CPI","Ind_Prod","Term", "Curve", "Default", "Inflation"]
common_sample = common_sample.dropna()
common_sample

Unnamed: 0_level_0,Mom,VWM,10yr,5yr,1yr,AAA,BAA,unempl,CPI,Ind_Prod,Term,Curve,Default,Inflation
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1986-01-31,0.34,1.21,9.08,8.53,7.57,9.97,11.37,6.7,109.900,57.3104,1.51,-0.41,-1.40,0.038966
1986-02-28,0.49,7.66,8.13,7.91,7.43,9.13,10.62,7.2,109.700,56.9344,0.70,-0.26,-1.49,0.031484
1986-03-31,0.76,5.48,7.39,7.19,6.72,8.73,10.31,7.2,109.100,56.5420,0.67,-0.27,-1.58,0.021307
1986-04-30,-0.36,-0.79,7.38,7.17,6.54,8.94,10.16,7.1,108.700,56.5599,0.84,-0.42,-1.22,0.015763
1986-05-31,-0.29,5.11,8.05,7.82,6.88,9.24,10.41,7.2,109.000,56.6823,1.17,-0.71,-1.17,0.016652
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-06-30,1.15,2.47,0.66,0.29,0.16,2.34,3.59,11.1,257.214,97.8019,0.50,0.24,-1.25,0.007070
2020-07-31,1.17,5.78,0.55,0.21,0.11,2.03,3.15,10.2,258.723,101.8924,0.44,0.24,-1.12,0.010241
2020-08-31,1.74,7.64,0.72,0.28,0.12,2.42,3.39,8.4,259.681,102.6619,0.60,0.28,-0.97,0.013129
2020-09-30,0.16,-3.62,0.69,0.28,0.12,2.32,3.44,7.9,260.209,102.6008,0.57,0.25,-1.12,0.013994


Importing useful functions

In [25]:
## Cross Validation function

def xval_5fold(y, x , random=False, seed = 20201231,fold=5):
    # Use numpy arrays for simplicity
    y = np.asarray(y)
    x = np.asarray(x)
    n = y.shape[0]
    if random:
        # If randomisation is needed, use either the default or provided seed
        rg = np.random.default_rng(seed)
        # Generate a set of index values to use to randomly reorder the data
        # After randomisation, we can use the data as if it is is inorder!
        ind = rg.permutation(np.arange(n))
        y = y[ind]
        x = x[ind]
    # Compute the block size
    block = n / fold
    sse = 0.0
    for i in range(int(fold)):
        # Start and end of each block need to be integers or we lose an observation
        # Rounding ensures that we get all observations since int rounds down
        st = int(np.round(i * block))
        en = int(np.round((i + 1) * block))
        # Construct the indicies of the observations that we leave out
        leave_out = np.r_[st:en]
        # The included are the one that we don't leave out
        include = np.setdiff1d(np.arange(n), leave_out)
        # Compute the regression coefficients
        beta = lstsq(x[include], y[include], rcond=None)[0]
        # Compute the residuals and add to the sse
        resid = y[st:en]- x[st:en] @ beta
        sse += resid @ resid
    return sse

# Randomisation is better when we suspect the model may not be totally stable

In [26]:
# forward stepwise function: 
# requires x: x dataframe , y series, p = number of columns, requires xval function too
# input x, y, p
def forward_stepwise(x, y, p,random=False,fold=5):
    included = []

    for i in range(p):
        excluded = [col for col in x if col not in included]
        best_sse = np.inf
        for col in excluded:
            try_x = x[included + [col]]
            beta = lstsq(try_x, y, rcond=None)[0]
            resid = y - try_x @ beta
            sse = resid @ resid
            if sse < best_sse:
                best_sse = sse
                next_var = col
                included.append(next_var)
    
    fsr_sse_sv = {}
    for i in range(1,p+1):
        fsr_sse_sv[i] = xval_5fold(y,x[included[:i]],random=random, fold=fold)
    fsr_sse_sv = pd.Series(fsr_sse_sv)
    forward_step_model = included[:fsr_sse_sv.idxmin()]
    return forward_step_model, pd.Series(fsr_sse_sv), included

In [27]:
### Best subset regression function

def best_subset(x, y, p, random=False, fold=5):

    for i in range(1, p+1):
        count = 0
        for comb in combinations(x.columns, i):
            count += 1
            if count > 1:
                break

    best_models = {}
    for i in range( 1, p+1):
        best_sse = np.inf
        for comb in combinations(x.columns, i):
            reg = x[list(comb)]
            beta = lstsq(reg, y , rcond=None)[0]
            resid = y - reg @ beta
            sse = resid @ resid
            if sse < best_sse:
                best_sse = sse
                best_models[i] = list(comb)

    bsr_sse_xv = {}
    for n_var in best_models:
        bsr_sse_xv[n_var] = xval_5fold(y, x[best_models[n_var]], random=random,fold=fold)
    bsr_sse_xv = pd.Series(bsr_sse_xv)
    bsr_model = best_models[bsr_sse_xv.idxmin()]

    return bsr_model, bsr_sse_xv

#print(bsr_model)
#bsr_sse_xv

Problem 1, predicting momentum

In [132]:
# Selecting a model to predict momentum returns in sample
# Setup y, x, t and p - cutting off first and last values to create a lagged effect of our regressors

y = common_sample["Mom"]["1986-02-28":]
x = common_sample["1986-01-28":"2020-09-30"]
t, p = x.shape
x.index = x.index.shift(1)


# Setup in_sample data

y_in = common_sample["Mom"][1:209]
x_in = common_sample[0:208]
t_in, p_in = x_in.shape
x_in.index = x_in.index.shift(1)

# Set up the out of sample evaluation data
# Setup y, x, t and p - cutting off first and last values to create a lagged effect of our regressors

y_oos = common_sample["Mom"][209:]
x_oos = common_sample[208:-1]
t_oos, p_oos = x_oos.shape
x_oos.index = x_oos.index.shift(1)

x_scale = x.std(ddof=0)
y_scale = y.std(ddof=0)
std_x = x / x_scale
std_y = y / y_scale


In [66]:
# Best subset model selection, momentum in sample
### Cell takes a minute to run ###

best_subset(x, y, p, random=False, fold=5)

(['10yr', '5yr', 'unempl', 'Default'],
 1     182.340085
 2     183.163474
 3     181.002456
 4     179.000691
 5     197.627927
 6     192.826593
 7     189.923527
 8     189.294736
 9     189.867361
 10    192.249382
 11    194.681215
 12    194.681215
 13    194.681215
 14    194.681215
 dtype: float64)

In [64]:
### Forward stepwise model selection, in sample
forward_stepwise(x, y, p, random=False, fold=5)

(['Mom', 'VWM', '10yr', '5yr', '1yr', 'AAA', 'BAA', 'unempl'],
 1     182.340085
 2     183.507617
 3     185.034513
 4     184.442231
 5     184.916710
 6     184.384152
 7     184.336901
 8     180.665378
 9     184.632058
 10    192.762705
 11    192.762705
 12    194.681215
 13    194.681215
 14    194.681215
 dtype: float64,
 ['Mom',
  'VWM',
  '10yr',
  '5yr',
  '1yr',
  'AAA',
  'BAA',
  'unempl',
  'CPI',
  'Ind_Prod',
  'Term',
  'Inflation',
  'Curve',
  'Default'])

In [133]:
### LASSO estimation
# initialise the Lasso CV model

x_scale = x[['10yr', '5yr', 'unempl', 'Default']].std(ddof=0)
y_scale = y.std(ddof=0)
std_x = x / x_scale
std_y = y / y_scale
std_x = std_x[['10yr', '5yr', 'unempl', 'Default']]

lasso_cv = LassoCV(fit_intercept=False)
lasso_cv = lasso_cv.fit(std_x,std_y)
print(f"Optimal Alpha = {lasso_cv.alpha_}")
lasso_beta = lasso_cv.coef_  * (y_scale / x_scale)
lasso_beta

Optimal Alpha = 0.00010679033922355593


10yr      -0.446567
5yr        0.431619
unempl     0.101191
Default    0.296963
dtype: float64

In [79]:
beta

array([-0.50016993,  0.48144405,  0.10839534,  0.29559332])

In [134]:
beta = lstsq(x[['10yr', '5yr', 'unempl', 'Default']],y,rcond=0)[0]

pred_returns = x_oos[['10yr', '5yr', 'unempl', 'Default']] @ beta

strat_returns = np.ones(209)
for i in range(209):
    if pred_returns[i] > y_in.mean():
        portfolio_weight = 1.5
    else:
        portfolio_weight = 0.5
    
    strat_returns[i] = (portfolio_weight * y_oos[i]) + (1- portfolio_weight) * rf[i]


cum_strat_returns = ((1+strat_returns/100).cumprod()-1)*100
cum_strat_returns[-1]

13.748104452210107

In [127]:
cum_strat_returns[-1]

19.398056515096517

In [130]:
strat_returns.std()

0.683087275253292

In [131]:
((cum_strat_returns[-1]/100)-rf.mean())/ strat_returns.std()

0.18358460251874528

In [35]:
(((((1+(y_oos/100)).cumprod()-1))[-1])-rf.mean())/ y_oos.std()

-0.029648874231109585

In [135]:
resid_oos = y_oos - pred_returns
oos_sse = resid_oos @ resid_oos
oos_tss = (y_oos **2).sum()
oos_r2 = 1- oos_sse/ oos_tss
print(f"The out of sample R2 for the out of sample period is {100*oos_r2:0.1f}%.")
print()
r2 = OLS(y_oos, x_oos).fit().rsquared
print(f"The in sample R2 for the out of sample period is {100*r2:0.1f}%.")

The out of sample R2 for the out of sample period is 9.1%.

The in sample R2 for the out of sample period is 11.0%.


Amazing - but now to attempt a replication out of sample

In [96]:
### Training the data on the first half (208 data points in y)
best_subset(x_in, y_in, p_in, random=True,fold=5)

(['VWM', '10yr', 'BAA', 'unempl', 'CPI', 'Ind_Prod'],
 1     54.401696
 2     53.343351
 3     54.208793
 4     53.182563
 5     52.979963
 6     52.677767
 7     53.390887
 8     52.998369
 9     53.346552
 10    54.759236
 11    56.640050
 12    56.640050
 13    56.640050
 14    56.640050
 dtype: float64)

In [95]:
forward_stepwise(x_in, y_in, p_in, random=True,fold=5)

(['Mom', 'VWM', '10yr', '5yr'],
 1     54.401696
 2     54.790824
 3     55.103541
 4     54.345972
 5     55.200944
 6     55.712932
 7     56.640742
 8     58.628300
 9     58.955278
 10    56.950648
 11    56.950648
 12    56.640050
 13    56.640050
 14    56.640050
 dtype: float64,
 ['Mom',
  'VWM',
  '10yr',
  '5yr',
  '1yr',
  'AAA',
  'BAA',
  'unempl',
  'CPI',
  'Ind_Prod',
  'Term',
  'Inflation',
  'Curve',
  'Default'])

In [97]:
### LASSO estimation
# initialise the Lasso CV model

x_scale = x_in[['VWM', '10yr', 'BAA', 'unempl', 'CPI', 'Ind_Prod']].std(ddof=0)
y_scale = y_in.std(ddof=0)
std_x = x / x_scale
std_y = y / y_scale
std_x = std_x[['VWM', '10yr', 'BAA', 'unempl', 'CPI', 'Ind_Prod']]

lasso_cv = LassoCV(fit_intercept=False)
lasso_cv = lasso_cv.fit(std_x,std_y)
print(f"Optimal Alpha = {lasso_cv.alpha_}")
lasso_beta = lasso_cv.coef_  * (y_scale / x_scale)
lasso_beta

Optimal Alpha = 0.2702617481343985


VWM         0.0
10yr        0.0
BAA         0.0
unempl      0.0
CPI         0.0
Ind_Prod    0.0
dtype: float64

In [149]:
### Evaluating the model out of sample

beta = lstsq(x_in[['VWM', '10yr', 'BAA', 'unempl', 'CPI', 'Ind_Prod']],y_in,rcond=0)[0]
pred_returns = x_oos[['VWM', '10yr', 'BAA', 'unempl', 'CPI', 'Ind_Prod']] @ beta

strat_returns = np.ones(209)
for i in range(209):
    if pred_returns[i] > y_in.mean():
        portfolio_weight = 1.5
    else:
        portfolio_weight = 0.5
    
    strat_returns[i] = (portfolio_weight * y_oos[i]) + (1 - portfolio_weight) * rf[i]


cum_strat_returns = ((1+strat_returns/100).cumprod()-1)*100
cum_strat_returns[-1]

10.677780642598588

In [40]:
(1 + strat_returns/100).cumprod()[-1]-1

0.10677780642598589

In [41]:
strat_returns.std()

0.49385887245944676

In [42]:
((cum_strat_returns[-1]/100)-rf.mean())/ strat_returns.std()

0.19175734829923413

In [43]:
(((1+(y_oos/100)).cumprod()[-1]-1)-rf.mean()) / y_oos.std()

-0.029648874231109585

In [101]:
resid_oos = y_oos - pred_returns
oos_sse = resid_oos @ resid_oos
oos_tss = (y_oos **2).sum()
oos_r2 = 1- (oos_sse/ oos_tss)
print(f"The out of sample R2 for the out of sample period is {100*oos_r2:0.1f}%.")
print()
r2 = OLS(y_oos, x_oos).fit().rsquared
print(f"The in sample R2 for the out of sample period is {100*r2:0.1f}%.")

The out of sample R2 for the out of sample period is -80.6%.

The in sample R2 for the out of sample period is 11.0%.


In [45]:
oos_sse

234.44105172558352

------------------------------------------------------------------------------------
------------------------------------------------------------------------------------
------------------------------------------------------------------------------------
------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------
-------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------


Problem 1, in sample, predicting Value Weighted Market

In [150]:
# Selecting a model to predict Value weighted market returns in sample
# Setup y, x, t and p - cutting off first and last values to create a lagged effect of our regressors

y = common_sample["VWM"]["1986-02-28":]
x = common_sample["1986-01-28":"2020-09-30"]
t, p = x.shape
x.index = x.index.shift(1)


# Setup in_sample data

y_in = common_sample["VWM"][1:209]
x_in = common_sample[0:208]
t_in, p_in = x_in.shape
x_in.index = x_in.index.shift(1)

# Set up the out of sample evaluation data
# Setup y, x, t and p - cutting off first and last values to create a lagged effect of our regressors

y_oos = common_sample["VWM"][209:]
x_oos = common_sample[208:-1]
t_oos, p_oos = x_oos.shape
x_oos.index = x_oos.index.shift(1)

In [47]:
# Best subset model selection,
### Cell takes a minute to run ###

best_subset(x,y,p)

(['1yr', 'unempl', 'Inflation'],
 1     8358.412126
 2     8300.299283
 3     8294.052150
 4     8365.302367
 5     8355.186006
 6     8601.719588
 7     8744.172252
 8     8837.944410
 9     9517.177930
 10    9577.530606
 11    9885.568453
 12    9885.568453
 13    9885.568453
 14    9885.568453
 dtype: float64)

In [48]:
### Forward stepwise model selection
forward_stepwise(x,y,p)

(['Mom', 'VWM', '10yr'],
 1      8846.187187
 2      8799.999413
 3      8642.075091
 4      8647.099104
 5      8820.054162
 6      8985.518802
 7      9259.716664
 8      9087.161438
 9      9248.401769
 10    10550.409232
 11    10550.409232
 12     9885.568453
 13     9885.568453
 14     9885.568453
 dtype: float64,
 ['Mom',
  'VWM',
  '10yr',
  '5yr',
  '1yr',
  'AAA',
  'BAA',
  'unempl',
  'CPI',
  'Ind_Prod',
  'Curve',
  'Inflation',
  'Term',
  'Default'])

In [156]:
beta = lstsq(x[['1yr', 'unempl', 'Inflation']],y,rcond=0)[0]
pred_returns = x_oos[['1yr', 'unempl', 'Inflation']] @ beta

strat_returns = np.ones(209)
for i in range(209):
    if pred_returns[i] > y_in.mean():
        portfolio_weight = 1.5
    else:
        portfolio_weight = 0.5
    
    strat_returns[i] = (portfolio_weight * y_oos[i]) + (1 - portfolio_weight) * rf[i]


cum_strat_returns = ((1+strat_returns/100).cumprod()-1)*100
cum_strat_returns[-1]

531.6911469437641

In [152]:
resid_oos = y_oos - pred_returns
oos_sse = resid_oos @ resid_oos
oos_tss = (y_oos **2).sum()
oos_r2 = 1- oos_sse/ oos_tss
print(f"The out of sample R2 for the out of sample period is {100*oos_r2:0.1f}%.")
print()
r2 = OLS(y_oos, x_oos).fit().rsquared
print(f"The in sample R2 for the out of sample period is {100*r2:0.1f}%.")

The out of sample R2 for the out of sample period is 10.6%.

The in sample R2 for the out of sample period is 14.7%.


In [153]:
### Sharpe ratio
print(((cum_strat_returns[-1]/100)-rf.mean())/ strat_returns.std())
print()
print((((1+(y_oos/100)).cumprod()[-1]-1) - rf.mean()) / y_oos.std())

1.2760645661537844

0.9993963757464968


In [52]:
### Training the data on the first half (208 data points in y)
best_subset(x_in,y_in,p_in, random=True,fold=5)

(['Mom', 'unempl', 'Curve'],
 1     4723.290414
 2     4595.401003
 3     4544.949513
 4     4592.170260
 5     4590.924004
 6     4683.694311
 7     4689.139039
 8     4710.483300
 9     4784.698767
 10    4796.941527
 11    4825.331600
 12    4825.331600
 13    4825.331600
 14    4825.331600
 dtype: float64)

In [53]:
forward_stepwise(x_in,y_in,p_in,random=True,fold=5)

(['Mom', 'VWM', '10yr'],
 1     4807.915063
 2     4822.116671
 3     4654.563908
 4     4685.377330
 5     4679.082366
 6     4752.488538
 7     4780.496079
 8     4761.567120
 9     4804.726211
 10    4810.196630
 11    4810.196630
 12    4825.331600
 13    4825.331600
 14    4825.331600
 dtype: float64,
 ['Mom',
  'VWM',
  '10yr',
  '5yr',
  '1yr',
  'AAA',
  'BAA',
  'unempl',
  'CPI',
  'Ind_Prod',
  'Default',
  'Inflation',
  'Term',
  'Curve'])

In [154]:
### Evaluating the model out of sample

beta = lstsq(x_in[['Mom', 'unempl', 'Curve']],y_in,rcond=0)[0]
pred_returns = x_oos[['Mom', 'unempl', 'Curve']] @ beta

strat_returns = np.ones(209)
for i in range(209):
    if pred_returns[i] > y_in.mean():
        portfolio_weight = 1.5
    else:
        portfolio_weight = 0.5
    
    strat_returns[i] = (portfolio_weight * y_oos[i]) + (1 - portfolio_weight) * rf[i]


cum_strat_returns = ((1+strat_returns/100).cumprod()-1)*100
cum_strat_returns[-1]

673.4066355281525

In [55]:
((1+(y_oos/100)).cumprod()[-1]-1)*100

422.4388819372857

In [56]:
resid_oos = y_oos - pred_returns
oos_sse = resid_oos @ resid_oos
oos_tss = (y_oos **2).sum()
oos_r2 = 1- (oos_sse/ oos_tss)
print(f"The out of sample R2 for the out of sample period is {100*oos_r2:0.1f}%.")
print()
r2 = OLS(y_oos, x_oos).fit().rsquared
print(f"The in sample R2 for the out of sample period is {100*r2:0.1f}%.")

The out of sample R2 for the out of sample period is -15.1%.

The in sample R2 for the out of sample period is 14.7%.


In [57]:
### Sharpe ratio
print(((cum_strat_returns[-1]/100)-rf.mean())/ strat_returns.std())
print()
print((((1+(y_oos/100)).cumprod()[-1]-1) - rf.mean()) / y_oos.std())

1.1774668498815901

0.9993963757464968
