In [1486]:
import pandas as pd
import numpy as np
import pandas_datareader as pdr
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from numpy.linalg import lstsq
from itertools import combinations
from statsmodels.api import OLS
from sklearn.linear_model import LassoCV, RidgeCV
import statsmodels.api as sm
from scipy import stats

In [1487]:
# Import data
## slow cell to run ##

# momentum
momentum = pdr.get_data_famafrench("F-F_Momentum_Factor_daily", start="1920", end="2020-12-31")[0]
momentum = momentum.resample("M").last().squeeze()
momentum = momentum["1986-01-31":]

#momentum = pd.read_csv("data/F-F_Momentum_Factor.csv",index_col="Date")
#momentum.index = pd.to_datetime(momentum.index,format="%Y%m")
#momentum = momentum.resample("M").last().squeeze()


# Value weighted market
vwm = pd.read_csv("data/VWM.csv",index_col="Date")
vwm.index = pd.to_datetime(vwm.index, format="%Y%m")
vwm = vwm.resample("M").last().squeeze()
vwm = vwm["1986-01-31":]

# 1, 5 and 10 year constant maturities
const_mat_10 = pdr.get_data_fred("DGS10", "1920-01-01", "2020-12-31")
const_mat_10 = const_mat_10.resample("M").last().squeeze()
const_mat_5 = pdr.get_data_fred("DGS5", "1920-01-01", "2020-12-31")
const_mat_5 = const_mat_5.resample("M").last().squeeze()
const_mat_1 = pdr.get_data_fred("DGS1", "1920-01-01", "2020-12-31")
const_mat_1 = const_mat_1.resample("M").last().squeeze()

# AAA and BAA (Moody's)
aaa = pd.read_csv("data/AAA.csv",parse_dates=True,index_col="Date")
aaa = aaa.resample("M").last().squeeze()
baa = pdr.get_data_fred("DBAA", "1920-01-01", "2020-12-31")
baa = baa.resample("M").last().squeeze()

# Unemployment rate (US)
unrate = pdr.get_data_fred("UNRATE", "1920-01-01", "2020-12-31")
unrate = unrate.resample("M").last().squeeze()

# core CPi US - consumer price index for all urban consumers
core_cpi = pdr.get_data_fred("CPIAUCSL", "1920-01-01", "2020-12-31")
core_cpi = core_cpi.resample("M").last().squeeze()

# Industrial Productivity
ind_prod = pdr.get_data_fred("INDPRO", "1920-01-01", "2020-12-31")
ind_prod = ind_prod.resample("M").last().squeeze()

# Risk free rate
#ff_factors = pdr.get_data_famafrench("F-F_Research_Data_Factors_daily", start="1920", end="2020-12-31")[0]
#rf = ff_factors["RF"].resample("M").last()
rf = (( 1 + const_mat_1["1986-02-28":]/100) **(1/12)) -1


In [1750]:
rf_mean = (rf["2003-06-30":]).mean()

In [1749]:
rf_mean

0.0012032831248151785

In [1488]:
# Variable construction:

term = const_mat_10 - const_mat_1
curve = const_mat_10 - 2 * const_mat_5 + const_mat_1
default = aaa - baa

inflation = pd.Series.copy(core_cpi)
for i in range(12):
    inflation[i] = np.nan
for i in range(12,887):
    inflation[i] = np.log(core_cpi[i]) - np.log(core_cpi[i-12])
inflation = inflation.dropna()

variables = pd.DataFrame([term,curve,default,inflation]).T
variables.columns = ["Term", "Curve", "Default", "Inflation"]
common_variables = variables.dropna()
common_variables

Unnamed: 0,Term,Curve,Default,Inflation
1986-01-31,1.51,-0.41,-1.32,0.038966
1986-02-28,0.70,-0.26,-0.95,0.031484
1986-03-31,0.67,-0.27,-1.31,0.021307
1986-04-30,0.84,-0.42,-1.37,0.015763
1986-05-31,1.17,-0.71,-1.32,0.016652
...,...,...,...,...
2020-08-31,0.60,0.28,-1.14,0.013129
2020-09-30,0.57,0.25,-1.13,0.013994
2020-10-31,0.75,0.25,-1.14,0.011964
2020-11-30,0.73,0.23,-0.83,0.011542


In [1012]:
np.log(core_cpi[14]) - np.log(core_cpi[2])

0.06595796779179741

In [1014]:
data = pd.DataFrame([momentum,vwm, const_mat_10,const_mat_5,const_mat_1,aaa,baa,unrate,core_cpi,ind_prod]).T
data.columns = ["Mom", "VWM","10yr","5yr","1yr","AAA","BAA","unempl","CPI","Ind_Prod"]
common_data = data.dropna()
common_data

Unnamed: 0_level_0,Mom,VWM,10yr,5yr,1yr,AAA,BAA,unempl,CPI,Ind_Prod
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1986-01-31,0.34,1.21,9.08,8.53,7.57,10.05,11.37,6.7,109.900,57.3104
1986-02-28,0.49,7.66,8.13,7.91,7.43,9.67,10.62,7.2,109.700,56.9344
1986-03-31,0.76,5.48,7.39,7.19,6.72,9.00,10.31,7.2,109.100,56.5420
1986-04-30,-0.36,-0.79,7.38,7.17,6.54,8.79,10.16,7.1,108.700,56.5599
1986-05-31,-0.29,5.11,8.05,7.82,6.88,9.09,10.41,7.2,109.000,56.6823
...,...,...,...,...,...,...,...,...,...,...
2020-06-30,1.15,2.47,0.66,0.29,0.16,2.44,3.59,11.1,257.214,97.8019
2020-07-31,1.17,5.78,0.55,0.21,0.11,2.14,3.15,10.2,258.723,101.8924
2020-08-31,1.75,7.64,0.72,0.28,0.12,2.25,3.39,8.4,259.681,102.6619
2020-09-30,0.15,-3.62,0.69,0.28,0.12,2.31,3.44,7.8,260.209,102.6008


In [1489]:
import statsmodels.api as sm
common_sample = pd.DataFrame([momentum,vwm, const_mat_10,const_mat_5,const_mat_1,aaa,baa,unrate,core_cpi,ind_prod,term,curve,default,inflation]).T
common_sample.columns = ["Mom", "VWM","10yr","5yr","1yr","AAA","BAA","unempl","CPI","Ind_Prod","Term", "Curve", "Default", "Inflation"]
common_sample = common_sample.dropna()
#common_sample = sm.add_constant(common_sample)
common_sample = common_sample - common_sample.mean(axis=0)
common_sample

Unnamed: 0_level_0,Mom,VWM,10yr,5yr,1yr,AAA,BAA,unempl,CPI,Ind_Prod,Term,Curve,Default,Inflation
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1986-01-31,0.320239,0.263158,4.239713,4.200885,4.108995,3.827512,4.190431,0.78134,-76.33349,-30.702938,0.130718,-0.053062,-0.362919,0.013822
1986-02-28,0.470239,6.713158,3.289713,3.580885,3.968995,3.447512,3.440431,1.28134,-76.53349,-31.078938,-0.679282,0.096938,0.007081,0.006341
1986-03-31,0.740239,4.533158,2.549713,2.860885,3.258995,2.777512,3.130431,1.28134,-77.13349,-31.471338,-0.709282,0.086938,-0.352919,-0.003837
1986-04-30,-0.379761,-1.736842,2.539713,2.840885,3.078995,2.567512,2.980431,1.18134,-77.53349,-31.453438,-0.539282,-0.063062,-0.412919,-0.009381
1986-05-31,-0.309761,4.163158,3.209713,3.490885,3.418995,2.867512,3.230431,1.28134,-77.23349,-31.331038,-0.209282,-0.353062,-0.362919,-0.008492
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-06-30,1.130239,1.523158,-4.180287,-4.039115,-3.301005,-3.782488,-3.589569,5.18134,70.98051,9.788562,-0.879282,0.596938,-0.192919,-0.018074
2020-07-31,1.150239,4.833158,-4.290287,-4.119115,-3.351005,-4.082488,-4.029569,4.28134,72.48951,13.879062,-0.939282,0.596938,-0.052919,-0.014903
2020-08-31,1.730239,6.693158,-4.120287,-4.049115,-3.341005,-3.972488,-3.789569,2.48134,73.44751,14.648562,-0.779282,0.636938,-0.182919,-0.012015
2020-09-30,0.130239,-4.566842,-4.150287,-4.049115,-3.341005,-3.912488,-3.739569,1.88134,73.97551,14.587462,-0.809282,0.606938,-0.172919,-0.011150


Importing useful functions

In [1490]:
## Cross Validation function

def xval_5fold(y, x , random=False, seed = 20201231,fold=5):
    # Use numpy arrays for simplicity
    y = np.asarray(y)
    x = np.asarray(x)
    n = y.shape[0]
    if random:
        # If randomisation is needed, use either the default or provided seed
        rg = np.random.default_rng(seed)
        # Generate a set of index values to use to randomly reorder the data
        # After randomisation, we can use the data as if it is is inorder!
        ind = rg.permutation(np.arange(n))
        y = y[ind]
        x = x[ind]
    # Compute the block size
    block = n / fold
    sse = 0.0
    for i in range(int(fold)):
        # Start and end of each block need to be integers or we lose an observation
        # Rounding ensures that we get all observations since int rounds down
        st = int(np.round(i * block))
        en = int(np.round((i + 1) * block))
        # Construct the indicies of the observations that we leave out
        leave_out = np.r_[st:en]
        # The included are the one that we don't leave out
        include = np.setdiff1d(np.arange(n), leave_out)
        # Compute the regression coefficients
        beta = lstsq(x[include], y[include], rcond=None)[0]
        # Compute the residuals and add to the sse
        resid = y[st:en]- x[st:en] @ beta
        sse += resid @ resid
    return sse

# Randomisation is better when we suspect the model may not be totally stable

In [1689]:
# forward stepwise function: 
# requires x: x dataframe , y series, p = number of columns, requires xval function too
# input x, y, p
def forward_stepwise(x, y, p,random=False,fold=5):
    included = []

    for i in range(p):
        excluded = [col for col in x if col not in included]
        best_sse = np.inf
        for col in excluded:
            try_x = x[included + [col]]
            beta = lstsq(try_x, y, rcond=None)[0]
            resid = y - try_x @ beta
            sse = resid @ resid
            if sse < best_sse:
                best_sse = sse
                next_var = col
        included.append(next_var)
    
    fsr_sse_sv = {}
    for i in range(1,p+1):
        fsr_sse_sv[i] = xval_5fold(y,x[included[:i]],random=random, fold=fold)
    fsr_sse_sv = pd.Series(fsr_sse_sv)
    forward_step_model = included[:fsr_sse_sv.idxmin()]
    return forward_step_model, pd.Series(fsr_sse_sv)

In [1705]:
# Backward stepwise 

def back_stepwise(x,y,p,random=False,fold=5):


    included = list(x.columns)
    
    removed = []
    p = x.shape[1]

    for i in range(p):
        best_sse = np.inf
        for col in included:
            try_col = included[:]
            try_col.remove(col)
            try_x = x[try_col]
            beta = lstsq(try_x,y,rcond=None)[0]
            resid = y - try_x @ beta
            sse = resid @ resid
            if sse < best_sse:
                best_sse = sse
                next_drop = col
        removed.append(next_drop)
        included.remove(next_drop)

    #print(removed)

    included = removed[::-1]

    bsr_sse_sv = {}
    for i in range(1,p+1):
        bsr_sse_sv[i] = xval_5fold(y,x[included[:i]],random=random,fold=fold)
    bsr_sse_sv = pd.Series(bsr_sse_sv)
    backstep_model = included[:bsr_sse_sv.idxmin()]
    return backstep_model

In [1719]:
# Hybrid stepwise function
# requires x: x dataframe , y series, p = number of columns, requires xval function too
# input x, y, p
def hybrid_stepwise(x, y, p,random=False,fold=5):
    included = []

    for i in range(p):
        excluded = [col for col in x if col not in included]
        best_sse = np.inf
        for col in excluded:
            try_x = x[included + [col]]
            beta = lstsq(try_x, y, rcond=None)[0]
            resid = y - try_x @ beta
            sse = resid @ resid
            if sse < best_sse:
                best_sse = sse
                next_var = col
        included.append(next_var)

    end_models = {}
    for i in range(1,p+1):
        x_col = included[:i]
        regressors = x[x_col]
        end_models[i] = back_stepwise(regressors,y,p)

    hsr_sse_sv = {}

    for i in range(1,p+1):
        hsr_sse_sv[i] = xval_5fold(y,x[end_models[i]],random=random, fold=fold)
    hsr_sse_sv = pd.Series(hsr_sse_sv)
    hybrid_step_model = end_models[hsr_sse_sv.idxmin()]

    return hybrid_step_model

In [1492]:
### Best subset regression function

def best_subset(x, y, p, random=False, fold=5):

    """for i in range(1, p+1):
        count = 0
        for comb in combinations(x.columns, i):
            count += 1
            if count > 1:
                break"""

    best_models = {}
    for i in range( 1, p+1):
        best_sse = np.inf
        for comb in combinations(x.columns, i):
            reg = x[list(comb)]
            beta = lstsq(reg, y , rcond=None)[0]
            resid = y - reg @ beta
            sse = resid @ resid
            if sse < best_sse:
                best_sse = sse
                best_models[i] = list(comb)

    bsr_sse_xv = {}
    for n_var in best_models:
        bsr_sse_xv[n_var] = xval_5fold(y, x[best_models[n_var]], random=random,fold=fold)
    bsr_sse_xv = pd.Series(bsr_sse_xv)
    bsr_model = best_models[bsr_sse_xv.idxmin()]

    return bsr_model, bsr_sse_xv

#print(bsr_model)
#bsr_sse_xv

In [1024]:
# Fully cross validated best subset - too cumbersome
"""def best_subset(x, y, p, random=False, fold=5):

    best_models = {}
    for i in range( 1, p+1):
        best_sse = np.inf
        for comb in combinations(x.columns, i):
            reg = x[list(comb)]

            sse = xval_5fold(y,reg,random=random,fold=fold)



            #beta = lstsq(reg, y , rcond=None)[0]
            #resid = y - reg @ beta
            #sse = resid @ resid
            if sse < best_sse:
                best_sse = sse
                best_models[i] = list(comb)

    bsr_sse_xv = {}
    for n_var in best_models:
        bsr_sse_xv[n_var] = xval_5fold(y, x[best_models[n_var]], random=random,fold=fold)
    bsr_sse_xv = pd.Series(bsr_sse_xv)
    bsr_model = best_models[bsr_sse_xv.idxmin()]

    return bsr_model, bsr_sse_xv"""

In [1587]:
# General to Specific selection

def gts(x,y):
    cv = stats.norm.ppf(.995)

    included  = list(x.columns)
    y = y
    for i in range(16):
        x = x.loc[:,included]
        res = sm.OLS(y,x).fit(cov_type="HC0")
        tstats = res.tvalues
        if tstats.abs().min() < cv:
            sorted_tstats = tstats.abs().sort_values()
            remove = sorted_tstats.index[0]
            included.remove(remove)
        else:
            break
    return included

In [1494]:
# Ridge regression -- only works without a constant

def ridge_reg(x,y):
    x_scale = x.std(ddof=0)
    y_scale = y.std(ddof=0)
    std_x = x / x_scale
    std_y = y / y_scale

    ridge_cv = RidgeCV(fit_intercept=False, alphas=np.linspace(1, 100, 101))
    ridge_cv = ridge_cv.fit(std_x, std_y)
    print(f"Optimal alpha = {ridge_cv.alpha_}")
    start = ridge_cv.alpha_ - (1/3 * ridge_cv.alpha_)
    end = ridge_cv.alpha_ + (1/3 * ridge_cv.alpha_)

    ridge_cv = RidgeCV(fit_intercept=False, alphas=np.linspace(start, end, 2001))
    ridge_cv = ridge_cv.fit(std_x, std_y)
    print(f"Optimal alpha = {ridge_cv.alpha_}")
    start = ridge_cv.alpha_ - (1/3 * ridge_cv.alpha_)
    end = ridge_cv.alpha_ + (1/3 * ridge_cv.alpha_)

    ridge_cv = RidgeCV(fit_intercept=False, alphas=np.linspace(start, end, 2001))
    ridge_cv = ridge_cv.fit(std_x, std_y)
    print(f"Optimal alpha = {ridge_cv.alpha_}")

    return ridge_cv.coef_ * (y_scale / x_scale)

In [1495]:
### LASSO function

def lasso_reg(x,y):

    x_scale = x.std(ddof=0)
    y_scale = y.std(ddof=0)
    std_x = x / x_scale
    std_y = y / y_scale
    std_x = std_x

    lasso_cv = LassoCV(fit_intercept=False)
    lasso_cv = lasso_cv.fit(std_x,std_y)
    #print(f"Optimal Alpha = {lasso_cv.alpha_}")
    lasso_beta = lasso_cv.coef_  * (y_scale / x_scale)
    return lasso_beta

Problem 1, predicting momentum
--------------------------------------------------------

In [1781]:
# Selecting a model to predict momentum returns in sample
# Setup y, x, t and p - cutting off first and last values to create a lagged effect of our regressors

# y = common_sample["Mom"]["1986-02-28":]
y = momentum["1986-02-28":"2020-10-31"]
x = common_sample["1986-01-28":"2020-09-30"]
t, p = x.shape
x.index = x.index.shift(1)


# Setup in_sample data

#y_in = common_sample["Mom"][1:209]
y_in = momentum[1:209]
x_in = common_sample[0:208]
t_in, p_in = x_in.shape
x_in.index = x_in.index.shift(1)

# Set up the out of sample evaluation data
# Setup y, x, t and p - cutting off first and last values to create a lagged effect of our regressors

#y_oos = common_sample["Mom"][209:]
y_oos = momentum[209:-1]
x_oos = common_sample[208:-1]
t_oos, p_oos = x_oos.shape
x_oos.index = x_oos.index.shift(1)

x_scale = x.std(ddof=0)
y_scale = y.std(ddof=0)
std_x = x / x_scale
std_y = y / y_scale

In [1445]:
y

Date
1986-02-28    0.49
1986-03-31    0.76
1986-04-30   -0.36
1986-05-31   -0.29
1986-06-30    0.34
              ... 
2020-06-30    1.15
2020-07-31    1.17
2020-08-31    1.75
2020-09-30    0.15
2020-10-31   -2.36
Freq: M, Name: Mom   , Length: 417, dtype: float64

In [1320]:
# Best subset model selection, momentum in sample
### Cell takes a minute to run so keep blocked off for now###

#best_subset(x, y, p, random=True, fold=5)

(['Mom', '1yr', 'BAA', 'unempl', 'CPI', 'Ind_Prod', 'Curve'],
 1     190.540998
 2     186.313345
 3     185.311511
 4     183.331755
 5     180.540560
 6     179.946257
 7     179.778476
 8     184.507811
 9     185.125053
 10    188.265548
 11    189.829165
 12    189.829165
 13    189.829165
 14    189.829165
 dtype: float64)

In [1726]:
hybrid_stepwise(x,y,p)

['Term', 'Curve', 'unempl', 'Ind_Prod', 'Default', 'Mom']

In [1718]:
### Forward stepwise model selection, in sample
forward_stepwise(x, y, p, fold=5)

(['Default', 'Mom', 'Term', 'unempl', 'Curve', 'Ind_Prod'],
 1     182.057043
 2     181.382338
 3     180.812383
 4     185.546405
 5     181.681202
 6     178.239288
 7     179.434001
 8     180.827496
 9     182.385478
 10    183.052752
 11    184.175372
 12    184.175372
 13    184.175372
 14    184.175372
 dtype: float64)

In [1339]:
# 
gts_model = gts(x,y)

In [1524]:
# Ridge regression and evaluation ( also ridge of gts model/ another model)

pred_returns = x_oos[gts_model] @ ridge_reg(x[gts_model],y)

strat_returns = np.ones(209)
for i in range(209):
    if pred_returns[i] > y_in.mean():
        portfolio_weight = 1.5
    else:
        portfolio_weight = 0.5
    
    strat_returns[i] = (portfolio_weight * y_oos[i]) + (1- portfolio_weight) * rf[i+208]


cum_strat_returns = ((1+strat_returns/100).cumprod()-1)*100
cum_strat_returns[-1]

Optimal alpha = 1.0
Optimal alpha = 0.6666666666666667
Optimal alpha = 0.664888888888889


10.987634761519406

In [1521]:
#Lasso evaluation

pred_returns = x_oos[gts_model] @ lasso_reg(x[gts_model],y)

strat_returns = np.ones(209)
for i in range(209):
    if pred_returns[i] > y_in.mean():
        portfolio_weight = 1.5
    else:
        portfolio_weight = 0.5
    
    strat_returns[i] = (portfolio_weight * y_oos[i]) + (1- portfolio_weight) * rf[i+208]


cum_strat_returns = ((1+strat_returns/100).cumprod()-1)*100
cum_strat_returns[-1]

11.750557828461506

In [1679]:
## Evaluating the model #######

beta = lstsq(x[['Term', 'Curve', 'unempl', 'Ind_Prod', 'Default', 'Mom']],y,rcond=0)[0]
pred_returns = x_oos[['Term', 'Curve', 'unempl', 'Ind_Prod', 'Default', 'Mom']] @ beta

strat_returns = np.ones(209)
for i in range(209):
    if pred_returns[i] > y_in.quantile(.75):
        portfolio_weight = 3
    elif pred_returns[i] > y_in.quantile(.5):
        portfolio_weight = 1
    elif pred_returns[i] > y_in.quantile(.25):
        portfolio_weight = 0.5
    else:
        portfolio_weight = -1
    strat_returns[i] = (portfolio_weight * y_oos[i]) + (1- portfolio_weight) * rf[i+208]


cum_strat_returns = ((1+strat_returns/100).cumprod()-1)*100
cum_strat_returns[-1]

27.534772769774783

Test out Models Here, in sample Momentum
----------------------------------
Add in the desired regressors, evaluate with the trading strategy by running first cell
then run next 2 cells for r_squared, out of sample SSE and sharpe ratio

In [1390]:
# For reference

common_sample.columns

Index(['Mom', 'VWM', '10yr', '5yr', '1yr', 'AAA', 'BAA', 'unempl', 'CPI',
       'Ind_Prod', 'Term', 'Curve', 'Default', 'Inflation'],
      dtype='object')

In [1772]:

beta = lstsq(x[['5yr']],y,rcond=0)[0]

pred_returns = x_oos[['5yr']] @ beta

strat_returns = np.ones(209)
for i in range(209):
    if pred_returns[i] > y_in.mean():
        portfolio_weight = 4
    else:
        portfolio_weight = -4
    
    strat_returns[i] = (portfolio_weight * y_oos[i]) + (1- portfolio_weight) * rf[i+208]


cum_strat_returns = ((1+strat_returns/100).cumprod()-1)*100
cum_strat_returns[-1]

-6.713644392532925

In [1773]:
resid_oos = y_oos - pred_returns
oos_sse = resid_oos @ resid_oos
oos_tss = (y_oos **2).sum()
oos_r2 = 1- oos_sse/ oos_tss
print(f"The out of sample R2 for the out of sample period is {100*oos_r2:0.1f}%.")
print()
r2 = OLS(y_oos, x_oos).fit().rsquared
print(f"The in sample R2 for the out of sample period is {100*r2:0.1f}%.")
oos_sse

The out of sample R2 for the out of sample period is 0.1%.

The in sample R2 for the out of sample period is 11.2%.


129.80940441577607

In [1770]:
### Sharpe ratio
print(strat_returns.std())
print()
print(((strat_returns.mean())-rf.mean())/ strat_returns.std() * np.sqrt(12))
print()
print((y_oos.mean()-rf.mean()) / y_oos.std() * np.sqrt(12))

3.1536392335060035

0.014530008062703893

-0.023147350695825655


In [1774]:
### Sharpe ratio

print(   (strat_returns.mean()-rf_mean)   / strat_returns.std() * np.sqrt(12)  )
print()
print(   (y_oos.mean()-rf_mean) / y_oos.std() * np.sqrt(12))

0.016277509902698314

-0.016175872392965935


Momentum Out of Sample start
-------------------------------------

In [1635]:
### Training the data on the first half (208 data points in y)
# best_subset(x_in, y_in, p_in, random=True,fold=5)

(['5yr', 'BAA', 'unempl', 'CPI', 'Ind_Prod'],
 1     54.858268
 2     53.674750
 3     55.005890
 4     51.719652
 5     51.414647
 6     51.936056
 7     51.763589
 8     51.878720
 9     51.995365
 10    52.209599
 11    53.778890
 12    53.778890
 13    53.778890
 14    53.778890
 dtype: float64)

In [1688]:
fwd_mom = forward_stepwise(x_in, y_in, p_in, random=True,fold=5)[0]
forward_stepwise(x_in, y_in, p_in, random=True,fold=5)

(['unempl', 'Mom', 'Curve', 'Term', 'Ind_Prod', 'CPI', '10yr', 'BAA', 'VWM'],
 1     54.858268
 2     54.275525
 3     54.726502
 4     55.154637
 5     54.521437
 6     53.463956
 7     54.149940
 8     53.415905
 9     53.358809
 10    53.673594
 11    53.778890
 12    53.778890
 13    53.778890
 14    53.778890
 dtype: float64,
 ['unempl',
  'Mom',
  'Curve',
  'Term',
  'Ind_Prod',
  'CPI',
  '10yr',
  'BAA',
  'VWM',
  'Inflation',
  'AAA',
  '5yr',
  '1yr',
  'Default'])

In [1687]:
hybrid_stepwise(x_in,y_in,p_in,random=True)

['Term', 'Curve', 'Ind_Prod', 'Mom']

In [1588]:
gts(x_in,y_in)

['10yr', 'BAA', 'unempl', 'CPI', 'Ind_Prod']

In [1636]:
### Evaluating the model out of sample with LASSO


pred_returns = x_oos[fwd_mom] @ lasso_reg(x_in[fwd_mom],y_in)


strat_returns = np.ones(209)
for i in range(209):
    
    if pred_returns[i] > 0:
        portfolio_weight = 2
    #elif pred_returns[i] > y_in.quantile(.5):
       # portfolio_weight = 1
   # elif pred_returns[i] > y_in.quantile(.25):
      #  portfolio_weight = 0.5
    else:
        portfolio_weight = -1
    
    strat_returns[i] = (portfolio_weight * y_oos[i]) + (1 - portfolio_weight) * rf[i+208]


cum_strat_returns = ((1+strat_returns/100).cumprod()-1)*100
cum_strat_returns[-1]

48.362312921122964

Test out Models Here, out of sample Momentum
----------------------------------
Add in the desired regressors, evaluate with the trading strategy by running first cell
then run next 2 cells for r_squared, out of sample SSE and sharpe ratio

In [1391]:
# For reference

common_sample.columns

Index(['Mom', 'VWM', '10yr', '5yr', '1yr', 'AAA', 'BAA', 'unempl', 'CPI',
       'Ind_Prod', 'Term', 'Curve', 'Default', 'Inflation'],
      dtype='object')

In [1800]:
### Evaluating the model out of sample without LASSO

beta = lstsq(x_in[['Mom']],y_in,rcond=0)[0]
pred_returns = x_oos[['Mom']] @ beta

strat_returns = np.ones(209)
for i in range(209):

    if pred_returns[i] > y_in.mean():
        portfolio_weight = 4
    else:
        portfolio_weight = -4
    
    strat_returns[i] = (portfolio_weight * y_oos[i]) + (1 - portfolio_weight) * rf[i+208]


cum_strat_returns = ((1+strat_returns/100).cumprod()-1)*100
cum_strat_returns[-1]

118.31086079688396

In [1801]:
resid_oos = y_oos - pred_returns
oos_sse = resid_oos @ resid_oos
oos_tss = (y_oos **2).sum()
oos_r2 = 1- (oos_sse/ oos_tss)
print(f"The out of sample R2 for the out of sample period is {100*oos_r2:0.1f}%.")
print()
r2 = OLS(y_oos, x_oos).fit().rsquared
print(f"The in sample R2 for the out of sample period is {100*r2:0.1f}%.")

The out of sample R2 for the out of sample period is 2.2%.

The in sample R2 for the out of sample period is 11.2%.


### Sharpe ratio
print()
print()
print(((strat_returns.mean())-rf.mean())/ strat_returns.std() * np.sqrt(12))
print()
print((y_oos.mean()-rf.mean()) / y.std() * np.sqrt(12))

In [1796]:
strat_returns.std()

3.125512027253984

In [1802]:
### Sharpe ratio

print(   (strat_returns.mean()-rf_mean)   / strat_returns.std() * np.sqrt(12)  )
print()
print(   (y_oos.mean()-rf_mean) / y_oos.std() * np.sqrt(12))

0.4665499191499994

-0.016175872392965935


------------------------------------------------------------------------------------
------------------------------------------------------------------------------------
------------------------------------------------------------------------------------
------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------
-------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------


Problem 1, in sample, predicting Value Weighted Market

In [1775]:
# Selecting a model to predict Value weighted market returns in sample
# Setup y, x, t and p - cutting off first and last values to create a lagged effect of our regressors

#y = common_sample["VWM"]["1986-02-28":]
y = vwm["1986-02-28":]
x = common_sample["1986-01-28":"2020-09-30"]
t, p = x.shape
x.index = x.index.shift(1)


# Setup in_sample data

#y_in = common_sample["VWM"][1:209]
y_in = vwm[1:209]
x_in = common_sample[0:208]
t_in, p_in = x_in.shape
x_in.index = x_in.index.shift(1)

# Set up the out of sample evaluation data
# Setup y, x, t and p - cutting off first and last values to create a lagged effect of our regressors

#y_oos = common_sample["VWM"][209:]
y_oos = vwm[209:]
x_oos = common_sample[208:-1]
t_oos, p_oos = x_oos.shape
x_oos.index = x_oos.index.shift(1)

In [1276]:
# Best subset model selection,
### Cell takes a minute to run ###

#best_subset(x,y,p,random=True)

(['const', 'Ind_Prod', 'Curve', 'Inflation'],
 1     8327.472263
 2     8257.845899
 3     8228.621257
 4     8124.477852
 5     8211.041703
 6     8232.513209
 7     8222.062818
 8     8270.586424
 9     8262.895462
 10    8329.127862
 11    8417.319234
 12    8493.365914
 13    8493.365914
 14    8493.365914
 15    8493.365914
 dtype: float64)

In [1642]:
### Forward stepwise model selection
forward_stepwise(x,y,p,random=True)

(['Inflation', 'Ind_Prod', 'Curve'],
 1     8716.790155
 2     8631.037308
 3     8486.446491
 4     8572.198378
 5     8579.347889
 6     8581.180765
 7     8632.179932
 8     8645.427570
 9     8698.605091
 10    8761.890536
 11    8866.053193
 12    8866.053193
 13    8866.053193
 14    8866.053193
 dtype: float64,
 ['Inflation',
  'Ind_Prod',
  'Curve',
  'Default',
  'unempl',
  'Mom',
  'Term',
  'AAA',
  'CPI',
  '10yr',
  'VWM',
  '5yr',
  '1yr',
  'BAA'])

In [1591]:
gts_model = gts(x, y)
gts_model

['Ind_Prod', 'Curve', 'Inflation']

In [1643]:
#  Ridge evaluation

pred_returns = x_oos @ ridge_reg(x ,y )

strat_returns = np.ones(209)
for i in range(209):
    if pred_returns[i] > y_in.quantile(.75):
        portfolio_weight = 2
    elif pred_returns[i] > y_in.quantile(.5):
        portfolio_weight = 2
    elif pred_returns[i] > y_in.quantile(.25):
        portfolio_weight = 1
    else:
        portfolio_weight = 1
    
    strat_returns[i] = (portfolio_weight * y_oos[i]) + (1- portfolio_weight) * rf[i]


cum_strat_returns = ((1+strat_returns/100).cumprod()-1)*100
cum_strat_returns[-1]

Optimal alpha = 100.0
Optimal alpha = 133.33333333333331
Optimal alpha = 147.1111111111111


481.5340102276922

In [1567]:
#  LASSO evaluation ---- doesn't work so well on vwm

pred_returns = x_oos @ lasso_reg(x ,y )

strat_returns = np.ones(209)
for i in range(209):
    if pred_returns[i] > y_in.quantile(.75):
        portfolio_weight = 2
    elif pred_returns[i] > y_in.quantile(.5):
        portfolio_weight = 2
    elif pred_returns[i] > y_in.quantile(.25):
        portfolio_weight = 1
    else:
        portfolio_weight = 1
    
    strat_returns[i] = (portfolio_weight * y_oos[i]) + (1- portfolio_weight) * rf[i]


cum_strat_returns = ((1+strat_returns/100).cumprod()-1)*100
cum_strat_returns[-1]

422.4388819372857

Test out Models Here, In sample VWM
----------------------------------
Add in the desired regressors, evaluate with the trading strategy by running first cell
then run next 2 cells for r_squared, out of sample SSE and sharpe ratio

In [1394]:
# For reference

common_sample.columns

Index(['Mom', 'VWM', '10yr', '5yr', '1yr', 'AAA', 'BAA', 'unempl', 'CPI',
       'Ind_Prod', 'Term', 'Curve', 'Default', 'Inflation'],
      dtype='object')

In [1644]:
beta = lstsq(x[['BAA', 'Ind_Prod', 'Curve', 'Inflation']],y,rcond=0)[0]
pred_returns = x_oos[['BAA', 'Ind_Prod', 'Curve', 'Inflation']] @ beta

strat_returns = np.ones(209)
for i in range(209):
    if pred_returns[i] > y_in.quantile(.75):
        portfolio_weight = 2
    elif pred_returns[i] > y_in.quantile(.5):
        portfolio_weight = 2
    elif pred_returns[i] > y_in.quantile(.25):
        portfolio_weight = 1
    else:
        portfolio_weight = 1
    
    strat_returns[i] = (portfolio_weight * y_oos[i]) + (1 - portfolio_weight) * rf[i+208]


cum_strat_returns = ((1+strat_returns/100).cumprod()-1)*100
cum_strat_returns[-1]

543.4419521167162

In [1646]:
resid_oos = y_oos - pred_returns
oos_sse = resid_oos @ resid_oos
oos_tss = (y_oos **2).sum()
oos_r2 = 1- oos_sse/ oos_tss
print(f"The out of sample R2 for the out of sample period is {100*oos_r2:0.1f}%.")
print()
r2 = OLS(y_oos, x_oos).fit().rsquared
print(f"The in sample R2 for the out of sample period is {100*r2:0.1f}%.")

The out of sample R2 for the out of sample period is 7.4%.

The in sample R2 for the out of sample period is 16.8%.


In [1645]:
### Sharpe ratio

print(   (strat_returns.mean()-rf_mean)   / strat_returns.std() * np.sqrt(12)  )
print()
print(   (y_oos.mean()-rf_mean) / y_oos.std() * np.sqrt(12))

0.7698711610903691

0.7254891133486322


---------------------------------------------------------------------------------------------------------------------------
Now to replicate out of sample

In [1713]:
### Training the data on the first half (208 data points in y)
#best_subset(x_in,y_in,p_in, random=False,fold=5)

(['Mom', '10yr', 'BAA', 'CPI', 'Curve'],
 1     4677.584105
 2     4540.001022
 3     4500.247111
 4     4507.740455
 5     4500.200689
 6     4569.637874
 7     4544.641567
 8     4567.434273
 9     4572.297738
 10    4607.939177
 11    4677.095427
 12    4677.095427
 13    4677.095427
 14    4677.095427
 dtype: float64)

In [1712]:
hybrid_stepwise(x_in,y_in,p_in,random=False,fold=5)

(['CPI', 'Mom', 'BAA', 'Curve', '10yr'],
 1     4677.584105
 2     4540.001022
 3     4500.247111
 4     4500.247111
 5     4500.200689
 6     4500.200689
 7     4500.200689
 8     4500.200689
 9     4500.200689
 10    4500.200689
 11    4500.200689
 12    4513.341205
 13    4513.341205
 14    4502.075524
 dtype: float64,
 {1: ['CPI'],
  2: ['CPI', 'Mom'],
  3: ['CPI', 'Mom', 'BAA'],
  4: ['CPI', 'Mom', 'BAA'],
  5: ['CPI', 'Mom', 'BAA', 'Curve', '10yr'],
  6: ['CPI', 'Mom', 'BAA', 'Curve', '10yr'],
  7: ['CPI', 'Mom', 'BAA', 'Curve', '10yr'],
  8: ['CPI', 'Mom', 'BAA', 'Curve', '10yr'],
  9: ['CPI', 'Mom', 'BAA', 'Curve', '10yr'],
  10: ['CPI', 'Mom', 'BAA', 'Curve', '10yr'],
  11: ['CPI', 'Mom', 'BAA', 'Curve', '10yr'],
  12: ['CPI', 'Mom', 'Curve', 'AAA', 'Default'],
  13: ['CPI', 'Mom', 'Curve', 'AAA', 'Default'],
  14: ['CPI', 'Mom', 'AAA', '1yr', 'Default']})

In [1707]:
fwd_step = forward_stepwise(x_in,y_in,p_in,random=False,fold=5)
fwd_step

(['CPI', 'Mom', 'BAA', 'Curve'],
 1     4716.826525
 2     4616.090323
 3     4597.268692
 4     4563.080093
 5     4581.552838
 6     4679.654265
 7     4715.408913
 8     4781.056685
 9     4800.327304
 10    4874.562877
 11    4893.520329
 12    4893.520329
 13    4893.520329
 14    4893.520329
 dtype: float64)

In [1597]:
gts(x_in,y_in)

['CPI']

In [1649]:
#  LASSO evaluation

pred_returns = x_oos[fwd_step[0]] @ ridge_reg(x_in[fwd_step[0]], y_in)

strat_returns = np.ones(209)
for i in range(209):
    if pred_returns[i] > y_in.quantile(.75):
        portfolio_weight = 1.5
    elif pred_returns[i] > y_in.quantile(.5):
        portfolio_weight = 1.5
    elif pred_returns[i] > y_in.quantile(.25):
        portfolio_weight = 1
    else:
        portfolio_weight = 1
    
    strat_returns[i] = (portfolio_weight * y_oos[i]) + (1- portfolio_weight) * rf[i+208]


cum_strat_returns = ((1+strat_returns/100).cumprod()-1)*100
cum_strat_returns[-1]

Optimal alpha = 23.77
Optimal alpha = 23.627380000000002
Optimal alpha = 23.627380000000002


390.77143452013024

In [1648]:
#  LASSO evaluation

pred_returns = x_oos[fwd_step[0]] @ lasso_reg(x_in[fwd_step[0]], y_in)

strat_returns = np.ones(209)
for i in range(209):
    if pred_returns[i] > y_in.quantile(.75):
        portfolio_weight = 1.5
    elif pred_returns[i] > y_in.quantile(.5):
        portfolio_weight = 1.5
    elif pred_returns[i] > y_in.quantile(.25):
        portfolio_weight = 1
    else:
        portfolio_weight = 1
    
    strat_returns[i] = (portfolio_weight * y_oos[i]) + (1- portfolio_weight) * rf[i+208]


cum_strat_returns = ((1+strat_returns/100).cumprod()-1)*100
cum_strat_returns[-1]

392.1727560747218

Test out Models Here, out of sample VWM
----------------------------------
Add in the desired regressors, evaluate with the trading strategy by running first cell
then run next 2 cells for r_squared, out of sample SSE and sharpe ratio

In [1479]:
# For reference

common_sample.columns

Index(['Mom', 'VWM', '10yr', '5yr', '1yr', 'AAA', 'BAA', 'unempl', 'CPI',
       'Ind_Prod', 'Term', 'Curve', 'Default', 'Inflation'],
      dtype='object')

In [1776]:
### Evaluating the model out of sample

beta = lstsq(x_in[["Mom"]],y_in,rcond=0)[0]
pred_returns = x_oos[["Mom"]] @ beta

#['Mom', 'unempl', 'Curve']

strat_returns = np.ones(209)
for i in range(209):
    if pred_returns[i] > y_in.quantile(.75):
        portfolio_weight = 2
    elif pred_returns[i] > y_in.quantile(.5):
        portfolio_weight = 2
    elif pred_returns[i] > y_in.quantile(.25):
        portfolio_weight = 1
    else:
        portfolio_weight = 1
    
    """if pred_returns[i] > y_in.mean():
        portfolio_weight = 1.5
    else:
        portfolio_weight = 0.5"""

    strat_returns[i] = (portfolio_weight * y_oos[i]) + (1 - portfolio_weight) * rf[i+208]


cum_strat_returns = ((1+strat_returns/100).cumprod()-1)*100
cum_strat_returns[-1]

365.15312981482106

In [1460]:
((1+(y_oos/100)).cumprod()[-1]-1)*100

422.4388819372857

In [1777]:
resid_oos = y_oos - pred_returns
oos_sse = resid_oos @ resid_oos
oos_tss = (y_oos **2).sum()
oos_r2 = 1- (oos_sse/ oos_tss)
print(f"The out of sample R2 for the out of sample period is {100*oos_r2:0.1f}%.")
print()
r2 = OLS(y_oos, x_oos).fit().rsquared
print(f"The in sample R2 for the out of sample period is {100*r2:0.1f}%.")

The out of sample R2 for the out of sample period is -13.4%.

The in sample R2 for the out of sample period is 16.8%.


In [1778]:
### Sharpe ratio

print(   (strat_returns.mean()-rf_mean)   / strat_returns.std() * np.sqrt(12)  )
print()
print(   (y_oos.mean()-rf_mean) / y_oos.std() * np.sqrt(12))

0.5657832668035775

0.7254891133486322
