# Machine Learning Benchmark

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import yfinance as yf
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from matplotlib.pyplot import figure
import matplotlib.pyplot as plt
import sys
sys.path.append('/Users/cheng/Google Drive/PhD/Research/Non-Myopic Equity Risk Premium/code/module/')
from analysis import get_return_forecast_performance


## Data Processing

In [4]:
# load data
feature_df = pd.read_csv('../../data/econ_predictors.csv', index_col = 0, parse_dates=True)
IVV_IEF_daily_ret = pd.read_csv('../../data/IVV_IEF_daily_ret.csv', index_col=0, parse_dates=True)

In [5]:
# truncate data to target period
DATE_START = '2002-08-01'
DATE_END = '2019-12-31'
feature_df = feature_df[DATE_START: DATE_END]

# get monthly data 
IVV_IEF_monthly_ret = IVV_IEF_daily_ret.groupby(pd.Grouper(freq='M')).mean()
IVV_IEF_monthly_ret = IVV_IEF_monthly_ret[DATE_START: DATE_END]

In [6]:
# standardize feature set
scaler = StandardScaler()
scaler.fit(feature_df)
X = scaler.transform(feature_df)

In [7]:
# create label which equal 1 when IVV return larger than 0 
thredshould = 0.0001
label = IVV_IEF_monthly_ret['IVV'] > thredshould
label = (label * 1).values.reshape((-1, 1)) # convert boolen vector to numeric vector

In [8]:
# combine X and y
data = np.concatenate((X[:len(X) - 1], label[1:]), axis=1)

In [9]:
# rolling window subset
window_size = 1 * 12
year_start = 0
year_end = len(data)
year_range = np.arange(year_start, year_end)
training_idx_list = np.array([np.arange(year, year + window_size, 1) for year in year_range[:-window_size]])
testing_idx_list = np.array([training_idx_list[:,window_size-1] + 1]).T

## Model

In [10]:
def get_ml_predict_prob(X_train, y_train, X_test, seed, model):
    np.random.seed(seed)
    model.fit(X_train, y_train)
    prob = model.predict_proba(X_test)[0][1]

    return prob

In [11]:
def get_ml_predict_prob_avg(X_train, y_train, X_test, model):
    seed_range = np.arange(0, 30)
    
    # loop for different seed and get average prob
    ml_prob_list = [get_ml_predict_prob(X_train, y_train, X_test, seed, model) for seed in seed_range]
    ml_prob_avg = np.mean(ml_prob_list)
    
    return ml_prob_avg

In [12]:
clf = svm.SVC(probability = True)
neigh = KNeighborsClassifier(n_neighbors=4)

## Training and prediction

In [13]:
X_train, y_train = data[training_idx_list, 0:12], data[training_idx_list, 12]
X_test, y_test = data[testing_idx_list, 0:12], data[testing_idx_list, 12]

In [14]:
svm_prob_list = [get_ml_predict_prob_avg(X_train, y_train, X_test, model = clf) for X_train, y_train, X_test in zip(X_train, y_train, X_test)]
knn_prob_list = [get_ml_predict_prob_avg(X_train, y_train, X_test, model = neigh) for X_train, y_train, X_test in zip(X_train, y_train, X_test)]

In [23]:
date_index = feature_df.index.values[testing_idx_list]

In [154]:
w_svm = pd.DataFrame(np.array([np.array(svm_prob_list),1 - np.array(svm_prob_list)]).T, columns = ['IVV', 'IEF'], index = date_index.reshape((-1)))
w_knn = pd.DataFrame(np.array([np.array(knn_prob_list),1 - np.array(knn_prob_list)]).T, columns = ['IVV', 'IEF'], index = date_index.reshape((-1)))

## Data Output

In [155]:
w_svm.to_csv('../../data/weights_svm.csv')
w_knn.to_csv('../../data/weights_knn.csv')