In [23]:
# General packages
import pandas as pd
import numpy as np
import hvplot.pandas
import datetime as dt

# Packages related to machine learning
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from dateutil.relativedelta import relativedelta
    #for nueral networs
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# needed for API
import pandas_datareader as pdr
import yfinance as yfin
yfin.pdr_override()
from dotenv import load_dotenv
import os
import json
import requests

# Sentiment Score
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
from urllib.request import urlopen
from urllib.request import Request
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

#turn off warning signs for cleaner code
from warnings import filterwarnings
filterwarnings("ignore")

# import modules
from functions.vix_mod import vix_analysis
from functions.spy_mod import spy_analysis
from functions.econ_mod import get_econ_data
from functions.sent_mod import market_sent

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\micha\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [78]:
def create_variable_tables ():
    spy_df = spy_analysis()
    econ_df = get_econ_data()
    vix_df, cluster_model = vix_analysis()
    sentiment_df = market_sent()

    # Set Up DataFrame for Testing
    X_prep = pd.concat([vix_df, spy_df], axis=1)
    X_prep['y']=X_prep['spy_change'].shift(-1)*100
    X_prep = X_prep.dropna()
    X_prep = pd.concat([X_prep, econ_df, sentiment_df], axis=1)
    X_prep = X_prep.dropna(subset='spy_close')
    X_prep[np.isnan(X_prep)] = 0
    X_prep = X_prep.drop(columns=['spy_close','high','low'])

    X_full = X_prep.drop(columns=['y'])
    y_full = pd.DataFrame()
    y_full = X_prep['y']*100

    # Create Variable Tables
    X_0 = X_prep[X_prep['labels']==0]
    X_1 = X_prep[X_prep['labels']==1]
    X_2 = X_prep[X_prep['labels']==2]

    y_0 = X_prep[X_prep['labels']==0]['y']
    y_1 = X_prep[X_prep['labels']==1]['y']
    y_2 = X_prep[X_prep['labels']==2]['y']
    
    return X_full, X_0, X_1 , X_2, y_full, y_0, y_1, y_2, X_prep

X_full, X_0, X_1 , X_2, y_full, y_0, y_1, y_2, X_prep = create_variable_tables()
X_full.head()

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Unnamed: 0,vix_close,vix_change,labels,vix_days_in_label,vix_con_direction,volume,spy_change,volume_change,spy_con_direction,3_day_change,...,MORTGAGE30US,PRIME,MICH,TOTALSA,UMCSENT,HOUST,RECPROUSM156N,REAINTRATREARAT1YE,REAINTRATREARAT10Y,Sentiment
2003-09-08,18.26,0.004953,0.0,21.0,1.0,32632800.0,0.008266,0.031466,1.0,0.003095,...,0.018987,-0.058824,0.12,-0.053444,-0.017917,0.057829,0.02,-0.417762,0.083081,0.0
2003-09-09,18.85,0.032311,0.0,22.0,2.0,35053200.0,-0.006559,0.074171,-1.0,-0.003965,...,0.018987,-0.058824,0.12,-0.053444,-0.017917,0.057829,0.02,-0.417762,0.083081,0.0
2003-09-10,20.01,0.061538,2.0,1.0,3.0,45904900.0,-0.010097,0.309578,-2.0,-0.00846,...,0.018987,-0.058824,0.12,-0.053444,-0.017917,0.057829,0.02,-0.417762,0.083081,0.0
2003-09-11,19.25,-0.037981,0.0,1.0,-1.0,38396300.0,0.002943,-0.163569,1.0,-0.013695,...,0.018987,-0.058824,0.12,-0.053444,-0.017917,0.057829,0.02,-0.417762,0.083081,0.0
2003-09-12,18.68,-0.02961,0.0,2.0,-2.0,42524800.0,0.001858,0.107523,2.0,-0.005339,...,-0.043478,-0.058824,0.12,-0.053444,-0.017917,0.057829,0.02,-0.417762,0.083081,0.0


In [119]:
def create_train_test ():
    # Define train periods
    X_full, X_0, X_1 , X_2, y_full, y_0, y_1, y_2, X_prep = create_variable_tables()
    start_train = X_full.index.min()
    last_day = dt.datetime.strptime('2020-01-01', '%Y-%m-%d').date()
    end_train = last_day

    # Define test period
    start_test = last_day
    end_test = X_full.index.max()


    #Create train Data Frames
    X_full_train = X_full.loc[start_train: end_train]
    y_full_train = y_full.loc[start_train: end_train]
    X_prep_train = X_prep.loc[start_train: end_train]

    # Create test DataFrames
    X_full_test = X_full.loc[start_test: end_test]
    y_full_test = y_full.loc[start_test: end_test]
    X_prep_test = X_prep.loc[start_test: end_test]
    
    return X_full_train, y_full_train, X_full_test, y_full_test, X_prep_train, X_prep_test

X_full_train, y_full_train, X_full_test, y_full_test, X_prep_train, X_prep_test = create_train_test()
display(X_full_test.head())
display(y_full_test.head())

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Unnamed: 0,vix_close,vix_change,labels,vix_days_in_label,vix_con_direction,volume,spy_change,volume_change,spy_con_direction,3_day_change,...,MORTGAGE30US,PRIME,MICH,TOTALSA,UMCSENT,HOUST,RECPROUSM156N,REAINTRATREARAT1YE,REAINTRATREARAT10Y,Sentiment
2020-01-02,12.47,-0.095065,0.0,59.0,-2.0,59151200.0,0.009352,0.036335,2.0,0.006226,...,-0.005348,-0.05,0.086957,-0.012885,0.005035,0.020156,0.26,1.060426,-0.017829,0.0
2020-01-03,14.02,0.124298,0.0,60.0,1.0,77709700.0,-0.007572,0.313747,-1.0,0.004142,...,-0.005348,-0.05,0.086957,-0.012885,0.005035,0.020156,0.26,1.060426,-0.017829,0.0
2020-01-06,13.85,-0.012126,0.0,61.0,-1.0,55653900.0,0.003815,-0.283823,1.0,0.00553,...,-0.005348,-0.05,0.086957,-0.012885,0.005035,0.020156,0.26,1.060426,-0.017829,0.0
2020-01-07,13.79,-0.004332,0.0,62.0,-2.0,40496400.0,-0.002812,-0.272353,-1.0,-0.006587,...,-0.005348,-0.05,0.086957,-0.012885,0.005035,0.020156,0.26,1.060426,-0.017829,0.0
2020-01-08,13.45,-0.024656,0.0,63.0,-3.0,68296000.0,0.00533,0.686471,1.0,0.006327,...,-0.005348,-0.05,0.086957,-0.012885,0.005035,0.020156,0.26,1.060426,-0.017829,0.0


2020-01-02   -75.721846
2020-01-03    38.149302
2020-01-06   -28.115748
2020-01-07    53.295320
2020-01-08    67.805141
Name: y, dtype: float64

In [120]:
def scale_train ():
    # fit Scale X Variables
    X_full_train, y_full_train, X_full_test, y_full_test, X_prep_train, X_prep_test = create_train_test()
    scaler = StandardScaler()
    X_scaler = scaler.fit(X_full_train)
    X_scaler = X_scaler.transform
    
    # Scale X Variables
    X_full_train_scaled = X_scaler(X_full_train) 
    
    # Sub variables
    X_train_scaled_0 = X_scaler(X_full_train[X_full_train['labels']==0])
    X_train_scaled_1 = X_scaler(X_full_train[X_full_train['labels']==1])
    X_train_scaled_2 = X_scaler(X_full_train[X_full_train['labels']==2])

    y_train_0 = X_prep_train[X_prep_train['labels']==0]['y']
    y_train_1 = X_prep_train[X_prep_train['labels']==1]['y']
    y_train_2 = X_prep_train[X_prep_train['labels']==2]['y']

    
    return X_train_scaled_0, X_train_scaled_1, X_train_scaled_2, y_train_0, y_train_1, y_train_2

X_train_scaled_0, X_train_scaled_1, X_train_scaled_2, y_train_0, y_train_1, y_train_2 = scale_train()

def scale_test ():
    # fit Scale X Variables
    X_full_train, y_full_train, X_full_test, y_full_test, X_prep_train, X_prep_test = create_train_test()
    scaler = StandardScaler()
    X_scaler = scaler.fit(X_full_train)
    X_scaler = X_scaler.transform
    
    # Scale X Variables
    X_full_test_scaled = X_scaler(X_full_test)
    
    # Sub variables
    X_test_scaled_0 = X_scaler(X_full_test[X_prep_test['labels']==0])
    X_test_scaled_1 = X_scaler(X_full_test[X_prep_test['labels']==1])
    X_test_scaled_2 = X_scaler(X_full_test[X_prep_test['labels']==2])

    y_test_0 = X_prep_test[X_prep_test['labels']==0]['y']
    y_test_1 = X_prep_test[X_prep_test['labels']==1]['y']
    y_test_2 = X_prep_test[X_prep_test['labels']==2]['y']

    
    return X_test_scaled_0, X_test_scaled_1, X_test_scaled_2, y_test_0, y_test_1, y_test_2

X_test_scaled_0, X_test_scaled_1, X_test_scaled_2, y_test_0, y_test_1, y_test_2 = scale_test()

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [121]:
from keras.models import Sequential
from sklearn import tree

model = tree.DecisionTreeRegressor(random_state=0)
model = model.fit(X_train_scaled_2, y_train_2)
predictions = model.predict(X_test_scaled_2)

results = pd.DataFrame()
results['pred'] = predictions
results['real'] = y_test_2.values
results['m1'] = np.where(results['pred']>=0,1,-1)
results['m2'] = np.where(results['real']>=0,1,-1)
results['accuracy'] = np.where(results['m1']==results['m2'],1,0)
results['predict_r'] = (1+results['pred']/100).cumprod()-1
results['real_r'] = (1+results['real']/100).cumprod()-1
results['strategy_r'] = (1+np.where(results['pred']>=0, 1,-1)*results['real']/100).cumprod() -1
print(results['accuracy'].sum()/results['accuracy'].count())
display(results.head(50))
results.hvplot(y=['real_r', 'strategy_r'])


0.48134328358208955


Unnamed: 0,pred,real,m1,m2,accuracy,predict_r,real_r,strategy_r
0,-2.202646,-3.030192,-1,-1,1,-0.022026,-0.030302,0.030302
1,0.321035,-0.36783,1,-1,0,-0.018887,-0.033869,0.026512
2,-0.669467,-4.491169,-1,-1,1,-0.025455,-0.077259,0.072615
3,0.034042,-2.863268,1,-1,0,-0.025123,-0.10368,0.041903
4,0.320584,4.203297,1,1,1,-0.021998,-0.066005,0.085697
5,0.034042,-3.324146,1,-1,0,-0.021665,-0.097052,0.049607
6,-1.871427,1.441855,-1,1,0,-0.039974,-0.084033,0.034473
7,-1.871427,-0.45986,-1,-1,1,-0.05794,-0.088245,0.03923
8,-1.871427,2.617858,-1,1,0,-0.07557,-0.064377,0.012025
9,-1.871427,-0.931059,-1,-1,1,-0.09287,-0.073088,0.021447
