In [19]:
import pandas as pd
import numpy as np
import sklearn.metrics as met 


In [20]:
#reads in the data
spy_predict_df = pd.read_csv("./model/spy_predict.csv")
spy_raw_df = pd.read_csv("./model/S&P_500_adj.csv")

In [21]:
del (spy_raw_df['Vol.'])

In [22]:
spy_raw_df.columns

Index(['Date', 'Price', 'Open', 'High', 'Low', 'Change %'], dtype='object')

In [23]:
spy_raw_df.rename({'Date': 'date',
                   'Price': 'price',
                   'Open':'open',
                   'High':'high',
                   'Low':'low',
                   'Change %':'percent_change'}, axis=1, inplace=True)

In [24]:
spy_raw_df['price'] = spy_raw_df['price'].str.replace(',', '').astype(float)
spy_raw_df['open'] = spy_raw_df['open'].str.replace(',', '').astype(float)
spy_raw_df['high'] = spy_raw_df['high'].str.replace(',', '').astype(float)
spy_raw_df['low'] = spy_raw_df['low'].str.replace(',', '').astype(float)
spy_raw_df['percent_change'] = spy_raw_df['percent_change'].str.replace('%', '').astype(float)/100

In [25]:
from sklearn.preprocessing import OrdinalEncoder
ord_enc = OrdinalEncoder()

spy_raw_df["date"] = pd.to_datetime(spy_raw_df["date"])
spy_raw_df["year"] = spy_raw_df["date"].dt.year
spy_raw_df["month"] = spy_raw_df["date"].dt.month
spy_raw_df["day_of_week"] = spy_raw_df["date"].dt.weekday
spy_raw_df["quarter"] = spy_raw_df["date"].dt.quarter
spy_raw_df["year"]=ord_enc.fit_transform(spy_raw_df["year"].values.reshape(-1, 1)).flatten()

In [26]:
spy_raw_df["date"] = spy_raw_df["date"].astype(str)
spy_raw_df_extract = spy_raw_df.loc[(spy_raw_df['date'] >= '2021-06-01')&(spy_raw_df['date'] <= '2022-08-31')].copy()

In [27]:
def _produce_prediction(data, window):
    """
    Function that produces the 'truth' values
    At a given row, it looks 'window' rows ahead to see if the price increased (1) or decreased (0)
    :param window: number of days, or rows to look ahead to see what the price did
    """
    
    prediction = (data.shift(-window)['price'] >= data['price'])
    prediction = prediction.iloc[:-window]
    data['target'] = prediction.astype(int)
    
    return data

spy_raw_df_extract = _produce_prediction(spy_raw_df_extract, window=1)
del (spy_raw_df_extract['price'])
spy_raw_df_extract = spy_raw_df_extract.dropna() # Some indicators produce NaN values for the first few rows, we just remove them here
spy_raw_df_extract.tail()
print(len(spy_raw_df_extract))

456


In [28]:
spy_processed_df = spy_raw_df_extract

In [29]:
spy_processed_test = spy_processed_df.loc[(spy_processed_df['date'] >= '2022-06-01')].copy()
spy_processed_train = spy_processed_df.loc[(spy_processed_df['date'] < '2022-06-01')].copy()

In [30]:
from pycaret.classification import *

In [31]:
models = ['lr', 'et', 'rf', 'dt', 'gbc', 'ada', 'lightgbm']
thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

In [32]:
results = {}

In [33]:
for model_name in models:
  for threshold in thresholds:
    data_model = setup(data = spy_processed_train, 
                      target = 'target', 
                      session_id=123,
                      normalize = True,
                      ignore_features = ['date'],
                      feature_selection=True,
                      feature_selection_threshold= threshold, #tune here
                      fold_strategy='timeseries',                   
                      use_gpu=True,
                      silent = True)

    model = create_model(model_name, fold = 10)

    tuned_model= tune_model(model, optimize = 'Precision')

    evaluate_model(tuned_model)

    predict_model(tuned_model)

    final_model = finalize_model(tuned_model)
    
    unseen_predictions = predict_model(final_model, data=spy_processed_test)

    y_test = unseen_predictions["target"].values
    y_pred = [float(x) for x in unseen_predictions["Label"].values]

    accuracy = met.accuracy_score(y_test, y_pred)
    precision = met.precision_score(y_test, y_pred)
    recall = met.recall_score(y_test, y_pred)
    f1score = met.f1_score(y_test, y_pred)
    rocaucscore = met.roc_auc_score(y_test, y_pred)
    predicted = unseen_predictions["Label"].value_counts()
    real = unseen_predictions["target"].value_counts()
    
    if model_name not in results:
      results[model_name] = []
      results[model_name].append({'THRESHOLD': threshold, "ACCURACY": accuracy, "PRECISION": precision, "RECALL": recall, "F1SCORE": f1score, "ROCAUC": rocaucscore})
    else:
      results[model_name].append({'THRESHOLD': threshold, "ACCURACY": accuracy, "PRECISION": precision, "RECALL": recall, "F1SCORE": f1score, "ROCAUC": rocaucscore})

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5652,0.3875,0.8667,0.619,0.7222,-0.1616,-0.2254
1,0.6087,0.4821,0.75,0.7059,0.7273,0.0372,0.0374
2,0.6957,0.825,0.8667,0.7222,0.7879,0.2648,0.2791
3,0.5652,0.6077,0.6154,0.6154,0.6154,0.1154,0.1154
4,0.6087,0.7698,0.5714,0.7273,0.64,0.2247,0.2326
5,0.6522,0.7632,0.7368,0.8235,0.7778,-0.011,-0.0114
6,0.6087,0.6569,0.6471,0.7857,0.7097,0.1266,0.1323
7,0.7826,0.7745,0.9412,0.8,0.8649,0.3275,0.3579
8,0.5217,0.5159,0.7143,0.5882,0.6452,-0.0675,-0.0706
9,0.6957,0.7333,0.8667,0.7222,0.7879,0.2648,0.2791


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.5636,0.6393,0.6528,0.6714,0.662,0.0469,0.047




Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.6374,0.7079,0.9107,0.6456,0.7556,0.1263,0.1592


In [34]:
results

{'lr': [{'THRESHOLD': 0.1,
   'ACCURACY': 0.6483516483516484,
   'PRECISION': 0.74,
   'RECALL': 0.6607142857142857,
   'F1SCORE': 0.6981132075471698,
   'ROCAUC': 0.6446428571428571},
  {'THRESHOLD': 0.2,
   'ACCURACY': 0.7362637362637363,
   'PRECISION': 0.8076923076923077,
   'RECALL': 0.75,
   'F1SCORE': 0.7777777777777779,
   'ROCAUC': 0.7321428571428571},
  {'THRESHOLD': 0.3,
   'ACCURACY': 0.6923076923076923,
   'PRECISION': 0.7692307692307693,
   'RECALL': 0.7142857142857143,
   'F1SCORE': 0.7407407407407408,
   'ROCAUC': 0.6857142857142858},
  {'THRESHOLD': 0.4,
   'ACCURACY': 0.6263736263736264,
   'PRECISION': 0.6410256410256411,
   'RECALL': 0.8928571428571429,
   'F1SCORE': 0.746268656716418,
   'ROCAUC': 0.5464285714285715},
  {'THRESHOLD': 0.5,
   'ACCURACY': 0.6043956043956044,
   'PRECISION': 0.6136363636363636,
   'RECALL': 0.9642857142857143,
   'F1SCORE': 0.75,
   'ROCAUC': 0.49642857142857144},
  {'THRESHOLD': 0.6,
   'ACCURACY': 0.6263736263736264,
   'PRECISION':

In [35]:
# Created DF keys
dict_keys = ['Model']
dict_keys.extend(list(results['lr'][0].keys()))

big_holder = []
for model in models:
  for result in results[model]:
    holder = [model]
    holder.extend(list(result.values()))
    big_holder.append(holder)


results_df = pd.DataFrame(big_holder, columns=dict_keys)

print(results_df)

       Model  THRESHOLD  ACCURACY  PRECISION    RECALL   F1SCORE    ROCAUC
0         lr        0.1  0.648352   0.740000  0.660714  0.698113  0.644643
1         lr        0.2  0.736264   0.807692  0.750000  0.777778  0.732143
2         lr        0.3  0.692308   0.769231  0.714286  0.740741  0.685714
3         lr        0.4  0.626374   0.641026  0.892857  0.746269  0.546429
4         lr        0.5  0.604396   0.613636  0.964286  0.750000  0.496429
..       ...        ...       ...        ...       ...       ...       ...
58  lightgbm        0.5  0.637363   0.653333  0.875000  0.748092  0.566071
59  lightgbm        0.6  0.626374   0.644737  0.875000  0.742424  0.551786
60  lightgbm        0.7  0.626374   0.634146  0.928571  0.753623  0.535714
61  lightgbm        0.8  0.637363   0.645570  0.910714  0.755556  0.555357
62  lightgbm        0.9  0.637363   0.645570  0.910714  0.755556  0.555357

[63 rows x 7 columns]


In [36]:
results_df.to_csv("SPY Prediction Without Sentiment.csv")