In [19]:
import pandas as pd
import numpy as np
import sklearn.metrics as met 


In [20]:
#reads in the data
spy_predict_df = pd.read_csv("./model/spy_predict.csv")
spy_raw_df = pd.read_csv("./model/S&P_500_adj.csv")

In [21]:
del (spy_raw_df['Vol.'])

In [22]:
spy_raw_df.columns

Index(['Date', 'Price', 'Open', 'High', 'Low', 'Change %'], dtype='object')

In [23]:
spy_raw_df.rename({'Date': 'date',
                   'Price': 'price',
                   'Open':'open',
                   'High':'high',
                   'Low':'low',
                   'Change %':'percent_change'}, axis=1, inplace=True)

In [24]:
spy_raw_df['price'] = spy_raw_df['price'].str.replace(',', '').astype(float)
spy_raw_df['open'] = spy_raw_df['open'].str.replace(',', '').astype(float)
spy_raw_df['high'] = spy_raw_df['high'].str.replace(',', '').astype(float)
spy_raw_df['low'] = spy_raw_df['low'].str.replace(',', '').astype(float)
spy_raw_df['percent_change'] = spy_raw_df['percent_change'].str.replace('%', '').astype(float)/100

In [25]:
from sklearn.preprocessing import OrdinalEncoder
ord_enc = OrdinalEncoder()

spy_raw_df["date"] = pd.to_datetime(spy_raw_df["date"])
spy_raw_df["year"] = spy_raw_df["date"].dt.year
spy_raw_df["month"] = spy_raw_df["date"].dt.month
spy_raw_df["day_of_week"] = spy_raw_df["date"].dt.weekday
spy_raw_df["quarter"] = spy_raw_df["date"].dt.quarter
spy_raw_df["year"]=ord_enc.fit_transform(spy_raw_df["year"].values.reshape(-1, 1)).flatten()

In [26]:
spy_raw_df["date"] = spy_raw_df["date"].astype(str)
spy_raw_df_extract = spy_raw_df.loc[(spy_raw_df['date'] >= '2021-06-01')&(spy_raw_df['date'] <= '2022-08-31')].copy()

In [27]:
def _produce_prediction(data, window):
    """
    Function that produces the 'truth' values
    At a given row, it looks 'window' rows ahead to see if the price increased (1) or decreased (0)
    :param window: number of days, or rows to look ahead to see what the price did
    """
    
    prediction = (data.shift(-window)['price'] >= data['price'])
    prediction = prediction.iloc[:-window]
    data['target'] = prediction.astype(int)
    
    return data

spy_raw_df_extract = _produce_prediction(spy_raw_df_extract, window=1)
del (spy_raw_df_extract['price'])
spy_raw_df_extract = spy_raw_df_extract.dropna() # Some indicators produce NaN values for the first few rows, we just remove them here
spy_raw_df_extract.tail()
print(len(spy_raw_df_extract))

456


In [28]:
spy_processed_df = spy_raw_df_extract.merge(spy_predict_df,how='left', left_on='date',right_on='date')

In [29]:
spy_processed_test = spy_processed_df.loc[(spy_processed_df['date'] >= '2022-06-01')].copy()
spy_processed_train = spy_processed_df.loc[(spy_processed_df['date'] < '2022-06-01')].copy()

In [30]:
from pycaret.classification import *

In [31]:
models = ['lr', 'et', 'rf', 'dt', 'gbc', 'ada', 'lightgbm']
thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

In [32]:
results = {}

In [33]:
for model_name in models:
  for threshold in thresholds:
    data_model = setup(data = spy_processed_train, 
                      target = 'target', 
                      session_id=123,
                      normalize = True,
                      ignore_features = ['date'],
                      feature_selection=True,
                      feature_selection_threshold= threshold, #tune here
                      fold_strategy='timeseries',                   
                      use_gpu=True,
                      silent = True)

    model = create_model(model_name, fold = 10)

    tuned_model= tune_model(model, optimize = 'Precision')

    evaluate_model(tuned_model)

    predict_model(tuned_model)

    final_model = finalize_model(tuned_model)
    
    unseen_predictions = predict_model(final_model, data=spy_processed_test)

    y_test = unseen_predictions["target"].values
    y_pred = [float(x) for x in unseen_predictions["Label"].values]

    accuracy = met.accuracy_score(y_test, y_pred)
    precision = met.precision_score(y_test, y_pred)
    recall = met.recall_score(y_test, y_pred)

    f1score = met.f1_score(y_test, y_pred)
    rocaucscore = met.roc_auc_score(y_test, y_pred)



    predicted = unseen_predictions["Label"].value_counts()

    real = unseen_predictions["target"].value_counts()
    
    if model_name not in results:
      results[model_name] = []

      results[model_name].append({'THRESHOLD': threshold, "ACCURACY": accuracy, "PRECISION": precision, "RECALL": recall, "F1SCORE": f1score, "ROCAUC": rocaucscore})
    else:
      results[model_name].append({'THRESHOLD': threshold, "ACCURACY": accuracy, "PRECISION": precision, "RECALL": recall, "F1SCORE": f1score, "ROCAUC": rocaucscore})

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6522,0.55,0.9333,0.6667,0.7778,0.0707,0.0986
1,0.5652,0.5982,0.6875,0.6875,0.6875,-0.0268,-0.0268
2,0.8696,0.9,0.9333,0.875,0.9032,0.7039,0.7073
3,0.4783,0.6,0.5385,0.5385,0.5385,-0.0615,-0.0615
4,0.5217,0.7222,0.7857,0.5789,0.6667,-0.1145,-0.1328
5,0.6522,0.6316,0.7368,0.8235,0.7778,-0.011,-0.0114
6,0.6087,0.6275,0.7059,0.75,0.7273,0.0372,0.0374
7,0.7391,0.7353,0.8824,0.7895,0.8333,0.2418,0.2499
8,0.4348,0.5476,0.6429,0.5294,0.5806,-0.2616,-0.2734
9,0.6957,0.7583,0.8,0.75,0.7742,0.309,0.3105


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.5727,0.6438,0.7083,0.6623,0.6846,0.0249,0.025




Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.6374,0.6602,0.9464,0.6386,0.7626,0.1044,0.1534


In [34]:
results

{'lr': [{'THRESHOLD': 0.1,
   'ACCURACY': 0.6483516483516484,
   'PRECISION': 0.6875,
   'RECALL': 0.7857142857142857,
   'F1SCORE': 0.7333333333333334,
   'ROCAUC': 0.6071428571428571},
  {'THRESHOLD': 0.2,
   'ACCURACY': 0.6373626373626373,
   'PRECISION': 0.7254901960784313,
   'RECALL': 0.6607142857142857,
   'F1SCORE': 0.6915887850467289,
   'ROCAUC': 0.6303571428571428},
  {'THRESHOLD': 0.3,
   'ACCURACY': 0.6263736263736264,
   'PRECISION': 0.7115384615384616,
   'RECALL': 0.6607142857142857,
   'F1SCORE': 0.6851851851851851,
   'ROCAUC': 0.6160714285714285},
  {'THRESHOLD': 0.4,
   'ACCURACY': 0.6373626373626373,
   'PRECISION': 0.7169811320754716,
   'RECALL': 0.6785714285714286,
   'F1SCORE': 0.6972477064220183,
   'ROCAUC': 0.625},
  {'THRESHOLD': 0.5,
   'ACCURACY': 0.6813186813186813,
   'PRECISION': 0.684931506849315,
   'RECALL': 0.8928571428571429,
   'F1SCORE': 0.7751937984496124,
   'ROCAUC': 0.6178571428571429},
  {'THRESHOLD': 0.6,
   'ACCURACY': 0.6813186813186813,

In [35]:
# Created DF keys
dict_keys = ['Model']
dict_keys.extend(list(results['lr'][0].keys()))

big_holder = []
for model in models:
  # big_holder = []
  for result in results[model]:
    holder = [model]
    holder.extend(list(result.values()))
    big_holder.append(holder)


results_df = pd.DataFrame(big_holder, columns=dict_keys)

print(results_df)

       Model  THRESHOLD  ACCURACY  PRECISION    RECALL   F1SCORE    ROCAUC
0         lr        0.1  0.648352   0.687500  0.785714  0.733333  0.607143
1         lr        0.2  0.637363   0.725490  0.660714  0.691589  0.630357
2         lr        0.3  0.626374   0.711538  0.660714  0.685185  0.616071
3         lr        0.4  0.637363   0.716981  0.678571  0.697248  0.625000
4         lr        0.5  0.681319   0.684932  0.892857  0.775194  0.617857
..       ...        ...       ...        ...       ...       ...       ...
58  lightgbm        0.5  0.626374   0.634146  0.928571  0.753623  0.535714
59  lightgbm        0.6  0.615385   0.623529  0.946429  0.751773  0.516071
60  lightgbm        0.7  0.626374   0.625000  0.982143  0.763889  0.519643
61  lightgbm        0.8  0.615385   0.623529  0.946429  0.751773  0.516071
62  lightgbm        0.9  0.637363   0.638554  0.946429  0.762590  0.544643

[63 rows x 7 columns]


In [36]:
results_df.to_csv("SPY Prediction With Sentiment.csv")