In [1]:
import pandas as pd
import numpy as np

In [2]:
#reads in the data (Please adjust the directory accordingly)
amazon_df = pd.read_csv("./model/amazon_combined_latest.csv")
apple_df = pd.read_csv("./model/apple_combined_latest.csv")
google_df = pd.read_csv("./model/google_combined_latest.csv")
meta_df = pd.read_csv("./model/META_combined_latest.csv")
msft_df = pd.read_csv("./model/microsoft_combined_latest.csv")
nvidia_df = pd.read_csv("./model/nvidia_combined_latest.csv")
samsung_df = pd.read_csv("./model/samsung_combined_latest.csv")
tencent_df = pd.read_csv("./model/tencent_combined_latest.csv")
tesla_df = pd.read_csv("./model/tesla_combined_latest.csv")
tsmc_df = pd.read_csv("./model/tsmc_combined_latest.csv")

In [3]:
def _produce_prediction(data, window,split_on_date):
    """
    Function that produces the 'truth' values
    At a given row, it looks 'window' rows ahead to see if the price increased (1) or decreased (0)
    :param window: number of days, or rows to look ahead to see what the price did
    """

    prediction = (data.shift(-window)['fin_close'] >= data['fin_close'])
    prediction = prediction.iloc[:-window]
    data['target'] = prediction.astype(int)
    del (data['fin_close'])
    data = data.dropna()
    data["date"] = data["date"].astype(str)
    data_test = data.loc[(data['date'] >= split_on_date)].copy()
    data_train = data.loc[(data['date'] < split_on_date)].copy()
    data_test['date'] = data_test['date'].apply(lambda x: str(x))
    data_train['date'] = data_train['date'].apply(lambda x: str(x))
    
    return data_test,data_train

In [4]:
from pycaret.classification import *

In [5]:
#Amazon, With Sentiment
amazon_df_test,amazon_df_train  = _produce_prediction(amazon_df, window=1,split_on_date='2021-06-01')


amazon_model = setup(data = amazon_df_train, 
                   target = 'target', 
                   session_id=123,
                   normalize = True,
                   numeric_features=[
                   'twitter_polarity_vader_compound', 
                   'news_title_finbert_argmax',
                   'news_text_finbert_argmax',
                   'fin_revenue'
                   ],
                   ignore_features = ['date'],
                   feature_selection=True,
                   feature_selection_threshold= 0.6,
                   fold_strategy='timeseries',                   
                   use_gpu=True)

#Creates Model
model = create_model('lr', fold = 10)
tuned_model = tune_model(model, optimize = 'Accuracy')
evaluate_model(tuned_model)
predict_model(tuned_model)
final_model= finalize_model(tuned_model)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6981,0.7405,0.8571,0.7317,0.7895,0.2677,0.2784
1,0.7075,0.7604,0.7922,0.8026,0.7974,0.272,0.2721
2,0.6887,0.7038,0.9014,0.7111,0.795,0.1839,0.2083
3,0.6981,0.7372,0.8356,0.7531,0.7922,0.2459,0.2504
4,0.7075,0.7639,0.8667,0.7558,0.8075,0.2112,0.22
5,0.7075,0.7606,0.775,0.8267,0.8,0.2582,0.2601
6,0.6604,0.7333,0.7867,0.7468,0.7662,0.1471,0.1477
7,0.6698,0.7517,0.8649,0.7191,0.7853,0.0964,0.1046
8,0.6887,0.7222,0.8267,0.7561,0.7898,0.1944,0.1973
9,0.6415,0.7027,0.8243,0.7093,0.7625,0.0482,0.0505


IntProgress(value=0, description='Processing: ', max=7)

Unnamed: 0,Fold,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC


Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


In [None]:
list(amazon_df_train.dtypes)

[dtype('O'),
 dtype('float64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('int64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('int64'),
 dtype('int64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('int64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('int64'),
 dtype('float64')]

In [None]:
amazon_df_predictions = predict_model(final_model, data=amazon_df_test)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.6711,0.7386,0.9155,0.6843,0.7832,0.1568,0.1896


In [None]:
print(amazon_df_predictions["Label"].value_counts())
print(amazon_df_predictions["target"].value_counts())

1.0    396
0.0     60
Name: Label, dtype: int64
1.0    296
0.0    160
Name: target, dtype: int64


In [None]:
#Apple, No Sentiment
apple_df_extract = apple_df[[
                        'date',
                        'is_trading_day',
                        'year',
                        'month',
                        'day_of_week',
                        'quarter',
                        'fin_volume',
                        'fin_revenue',
                        'fin_eps',
                        'fin_pe_ratio',
                        'fin_open',
                        'fin_high',
                        'fin_low',
                        'fin_close',
                        'fin_heikin_trend_uptrend',
                        ]].copy()

#train test split
apple_df_test,apple_df_train  = _produce_prediction(apple_df_extract, window=1,split_on_date='2021-06-01')

#Setup Process
apple_model = setup(data = apple_df_train, 
                   target = 'target', 
                   session_id=123,
                   normalize = True,
                   numeric_features=[
                #    'twitter_polarity_vader_compound', 
                #    'news_title_finbert_argmax',
                #    'news_text_finbert_argmax',
                   'fin_revenue'
                   ],
                   ignore_features = ['date'],
                   feature_selection=True,
                   feature_selection_threshold= 0.5
                   ,
                   fold_strategy='timeseries',   
                   use_gpu=True)

#Creates Model
model = create_model('lr', fold = 10)
tuned_model = tune_model(model, optimize = 'Accuracy')
evaluate_model(tuned_model)
predict_model(tuned_model)
final_model= finalize_model(tuned_model)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6132,0.6295,0.9701,0.625,0.7602,-0.0372,-0.1058
1,0.6981,0.7148,0.9733,0.7087,0.8202,0.0076,0.0153
2,0.6698,0.6647,1.0,0.6667,0.8,0.0364,0.1361
3,0.6415,0.7026,0.9851,0.6408,0.7765,0.045,0.1057
4,0.7358,0.7412,0.9873,0.7429,0.8478,-0.0185,-0.0571
5,0.7547,0.7509,1.0,0.75,0.8571,0.1017,0.2315
6,0.6415,0.8146,0.9853,0.6442,0.7791,0.0147,0.0409
7,0.7453,0.7573,1.0,0.7404,0.8508,0.0972,0.226
8,0.6604,0.6536,0.9714,0.6667,0.7907,0.0344,0.0671
9,0.6887,0.7372,1.0,0.6827,0.8114,0.0751,0.1975


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.674,0.7002,0.9677,0.6832,0.801,0.0303,0.0557


In [None]:
apple_df_predictions = predict_model(final_model, data=apple_df_test)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.6842,0.7283,0.8599,0.7233,0.7857,0.2024,0.2137


In [None]:
print(apple_df_predictions["Label"].value_counts())
print(apple_df_predictions["target"].value_counts())

1.0    365
0.0     91
Name: Label, dtype: int64
1.0    307
0.0    149
Name: target, dtype: int64


In [None]:
#Google, With Sentiment
google_df_test,google_df_train  = _produce_prediction(google_df, window=1,split_on_date='2021-06-01')


google_model = setup(data = google_df_train, 
                   target = 'target', 
                   session_id=123,
                   normalize = True,
                   numeric_features=[
                   'twitter_polarity_vader_compound', 
                   'news_title_finbert_argmax',
                   'news_text_finbert_argmax',
                   'fin_revenue'
                   ],
                   ignore_features = ['date'],
                   feature_selection=True,
                   feature_selection_threshold= 0.9,
                   fold_strategy='timeseries',                   
                   use_gpu=True)

#Creates Model
google_model = create_model('lr', fold = 10)
tuned_model = tune_model(model, optimize = 'Accuracy')
evaluate_model(tuned_model)
predict_model(tuned_model)
final_model= finalize_model(tuned_model)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6604,0.5376,0.9859,0.6667,0.7955,-0.0187,-0.0685
1,0.6698,0.7337,1.0,0.6667,0.8,0.0364,0.1361
2,0.5755,0.6352,0.9242,0.604,0.7305,-0.0915,-0.1732
3,0.6509,0.7229,0.8806,0.6705,0.7613,0.1544,0.176
4,0.783,0.8134,0.9494,0.7979,0.8671,0.3006,0.3378
5,0.6698,0.7274,0.8025,0.7738,0.7879,0.0443,0.0445
6,0.717,0.699,0.9125,0.7604,0.8295,0.0352,0.041
7,0.6981,0.7062,0.9221,0.732,0.8161,0.0325,0.0408
8,0.6981,0.6192,0.9,0.75,0.8182,-0.0291,-0.034
9,0.6698,0.696,0.971,0.67,0.7929,0.0982,0.1632


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.6819,0.6901,0.9706,0.6875,0.8049,0.0648,0.1128


In [None]:
google_df_predictions = predict_model(final_model, data=google_df_test)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.6842,0.7278,0.9575,0.691,0.8027,0.1053,0.1548


In [None]:
print(google_df_predictions["Label"].value_counts())
print(google_df_predictions["target"].value_counts())

1.0    424
0.0     32
Name: Label, dtype: int64
1.0    306
0.0    150
Name: target, dtype: int64


In [None]:
#META, With Sentiment
meta_df_test,meta_df_train  = _produce_prediction(meta_df, window=1,split_on_date='2021-06-01')


meta_model = setup(data = meta_df_train, 
                   target = 'target', 
                   session_id=123,
                   normalize = True,
                   numeric_features=[
                   'twitter_polarity_vader_compound', 
                   'news_title_finbert_argmax',
                   'news_text_finbert_argmax',
                   'fin_revenue'
                   ],
                   ignore_features = ['date'],
                   feature_selection=True,
                   feature_selection_threshold= 0.8,
                   fold_strategy='timeseries',                   
                   use_gpu=True)

#Creates Model
meta_model = create_model('lr', fold = 10)
tuned_model = tune_model(model, optimize = 'Accuracy')
evaluate_model(tuned_model)
predict_model(tuned_model)
final_model= finalize_model(tuned_model)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5943,0.6075,0.9219,0.6082,0.7329,0.0198,0.03
1,0.6604,0.6491,0.9552,0.6598,0.7805,0.13,0.1887
2,0.6887,0.6356,0.8919,0.7253,0.8,0.1303,0.1457
3,0.6415,0.7191,0.9531,0.6354,0.7625,0.1378,0.2004
4,0.6792,0.6798,0.8961,0.7263,0.8023,-0.0006,-0.0007
5,0.6981,0.6798,0.9351,0.7273,0.8182,0.0053,0.0072
6,0.6887,0.6724,0.9211,0.7216,0.8092,0.0267,0.034
7,0.7547,0.7618,1.0,0.7451,0.8539,0.1807,0.3152
8,0.6509,0.7214,0.9559,0.6566,0.7784,0.0746,0.1181
9,0.6604,0.6887,0.9583,0.6765,0.7931,-0.016,-0.03


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.664,0.6939,0.9201,0.6865,0.7863,0.0724,0.0934


In [None]:
meta_df_predictions = predict_model(final_model, data=meta_df_test)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.6623,0.7433,0.8915,0.6831,0.7735,0.1532,0.1763


In [None]:
print(meta_df_predictions["Label"].value_counts())
print(meta_df_predictions["target"].value_counts())

1.0    385
0.0     71
Name: Label, dtype: int64
1.0    295
0.0    161
Name: target, dtype: int64


In [None]:
#Microsoft, With Sentiment
msft_df_test,msft_df_train  = _produce_prediction(msft_df, window=1,split_on_date='2021-06-01')


msft_model = setup(data = msft_df_train, 
                   target = 'target', 
                   session_id=123,
                   normalize = True,
                   numeric_features=[
                   'twitter_polarity_vader_compound', 
                   'news_title_finbert_argmax',
                   'news_text_finbert_argmax',
                   'fin_revenue'
                   ],
                   ignore_features = ['date'],
                   feature_selection=True,
                   feature_selection_threshold= 0.3,
                   fold_strategy='timeseries',                   
                   use_gpu=True)

#Creates Model
model = create_model('lr', fold = 10)
tuned_model = tune_model(model, optimize = 'Accuracy')
evaluate_model(tuned_model)
predict_model(tuned_model)
final_model= finalize_model(tuned_model)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6981,0.4637,1.0,0.6981,0.8222,0.0,0.0
1,0.6792,0.6985,0.9861,0.6827,0.8068,0.0207,0.0533
2,0.6604,0.7083,0.9714,0.6667,0.7907,0.0344,0.0671
3,0.717,0.6763,0.9744,0.7308,0.8352,-0.0365,-0.0831
4,0.7547,0.8031,1.0,0.7524,0.8587,0.0542,0.1669
5,0.7358,0.7244,0.9744,0.7451,0.8444,0.0631,0.1059
6,0.717,0.7246,0.9737,0.7255,0.8315,0.0547,0.0954
7,0.6981,0.7385,0.9733,0.7087,0.8202,0.0076,0.0153
8,0.6887,0.6585,0.9467,0.71,0.8114,0.0146,0.022
9,0.6509,0.665,0.9583,0.6699,0.7886,-0.0549,-0.1173


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.6759,0.6922,0.9825,0.6822,0.8053,0.0017,0.0044


In [None]:
msft_df_predictions = predict_model(final_model, data=msft_df_test)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.6425,0.6891,0.9161,0.6642,0.7701,0.0513,0.0681


In [None]:
print(msft_df_predictions["Label"].value_counts())
print(msft_df_predictions["target"].value_counts())

1.0    411
0.0     45
Name: Label, dtype: int64
1.0    298
0.0    158
Name: target, dtype: int64


In [None]:
#Nvidia, With Sentiment
nvidia_df_test,nvidia_df_train  = _produce_prediction(nvidia_df, window=1,split_on_date='2021-06-01')


nvidia_model = setup(data = nvidia_df_train, 
                   target = 'target', 
                   session_id=123,
                   normalize = True,
                   numeric_features=[
                   'twitter_polarity_vader_compound', 
                   'news_title_finbert_argmax',
                   'news_text_finbert_argmax',
                   'fin_revenue'
                   ],
                   ignore_features = ['date'],
                   feature_selection=True,
                   feature_selection_threshold= 0.4,
                   fold_strategy='timeseries',                   
                   use_gpu=True)

#Creates Model
model = create_model('lr', fold = 10)
tuned_model = tune_model(model, optimize = 'Accuracy')
evaluate_model(tuned_model)
predict_model(tuned_model)
final_model= finalize_model(tuned_model)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6321,0.4975,0.971,0.6442,0.7746,-0.0371,-0.1015
1,0.6981,0.7141,1.0,0.6923,0.8182,0.0783,0.2018
2,0.5943,0.6962,0.9524,0.6,0.7362,0.0257,0.0471
3,0.7453,0.6597,0.9481,0.7604,0.8439,0.1947,0.2363
4,0.6981,0.7418,1.0,0.6923,0.8182,0.0783,0.2018
5,0.7453,0.7454,0.962,0.76,0.8492,0.0983,0.1379
6,0.6604,0.7459,0.9167,0.6875,0.7857,0.0422,0.0548
7,0.6887,0.7279,0.8875,0.7474,0.8114,-0.0442,-0.0502
8,0.6792,0.7451,0.9583,0.69,0.8023,0.0595,0.0941
9,0.6698,0.7308,0.9306,0.6907,0.7929,0.0598,0.0807


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.66,0.6509,0.9125,0.6894,0.7854,0.0384,0.0491


In [None]:
nvidia_df_predictions = predict_model(final_model, data=nvidia_df_test)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.6667,0.7179,0.9867,0.6674,0.7962,0.0411,0.0984


In [None]:
print(nvidia_df_predictions["Label"].value_counts())
print(nvidia_df_predictions["target"].value_counts())

1.0    445
0.0     11
Name: Label, dtype: int64
1.0    301
0.0    155
Name: target, dtype: int64


In [None]:
#Samsung, With Sentiment
samsung_df_test,samsung_df_train  = _produce_prediction(samsung_df, window=1,split_on_date='2021-06-01')


samsung_model = setup(data = samsung_df_train, 
                   target = 'target', 
                   session_id=123,
                   normalize = True,
                   numeric_features=[
                   'twitter_polarity_vader_compound', 
                   'news_title_finbert_argmax',
                   'news_text_finbert_argmax',
                   'fin_revenue'
                   ],
                   ignore_features = ['date'],
                   feature_selection=True,
                   feature_selection_threshold= 0.5,
                   fold_strategy='timeseries',                   
                   use_gpu=True)

#Creates Model
model = create_model('lr', fold = 10)
tuned_model = tune_model(model, optimize = 'Accuracy')
evaluate_model(tuned_model)
predict_model(tuned_model)
final_model= finalize_model(tuned_model)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.581,0.5729,0.9524,0.5941,0.7317,-0.028,-0.0609
1,0.5429,0.5065,0.8308,0.5934,0.6923,-0.1077,-0.1346
2,0.6571,0.5819,0.9559,0.6633,0.7831,0.0785,0.1225
3,0.6381,0.6006,0.875,0.6848,0.7683,-0.0045,-0.0053
4,0.5905,0.5899,0.875,0.6154,0.7226,0.0242,0.0306
5,0.581,0.6353,0.9062,0.6042,0.725,-0.0239,-0.0359
6,0.6286,0.671,0.871,0.6353,0.7347,0.1634,0.1879
7,0.6286,0.63,0.9077,0.6413,0.7516,0.095,0.1219
8,0.6,0.6193,0.9,0.6,0.72,0.1091,0.1414
9,0.5714,0.5935,0.8361,0.593,0.6939,0.0441,0.052


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.6044,0.6664,0.8591,0.6229,0.7221,0.0927,0.1085


In [None]:
samsung_df_predictions = predict_model(final_model, data=samsung_df_test)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.588,0.6327,0.7217,0.5929,0.651,0.1598,0.1644


In [None]:
print(samsung_df_predictions["Label"].value_counts())
print(samsung_df_predictions["target"].value_counts())

1.0    280
0.0    152
Name: Label, dtype: int64
1.0    230
0.0    202
Name: target, dtype: int64


In [None]:
#Tencent, No Sentiment
tencent_df_extract = tencent_df[[
                        'date',
                        'is_trading_day',
                        'year',
                        'month',
                        'day_of_week',
                        'quarter',
                        'fin_volume',
                        'fin_revenue',
                        'fin_eps',
                        'fin_pe_ratio',
                        'fin_open',
                        'fin_high',
                        'fin_low',
                        'fin_close',
                        'fin_heikin_trend_uptrend',
                        ]].copy()

#train test split
tencent_df_test,tencent_df_train  = _produce_prediction(tencent_df_extract, window=1,split_on_date='2021-06-01')

tencent_model = setup(data = tencent_df_train, 
                   target = 'target', 
                   session_id=123,
                   normalize = True,
                   numeric_features=[
                   'fin_revenue'
                   ],
                   ignore_features = ['date'],
                   feature_selection=True,
                   feature_selection_threshold= 0.5
                   ,
                   fold_strategy='timeseries',   
                   use_gpu=True)

#Creates Model
model = create_model('rf', fold = 10)
tuned_model = tune_model(model, optimize = 'Accuracy')
evaluate_model(tuned_model)
predict_model(tuned_model)
final_model = finalize_model(tuned_model)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6476,0.7023,0.5942,0.82,0.6891,0.3056,0.3271
1,0.6476,0.6733,0.7258,0.6923,0.7087,0.2635,0.264
2,0.6381,0.6706,0.7541,0.6667,0.7077,0.2374,0.2405
3,0.6,0.6322,0.625,0.75,0.6818,0.1552,0.1599
4,0.6,0.6293,0.5738,0.6863,0.625,0.2037,0.2074
5,0.7143,0.7179,0.7241,0.75,0.7368,0.4246,0.4249
6,0.5714,0.6489,0.3968,0.7812,0.5263,0.2049,0.245
7,0.6762,0.7325,0.6324,0.8269,0.7167,0.3542,0.3718
8,0.619,0.6379,0.6066,0.6981,0.6491,0.2369,0.2397
9,0.6095,0.7465,0.5634,0.8,0.6612,0.2319,0.2523


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.6647,0.7278,0.6396,0.7787,0.7023,0.3268,0.3351


In [None]:
tencent_df_predictions = predict_model(final_model, data=tencent_df_test)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.6458,0.6966,0.5161,0.7,0.5942,0.2925,0.3032


In [None]:
tencent_df_predictions["Label"].value_counts()
tencent_df_predictions["target"].value_counts()

1.0    217
0.0    215
Name: target, dtype: int64

In [None]:
#Tesla, No Sentiment
tesla_df_extract = tesla_df[[
                        'date',
                        'is_trading_day',
                        'year',
                        'month',
                        'day_of_week',
                        'quarter',
                        'fin_volume',
                        'fin_revenue',
                        'fin_eps',
                        'fin_pe_ratio',
                        'fin_open',
                        'fin_high',
                        'fin_low',
                        'fin_close',
                        'fin_heikin_trend_uptrend',
                        ]].copy()

#train test split
tesla_df_test,tesla_df_train  = _produce_prediction(tesla_df_extract, window=1,split_on_date='2021-06-01')

tesla_model = setup(data = tesla_df_train, 
                   target = 'target', 
                   session_id=123,
                   normalize = True,
                   numeric_features=[
                   'fin_revenue'
                   ],
                   ignore_features = ['date'],
                   feature_selection=True,
                   feature_selection_threshold= 0.4
                   ,
                   fold_strategy='timeseries',   
                   use_gpu=True)

#Creates Model
model = create_model('lightgbm', fold = 10)
tuned_model = tune_model(model, optimize = 'Accuracy')
evaluate_model(tuned_model)
predict_model(tuned_model)
final_model= finalize_model(tuned_model)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6476,0.6179,0.9697,0.6465,0.7758,0.0874,0.1504
1,0.619,0.6181,0.95,0.6064,0.7403,0.1411,0.2065
2,0.6,0.5673,0.8594,0.625,0.7237,0.0609,0.0722
3,0.6286,0.6183,0.9077,0.6413,0.7516,0.095,0.1219
4,0.7048,0.6417,0.9014,0.7273,0.805,0.2248,0.2484
5,0.6762,0.6067,0.8986,0.6966,0.7848,0.1713,0.1962
6,0.581,0.5715,0.8358,0.6292,0.7179,-0.0373,-0.0436
7,0.6095,0.6583,0.9508,0.6042,0.7389,0.0981,0.1537
8,0.6,0.6715,0.9333,0.5957,0.7273,0.0982,0.1436
9,0.6667,0.8142,0.9552,0.6667,0.7853,0.1355,0.1942


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.6426,0.6763,0.93,0.6399,0.7582,0.1552,0.2032




In [None]:
tesla_df_predictions = predict_model(final_model, data=tesla_df_test)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.6667,0.6141,0.9717,0.6691,0.7925,0.074,0.1304


In [None]:
print(tesla_df_predictions["Label"].value_counts())
print(tesla_df_predictions["target"].value_counts())

1.0    411
0.0     21
Name: Label, dtype: int64
1.0    283
0.0    149
Name: target, dtype: int64


In [None]:
#TSMC, With Sentiment
tsmc_df_test,tsmc_df_train  = _produce_prediction(tsmc_df, window=1,split_on_date='2021-06-01')


tsmc_model = setup(data = tsmc_df_train, 
                   target = 'target', 
                   session_id=123,
                   normalize = True,
                   numeric_features=[
                   'twitter_polarity_vader_compound', 
                   'news_title_finbert_argmax',
                   'news_text_finbert_argmax',
                   'fin_revenue'
                   ],
                   ignore_features = ['date'],
                   feature_selection=True,
                   feature_selection_threshold= 0.1,
                   fold_strategy='timeseries',                   
                   use_gpu=True)

#Creates Model
model = create_model('lr', fold = 10)
tuned_model = tune_model(model, optimize = 'Accuracy')
evaluate_model(tuned_model)
predict_model(tuned_model)
final_model= finalize_model(tuned_model)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6887,0.6373,1.0,0.6857,0.8136,0.0395,0.142
1,0.6981,0.6706,0.9737,0.7115,0.8222,-0.0367,-0.0871
2,0.6226,0.7418,1.0,0.6154,0.7619,0.0569,0.1712
3,0.6981,0.742,0.9865,0.7019,0.8202,0.0242,0.0598
4,0.7264,0.7623,1.0,0.7238,0.8398,0.0471,0.1553
5,0.717,0.7329,1.0,0.717,0.8352,0.0,0.0
6,0.7264,0.7741,1.0,0.7238,0.8398,0.0471,0.1553
7,0.783,0.7931,1.0,0.7788,0.8757,0.1173,0.2496
8,0.6415,0.637,1.0,0.6415,0.7816,0.0,0.0
9,0.6604,0.7887,1.0,0.6538,0.7907,0.0665,0.1855


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.666,0.6911,0.9736,0.6762,0.7981,-0.0104,-0.0241


In [None]:
tsmc_df_predictions = predict_model(final_model, data=tsmc_df_test)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.6425,0.6915,0.9119,0.6626,0.7675,0.0722,0.0932


In [None]:
print(tsmc_df_predictions["Label"].value_counts())
print(tsmc_df_predictions["target"].value_counts())

1.0    406
0.0     50
Name: Label, dtype: int64
1.0    295
0.0    161
Name: target, dtype: int64


In [None]:
print(amazon_df_predictions.shape)
print(apple_df_predictions.shape)
print(google_df_predictions.shape)
print(meta_df_predictions.shape)
print(msft_df_predictions.shape)
print(nvidia_df_predictions.shape)
print(samsung_df_predictions.shape)
print(tencent_df_predictions.shape)
print(tesla_df_predictions.shape)
print(tsmc_df_predictions.shape)

(456, 40)
(456, 17)
(456, 40)
(456, 40)
(456, 40)
(456, 40)
(432, 40)
(432, 17)
(432, 17)
(456, 40)


In [None]:
amazon_df_predictions.rename({'Label': 'pred_amazon'}, axis=1, inplace=True)
apple_df_predictions.rename({'Label': 'pred_apple'}, axis=1, inplace=True)
google_df_predictions.rename({'Label': 'pred_google'}, axis=1, inplace=True)
meta_df_predictions.rename({'Label': 'pred_meta'}, axis=1, inplace=True)
msft_df_predictions.rename({'Label': 'pred_msft'}, axis=1, inplace=True)
nvidia_df_predictions.rename({'Label': 'pred_nvidia'}, axis=1, inplace=True)
samsung_df_predictions.rename({'Label': 'pred_samsung'}, axis=1, inplace=True)
tencent_df_predictions.rename({'Label': 'pred_tencent'}, axis=1, inplace=True)
tesla_df_predictions.rename({'Label': 'pred_tesla'}, axis=1, inplace=True)
tsmc_df_predictions.rename({'Label': 'pred_tsmc'}, axis=1, inplace=True)

In [None]:
snp_df = amazon_df_predictions[["date","pred_amazon"]].merge(
                                apple_df_predictions[["date","pred_apple"]],how='left', left_on='date',right_on='date',).merge(
                                google_df_predictions[["date","pred_google"]],how='left', left_on='date',right_on='date').merge(
                                meta_df_predictions[["date","pred_meta"]],how='left', left_on='date',right_on='date').merge(
                                msft_df_predictions[["date","pred_msft"]],how='left', left_on='date',right_on='date').merge(
                                nvidia_df_predictions[["date","pred_nvidia"]],how='left', left_on='date',right_on='date').merge(
                                samsung_df_predictions[["date","pred_samsung"]],how='left', left_on='date',right_on='date').merge(
                                tencent_df_predictions[["date","pred_tencent"]],how='left', left_on='date',right_on='date').merge(
                                tesla_df_predictions[["date","pred_tesla"]],how='left', left_on='date',right_on='date').merge(
                                tsmc_df_predictions[["date","pred_tsmc"]],how='left', left_on='date',right_on='date')

In [None]:
snp_df.to_csv('spy_predict.csv', index=False)