# Price stock tendency prediction

## Setup

In [1]:
%load_ext autoreload
%autoreload 2

from notebook_config import setup_notebook
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

setup_notebook()

In [2]:
from tqdm.notebook import trange, tqdm
from datasets.stocks_data_wrapper import StocksDataWrapper
from helpers.data_helper import *

In [3]:
DATA_PATH = '../../data/'
quotation = 'AMZN'
FILE_SUFFIX = '.txt'
price_column = 'Close'

In [4]:
OUTPUT_PATH = 'results/'
N_CLASSES = 2
METRICS_PATH = f"{OUTPUT_PATH}metrics/{N_CLASSES}_classes/"
PIPELINE_LABEL = 'random_walk'

THRESH_DIFF = 0.005 if N_CLASSES is 3 else None

### Data preparation

In [5]:
quotations = ['AAL', 'AAPL', 'AMZN','CMCSA', 'COST', 'GM', 'GOOG', 'IBM', 'JNJ', 'KO','PEP', 'TSLA', 'WMT', 'XOM']
days_predict_feature_set = {
    1:['Volume', 'Close', 'LowLen', 'Difference', 'BG_L_Band', 'GAP', 'MACD_diff'],
    5:['Volume', 'Close', 'LowLen', 'Difference', 'SMA(20) - SMA(10)', 'BG_H_Band_Indicator', 'MACD'],
    10:['Volume', 'Close', 'BodyLen', 'Difference', 'SMA(20) - SMA(10)', 'EMA_Diff', 'MACD_diff'],
    20:['Volume', 'Close', 'LowLen', 'Difference', 'EMA(14)', 'BG_H_Band', 'MACD_diff'],
    50:['Volume', 'Close', 'LowLen', 'Difference', 'BG_L_Band','GAP','MACD_diff'],
}

In [6]:
predict_n_metrics_dict = {}

In [7]:
import random
from sklearn.metrics import accuracy_score, f1_score

for predict_n, features_list in tqdm(days_predict_feature_set.items()):
    quot_metrics_dict = {}
    for quot in tqdm(quotations):
        data_wrapper = StocksDataWrapper.read_from(f"{DATA_PATH}{quot}{FILE_SUFFIX}", compute_features=True, 
                                                   predict_n=predict_n, thresh_diff=THRESH_DIFF, normalize=True)
        features_list = ['Tendency']
        X_train, X_test, y_train, y_test = data_wrapper.get_datasets(n_splits=1, val_size=0.2, 
                                                                     y_column='Next', 
                                                                     features_list=features_list)
        
        
        unique_elements, counts_elements = np.unique(X_train, return_counts=True)
        # compute element probabilities
        element_freq = [v /  sum(counts_elements) for v in counts_elements]
        
        # choose n = len(X_test) elements according to the prior computed probability
        tendency_walk = np.random.choice(unique_elements, size=len(X_test), p=element_freq)
        
        metrics= {'acc':accuracy_score(tendency_walk, y_test), 
                  'f1':f1_score(tendency_walk, y_test, average='weighted')
        }
        
        quot_metrics_dict[quot] = {'RandomWalk':metrics}
        
        
    predict_n_metrics_dict[predict_n] = quot_metrics_dict.copy()

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

In [8]:
print(METRICS_PATH)

results/metrics/2_classes/


In [9]:
save_dict(dict_save=predict_n_metrics_dict, path=f"{METRICS_PATH}{PIPELINE_LABEL}_dict.txt")

In [10]:
print(predict_n_metrics_dict[1])

{'AAL': {'RandomWalk': {'acc': 0.45779685264663805, 'f1': 0.45795740731429774}}, 'AAPL': {'RandomWalk': {'acc': 0.5135908440629471, 'f1': 0.5124678994683763}}, 'AMZN': {'RandomWalk': {'acc': 0.5236051502145923, 'f1': 0.523540220148023}}, 'CMCSA': {'RandomWalk': {'acc': 0.49277456647398843, 'f1': 0.4946461052575604}}, 'COST': {'RandomWalk': {'acc': 0.5093910073989756, 'f1': 0.5089971351989212}}, 'GM': {'RandomWalk': {'acc': 0.48906560636182905, 'f1': 0.4900682589880868}}, 'GOOG': {'RandomWalk': {'acc': 0.5021459227467812, 'f1': 0.501501320323584}}, 'IBM': {'RandomWalk': {'acc': 0.510472972972973, 'f1': 0.5104037442343454}}, 'JNJ': {'RandomWalk': {'acc': 0.5064377682403434, 'f1': 0.5063161515130448}}, 'KO': {'RandomWalk': {'acc': 0.5107296137339056, 'f1': 0.510525213353005}}, 'PEP': {'RandomWalk': {'acc': 0.4949372215471851, 'f1': 0.4966658084685576}}, 'TSLA': {'RandomWalk': {'acc': 0.4933078393881453, 'f1': 0.4921940384906754}}, 'WMT': {'RandomWalk': {'acc': 0.5135908440629471, 'f1': 0.

In [11]:
previous_metrics = read_dict(f"{METRICS_PATH}full_metrics_dict.txt")

previous_metrics = {int(k):v for k,v in previous_metrics.items()}
previous_metrics[1]

{'AAL': {'DTree': {'acc': 0.5207439198855508, 'f1': 0.5342762796807659},
  'RandomForest': {'acc': 0.5007153075822603, 'f1': 0.5320477382211538},
  'SVM': {'acc': 0.47067238912732473, 'f1': 0.5630758763862446},
  'LR': {'acc': 0.47782546494992845, 'f1': 0.4817446940781028},
  'LinearNN': {'acc': 0.47639484978540775,
   'roc_auc': 0.5065847883989345,
   'f1': 0.3350949484127063},
  'LSTM': {'acc': 0.5329512893982808, 'f1': 0.37057547599282326},
  'RandomWalk': {'acc': 0.4975651675737611, 'f1': 0.4975651675737611}},
 'AAPL': {'DTree': {'acc': 0.48068669527896996, 'f1': 0.5666769203237425},
  'RandomForest': {'acc': 0.4563662374821173, 'f1': 0.6055698033952478},
  'SVM': {'acc': 0.44778254649499283, 'f1': 0.6185770750988142},
  'LR': {'acc': 0.44778254649499283, 'f1': 0.6185770750988142},
  'LinearNN': {'acc': 0.5364806866952789,
   'roc_auc': 0.489980797563277,
   'f1': 0.4167863715814626},
  'LSTM': {'acc': 0.5501432664756447, 'f1': 0.39048986012319326},
  'RandomWalk': {'acc': 0.497565

for predict_n, quot_metrics in previous_metrics.items():
    for quot, metrics in quot_metrics.items():
        #print(metrics.keys())
        quot_metrics[quot] = {k:v for k,v in metrics.items() if k not in ['acc', 'f1', quot]}
        #metrics[quot] = {k:v for k,v in metrics.items() if k not in ['acc', 'f1', quot]}
        
save_dict(dict_save=previous_metrics, path=f"{METRICS_PATH}full_metrics_dict.txt")

In [12]:
full_metrics = previous_metrics.copy()
merge_metric_dicts(full_metrics, predict_n_metrics_dict)

In [13]:
full_metrics

{1: {'AAL': {'DTree': {'acc': 0.5207439198855508, 'f1': 0.5342762796807659},
   'RandomForest': {'acc': 0.5007153075822603, 'f1': 0.5320477382211538},
   'SVM': {'acc': 0.47067238912732473, 'f1': 0.5630758763862446},
   'LR': {'acc': 0.47782546494992845, 'f1': 0.4817446940781028},
   'LinearNN': {'acc': 0.47639484978540775,
    'roc_auc': 0.5065847883989345,
    'f1': 0.3350949484127063},
   'LSTM': {'acc': 0.5329512893982808, 'f1': 0.37057547599282326},
   'RandomWalk': {'acc': 0.45779685264663805, 'f1': 0.45795740731429774}},
  'AAPL': {'DTree': {'acc': 0.48068669527896996, 'f1': 0.5666769203237425},
   'RandomForest': {'acc': 0.4563662374821173, 'f1': 0.6055698033952478},
   'SVM': {'acc': 0.44778254649499283, 'f1': 0.6185770750988142},
   'LR': {'acc': 0.44778254649499283, 'f1': 0.6185770750988142},
   'LinearNN': {'acc': 0.5364806866952789,
    'roc_auc': 0.489980797563277,
    'f1': 0.4167863715814626},
   'LSTM': {'acc': 0.5501432664756447, 'f1': 0.39048986012319326},
   'Random

In [14]:
save_dict(full_metrics, f"{METRICS_PATH}full_metrics_dict.txt")

In [15]:
for predict_n, quot_metrics in full_metrics.items():
    metrics_df = pd.DataFrame.from_dict(quot_metrics).T
    #print(metrics_df)
    acc_df = metrics_df.applymap(lambda metrics: metrics['acc'])
    f1_df = metrics_df.applymap(lambda metrics: metrics['f1'])
    
    plt.figure()
    acc_heatmap = sns.heatmap(acc_df, cmap ='mako', linewidths = 0.5, annot = True)
    acc_heatmap.figure.savefig(f"{METRICS_PATH}{predict_n}_acc.png")
    plt.close()
    
    plt.figure()
    f1_heatmap = sns.heatmap(f1_df, cmap ='mako', linewidths = 0.5, annot = True)
    f1_heatmap.figure.savefig(f"{METRICS_PATH}{predict_n}_f1.png")
    plt.close()

price_walk = []
for direction, count in data_wrapper['Tendency'].value_counts().to_dict().items():
    # filter the dataset by the current direction chosen
    df_filtered_direction = data_wrapper[data_wrapper['Tendency'] == direction]
    
    # pick at random some prices according to the current price direction chosen
    price_diffs = np.random.choice(df_filtered_direction['Difference'], size=count, replace=True)
    
    price_walk.extend([(direction, diff) for diff in price_diffs])
    
random.shuffle(price_walk)

print(price_walk[:10])