# Random walk stock tendency prediction

## Setup

In [1]:
%load_ext autoreload
%autoreload 2

from notebook_config import setup_notebook
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

setup_notebook()

In [2]:
from tqdm.notebook import trange, tqdm
from datasets.stocks_data_wrapper import StocksDataWrapper
from helpers.data_helper import *

In [3]:
DATA_PATH = '../../data/'
quotation = 'AMZN'
FILE_SUFFIX = '.txt'
price_column = 'Close'

In [5]:
OUTPUT_PATH = 'results/'
N_CLASSES = 2
METRICS_PATH = f"{OUTPUT_PATH}metrics/{N_CLASSES}_classes/"
METRICS_PATH = f"{OUTPUT_PATH}metrics/regression/"
PIPELINE_LABEL = 'random_walk'

### Data preparation

In [6]:
quotations = ['AAL', 'AAPL', 'AMZN','CMCSA', 'COST', 'GM', 'GOOG', 'IBM', 'JNJ', 'KO','PEP', 'TSLA', 'WMT', 'XOM']
days_predict_feature_set = {
    1:['Volume', 'Close', 'LowLen', 'Difference', 'BG_L_Band', 'GAP', 'MACD_diff'],
    5:['Volume', 'Close', 'LowLen', 'Difference', 'SMA(20) - SMA(10)', 'BG_H_Band_Indicator', 'MACD'],
    10:['Volume', 'Close', 'BodyLen', 'Difference', 'SMA(20) - SMA(10)', 'EMA_Diff', 'MACD_diff'],
    20:['Volume', 'Close', 'LowLen', 'Difference', 'EMA(14)', 'BG_H_Band', 'MACD_diff'],
    50:['Volume', 'Close', 'LowLen', 'Difference', 'BG_L_Band','GAP','MACD_diff'],
}

In [7]:
predict_n_metrics_dict = {}

In [10]:
data_wrapper = StocksDataWrapper.read_from(f"{DATA_PATH}{quotation}{FILE_SUFFIX}", compute_features=True, 
                                           predict_n=5, thresh_diff=THRESH_DIFF, normalize=True)

In [7]:
import random
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

for predict_n, features_list in tqdm(days_predict_feature_set.items()):
    quot_metrics_dict = {}
    for quot in tqdm(quotations):
        data_wrapper = StocksDataWrapper.read_from(f"{DATA_PATH}{quot}{FILE_SUFFIX}", compute_features=True, 
                                                   predict_n=predict_n, thresh_diff=THRESH_DIFF, normalize=True)
        features_list = ['Tendency']
        X_train, X_test, y_train, y_test = data_wrapper.get_datasets(n_splits=1, val_size=0.2, 
                                                                     y_column='Next', 
                                                                     features_list=features_list)
        
        
        unique_elements, counts_elements = np.unique(X_train, return_counts=True)
        # compute element probabilities
        element_freq = [v /  sum(counts_elements) for v in counts_elements]
        
        # choose n = len(X_test) elements according to the prior computed probability
        tendency_walk = np.random.choice(unique_elements, size=len(X_test), p=element_freq)
        
        metrics= {'acc':accuracy_score(tendency_walk, y_test), 
                  'f1':f1_score(tendency_walk, y_test, average='weighted')
        }
        
        quot_metrics_dict[quot] = {'RandomWalk':metrics}
        
        
    predict_n_metrics_dict[predict_n] = quot_metrics_dict.copy()

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

In [11]:
predict_n_metrics_dict = {}

In [20]:
import random
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

for predict_n, features_list in tqdm(days_predict_feature_set.items()):
    quot_metrics_dict = {}
    for quot in tqdm(quotations):
        data_wrapper = StocksDataWrapper.read_from(f"{DATA_PATH}{quot}{FILE_SUFFIX}", compute_features=True, 
                                                   predict_n=predict_n, thresh_diff=THRESH_DIFF, normalize=True)
        features_list = ['Close']
        X_train, X_test, y_train, y_test = data_wrapper.get_datasets(n_splits=1, val_size=0.2, 
                                                                     y_column='NextPrice', 
                                                                     features_list=features_list)
        
        # choose n = len(X_test) elements
        prices_walk = np.random.choice(X_train.squeeze(), size=len(X_test))
        
        metrics= {'mse':mean_squared_error(prices_walk, y_test), 
                  'mae':mean_absolute_error(prices_walk, y_test), 
                  'mape':mean_absolute_percentage_error(prices_walk, y_test),
        }

        quot_metrics_dict[quot] = {'RandomWalk':metrics}
        
    predict_n_metrics_dict[predict_n] = quot_metrics_dict.copy()

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

In [21]:
predict_n_metrics_dict

{1: {'AAL': {'RandomWalk': {'mse': 0.13232223208556113,
    'mae': 0.30531158602995084,
    'mape': 3.4475889450512174}},
  'AAPL': {'RandomWalk': {'mse': 0.17597319389388721,
    'mae': 0.3631381314370611,
    'mape': 15.789164834942758}},
  'AMZN': {'RandomWalk': {'mse': 0.28456004130557067,
    'mae': 0.49774331855896675,
    'mape': 24.007528575355074}},
  'CMCSA': {'RandomWalk': {'mse': 0.28392384554024597,
    'mae': 0.5050117089878567,
    'mape': 1253554709750.9727}},
  'COST': {'RandomWalk': {'mse': 0.25164144369539343,
    'mae': 0.4542527340675398,
    'mape': 2811482005801.083}},
  'GM': {'RandomWalk': {'mse': 0.08179163483681472,
    'mae': 0.228303354345853,
    'mape': 0.6913770700215232}},
  'GOOG': {'RandomWalk': {'mse': 0.2706730085275981,
    'mae': 0.4864307435668934,
    'mape': 4068657643002.2026}},
  'IBM': {'RandomWalk': {'mse': 0.3562452065402905,
    'mae': 0.5604074106443208,
    'mape': 13.322117100431615}},
  'JNJ': {'RandomWalk': {'mse': 0.2935266804192242

In [23]:
print(f"{METRICS_PATH}{PIPELINE_LABEL}_dict.txt")

results/metrics/regression/random_walk_dict.txt


In [24]:
save_dict(dict_save=predict_n_metrics_dict, path=f"{METRICS_PATH}{PIPELINE_LABEL}_dict.txt")

In [25]:
print(predict_n_metrics_dict[1])

{'AAL': {'RandomWalk': {'mse': 0.13232223208556113, 'mae': 0.30531158602995084, 'mape': 3.4475889450512174}}, 'AAPL': {'RandomWalk': {'mse': 0.17597319389388721, 'mae': 0.3631381314370611, 'mape': 15.789164834942758}}, 'AMZN': {'RandomWalk': {'mse': 0.28456004130557067, 'mae': 0.49774331855896675, 'mape': 24.007528575355074}}, 'CMCSA': {'RandomWalk': {'mse': 0.28392384554024597, 'mae': 0.5050117089878567, 'mape': 1253554709750.9727}}, 'COST': {'RandomWalk': {'mse': 0.25164144369539343, 'mae': 0.4542527340675398, 'mape': 2811482005801.083}}, 'GM': {'RandomWalk': {'mse': 0.08179163483681472, 'mae': 0.228303354345853, 'mape': 0.6913770700215232}}, 'GOOG': {'RandomWalk': {'mse': 0.2706730085275981, 'mae': 0.4864307435668934, 'mape': 4068657643002.2026}}, 'IBM': {'RandomWalk': {'mse': 0.3562452065402905, 'mae': 0.5604074106443208, 'mape': 13.322117100431615}}, 'JNJ': {'RandomWalk': {'mse': 0.2935266804192242, 'mae': 0.49201338246557486, 'mape': 2.7935365756520083}}, 'KO': {'RandomWalk': {'m

In [26]:
previous_metrics = read_dict(f"{METRICS_PATH}full_metrics_dict.txt")

previous_metrics = {int(k):v for k,v in previous_metrics.items()}
previous_metrics[1]

{'AAL': {'DTree': {'mse': 0.00040002611775352936,
   'mae': 0.01517712473316475,
   'mape': 0.03865700863537383},
  'RandomForest': {'mse': 0.00026469454356206737,
   'mae': 0.011830471121597129,
   'mape': 0.0317035815048061},
  'SVM': {'mse': 0.0012326616718660055,
   'mae': 0.027130217484537124,
   'mape': 0.08522520213496947},
  'LR': {'mse': 0.0002459189601130199,
   'mae': 0.011412489432978266,
   'mape': 0.03143306038231758},
  'LinearNN': {'mse': 0.17441429197788239,
   'mae': 0.3135688900947571,
   'mape': 1.242722988128662},
  'LSTM': {'mse': 0.005345122227051616,
   'mae': 0.05257157798733653,
   'mape': 0.21990598038471207}},
 'AAPL': {'DTree': {'mse': 0.0639864274135731,
   'mae': 0.1675341345697533,
   'mape': 0.5330979677282099},
  'RandomForest': {'mse': 0.06479528918158887,
   'mae': 0.1696178740919568,
   'mape': 0.5438718114409096},
  'SVM': {'mse': 0.13882785240228357,
   'mae': 0.31834349124692446,
   'mape': 1.9994437529407039},
  'LR': {'mse': 0.00016161902012559

In [44]:
data_wrapper = StocksDataWrapper.read_from(f"{DATA_PATH}WMT{FILE_SUFFIX}", compute_features=True, 
                                               predict_n=50, thresh_diff=THRESH_DIFF, normalize=True)

data_wrapper['Close'].describe()

count    3442.000000
mean        0.290156
std         0.208339
min         0.000000
25%         0.113462
50%         0.273361
75%         0.368276
max         1.000000
Name: Close, dtype: float64

for predict_n, quot_metrics in previous_metrics.items():
    for quot, metrics in quot_metrics.items():
        #print(metrics.keys())
        quot_metrics[quot] = {k:v for k,v in metrics.items() if k not in ['acc', 'f1', quot]}
        #metrics[quot] = {k:v for k,v in metrics.items() if k not in ['acc', 'f1', quot]}
        
save_dict(dict_save=previous_metrics, path=f"{METRICS_PATH}full_metrics_dict.txt")

In [27]:
full_metrics = previous_metrics.copy()
merge_metric_dicts(full_metrics, predict_n_metrics_dict)

In [29]:
full_metrics[1]['AAL']

{'DTree': {'mse': 0.00040002611775352936,
  'mae': 0.01517712473316475,
  'mape': 0.03865700863537383},
 'RandomForest': {'mse': 0.00026469454356206737,
  'mae': 0.011830471121597129,
  'mape': 0.0317035815048061},
 'SVM': {'mse': 0.0012326616718660055,
  'mae': 0.027130217484537124,
  'mape': 0.08522520213496947},
 'LR': {'mse': 0.0002459189601130199,
  'mae': 0.011412489432978266,
  'mape': 0.03143306038231758},
 'LinearNN': {'mse': 0.17441429197788239,
  'mae': 0.3135688900947571,
  'mape': 1.242722988128662},
 'LSTM': {'mse': 0.005345122227051616,
  'mae': 0.05257157798733653,
  'mape': 0.21990598038471207},
 'RandomWalk': {'mse': 0.13232223208556113,
  'mae': 0.30531158602995084,
  'mape': 3.4475889450512174}}

In [30]:
save_dict(full_metrics, f"{METRICS_PATH}full_metrics_dict.txt")

In [32]:
from helpers.data_helper import save_predictions_heatmaps

save_predictions_heatmaps(path=METRICS_PATH, metrics_dict=full_metrics, metrics_names_list=['mse', 'mae', 'mape'], reversed=True)

price_walk = []
for direction, count in data_wrapper['Tendency'].value_counts().to_dict().items():
    # filter the dataset by the current direction chosen
    df_filtered_direction = data_wrapper[data_wrapper['Tendency'] == direction]
    
    # pick at random some prices according to the current price direction chosen
    price_diffs = np.random.choice(df_filtered_direction['Difference'], size=count, replace=True)
    
    price_walk.extend([(direction, diff) for diff in price_diffs])
    
random.shuffle(price_walk)

print(price_walk[:10])