In [1]:
%load_ext autoreload
%autoreload 2

from notebook_config import setup_notebook
import matplotlib.pyplot as plt
import numpy as np
from tqdm.notebook import trange, tqdm
import seaborn as sns

setup_notebook()

In [2]:
import torch
import torch.nn as nn
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = torch.device('cpu')
print(device)

cuda


In [3]:
from helpers.data_helper import *
from datasets.stocks_data_wrapper import StocksDataWrapper

In [4]:
DATA_PATH = '../../data/'
FILE_SUFFIX = '.txt'
price_column = 'Close'

In [None]:
OUTPUT_PATH = 'results/'
N_CLASSES = 2
METRICS_PATH = f"{OUTPUT_PATH}metrics/{N_CLASSES}_classes/"
PIPELINE_LABEL = 'linear_nn'

THRESH_DIFF = 0.005 if N_CLASSES is 3 else None

### Prepare data

In [5]:
PERFORM_GRID_SEARCH = True
MAX_SIZE = 5000
n_splits=5

quotations = ['AAL', 'AAPL', 'AMZN','CMCSA', 'COST', 'GM', 'GOOG', 'IBM', 'JNJ', 'KO','PEP', 'TSLA', 'WMT', 'XOM']
days_predict_feature_set = {
    1:['Volume', 'Close', 'LowLen', 'Difference', 'BG_L_Band', 'GAP', 'MACD_diff'],
    5:['Volume', 'Close', 'LowLen', 'Difference', 'SMA(20) - SMA(10)', 'BG_H_Band_Indicator', 'MACD'],
    10:['Volume', 'Close', 'BodyLen', 'Difference', 'SMA(20) - SMA(10)', 'EMA_Diff', 'MACD_diff'],
    20:['Volume', 'Close', 'LowLen', 'Difference', 'EMA(14)', 'BG_H_Band', 'MACD_diff'],
    50:['Volume', 'Close', 'LowLen', 'Difference', 'BG_L_Band','GAP','MACD_diff'],
}

#quotations=['CMCSA']
#days_predict_feature_set = {1:days_predict_feature_set[1]}

In [6]:
predict_n_metrics_dict = {}

In [7]:
from torch.utils.data import DataLoader
from datasets.torch_datasets import StocksDataset
from models.pytorch_linear_model import LinearModel
from trainers.pytorch_classification_trainer import PytorchClassificationTrainer

for predict_n, features_list in tqdm(days_predict_feature_set.items()):
    quot_metrics_dict = {}
    for quot in tqdm(quotations):
        data_wrapper = StocksDataWrapper.read_from(f"{DATA_PATH}{quot}{FILE_SUFFIX}", 
                                                   compute_features=True, 
                                                   predict_n=predict_n, 
                                                   thresh_diff=THRESH_DIFF, normalize=True)


        X_train, X_test, y_train, y_test = data_wrapper.get_datasets(n_splits=1, val_size=0.2, 
                                                                     y_column='Next', 
                                                                     features_list=features_list)
        n_classes = len(np.unique(y_train))
      
        train_dataset = StocksDataset(X_train,y_train)
        test_dataset = StocksDataset(X_test, y_test)

        train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=False, num_workers=0, drop_last=True)
        test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=0)
        
        config = dict(
            lr=1e-5,
            loss=nn.CrossEntropyLoss,
            n_epochs=100,
            optimizer=torch.optim.Adam,
        )
        model = LinearModel(config, input_dim=X_train.shape[1], output_dim=n_classes, device=device)

        trainer = PytorchClassificationTrainer(model, device=device, use_wandb=False, project_label="Pipeline")
        trainer.train(dataloader=train_dataloader)

        
        metrics = trainer.evaluate(test_dataloader)
        label_dict = {'LinearNN':metrics}
        quot_metrics_dict[quot] = label_dict
        
    #metrics_df = pd.DataFrame.from_dict(quot_metrics_dict).T
    """acc_df = metrics_df.applymap(lambda metrics: metrics['accuracy'])
    f1_df = metrics_df.applymap(lambda metrics: metrics['f1_score'])

    acc_df.to_csv(f"metrics/acc_{predict_n}.csv", index=True)
    f1_df.to_csv(f"metrics/f1_{predict_n}.csv", index=True)"""
        
    predict_n_metrics_dict[predict_n] = quot_metrics_dict.copy()

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

In [8]:
predict_n_metrics_dict

{1: {'AAL': {'LinearNN': {'acc': 0.47639484978540775,
    'roc_auc': 0.5065847883989345,
    'f1': 0.3350949484127063}},
  'AAPL': {'LinearNN': {'acc': 0.5364806866952789,
    'roc_auc': 0.489980797563277,
    'f1': 0.4167863715814626}},
  'AMZN': {'LinearNN': {'acc': 0.4520743919885551,
    'roc_auc': 0.5,
    'f1': 0.2814886854549427}},
  'CMCSA': {'LinearNN': {'acc': 0.470616570327553,
    'roc_auc': 0.5,
    'f1': 0.3012069369210739}},
  'COST': {'LinearNN': {'acc': 0.45987478656801367,
    'roc_auc': 0.499019469783501,
    'f1': 0.29406606744138064}},
  'GM': {'LinearNN': {'acc': 0.5129224652087475,
    'roc_auc': 0.49830685486423193,
    'f1': 0.3560544179760239}},
  'GOOG': {'LinearNN': {'acc': 0.5221745350500715,
    'roc_auc': 0.5192323548812665,
    'f1': 0.5223910235138527}},
  'IBM': {'LinearNN': {'acc': 0.4885135135135135,
    'roc_auc': 0.5016630116959065,
    'f1': 0.33269723499081294}},
  'JNJ': {'LinearNN': {'acc': 0.51931330472103,
    'roc_auc': 0.5298550391164288,
 

### Add to metrics

In [9]:
save_dict(dict_save=predict_n_metrics_dict, path=f"{METRICS_PATH}{PIPELINE_LABEL}_dict.txt")

In [10]:
previous_metrics = read_dict(f"{METRICS_PATH}linear_metrics_dict.txt")

previous_metrics = {int(k):v for k,v in previous_metrics.items()}
previous_metrics[1]

In [11]:
full_metrics = previous_metrics.copy()
merge_metric_dicts(full_metrics, predict_n_metrics_dict)
        
save_dict(full_metrics, f"{METRICS_PATH}full_metrics_dict.txt")

In [12]:
for predict_n, quot_metrics in full_metrics.items():
    metrics_df = pd.DataFrame.from_dict(quot_metrics).T
    #print(metrics_df)
    acc_df = metrics_df.applymap(lambda metrics: metrics['acc'])
    f1_df = metrics_df.applymap(lambda metrics: metrics['f1'])
    
    plt.figure()
    acc_heatmap = sns.heatmap(acc_df, cmap ='mako', linewidths = 0.5, annot = True)
    acc_heatmap.figure.savefig(f"{METRICS_PATH}{predict_n}_acc.png")
    plt.close()
    
    plt.figure()
    f1_heatmap = sns.heatmap(f1_df, cmap ='mako', linewidths = 0.5, annot = True)
    f1_heatmap.figure.savefig(f"{METRICS_PATH}{predict_n}_f1.png")
    plt.close()