# Pytorch LinearModel Pipeline 

## Setup

In [2]:
%load_ext autoreload
%autoreload 2>

from notebook_config import setup_notebook
import matplotlib.pyplot as plt
import numpy as np
from tqdm.notebook import trange, tqdm
import seaborn as sns

setup_notebook()

In [3]:
import torch
import torch.nn as nn
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = torch.device('cpu')
print(device)

cuda


In [4]:
from helpers.data_helper import *
from datasets.stocks_data_wrapper import StocksDataWrapper

In [5]:
DATA_PATH = '../../data/'
FILE_SUFFIX = '.txt'
price_column = 'Close'

In [6]:
OUTPUT_PATH = 'results/'
N_CLASSES = 2
METRICS_PATH = f"{OUTPUT_PATH}metrics/{N_CLASSES}_classes/"
METRICS_PATH = f"{OUTPUT_PATH}metrics/regression/"
PIPELINE_LABEL = 'linear_nn'

THRESH_DIFF = 0.005 if N_CLASSES is 3 else None

### Prepare data

In [6]:
PERFORM_GRID_SEARCH = True
MAX_SIZE = 5000
n_splits=5

quotations = ['AAL', 'AAPL', 'AMZN','CMCSA', 'COST', 'GM', 'GOOG', 'IBM', 'JNJ', 'KO','PEP', 'TSLA', 'WMT', 'XOM']
days_predict_feature_set = {
    1:['Volume', 'Close', 'LowLen', 'Difference', 'BG_L_Band', 'GAP', 'MACD_diff'],
    5:['Volume', 'Close', 'LowLen', 'Difference', 'SMA(20) - SMA(10)', 'BG_H_Band_Indicator', 'MACD'],
    10:['Volume', 'Close', 'BodyLen', 'Difference', 'SMA(20) - SMA(10)', 'EMA_Diff', 'MACD_diff'],
    20:['Volume', 'Close', 'LowLen', 'Difference', 'EMA(14)', 'BG_H_Band', 'MACD_diff'],
    50:['Volume', 'Close', 'LowLen', 'Difference', 'BG_L_Band','GAP','MACD_diff'],
}

#quotations=['CMCSA']
#days_predict_feature_set = {1:days_predict_feature_set[1]}

In [7]:
predict_n_metrics_dict = {}

In [8]:
from torch.utils.data import DataLoader
from datasets.torch_datasets import StocksDataset
from models.pytorch_linear_model import LinearModel
from trainers.pytorch_classification_trainer import PytorchClassificationTrainer
from trainers.pytorch_regression_trainer import PytorchRegressionTrainer

for predict_n, features_list in tqdm(days_predict_feature_set.items()):
    quot_metrics_dict = {}
    for quot in tqdm(quotations):
        data_wrapper = StocksDataWrapper.read_from(f"{DATA_PATH}{quot}{FILE_SUFFIX}", 
                                                   compute_features=True, 
                                                   predict_n=predict_n, 
                                                   thresh_diff=THRESH_DIFF, normalize=True)


        X_train, X_test, y_train, y_test = data_wrapper.get_datasets(n_splits=1, val_size=0.2, 
                                                                     y_column='NextPrice', 
                                                                     features_list=features_list)
        n_classes = len(np.unique(y_train))
      
        train_dataset = StocksDataset(X_train,y_train)
        test_dataset = StocksDataset(X_test, y_test)

        train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=False, num_workers=0, drop_last=True)
        test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=0)
        
        """config = dict(
            lr=1e-5,
            loss=nn.CrossEntropyLoss,
            n_epochs=100,
            optimizer=torch.optim.Adam,
        )
        model = LinearModel(config, input_dim=X_train.shape[1], output_dim=n_classes, device=device)
        trainer = PytorchClassificationTrainer(model, device=device, use_wandb=False, project_label="Pipeline")
        """
        
        config = dict(
            lr=1e-5,
            loss=nn.MSELoss,
            n_epochs=100,
            optimizer=torch.optim.Adam,
        )
        model = LinearModel(config, input_dim=X_train.shape[1], output_dim=1, device=device)            
        trainer = PytorchRegressionTrainer(model, device=device, use_wandb=False, project_label="Pipeline")
        trainer.train(dataloader=train_dataloader)

        
        metrics = trainer.evaluate(test_dataloader)
        # convert to regular python float because for some reason numpyfloat32 is not json-serializable
        metrics = {k:float(m) for k,m in metrics.items()}
        # add label so we can merge metric dicts from multiple methods (ML, MLP, LSTM)
        label_dict = {'LinearNN':metrics}
        quot_metrics_dict[quot] = label_dict
        
    #metrics_df = pd.DataFrame.from_dict(quot_metrics_dict).T
    """acc_df = metrics_df.applymap(lambda metrics: metrics['accuracy'])
    f1_df = metrics_df.applymap(lambda metrics: metrics['f1_score'])

    acc_df.to_csv(f"metrics/acc_{predict_n}.csv", index=True)
    f1_df.to_csv(f"metrics/f1_{predict_n}.csv", index=True)"""
        
    predict_n_metrics_dict[predict_n] = quot_metrics_dict.copy()

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

In [12]:
predict_n_metrics_dict

{1: {'AAL': {'LinearNN': {'mse': 0.17441429197788239,
    'mae': 0.3135688900947571,
    'mape': 1.242722988128662}},
  'AAPL': {'LinearNN': {'mse': 0.76552814245224,
    'mae': 0.6214923858642578,
    'mape': 1.066606879234314}},
  'AMZN': {'LinearNN': {'mse': 0.022700581699609756,
    'mae': 0.11364807188510895,
    'mape': 0.1938926875591278}},
  'CMCSA': {'LinearNN': {'mse': 0.49718743562698364,
    'mae': 0.4886718988418579,
    'mape': 0.7041550278663635}},
  'COST': {'LinearNN': {'mse': 0.05603839457035065,
    'mae': 0.18956239521503448,
    'mape': 0.32471588253974915}},
  'GM': {'LinearNN': {'mse': 0.06297202408313751,
    'mae': 0.19711221754550934,
    'mape': 6950458228736.0}},
  'GOOG': {'LinearNN': {'mse': 0.04310254380106926,
    'mae': 0.16003645956516266,
    'mape': 0.23966863751411438}},
  'IBM': {'LinearNN': {'mse': 1.2479772567749023,
    'mae': 0.9533796906471252,
    'mape': 1.2415010929107666}},
  'JNJ': {'LinearNN': {'mse': 0.18543632328510284,
    'mae': 0.41

### Add to metrics

for predict_n, quot_metrics in predict_n_metrics_dict.items():
    for quot, metrics_dict in quot_metrics.items():
        for metric in metrics_dict.values():
            for m in metric.values():
                print(f"{m} -> {float(m)} : {type(float(m))}")
        #print(f"{quot} -> {type(m) for m in list(metrics.values())}")

In [13]:
save_dict(dict_save=predict_n_metrics_dict, path=f"{METRICS_PATH}{PIPELINE_LABEL}_dict.txt")

In [15]:
previous_metrics = read_dict(f"{METRICS_PATH}linear_regs_dict.txt")

previous_metrics = {int(k):v for k,v in previous_metrics.items()}
previous_metrics[1]

{'AAL': {'DTree': {'mse': 0.00040002611775352936,
   'mae': 0.01517712473316475,
   'mape': 0.03865700863537383},
  'RandomForest': {'mse': 0.00026469454356206737,
   'mae': 0.011830471121597129,
   'mape': 0.0317035815048061},
  'SVM': {'mse': 0.0012326616718660055,
   'mae': 0.027130217484537124,
   'mape': 0.08522520213496947},
  'LR': {'mse': 0.0002459189601130199,
   'mae': 0.011412489432978266,
   'mape': 0.03143306038231758}},
 'AAPL': {'DTree': {'mse': 0.0639864274135731,
   'mae': 0.1675341345697533,
   'mape': 0.5330979677282099},
  'RandomForest': {'mse': 0.06479528918158887,
   'mae': 0.1696178740919568,
   'mape': 0.5438718114409096},
  'SVM': {'mse': 0.13882785240228357,
   'mae': 0.31834349124692446,
   'mape': 1.9994437529407039},
  'LR': {'mse': 0.0001616190201255988,
   'mae': 0.007981748225476632,
   'mape': 0.016132374904576908}},
 'AMZN': {'DTree': {'mse': 0.04761110137745437,
   'mae': 0.14974067628169763,
   'mape': 0.34008283427615543},
  'RandomForest': {'mse':

In [16]:
full_metrics = previous_metrics.copy()
merge_metric_dicts(full_metrics, predict_n_metrics_dict)
        
save_dict(full_metrics, f"{METRICS_PATH}full_metrics_dict.txt")

In [8]:
full_metrics = read_dict(f"{METRICS_PATH}full_metrics_dict.txt")

full_metrics = {int(k):v for k,v in full_metrics.items()}

full_metrics[1]

{'AAL': {'DTree': {'mse': 0.00040002611775352936,
   'mae': 0.01517712473316475,
   'mape': 0.03865700863537383},
  'RandomForest': {'mse': 0.00026469454356206737,
   'mae': 0.011830471121597129,
   'mape': 0.0317035815048061},
  'SVM': {'mse': 0.0012326616718660055,
   'mae': 0.027130217484537124,
   'mape': 0.08522520213496947},
  'LR': {'mse': 0.0002459189601130199,
   'mae': 0.011412489432978266,
   'mape': 0.03143306038231758},
  'LinearNN': {'mse': 0.17441429197788239,
   'mae': 0.3135688900947571,
   'mape': 1.242722988128662}},
 'AAPL': {'DTree': {'mse': 0.0639864274135731,
   'mae': 0.1675341345697533,
   'mape': 0.5330979677282099},
  'RandomForest': {'mse': 0.06479528918158887,
   'mae': 0.1696178740919568,
   'mape': 0.5438718114409096},
  'SVM': {'mse': 0.13882785240228357,
   'mae': 0.31834349124692446,
   'mape': 1.9994437529407039},
  'LR': {'mse': 0.0001616190201255988,
   'mae': 0.007981748225476632,
   'mape': 0.016132374904576908},
  'LinearNN': {'mse': 0.7655281424

In [9]:
from helpers.data_helper import save_predictions_heatmaps

save_predictions_heatmaps(path=METRICS_PATH, metrics_dict=full_metrics, metrics_names_list=['mse', 'mae', 'mape'], reversed=True)

for predict_n, quot_metrics in full_metrics.items():
    metrics_df = pd.DataFrame.from_dict(quot_metrics).T
    #print(metrics_df)
    acc_df = metrics_df.applymap(lambda metrics: metrics['acc'])
    f1_df = metrics_df.applymap(lambda metrics: metrics['f1'])
    
    plt.figure()
    acc_heatmap = sns.heatmap(acc_df, cmap ='mako', linewidths = 0.5, annot = True)
    acc_heatmap.figure.savefig(f"{METRICS_PATH}{predict_n}_acc.png")
    plt.close()
    
    plt.figure()
    f1_heatmap = sns.heatmap(f1_df, cmap ='mako', linewidths = 0.5, annot = True)
    f1_heatmap.figure.savefig(f"{METRICS_PATH}{predict_n}_f1.png")
    plt.close()