In [35]:
import os
import re
from datetime import date, datetime, timedelta
from importlib import reload
from pathlib import Path
from time import time
from typing import Annotated

import numpy as np
import pandas as pd
import pytorch_lightning as pl
import pytz
import talib
import torch
import torch.nn as nn
import torchmetrics
import vectorbt as vbt
import yfinance as yf
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import DataLoader, Dataset, TensorDataset
from torchviz import make_dot

import get_data
from tools import dataframe_reformat, inspect_code, plotting, training, wandb_api

log_wandb = True

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
kwargs = {"num_workers": 2, "pin_memory": True} if use_cuda else {"num_workers": 4}
print(f"[INFO]: Computation device: {device}")


[INFO]: Computation device: cpu


In [36]:
if log_wandb:
    import wandb

    wandb_api.login()
    run = wandb.init(
        project="binance",
        group="Initial GAN",
        job_type="test",
    )







VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
close_generator,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
d_loss,█████████████████████▇▇▇▇▇▇▆▆▆▆▅▅▅▄▄▄▄▂▁
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
fake_discriminator,█████████████████▇▇▇▇▇▇▇▇▇▆▆▆▅▅▅▅▅▄▄▄▃▃▁
fake_generator,█████████████████▇▇▇▇▇▇▇▇▇▆▆▆▆▅▅▅▅▄▄▄▃▃▁
g_loss,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▆▆█
real_discriminator,█████████████████▇▇▇▇▇▇▇▆▆▆▆▆▅▅▅▆▄▄▄▄▃▃▁
sign_generator,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁
trainer/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val_close_generator,█▃▁▂▁▁▁▁▁▁▁▁▂▂▃▃▁▁▂▁▅▅▁▂▂▂▂▂▄▂▂▂▂▂▃▂▂▂▂▂

0,1
close_generator,0.01797
d_loss,-79520368.0
epoch,61.0
fake_discriminator,-422543616.0
fake_generator,-430317664.0
g_loss,430317664.0
real_discriminator,-343023392.0
sign_generator,0.11508
trainer/global_step,31123.0
val_close_generator,0.01582


In [37]:
if log_wandb:
    config = wandb.config
else:
    config = {}


config["job_type"] = run.job_type if "run" in locals() else "test"
config["log_wandb"] = log_wandb
config["train_test_split"] = 0.7
config["nb_previous_close"] = 20
config["batch_size"] = 16
config["learning_rate_generator"] = 0.0001
config["learning_rate_discriminator"] = 0.0004
config["beta1"] = 0.5
config["beta2"] = 0.9


In [38]:
class DataModule(pl.LightningDataModule):
    def __init__(
        self,
        config,
        csv_file=None,
        train_df=None,
        test_df=None,
        train_dataset=None,
        validation_dataset=None,
    ):
        super().__init__()
        self.config = config

        if csv_file is not None:
            self.df = pd.read_csv(csv_file, delimiter=";")
            self.df["BEGINNING_DATE"] = pd.to_datetime(
                self.df["BEGINNING_DATE"], dayfirst=True
            )
            self.df["ENDING_DATE"] = pd.to_datetime(
                self.df["ENDING_DATE"], dayfirst=True
            )
            self.df["TICKER"] += "-USD"

        self.train_df = train_df.convert_dtypes() if train_df is not None else None
        self.test_df = test_df.convert_dtypes() if test_df is not None else None
        self.train_dataset = train_dataset
        self.validation_dataset = validation_dataset

    def preprocess_klines(
        self,
        data=None,
        ticker=None,
        beginning_date=None,
        ending_date=None,
        interval="1d",
    ):
        if data is None:
            data = get_data.select_data(
                ticker,
                interval,
                beginning_date=beginning_date,
                ending_date=ending_date,
            )
        data.dropna(axis=0, inplace=True)
        data.drop(labels="Date", axis=1, inplace=True)
        data.replace(
            to_replace=[np.inf, -np.inf, np.float64("inf"), -np.float64("inf")],
            value=0,
            inplace=True,
        )
        idx_close = list(data.columns).index("Close")
        scaler = MinMaxScaler(feature_range=(-1, 1))
        klines = torch.FloatTensor(scaler.fit_transform(data))

        data_close = klines[:, idx_close]
        single_close = torch.FloatTensor(data_close[self.config["nb_previous_close"] :])
        multiple_close = torch.stack(
            [
                torch.FloatTensor(data_close[i : i + self.config["nb_previous_close"]])
                for i in range(len(data_close) - self.config["nb_previous_close"])
            ]
        )
        multiple_klines = torch.stack(
            [
                klines[i : i + self.config["nb_previous_close"], :]
                for i in range(len(klines) - self.config["nb_previous_close"])
            ]
        )
        return multiple_klines, single_close, multiple_close

    def prepare_data(self):
        for _, row in self.df.iterrows():
            _ = get_data.select_data(
                row["TICKER"],
                "1d",
                beginning_date=row["BEGINNING_DATE"],
                ending_date=row["ENDING_DATE"],
            )

    def setup(self, stage):
        klines_training_sets = []
        single_close_training_sets = []
        multiple_close_training_sets = []
        klines_validation_sets = []
        single_close_validation_sets = []
        multiple_close_validation_sets = []
        for _, row in self.df.iterrows():
            klines, single_close, multiple_close = self.preprocess_klines(
                ticker=row["TICKER"],
                beginning_date=row["BEGINNING_DATE"],
                ending_date=row["ENDING_DATE"],
            )
            n = len(klines)
            klines_training_sets.append(
                klines[: int(n * self.config["train_test_split"]), :, :]
            )
            single_close_training_sets.append(
                single_close[: int(n * self.config["train_test_split"])]
            )
            multiple_close_training_sets.append(
                multiple_close[: int(n * self.config["train_test_split"])]
            )

            klines_validation_sets.append(
                klines[int(n * self.config["train_test_split"]) :, :, :]
            )
            single_close_validation_sets.append(
                single_close[int(n * self.config["train_test_split"]) :]
            )
            multiple_close_validation_sets.append(
                multiple_close[int(n * self.config["train_test_split"]) :]
            )
        assert len(klines_training_sets) == len(single_close_validation_sets)
        assert len(klines_training_sets) == len(multiple_close_training_sets)
        self.klines_training_sets = torch.cat(klines_training_sets)
        self.single_close_training_sets = torch.cat(
            single_close_training_sets
        ).unsqueeze(-1)
        self.multiple_close_training_sets = torch.cat(multiple_close_training_sets)

        self.klines_validation_sets = torch.cat(klines_validation_sets)
        self.single_close_validation_sets = torch.cat(
            single_close_validation_sets
        ).unsqueeze(-1)
        self.multiple_close_validation_sets = torch.cat(multiple_close_validation_sets)

        self.train_dataset = TensorDataset(
            self.klines_training_sets,
            self.multiple_close_training_sets,
            self.single_close_training_sets,
        )
        self.validation_dataset = TensorDataset(
            self.klines_validation_sets,
            self.multiple_close_validation_sets,
            self.single_close_validation_sets,
        )

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.config["batch_size"],
            shuffle=True,
            **kwargs,
        )

    def val_dataloader(self):
        return DataLoader(
            self.validation_dataset,
            batch_size=self.config["batch_size"],
            shuffle=False,
            **kwargs,
        )

    def test_dataloader(self):
        return self.val_dataloader()

    def predict_dataloader(self):
        return DataLoader(
            [image for image, _ in self.validation_dataset],
            batch_size=self.config["batch_size"],
            shuffle=False,
            **kwargs,
        )


dm = DataModule(config, "DATE.csv")
dm.prepare_data()
dm.setup(stage="fit")
train_dataloader = dm.train_dataloader()
val_dataloader = dm.val_dataloader()


In [39]:
print(len(train_dataloader) * config["batch_size"])
print(len(val_dataloader) * config["batch_size"])
klines, single_close, multiple_close = next(iter(train_dataloader))
print(klines.shape, single_close.shape, multiple_close.shape)


8032
3472
torch.Size([16, 20, 29]) torch.Size([16, 20]) torch.Size([16, 1])


In [40]:
class Generator(nn.Module):
    def __init__(self, config):
        super().__init__()

        self.conv_layer = nn.Sequential(
            nn.Conv1d(config["nb_previous_close"], 32, kernel_size=2),
            nn.LeakyReLU(),
        )
        self.lstm_layer = nn.LSTM(
            28, 64, num_layers=1, batch_first=True, bidirectional=True, dropout=0.3
        )
        self.fc_layers = nn.Sequential(
            nn.ReLU(),
            # nn.Flatten(),
            nn.Linear(in_features=2 * 64, out_features=64),
            nn.LeakyReLU(),
            # nn.Dropout2d(0.2),
            nn.Linear(in_features=64, out_features=32),
            nn.LeakyReLU(),
            # nn.Dropout2d(0.2),
            nn.Linear(in_features=32, out_features=1),
        )

    def forward(self, x):
        x = self.conv_layer(x)
        output, (hidden_state, cell_state) = self.lstm_layer(x)
        hidden_state = torch.permute(hidden_state, (1, 0, 2)).reshape(-1, 2 * 64)
        x = self.fc_layers(hidden_state)
        return x


class Discriminator(nn.Module):
    def __init__(self, config):
        super().__init__()

        self.conv_layer = nn.Sequential(
            nn.Conv1d(1, 32, kernel_size=2),
            nn.LeakyReLU(),
            nn.Conv1d(32, 64, kernel_size=2),
            nn.LeakyReLU(),
        )
        _, nb_filters, width = self.conv_layer(torch.rand(1, 1, 21)).shape
        self.fc_layers = nn.Sequential(
            nn.Flatten(),
            nn.Linear(in_features=nb_filters * width, out_features=64),
            nn.LeakyReLU(),
            # nn.Dropout2d(0.2),
            nn.Linear(in_features=64, out_features=32),
            nn.LeakyReLU(),
            # nn.Dropout2d(0.2),
            nn.Linear(in_features=32, out_features=1),
        )

    def forward(self, z):
        x = self.conv_layer(z.unsqueeze(1))
        x = self.fc_layers(x)
        return x


class GAN(pl.LightningModule):
    def __init__(
        self,
        config,
    ):
        super().__init__()
        self.config = config

        # networks
        self.generator = Generator(self.config)
        self.discriminator = Discriminator(self.config)
        self.automatic_optimization = False

    def forward(self, z):
        return self.generator(z)

    def adversarial_loss(self, y_hat, y):
        return nn.BCELoss()(y_hat, y)

    def generator_loss(self, fake, y_hat, y):
        fake_generator = torch.mean(fake)
        close_generator = torchmetrics.MeanSquaredError()(y_hat, y)
        sign_generator = torch.mean(torch.abs(torch.sign(y_hat) - torch.sign(y)))
        return fake_generator, close_generator, sign_generator

    def discriminator_loss(self, real, fake, y_hat, y):
        fake_discriminator = torch.mean(fake)
        real_discriminator = torch.mean(real)
        return fake_discriminator, real_discriminator

    def _training_step_generator(self, batch, opt=None):
        klines, multiple_close, y = batch
        fake_close = self(klines)
        fake = self.discriminator(torch.cat([multiple_close, fake_close], dim=1))
        fake_generator, close_generator, sign_generator = self.generator_loss(
            fake, fake_close, y
        )
        g_loss = -fake_generator + 0.5 * close_generator + 0.5 * sign_generator
        if opt is not None:
            opt.zero_grad()
            self.manual_backward(g_loss)
            opt.step()

        return {
            "g_loss": g_loss,
            "fake_generator": fake_generator,
            "close_generator": close_generator,
            "sign_generator": sign_generator,
        }

    def _training_step_discriminator(self, batch, opt=None, steps=5):
        klines, multiple_close, y = batch
        D_loss = 0
        Fake_discriminator = 0
        Real_discriminator = 0
        for _ in range(steps):
            fake_close = self(klines)
            fake = self.discriminator(torch.cat([multiple_close, fake_close], dim=1))
            real = self.discriminator(torch.cat([multiple_close, y], dim=1))
            fake_discriminator, real_discriminator = self.discriminator_loss(
                real, fake, fake_close, y
            )
            d_loss = fake_discriminator - real_discriminator
            if opt is not None:
                opt.zero_grad()
                self.manual_backward(d_loss)
                opt.step()

            D_loss += d_loss / steps
            Fake_discriminator += fake_discriminator / steps
            Real_discriminator += real_discriminator / steps

        return {
            "d_loss": D_loss,
            "fake_discriminator": Fake_discriminator,
            "real_discriminator": Real_discriminator,
        }

    def training_step(self, batch, batch_idx):
        opt_g, opt_d = self.optimizers()

        metrics = {}
        metrics.update(self._training_step_generator(batch, opt_g))
        metrics.update(self._training_step_discriminator(batch, opt_d, steps=1))

        self.log_dict(
            metrics,
            prog_bar=True,
            on_step=False,
            on_epoch=True,
        )

    def validation_step(self, batch, batch_idx):
        metrics = {}
        metrics.update(self._training_step_generator(batch))
        metrics.update(self._training_step_discriminator(batch, steps=1))
        metrics = {
            "val_" + metric_name: metric_value
            for metric_name, metric_value in metrics.items()
        }

        self.log_dict(
            metrics,
            prog_bar=True,
            on_step=False,
            on_epoch=True,
        )
        return metrics

    def configure_optimizers(self):
        opt_g = torch.optim.Adam(
            self.generator.parameters(),
            lr=self.config["learning_rate_generator"],
            # betas=(self.config["beta1"], self.config["beta2"]),
        )
        opt_d = torch.optim.Adam(
            self.discriminator.parameters(),
            lr=self.config["learning_rate_discriminator"],
            betas=(self.config["beta1"], self.config["beta2"]),
        )
        return opt_g, opt_d


model = GAN(config)

model_checkpoint = pl.callbacks.model_checkpoint.ModelCheckpoint(
    dirpath=run.dir if "run" in locals() else "tmp/",
    filename="{epoch}-{val_loss:.3f}",
    monitor="_generatorg_loss",
    mode="min",
    verbose=True,
    save_last=True,
)

script_checkpoint = training.ScriptCheckpoint(
    dirpath=run.dir if "run" in locals() else "tmp/",
)

callbacks = [script_checkpoint]
log = None
if config["job_type"] == "train" or False:
    callbacks.append(model_checkpoint)
    print(f"[INFO]: saving models.")
else:
    print(f"[INFO]: not saving models.")
if config["job_type"] == "debug":
    log = "all"

if config["log_wandb"]:
    wandb_logger = pl.loggers.WandbLogger()
    wandb_logger.watch(model, log=log, log_graph=True)
else:
    wandb_logger = None
trainer = pl.Trainer(
    max_epochs=150,
    callbacks=callbacks,
    logger=wandb_logger,
    devices="auto",
    accelerator="auto",
    #     limit_train_batches=3,
    #     limit_val_batches=3,
)
trainer.fit(model, dm)


  rank_zero_warn(
[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
  rank_zero_deprecation(

  | Name          | Type          | Params
------------------------------------------------
0 | generator     | Generator     | 59.8 K
1 | discriminator | Discriminator | 84.3 K
------------------------------------------------
144 K     Trainable params
0         Non-trainable params
144 K     Total params
0.576     Total estimated model params size (MB)


[INFO]: not saving models.


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [None]:
compteur = 0
for i in range(len(previous_x)):
    for j in range(len(previous_x[i])):
        if not torch.equal(previous_x[i][0], previous_x[i][j]):
            compteur += 1
print(compteur)


0


In [None]:
compteur = 0
for i in range(len(preprevious_x)):
    for j in range(len(preprevious_x[i])):
        if not torch.equal(preprevious_x[i][0], preprevious_x[i][j]):
            compteur += 1
print(compteur)


0


In [None]:
compteur = 0
for i in range(len(following_x)):
    for j in range(len(following_x[i])):
        if not torch.equal(following_x[i][0], following_x[i][j]):
            compteur += 1
print(compteur)


0


In [None]:
batch = previous_x[-1]
klines1 = batch[14]
klines2 = batch[15]
print(klines1.shape)
print(
    model.generator.fc_layers(
        klines1.unsqueeze(0),
    )
)


IndexError: list index out of range