In [1]:
import seaborn as sns 
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc

import pandas as pd
import numpy as np
from tqdm import tqdm
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim 
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict



In [2]:
tqdm.pandas()

In [3]:
pl.seed_everything(42)

Global seed set to 42


42

In [4]:
!gdown --id 174pzX55XaNzWgjZSgX6Us13fnBvPCca6

Downloading...
From: https://drive.google.com/uc?id=174pzX55XaNzWgjZSgX6Us13fnBvPCca6
To: c:\Users\HRUT\OneDrive - Novozymes A S\Desktop\mtf\Binance_BTCUSDT_minute.csv

  0%|          | 0.00/37.2M [00:00<?, ?B/s]
  1%|▏         | 524k/37.2M [00:00<00:08, 4.16MB/s]
  3%|▎         | 1.05M/37.2M [00:00<00:07, 4.71MB/s]
  4%|▍         | 1.57M/37.2M [00:00<00:07, 4.94MB/s]
  6%|▌         | 2.10M/37.2M [00:00<00:07, 5.00MB/s]
  8%|▊         | 3.15M/37.2M [00:00<00:06, 5.24MB/s]
 11%|█▏        | 4.19M/37.2M [00:00<00:06, 5.29MB/s]
 14%|█▍        | 5.24M/37.2M [00:01<00:06, 5.30MB/s]
 17%|█▋        | 6.29M/37.2M [00:01<00:05, 5.34MB/s]
 20%|█▉        | 7.34M/37.2M [00:01<00:05, 5.32MB/s]
 23%|██▎       | 8.39M/37.2M [00:01<00:05, 5.36MB/s]
 25%|██▌       | 9.44M/37.2M [00:01<00:05, 5.33MB/s]
 28%|██▊       | 10.5M/37.2M [00:02<00:05, 5.02MB/s]
 31%|███       | 11.5M/37.2M [00:02<00:04, 5.50MB/s]
 34%|███▍      | 12.6M/37.2M [00:02<00:04, 5.50MB/s]
 37%|███▋      | 13.6M/37.2M [00:02<00:04, 5.2

In [5]:
df = pd.read_csv("Binance_BTCUSDT_minute.csv", parse_dates=['date'])
# df = df.sort_values(by='date').reset_index(drop=True)

In [6]:
df['prev_close'] = df.shift(1)['close']

In [7]:

df['close_change'] = df.progress_apply(
    lambda row: 0 if np.isnan(row.prev_close) else row.close - row.prev_close, axis=1
)

100%|██████████| 273288/273288 [00:06<00:00, 41377.35it/s]


In [8]:
df.head()

Unnamed: 0,unix,date,symbol,open,high,low,close,Volume BTC,Volume USDT,tradecount,prev_close,close_change
0,1616285160000,2021-03-21 00:06:00,BTC/USDT,58217.32,58339.37,58208.39,58331.16,46.047371,2682795.0,1271,,0.0
1,1616285100000,2021-03-21 00:05:00,BTC/USDT,58051.79,58262.06,58040.25,58215.14,99.395476,5780768.0,2924,58331.16,-116.02
2,1616285040000,2021-03-21 00:04:00,BTC/USDT,57926.3,58059.44,57830.37,58049.58,92.501519,5360804.0,2784,58215.14,-165.56
3,1616284980000,2021-03-21 00:03:00,BTC/USDT,57816.93,57960.0,57811.41,57930.28,92.864307,5376217.0,2699,58049.58,-119.3
4,1616284920000,2021-03-21 00:02:00,BTC/USDT,57908.93,57967.0,57811.41,57816.93,202.32754,11711080.0,5072,57930.28,-113.35


In [9]:
rows = []

for _, row in tqdm(df.iterrows(), total=df.shape[0]):
    row_data = dict(
        day_of_week = row.date.dayofweek,
        day_of_month = row.date.day, 
        week_of_year = row.date.week,
        month = row.date.month,
        open = row.open,
        high = row.high,
        low = row.low,
        close_change = row.close_change,
        close = row.close
    )

    rows.append(row_data)

features_df = pd.DataFrame(rows)

100%|██████████| 273288/273288 [00:25<00:00, 10538.07it/s]


In [10]:
features_df.head()

Unnamed: 0,day_of_week,day_of_month,week_of_year,month,open,high,low,close_change,close
0,6,21,11,3,58217.32,58339.37,58208.39,0.0,58331.16
1,6,21,11,3,58051.79,58262.06,58040.25,-116.02,58215.14
2,6,21,11,3,57926.3,58059.44,57830.37,-165.56,58049.58
3,6,21,11,3,57816.93,57960.0,57811.41,-119.3,57930.28
4,6,21,11,3,57908.93,57967.0,57811.41,-113.35,57816.93


In [11]:
training_size = int(len(features_df) * 0.9)
training_size

245959

In [12]:
train_df , test_df = features_df[:training_size], features_df[training_size+1:]
train_df.shape, test_df.shape

((245959, 9), (27328, 9))

In [13]:
scaler = MinMaxScaler(feature_range=(-1,1))
scaler = scaler.fit(train_df)

In [14]:
train_df = pd.DataFrame(
    scaler.transform(train_df),
    index = train_df.index,
    columns=train_df.columns
)
train_df.shape

(245959, 9)

In [15]:
test_df = pd.DataFrame(
    scaler.transform(test_df),
    index = test_df.index,
    columns=test_df.columns
)

test_df.head()

Unnamed: 0,day_of_week,day_of_month,week_of_year,month,open,high,low,close_change,close
245960,-0.333333,0.933333,0.5,0.454545,-0.986917,-0.987523,-0.986377,-0.24384,-0.987044
245961,-0.333333,0.933333,0.5,0.454545,-0.986869,-0.987488,-0.986349,-0.241675,-0.986963
245962,-0.333333,0.933333,0.5,0.454545,-0.986959,-0.987503,-0.986377,-0.242183,-0.986915
245963,-0.333333,0.933333,0.5,0.454545,-0.986839,-0.987473,-0.986348,-0.244243,-0.987005
245964,-0.333333,0.933333,0.5,0.454545,-0.987095,-0.987458,-0.986459,-0.241086,-0.986885


In [16]:
def create_sequences(input_data:pd.DataFrame, target_column, sequence_length):
    sequences = []
    data_size = len(input_data)
    for i in tqdm(range(data_size - sequence_length)):
        sequence = input_data[i:i+sequence_length]
        label_position = i + sequence_length
        label = input_data.iloc[label_position][target_column]
        sequences.append((sequence,label))

    return sequences

In [17]:
SEQUENCE_LENGTH = 150

train_sequences = create_sequences(train_df, 'close', SEQUENCE_LENGTH)
test_sequences = create_sequences(test_df, 'close', SEQUENCE_LENGTH)


100%|██████████| 245809/245809 [00:18<00:00, 13445.00it/s]
100%|██████████| 27178/27178 [00:01<00:00, 16019.67it/s]


In [18]:
class BTCDataset(Dataset):
    def __init__(self,sequences):
        self.sequences = sequences
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        sequence, label = self.sequences[idx]

        return dict(
            sequence=torch.Tensor(sequence.to_numpy()),
            label = torch.tensor(label).float()
        )


In [None]:
class BTCPriceDataModule(pl.LightningDataModule):

    def __init__(self, train_sequences, test_sequences, batch_size = 8):
        super().__init__()
        self.train_sequence = train_sequences
        self.test_sequences = test_sequences
        self.batch_size = batch_size

    def setup(self, stage=None):
        self.train_dataset = BTCDataset(self.train_sequence)
        self.test_dataset = BTCDataset(self.test_sequences)

    def train_dataloader(self):
        print("coming here")
        return DataLoader(
            self.train_dataset,
            batch_size = self.batch_size,
            shuffle = False,
            num_workers=2
        )
    
    def val_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=1,
            shuffle=False,
            num_workers=1
        )

    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=1,
            shuffle=False,
            num_workers=1
        )   

In [None]:
N_EPOCHS = 8
BATCH_SIZE = 128

data_module = BTCPriceDataModule(train_sequences, test_sequences, batch_size=BATCH_SIZE)
data_module.setup()

len(data_module.train_dataloader())

# for i in data_module.train_dataloader():
#     print(i['sequence'].shape)
#     print(i['label'].shape)
#     break

coming here


1921

In [None]:
class PricePredictionModel(nn.Module):

    def __init__(self, n_features, n_hidden=128, n_layers=2):
        super().__init__()

        self.n_hidden = n_hidden
        self.lstm = nn.LSTM(
            input_size = n_features,
            hidden_size = n_hidden,
            batch_first = True, 
            num_layers = n_layers,
            dropout = 0.2
        )

        self.regressor = nn.Linear(n_hidden, 1)


    def forward(self, x):
        self.lstm.flatten_parameters()

        _, (hidden, _) = self.lstm(x)
        out = hidden[-1]

        return self.regressor(out)

In [None]:
class BTCPricePredictor(pl.LightningModule):

    def __init__(self, n_features: int):
        super().__init__()

        self.model = PricePredictionModel(n_features)
        self.criterion = nn.MSELoss()

    def forward(self, x, labels=None):
        output = self.model(x)
        loss = 0

        if labels is not None:
            loss = self.criterion(output, labels.unsqueeze(dim=1))

        return loss, output

    def training_step(self, batch, batch_idx):
        sequences = batch["sequence"]
        labels = batch["label"]

        loss, outputs = self(sequences, labels)
        self.log("train_loss", loss, prog_bar = True, logger = True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        sequences = batch["sequence"]
        labels = batch["label"]

        loss, outputs = self(sequences, labels)
        self.log("validation_loss", loss, prog_bar = True, logger = True)
        return loss
    
    def test_step(self, batch, batch_idx):
        sequences = batch["sequence"]
        labels = batch["label"]

        loss, outputs = self(sequences, labels)
        self.log("test_loss", loss, prog_bar = True, logger = True)
        return loss
    
    def configure_optimizers(self):
        return optim.AdamW(self.parameters(), lr=0.0001)

In [None]:
model = BTCPricePredictor(n_features=train_df.shape[1])

In [None]:
checkpoint_callback = ModelCheckpoint(
    dirpath = 'checkpoints',
    filename='best-checkpoint',
    save_top_k= 1,
    verbose=True,
    monitor='val_loss',
    mode = 'min'
)

logger = TensorBoardLogger('lightning_logs', name='btc-price')

early_stopping_callback = EarlyStopping(monitor='val_loss', patience=2)


trainer = pl.Trainer(
    logger = logger,
    callbacks=[early_stopping_callback, checkpoint_callback],
    max_epochs= N_EPOCHS,
    enable_progress_bar=True,
    num_sanity_val_steps=1
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
# trainer.fit(model, data_module)