# **Microsoft Stock (MSFT) FORCASTING USING PyTorch LSTM**



## **INTRODUCTION**

Time series data captures a series of data points recorded at (usually) regular intervals.
Many classical methods try to deal with Time Series data. In the last years, Long Short Term Memory Networks (LSTM) models have become a very useful method when dealing with those types of data.
In this project, we are going to forcast Microsot stock using lstm model and PyTorch.

## **Install necessary libraries**

In [None]:
!pip install torch==1.11.0
!pip install --quiet pytorch-lightning
!pip install --quiet tqdm



## **Import necessary libraries**

In [None]:
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
import math
import matplotlib
import plotly.graph_objects as go
import matplotlib.pyplot as plt
%matplotlib inline

import pandas as pd
import numpy as np
from tqdm import tqdm
from tqdm.notebook import tqdm
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.preprocessing import MinMaxScaler


import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import os
from pandas.plotting import register_matplotlib_converters
from torch import nn, optim

%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.2)

HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#93D30C", "#8F00FF"]

sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))

rcParams['figure.figsize'] = 14, 10
register_matplotlib_converters()

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)


In [None]:
pl.seed_everything(42)

Global seed set to 42


42

# **Data Preprocessing**

## **Import Data**

You can find it [here](https://www.kaggle.com/datasets/varpit94/microsoft-stock-data)

In [None]:
from google.colab import files 
import io 
uploaded = files.upload()

Saving MSFT.csv to MSFT (1).csv


In [None]:
data = pd.read_csv('MSFT.csv',parse_dates=['Date'])
data=data[8000:]

## **Visualize data**

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=data['Date'], y=data['High'],
                    mode='lines',
                    name='High',
                    line=dict(color='#a5ade6')))


fig.update_layout(font_color="#d7dbf5",
                  paper_bgcolor="#020938",  
                  plot_bgcolor="#020938",title="Visualize High value of Microsoft Stock (MSFT) prices ")
fig.update_layout(
    xaxis=dict(
        showline=True,
        showgrid=False,
        showticklabels=True,
        ),
    ),
fig.show()

In [None]:
data["prev_High"]=data.shift(1)["High"] 
data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,prev_High
8000,2017-12-05,81.339996,82.680000,80.980003,81.589996,77.394432,26152300,
8001,2017-12-06,81.550003,83.139999,81.430000,82.779999,78.523239,26162100,82.680000
8002,2017-12-07,82.540001,82.800003,82.000000,82.489998,78.248146,23184500,83.139999
8003,2017-12-08,83.629997,84.580002,83.330002,84.160004,79.832291,24489100,82.800003
8004,2017-12-11,84.290001,85.370003,84.120003,85.230003,80.847244,22857900,84.580002
...,...,...,...,...,...,...,...,...
9078,2022-03-18,295.369995,301.000000,292.730011,300.429993,300.429993,43317000,295.609985
9079,2022-03-21,298.890015,300.140015,294.899994,299.160004,299.160004,28351200,301.000000
9080,2022-03-22,299.799988,305.000000,298.769989,304.059998,304.059998,27599700,300.140015
9081,2022-03-23,300.510010,303.230011,297.720001,299.489990,299.489990,25715400,305.000000


In [None]:
from tqdm._tqdm_notebook import tqdm_notebook
tqdm.pandas()


In [None]:
data["High_change"]=data.progress_apply(lambda row : 0 if np.isnan(row.prev_High) else row.High - row.prev_High, axis=1 )
data=data.sort_values(by="Date").reset_index(drop=True)


100%|██████████| 1083/1083 [00:00<00:00, 24476.68it/s]


In [None]:
rows=[]
for _, row in tqdm(data.iterrows(), total=data.shape[0]):
  row_data=dict(
      day_of_week = row.Date.dayofweek,
      day_of_month= row.Date.day,
      week_of_year=row.Date.week,
      month=row.Date.month,
      Open=row.Open,
      High=row.High,
      Low=row.Low,
      High_change=row.High_change,
      Close=row.Close
      )
  rows.append(row_data)
features_df = pd.DataFrame(rows) 

100%|██████████| 1083/1083 [00:00<00:00, 8302.93it/s]


In [None]:
features_df

Unnamed: 0,day_of_week,day_of_month,week_of_year,month,Open,High,Low,High_change,Close
0,1,5,49,12,81.339996,82.680000,80.980003,0.000000,81.589996
1,2,6,49,12,81.550003,83.139999,81.430000,0.459999,82.779999
2,3,7,49,12,82.540001,82.800003,82.000000,-0.339996,82.489998
3,4,8,49,12,83.629997,84.580002,83.330002,1.779999,84.160004
4,0,11,50,12,84.290001,85.370003,84.120003,0.790001,85.230003
...,...,...,...,...,...,...,...,...,...
1078,4,18,11,3,295.369995,301.000000,292.730011,5.390015,300.429993
1079,0,21,12,3,298.890015,300.140015,294.899994,-0.859985,299.160004
1080,1,22,12,3,299.799988,305.000000,298.769989,4.859985,304.059998
1081,2,23,12,3,300.510010,303.230011,297.720001,-1.769989,299.489990


## **Split into test and train**

In [None]:
train_size=int(len(features_df)*.9) 

train_data = features_df[:train_size]
test_data =features_df[train_size+1:]

test_data

Unnamed: 0,day_of_week,day_of_month,week_of_year,month,Open,High,Low,High_change,Close
975,2,20,42,10,309.209991,309.700012,306.109985,0.400024,307.410004
976,3,21,42,10,307.170013,311.019989,306.359985,1.319977,310.760010
977,4,22,42,10,310.399994,311.089996,307.799988,0.070007,309.160004
978,0,25,43,10,309.359985,309.399994,306.459991,-1.690002,308.130005
979,1,26,43,10,311.000000,312.399994,308.600006,3.000000,310.109985
...,...,...,...,...,...,...,...,...,...
1078,4,18,11,3,295.369995,301.000000,292.730011,5.390015,300.429993
1079,0,21,12,3,298.890015,300.140015,294.899994,-0.859985,299.160004
1080,1,22,12,3,299.799988,305.000000,298.769989,4.859985,304.059998
1081,2,23,12,3,300.510010,303.230011,297.720001,-1.769989,299.489990


## **Scale data**

In [None]:
scaler= MinMaxScaler(feature_range=(-1, 1))
scaler=scaler.fit(train_data)

In [None]:
train_data=pd.DataFrame(
 scaler.transform(train_data),
 index=train_data.index,
 columns=train_data.columns)

## **Create sequences**

In [None]:
def create_sequences(input_data: pd.DataFrame, target_column, sequence_length):
  sequences = []
  data_size = len(input_data)
  for i in tqdm(range(data_size - sequence_length)):
    sequence=input_data[i:i+sequence_length]
    label_position=i+sequence_length
    label=input_data.iloc[label_position][target_column]
    sequences.append((sequence, label))
  return sequences

In [None]:
SEQUENCE_LENGTH = 90
train_sequences = create_sequences(train_data, "High", SEQUENCE_LENGTH)
test_sequences = create_sequences(test_data, "High", SEQUENCE_LENGTH)
                 
                
               


100%|██████████| 884/884 [00:00<00:00, 10005.36it/s]

100%|██████████| 18/18 [00:00<00:00, 5672.24it/s]


# **Create Pytorch dataset**

In [None]:
class MSFTDataset(Dataset):
    def __init__(self, sequences):
      self.sequences=sequences
    def __len__(self):
      return len(self.sequences)
      
    def __getitem__(self, idx):
      sequence, label=self.sequences[idx]
      
      return dict(
          sequence=torch.Tensor(sequence.to_numpy()),
          label=torch.tensor(label).float()
      )

In [None]:
class MSFTPriceDataModule(pl.LightningDataModule):
  def __init__(
      self, train_sequences, test_sequences, batch_size=64
  ):
    super().__init__()
    self.train_sequences=train_sequences
    self.test_sequences=test_sequences
    self.batch_size=batch_size 

  def setup(self, stage=None):
    self.train_dataset=MSFTDataset(self.train_sequences)
    self.test_dataset=MSFTDataset(self.test_sequences)
  
  def train_dataloader(self):
    return DataLoader(
        self.train_dataset,
        batch_size=self.batch_size,
        shuffle=False,
        num_workers=1
    )
  def val_dataloader(self):
    return DataLoader(
        self.test_dataset,
        batch_size=self.batch_size,
        shuffle=False,
        num_workers=1
    )
  def test_dataloader(self):
    return DataLoader(
        self.test_dataset,
        batch_size=self.batch_size,
        shuffle=False,
        num_workers=1
    )

In [None]:
N_EPOCHS=8
BATCH_SIZE=64
data_module=MSFTPriceDataModule(train_sequences, test_sequences, batch_size=BATCH_SIZE)
data_module.setup()

In [None]:
train_dataset= MSFTDataset(train_sequences)

# **Build LSTM model**

In [None]:
class PricePredictionModel(nn.Module):
  def __init__(self, n_features, n_hidden=128, n_layers=2):
    super().__init__()

    self.n_hidden=n_hidden
    self.lstm=nn.LSTM(
      input_size=n_features,
      hidden_size=n_hidden,
      batch_first=True,
      num_layers=n_layers,
      dropout=0.2
    )
    self.regressor=nn.Linear(n_hidden, 1)

  def forward(self, x):
    self.lstm.flatten_parameters()
    _, (hidden,_) =self.lstm(x)
    out=hidden[-1]
    return self.regressor(out)

In [None]:
class MSFTPricePredictor(pl.LightningModule):
  def __init__(self, n_features:int):
    super().__init__()
    self.model=PricePredictionModel(n_features)
    self.criterion = nn.MSELoss()

  def forward(self, x, labels=None):
    output=self.model(x)
    loss = 0
    if labels is not None:
      loss=self.criterion(output, labels.unsqueeze(dim=1))
    return loss, output

  def training_step(self, batch, batch_idx):
    sequences=batch["sequence"]
    labels=batch["label"]
    loss, outputs=self(sequences, labels)
    self.log("train_loss", loss, prog_bar=True, logger=True)
    return {"loss": loss}

  def validation_step(self, batch, batch_idx):
    sequences=batch["sequence"]
    labels=batch["label"]
    loss, outputs=self(sequences, labels)
    self.log("val_loss", loss, prog_bar=True, logger=True)
    return loss

  def test_step (self, batch, batch_idx):
    sequences=batch["sequence"]
    labels=batch["label"] 
    loss, outputs=self(sequences, labels)
    self.log("test_loss", loss, prog_bar=True, logger=True)
    return loss
    
  def configure_optimizers (self):
    return optim.AdamW(self.parameters (), lr=0.0001)

In [None]:
model= MSFTPricePredictor(n_features=train_data.shape[1]) 

# **Training**

In [None]:
%load_ext tensorboard
%tensorboard --logdir ./lightning_logs

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 227), started 0:43:19 ago. (Use '!kill 227' to kill it.)

<IPython.core.display.Javascript object>

In [None]:
checkpoint_callback=ModelCheckpoint(
    dirpath="./",
    filename="best-checkpoint",
    save_top_k=1,
    verbose=True,
    monitor="val_loss",
    mode="min"
)
logger = pl.loggers.TensorBoardLogger(
                save_dir='./lightning_logs',
                name='lightning_logs',
                version='my_name'
            )
#logger=TensorBoardLogger("./lightning_logs", name="btc-price")
early_stopping_callback=EarlyStopping(monitor="val_loss", patience=2)

trainer=pl.Trainer(
  logger=logger,
  checkpoint_callback=checkpoint_callback,
  callbacks=[early_stopping_callback],
  max_epochs=N_EPOCHS,
  gpus=1,
  progress_bar_refresh_rate=30
)


Setting `Trainer(checkpoint_callback=<pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint object at 0x7ff6b53152d0>)` is deprecated in v1.5 and will be removed in v1.7. Please consider using `Trainer(enable_checkpointing=<pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint object at 0x7ff6b53152d0>)`.


Setting `Trainer(progress_bar_refresh_rate=30)` is deprecated in v1.5 and will be removed in v1.7. Please pass `pytorch_lightning.callbacks.progress.TQDMProgressBar` with `refresh_rate` directly to the Trainer's `callbacks` argument instead. Or, to disable the progress bar pass `enable_progress_bar = False` to the Trainer.

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
history =trainer.fit(model, data_module)


Checkpoint directory ./lightning_logs/lightning_logs/my_name/checkpoints exists and is not empty.

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type                 | Params
---------------------------------------------------
0 | model     | PricePredictionModel | 203 K 
1 | criterion | MSELoss              | 0     
---------------------------------------------------
203 K     Trainable params
0         Non-trainable params
203 K     Total params
0.814     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]


The number of training batches (14) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.



Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [None]:
trainer.save_checkpoint("best-checkpoint.ckpt")


In [None]:
trained_model=MSFTPricePredictor.load_from_checkpoint( 
  "best-checkpoint.ckpt",
   n_features=train_data.shape[1])

In [None]:
trained_model.freeze()


# **Forcasting**

In [None]:
test_dataset=MSFTDataset(test_sequences) 
predictions=[]
labels=[]
for item in tqdm(test_dataset):
  sequence=item["sequence"]
  label=item["label"]
  
  _, output=trained_model(sequence.unsqueeze(dim=0))
  predictions.append(output.item())
  labels.append(label.item())


  0%|          | 0/18 [00:00<?, ?it/s][A
 28%|██▊       | 5/18 [00:00<00:00, 48.85it/s][A
 56%|█████▌    | 10/18 [00:00<00:00, 48.43it/s][A
100%|██████████| 18/18 [00:00<00:00, 45.52it/s]


In [None]:
scaler.min_

array([-1.00000000e+00, -1.06666667e+00, -1.03846154e+00, -1.18181818e+00,
       -1.72728897e+00, -1.73320625e+00, -1.73050384e+00,  7.97012238e-04,
       -1.72299505e+00])

In [None]:
scaler.scale_

array([0.5       , 0.06666667, 0.03846154, 0.18181818, 0.00894135,
       0.008868  , 0.00902079, 0.0796813 , 0.00886132])

In [None]:
descaler=MinMaxScaler()
descaler.min_, descaler.scale_ = scaler.min_[ -1], scaler. scale_[-1] 

In [None]:
def descale(descaler, values):
  values_2d=np.array(values)[:, np.newaxis]
  return descaler.inverse_transform(values_2d).flatten()

In [None]:
predictions_descaled=descale(descaler, predictions)
labels_descaled= descale(descaler, labels) 

In [None]:
test_df=data[train_size+1:]
len(test_data), len(test_df)

(108, 108)

In [None]:
test_sequences_data= test_df.iloc[SEQUENCE_LENGTH:] 
len(test_sequences_data), len(test_sequences)


(18, 18)

# **Forecast Evaluation**

In [None]:
def adjust(val, length= 6): return str(val).ljust(length)

def forecast_accuracy(forecast, actual):
    m=np.abs(forecast - actual)/np.abs(actual)
    mape = np.mean(m)  # MAPE
    me = np.mean(forecast - actual)             # ME
    mae = np.mean(np.abs(forecast - actual))    # MAE
    mpe = np.mean((forecast - actual)/actual)   # MPE
    rmse = np.mean((forecast - actual)**2)**.5  # RMSE

    return({'mape':mape, 'me':me, 'mae': mae, 
            'mpe': mpe, 'rmse':rmse})

print('Forecast Accuracy of: BTC HIGH')
accuracy_prod = forecast_accuracy(predictions_descaled, test_sequences_data['High'])
for k, v in accuracy_prod.items():
    print(adjust(k), ': ', round(v,4))

## **Visualize actual vs forcast**

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=test_sequences_data['Date'], y=predictions_descaled,
                    mode='lines',
                    name='predicted',
                    line=dict(color='#a5ade6')))
fig.add_trace(go.Scatter(x=test_sequences_data['Date'], y=test_sequences_data['High'],
                    mode='lines',
                    name='Real',
                    line=dict(color='#a5ade6')))

fig.update_layout(font_color="#d7dbf5",
                  paper_bgcolor="#020938",  
                  plot_bgcolor="#020938",title="Visualize High value of Microsoft Stock (MSFT) prices ")
fig.update_layout(
    xaxis=dict(
        showline=True,
        showgrid=False,
        showticklabels=True,
        ),
    ),
fig.show()