In [1]:
import torch
import torch.nn as nn
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import numpy as np
from torch import optim


In [2]:

# Reading the file that contains stock continious sequences
stocks_df = pd.read_csv('stock_dataframe.csv')
print(stocks_df.head(3))


                        Date       Open       High        Low      Close  \
0  2000-01-03 00:00:00-05:00  25.215814  25.330655  24.690826  24.772856   
1  2000-01-04 00:00:00-05:00  24.379101  24.887683  23.788490  23.788490   
2  2000-01-05 00:00:00-05:00  23.919738  25.265018  23.919738  24.477537   

      Volume Brand_Name Ticker   Industry_Tag Country  Dividends  \
0  2173400.0         3m    MMM  manufacturing     usa        0.0   
1  2713800.0         3m    MMM  manufacturing     usa        0.0   
2  3699400.0         3m    MMM  manufacturing     usa        0.0   

   Stock Splits  Capital Gains   Date_only    Margin  Ticker_id  \
0           0.0            NaN  2000-01-03  0.442958          0   
1           0.0            NaN  2000-01-04  0.590611          0   
2           0.0            NaN  2000-01-05 -0.557799          0   

   Normalized_Close  
0          0.020708  
1          0.015483  
2          0.019141  


In [3]:
# Creating ticker:integer_id dictionary
ticker2idx = {ticker: i for i, ticker in enumerate(stocks_df["Ticker"].unique())}

In [4]:

#  Function to create  the  training-test dataset from the dataframe that contains  time-series sequences of stock prices 
def create_sequences(df, seq_length=4, features=["Open", "Close"], split_ratio=0.8):
    X_train, y_train, ids_train = [], [], []
    X_test, y_test, ids_test = [], [], []
    ticker2idx = {ticker: i for i, ticker in enumerate(df["Ticker"].unique())}
    group_length = []
    for ticker, group in df.groupby("Ticker"):
        group = group.sort_values("Date").reset_index(drop=True)
        ticker_id = ticker2idx[ticker]

        ticker_sequences = []
        ticker_labels = []
        
        group_length.append(len(group))
        for i in range(len(group) - seq_length):
            seq = group.iloc[i:i+seq_length][features].values
            label = group.iloc[i+seq_length]["Close"]

            ticker_sequences.append(seq)
            ticker_labels.append(label)

        split_idx = int(split_ratio * len(ticker_sequences)) #0.8 ,0.2 
 
        X_train.extend(ticker_sequences[:split_idx])
        y_train.extend(ticker_labels[:split_idx])
        ids_train.extend([ticker_id] * split_idx)

        X_test.extend(ticker_sequences[split_idx:])
        y_test.extend(ticker_labels[split_idx:])
        ids_test.extend([ticker_id] * (len(ticker_sequences) - split_idx))

    return (
        np.array(X_train), np.array(y_train), np.array(ids_train),
        np.array(X_test), np.array(y_test), np.array(ids_test),group_length
    )
# Calling create_sequences() to formate  the time sequences of stock prices for training-test dataset
X_train, y_train, ids_train, X_test, y_test, ids_test,group_length = create_sequences(stocks_df)

In [5]:
# Testing if a group of records have a batch with less  than 32 samples 
counter_of_groups = 0
counter_of_error =0 
for group_num in group_length :
    
    if group_num < 160:
       counter_of_groups =counter_of_groups +1
    if  group_num*0.2<160:
          counter_of_error = counter_of_error+1
       #if len(group_num)*0.832: 
print('Num of Groups with less than 32 samples-sequences(160 days) : \n',counter_of_groups)
print('errors : \n',counter_of_error)

Num of Groups with less than 32 samples-sequences(160 days) : 
 0
errors : 
 3


In [6]:

# Dataset Class
class StockDataset(Dataset):
    def __init__(self, X, y,tickers_ids): #ticker_ids
        #print('This is array X : \n',X)
        self.X = torch.tensor(X, dtype=torch.float32) #Check X contents before and after conversion
        #print('This is Tensor X : \n',X)
        self.y = torch.tensor(y, dtype=torch.float32)
        self.ticker_ids = torch.tensor(tickers_ids, dtype=torch.long)
    
    def __len__(self):
        return len(self.X)#,self.y,self.tickers_ids)
       # return len(self.y)  # print(len(dataset))         # calls __len__ → returns number of samples
        
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx] ,self.ticker_ids[idx] #    self.y[idx].unsqueeze(0) adding   # print(dataset[0])  calls __getitem__(0) → returns (x, y)



In [7]:
class StockPredictor(nn.Module) :  
   def __init__(self,embedding_dim,num_tickers,hidden_size,output_size,num_layers,num_features): # Constructor method
    super(StockPredictor, self).__init__()  #Inheriting 
    self.embedding = nn.Embedding(num_tickers,embedding_dim) #num_tickers -> Size of tickers:id dictionary  (number of tickers for embedding)
      
    self.lstm = nn.LSTM(input_size=num_features+embedding_dim, ##input_size = num_features 
                        hidden_size=hidden_size,
                        num_layers=num_layers,
                        batch_first=True)  # if batch_ first true : (batch, seq, feature) instead of (seq, batch, feature).
    self.fc = nn.Linear( hidden_size,output_size) 

   def forward(self,x,ticker_ids): 
    ''' 
     Notes:
     # x: (batch, seq_len, input_size-num_features)
     # output: (batch,seq_len,hidden_size)  each row contains hidden state vecotr for each time step  of  last layer
     # output[:,-1,:] picking the last hidden state vector (last row)  for each sample #hidden state vector size is equal to the number of hidden_size
     # ticker_ids: (batch,)
     
     # h_n: [num_layers,seq_len,hidden_size]	Final hidden states at last time step of each layer
     #c_n:  [num_layers,seq_len,hidden_size] Final cell states at last time step of each layer
     #output[t] = hidden states at time step t → shape [seq_length,hidden_size]
     # h_n[layer] = hidden state at last time step for that layer -> h_n[0] = output from layer 0's final time step 
          stock_embeddings = self.embedding(ticker_ids)   # (batch, embedding_dim)  
    
     stock_embeddings.shape = (32, 16)
     #stock_embeddings.unsqueeze(1) = (32, 1, 16)
     #stock_embeddings = stock_embeddings.repeat(1, 4, 1) => (32, 1, 16) → (32, 4, 16) each 'fake'time step has its own word embedding and  repeat copies the embedding across all seq_length time steps
     '''
    stock_embeddings = self.embedding(ticker_ids)   # (batch, embedding_dim)  
    stock_embeddings = stock_embeddings.unsqueeze(1).repeat(1, x.size(1), 1) # (batch, seq_len, embedding_dim) #size(1) picks the second dimension
    # Concatenate embeddings along the feature dimension
    x = torch.cat([x,stock_embeddings],dim=-1)  #  (batch, seq_len, input_size + embedding_dim)     #other way ->: x = torch.cat((x,  stock_embeddings), dim=2)
    output,(h_n, c_n) = self.lstm(x)  # batch_size(N), seq_len,  # other notation for output-> output, (h_n, c_n) =  self.lstm(x) Hidden states from all time steps of the last LSTM layer only.
   #print('Hidden state h_n shape : ',h_n.shape)  #(Final hidden state of each layer, for the last time step) (num_layers * num_directions, batch_size, hidden_size)
   # print('\n Cell state c_n :',c_n.size()) #size() or .shape
    last_time_step = output[:, -1, :]  # Picking the last hidden state for each sample-sequence  #last hidden state for each sample
    stock_predictions  = self.fc(last_time_step)  # Using last hidden states through linear fcFinal prediction:   ->   (batch,1)
    return stock_predictions
  

In [27]:
# Creating test-train Dataset
train_dataset = StockDataset(X_train, y_train,ids_train)
test_dataset = StockDataset(X_test, y_test, ids_test)
# Passing the dataset to test-train Dataloaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True,drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False,drop_last=True)


In [9]:

batch = next(iter(train_loader))
# Unpack the batch
tensor_seq, labels, ticker_ids = batch
print("Number of training Samples:",len(train_dataset))
print("Number of test Samples",len(test_dataset))

Number of training Samples: 201276
Number of test Samples 50380


In [33]:

# Model Parameters 
embedding_dim = 8
num_tickers = len(ticker2idx)
hidden_size = 128
output_size=1
num_layers=3 
batch_size=32
num_features = int(tensor_seq.size(-1)) #num_features torch dimension
lr=0.001  # Learning rate
# Creating model
Stock_predictor =StockPredictor(embedding_dim=embedding_dim,num_tickers= num_tickers, hidden_size=hidden_size,output_size=output_size,num_layers=num_layers,num_features=num_features)
# Optimizer
optimizer = optim.Adam(Stock_predictor.parameters(),lr=lr)
# Criterion-Loss_Function
loss_function= nn.MSELoss() 


In [11]:

def training_function (model,train_loader,loss_function,optimizer,num_epochs): #+ device='cpu' #

 # Hyperparameters
 num_epochs=num_epochs

 # Set model for training
 model.train()

 # Training loop    
 for epoch in range (num_epochs):
     running_loss=0
     for idx,(sequence,labels,ticker_ids) in enumerate(train_loader,start=1):
      labels = labels.view(-1, 1)  # ensures shape [batch_size, 1]
     #forward() prediction
      predictions = Stock_predictor(sequence,ticker_ids)
      # Loss function
      loss=loss_function(predictions,labels) # This is a tensor (scalar tensor)
      # Zero Grad
      optimizer.zero_grad()
      #Back propagation
      loss.backward()
      
      # Optimizer step
      optimizer.step()
     # if idx%32==0 :
      #print(accuracy_fn(predictions,labels)) #we dont use accuracy in regression because the numbers are close estimations hard to have 1-1 match
      # Track the running loss
      running_loss += loss.item() #loss_value = loss.item()  # Converts single-element tensor -> float
        

     # Print loss for every epoch
     print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss / len(train_loader)}")  



In [12]:
training_function(model=Stock_predictor,train_loader=train_loader,loss_function=loss_function,optimizer=optimizer,num_epochs=50)

Epoch 1/50, Loss: 2164.2261707378575
Epoch 2/50, Loss: 606.1973296576235
Epoch 3/50, Loss: 337.893788085473
Epoch 4/50, Loss: 200.57270093698875
Epoch 5/50, Loss: 173.92933164771523
Epoch 6/50, Loss: 230.68768862849922
Epoch 7/50, Loss: 236.21091439942094
Epoch 8/50, Loss: 174.80154826461387
Epoch 9/50, Loss: 141.0195274433042
Epoch 10/50, Loss: 124.29000898147922
Epoch 11/50, Loss: 166.9941652658459
Epoch 12/50, Loss: 124.91790830391942
Epoch 13/50, Loss: 118.21204648635681
Epoch 14/50, Loss: 146.8734488595952
Epoch 15/50, Loss: 131.55231296872321
Epoch 16/50, Loss: 135.0920653179096
Epoch 17/50, Loss: 140.91049451731047
Epoch 18/50, Loss: 141.72822759839863
Epoch 19/50, Loss: 231.7305499640794
Epoch 20/50, Loss: 183.3345580881873
Epoch 21/50, Loss: 145.35376166721363
Epoch 22/50, Loss: 197.96053347462575
Epoch 23/50, Loss: 161.06431647850383
Epoch 24/50, Loss: 144.0351850611357
Epoch 25/50, Loss: 178.59761438547787
Epoch 26/50, Loss: 193.6725819332362
Epoch 27/50, Loss: 198.676181949

## Norrmalization ,feature optimization and other strategies to optimize loss   are needed

In [None]:
def evaluation_function(model,test_loader,loss_function): #, device='cpu'
   model.eval()
   total_loss = 0.0
   all_predictions = []
   all_labels = []
   total_samples = 0.0
   with torch.no_grad():  # Turn off gradient tracking
          for idx,(sequence,labels,ticker_ids) in enumerate(test_loader,start=1):
            #inputs = inputs.to(device)
            #labels = labels.to(device)

            stock_predictions = model(sequence,ticker_ids)  # Forward pass
            labels = labels.unsqueeze(1)
            loss = loss_function(stock_predictions, labels)
            #labels = labels.unsqueeze(1)
            total_loss += loss.item()
            all_predictions.append(stock_predictions)  # all_predictions.append(outputs.cpu())
            all_labels.append(labels)
            total_samples += batch_size
            if idx%32==0 : 
             predictions = torch.cat(all_predictions).squeeze()
             labels = torch.cat(all_labels).squeeze()
             print("f:predictions:\n {} \n  vs \n labels: {}".format( predictions, labels) )
         # number_of_batches = len(test_loader)/32
          avg_loss = total_loss /  total_samples          
          print(f"Average Loss: {avg_loss:.4f}")

In [39]:
evaluation_function(model=Stock_predictor,test_loader=test_loader,loss_function=loss_function)

f:predictions:
 tensor([0.0845, 0.0845, 0.0846, 0.0846, 0.0847, 0.0846, 0.0846, 0.0846, 0.0845,
        0.0846, 0.0846, 0.0845, 0.0845, 0.0845, 0.0845, 0.0845, 0.0844, 0.0844,
        0.0844, 0.0844, 0.0844, 0.0844, 0.0843, 0.0842, 0.0843, 0.0842, 0.0842,
        0.0841, 0.0841, 0.0842, 0.0842, 0.0841, 0.0840, 0.0840, 0.0840, 0.0840,
        0.0840, 0.0840, 0.0839, 0.0839, 0.0839, 0.0839, 0.0838, 0.0838, 0.0838,
        0.0838, 0.0839, 0.0839, 0.0840, 0.0839, 0.0838, 0.0838, 0.0838, 0.0839,
        0.0839, 0.0838, 0.0837, 0.0837, 0.0836, 0.0836, 0.0835, 0.0835, 0.0835,
        0.0833, 0.0833, 0.0832, 0.0831, 0.0831, 0.0831, 0.0831, 0.0832, 0.0832,
        0.0831, 0.0831, 0.0830, 0.0831, 0.0831, 0.0831, 0.0830, 0.0831, 0.0831,
        0.0831, 0.0831, 0.0830, 0.0831, 0.0831, 0.0832, 0.0834, 0.0836, 0.0841,
        0.0840, 0.0839, 0.0836, 0.0834, 0.0837, 0.0837, 0.0839, 0.0839, 0.0839,
        0.0840, 0.0842, 0.0844, 0.0843, 0.0843, 0.0842, 0.0841, 0.0842, 0.0844,
        0.0844, 0.0842, 