In [None]:
#Yash Oswal (yoswal@binghamton.edu)
#Praneeth Purini (ppurini@binghamton.edu)

In [None]:
#Necessary Imports
# !pip install ta
import numpy as np
import pandas as pd
import os
import pickle
import torch
# import ta
from google.colab import drive
drive.mount('/content/drive')

#Data Source Location
os.chdir("/content/drive/My Drive/mining-on-stock")

Mounted at /content/drive


Task 1 Start

In [None]:
#Fetching training data from the pkl file.
with open('training_set.pkl', 'rb') as train_file:
  train_data = pickle.load(train_file)

#We can also use pd.read_pickle fuction

In [None]:
# Concatenate the daily percentage changes for all the stocks into a single list
change_list = [df['Close'].pct_change() for df in train_data]
all_changes = sorted([item for sublist in change_list for item in sublist])
all_changes = np.nan_to_num(all_changes)

# Compute the two thresholds that divide the list into three equal-sized intervals
level1 = np.percentile(all_changes, 33.33)
level2 = np.percentile(all_changes, 66.67)

# Assign labels to the daily percentage changes for each stock
labels = []
for df in train_data:
    close_pct_change = df['Close'].pct_change() 
    label = pd.cut(close_pct_change, bins=[-np.inf, level1, level2, np.inf], labels=['decrease', 'no big change', 'increase'])
    labels.append(label)
    
# Output the thresholds and the number of data points in each level
print(f'Threshold 1: {level1:.6f}')
print(f'Threshold 2: {level2:.6f}')
labels = pd.concat(labels)
print(f'Total number of data points: {len(labels)}')
print(f'Number of data points in decrease level: {(labels == "decrease").sum()}')
print(f'Number of data points in no big change level: {(labels == "no big change").sum()}')
print(f'Number of data points in increase level: {(labels == "increase").sum()}')

Threshold 1: -0.002106
Threshold 2: 0.001981
Total number of data points: 4404000
Number of data points in decrease level: 1467857
Number of data points in no big change level: 1466290
Number of data points in increase level: 1467853


Task 1 End

Task 2 Start

In [None]:
from features import *
def feature_engineering_func():
  #These are the functions that we have considered using for making additional features in our dataset. 
  feature_funcs = [('awesome_oscillator', None),
  ('bollinger_percent_b', None),
  ('cfo', None),
  ('cmo', None),
  ('coppock_curve', None),
  ('detrended_price_oscillator', None),
  ('disparity_index', None),
  ('elder_impulse_system', None),
  ('gator_oscillator', None), 
  ('historical_volatility', None),
  ('intraday_momentum_index', None),
  ('linear_reg_slope', None),
  ('macd', None),
  ('macd_divergence', None), 
  ('momentum_indicator', None), 
  ('qstick', 14), 
  ('projected_volume_at_time', 14),
  ('projected_aggregate_volume', 14),
  ('price_volume_trend', None),
  ('positive_volume_index', None),
  ('twiggs_money_flow', None),
  ('volume_underlay', None),
  ('on_balance_volume', None),
  ('moving_average', 30),
  ('parabolic_sar', (0.02, 0.2)),
  ('pretty_good_oscillator', (10, 20)),
  ('price_momentum_oscillator', (12, 24)),
  ('price_oscillator', (12, 26, 9)),
  ('price_rate_of_change', 14),
  ('KST', (10, 15, 20, 30, 10, 10, 10, 15)),
  ('SpecialK', (10, 15, 20, 30, 10, 10, 10, 15)),
  ('ravi', (7, 65)),
  ('rsi', 14),
  ('rsi_divergence', (14, 9)),
  ('rainbow_moving_average', 30),
  ('rainbow_oscillator', (10, 1.0)),
  ('random_walk_index', 14),
  ('relative_vigor_index', 14),
  ('schaff_trend_cycle', 10),
  ('standard_deviation', 14),
  ('stochastic_divergence', 14),
  ('stochastic_momentum_index', (14, 3)),
  ('stochastic_rsi', (14, 3, 3)),
  ('stochastics', (14, 3, 3)),
  ('trix', 15),
  ('trade_volume_index', None),
  ('trend_intensity_index', 14),
  ('true_range', None),
  ('typical_price', None),
  ('ultimate_oscillator', (7, 14, 28, 4, 2, 1)),
  ('vwap', None),
  ('vertical_horizontal_filter', 28),
  ('vortex_indicator', 14),
  ('williams_r', 14),
  ('zigzag', 5),
  ('calculate_adx', None), 
  ('atr_bands', None),
  ('accumulative_swing_index', None),
  ('alligator', None),
  ('aroon', None),
  ('aroon_oscillator', None),
  ('average_true_range', None),
  ('cog', None),
  ('cpr', None),
  ('chv', None),
  ('cmf', None),
  ('ci', None),
  ('cci', None),
  ('donchian_channel', None),
  ('ehlers_fisher_transform', None),
  ('elder_ray_index', None),
  ('fractal_chaos_bands', None),
  ('fractal_chaos_oscillator', None),
  ('gopalakrishnan_range_index', None),
  ('high_minus_low', None),
  ('highest_high', None),
  ('ichimoku_clouds', None),
  ('keltner_channel', None),
  ('linear_reg_forecast', None),
  ('mass_index', None),
  ('median_price', None),
  ('money_flow_index', None)]

  #There are about 82 functions above from which we have obtained more than 100 new features.
  #With careful evaluation, we finally considered using 101 features for task-3 removing all other erratic features.
  
  # Concatenate the daily percentage changes for all the stocks into a single list
  change_list = [df['Close'].pct_change() for df in train_data]
  all_changes = sorted([item for sublist in change_list for item in sublist])
  all_changes = np.nan_to_num(all_changes)

  # Compute the two thresholds that divide the list into three equal-sized intervals
  level1 = np.percentile(all_changes, 33.33)
  level2 = np.percentile(all_changes, 66.67)

  # Assign labels to the daily percentage changes for each stock
  labels = []
  for df in train_data:
      close_pct_change = df['Close'].pct_change() 
      label = pd.cut(close_pct_change, bins=[-np.inf, level1, level2, np.inf], labels=['decrease', 'no big change', 'increase'])
      labels.append(label)
      
  # Output the thresholds and the number of data points in each level
  labels = pd.concat(labels)
  
  # Define a function to calculate the new features  
  def calculate_new_features(df):
      df_new = df.copy()
      for func in feature_funcs:
        func_name = func[0]
        func_args = func[1]        
        func_module = __import__('features', globals(), locals(), [func_name], 0) # Import the function dynamically using its name
        func_to_call = getattr(func_module, func_name)
        try:
          if func_args is None: 
              result = func_to_call(df)
          elif isinstance(func_args, tuple):
              result = func_to_call(df, *func_args)
          else:
              result = func_to_call(df, func_args)   
          if isinstance(result, pd.DataFrame):
            for i in result.columns.tolist():
              column_name = f'{func_name} ({i})'
              df_new[column_name] = result[i]
          elif isinstance(result, pd.Series):
              column_name = f'{func_name} ({result.name})'
              df_new[column_name] = result
        except AttributeError:            
            print(f"{func_name} function not found in the features module")
        except Exception as e:
            print(f"An error occurred while calling {func_name}: {str(e)}")
      return df_new

  # Create a new list to store the processed dataframes
  processed_data = []

  # Iterate over each dataframe in the list
  for df in train_data.copy():      
      df_new = calculate_new_features(df) # Apply the calculate_new_features function to the current dataframe
      processed_data.append(df_new) # Append the updated dataframe to the new list
  
  #Now checking if training data has any infinity values in them 
  list_inf = []
  for i in range(0, len(train_data)):
      inf_locs = np.where(np.isinf(train_data[i].iloc[:, :-1]))
      list_inf.append((i, inf_locs))  
  #Dropping rows that contain nan's in the label column for all the data frames in the train data.
  #Those are actually the first rows of each dataframe for which the nan's in the labels caused because of percent change.  
  i = 0
  while(i < len(train_data)):
    train_data[i] = train_data[i].dropna(subset = ['Label'])
    i = i +1
  #Now each dataframe has 2201 rows.
  #Droping 16th column as we dropeed in training
  #Droping 16th column as we dropeed in training
  i = 0
  while(i < len(train_data)):
    train_data[i] = train_data[i].drop(train_data[i].columns[16], axis =1)
    i = i +1
  list_inf = []
  for i in range(0, len(train_data)):
      inf_locs = np.where(np.isinf(train_data[i].iloc[:, :-1]))
      list_inf.append((i, inf_locs))

  #16th column is the culprit having inf values
  for i, j in list_inf:
    if len(j[0])!=0:
      k = 0
      while(k < len(train_data)):
        # select column containing infinite values
        col = train_data[k].iloc[:, j[1]]
      # replace infinite values with NaN
        col.replace([np.inf, -np.inf], np.nan, inplace=True)
      # replace NaN values with maximum value of column
        max_val = col.max()
        col.fillna(max_val, inplace=True)
        k = k + 1
        #Removing the above obtained features

  for i in range(0, len(train_data)):
    for k in ['coppock_curve (coppock)','twiggs_money_flow (Twiggs Money Flow)' ,'volume_underlay (Volume Ratio MA)', 'stochastic_rsi (Close)', 'vwap (None)', 'donchian_channel (upper_dc)', 'donchian_channel (lower_dc)', 'ichimoku_clouds (senkou_span_b)']:
      train_data[i] = train_data[i].drop(k, axis=1)
  #10
  for i in range(0, len(train_data)):
    for j in ['cfo (cfo)', 'ichimoku_clouds (chikou_span)', 'cci (cci)', 'chv (chv)', 'cmf (cmf)', 'money_flow_index (Money Flow Index)', 'relative_vigor_index (None)']:
      train_data[i][j].fillna(method='bfill', inplace=True)
    for j in ['KST (Close)', 'SpecialK (Close)', 'calculate_adx (ADX)']:
      train_data[i][j].fillna(method='ffill', inplace=True)
  #14
  for i in range(0, len(train_data)):
    for j in range(0, train_data[0].shape[1]):
      train_data[i].iloc[:, j].fillna(method='bfill', inplace=True)
      train_data[i].iloc[:, j].fillna(method='ffill', inplace=True)
      train_data = pd.concat(train_data, axis =0)
  return train_data

In [13]:
#Load Feature Engineered Stock dataset: 100 Features including labels
with open('data_with_features_clean_train.pkl', 'rb') as train_file: #To do: Change file name.
  train_data = pickle.load(train_file)
del train_file

In [None]:
#Last 5 data points of the first stock
print(f'Last 5 datapoints of the first stock among 2000 stocks')
stock_len = 2201
train_data.iloc[stock_len-5: stock_len] #To do: Once the concatenated file is ready.

Last 5 datapoints of the first stock among 2000 stocks


Unnamed: 0,Open,High,Low,Close,Volume,awesome_oscillator (AO),bollinger_percent_b (percent_b),cfo (cfo),cmo (cmo),detrended_price_oscillator (dpo),...,ichimoku_clouds (senkou_span_a),ichimoku_clouds (chikou_span),keltner_channel (Keltner Center),keltner_channel (Keltner Upper),keltner_channel (Keltner Lower),linear_reg_forecast (LRF),mass_index (Mass_Index),median_price (Median Price),money_flow_index (Money Flow Index),Label
2197,0.680849,0.683633,0.672997,0.674055,0.351048,0.010157,0.744441,0.068595,21.168686,0.004813,...,0.683299,0.689504,0.677062,0.692473,0.66165,0.689504,1.080827,0.678315,7.843752,decrease
2198,0.674389,0.67489,0.671326,0.674055,0.143069,0.01002,0.72316,0.068595,19.300231,0.005677,...,0.682909,0.689504,0.677206,0.692105,0.662307,0.689504,0.9666,0.673108,7.179743,no big change
2199,0.67411,0.676227,0.67244,0.676004,0.312048,0.010509,0.755154,0.068595,21.857938,0.008792,...,0.682505,0.689504,0.677414,0.691979,0.662849,0.689504,0.896525,0.674333,7.179743,increase
2200,0.675725,0.675837,0.66882,0.669321,0.27923,0.008306,0.536443,0.068595,18.491048,0.003247,...,0.682505,0.689504,0.677475,0.692413,0.662537,0.689504,0.925453,0.672328,7.535366,decrease
2201,0.669293,0.671326,0.662973,0.664198,0.552933,0.005024,0.362143,0.068595,15.904361,-0.000906,...,0.679735,0.689504,0.677201,0.691448,0.662953,0.689504,0.977753,0.667149,7.707911,decrease


Task 2 End

Custom DataLoader Class to process, load the data in chunks

---



In [None]:
import torch
from torch.utils.data import DataLoader, Dataset, TensorDataset
import multiprocessing

class MyDataset(Dataset):
    def __init__(self, df, stock_len, seq_length):
        self.X = []
        self.y = []
        
        labels = df.iloc[:, -1].values # Extract labels from the last column
        cnt = len(labels)
        
        df = df.iloc[:, :-1] # Remove labels column from the DataFrame        
        values = torch.tensor(df.values) # Convert DataFrame to tensor
        
        # Create input/output sequences
        while cnt != 0:
          for i in range(seq_length, stock_len):
            self.X.append(values[i-seq_length:i, :])
            self.y.append(labels[i])
          cnt -= stock_len
        
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
    
    def __len__(self):
        return len(self.X)

def data_generator(train_data, stock_len, seq_length, batch_size, num_workers):
    train_dataset = MyDataset(train_data, stock_len, seq_length)
    
    # Create DataLoader
    # Optimzation 1: Dataset and Dataloaders for batch processing
    # Optimization 2: (num_of_workers, pin_memory(GPU specific))    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True)

    for X_batch, y_batch in train_loader:
        # Convert labels to tensor
        y_train = np.array(y_batch)
        labels_encoded = torch.zeros((len(y_train), 3), dtype=torch.float32)
        labels_encoded[y_train == 'increase', 2] = 1
        labels_encoded[y_train == 'no big change', 1] = 1
        labels_encoded[y_train == 'decrease', 0] = 1        
        y_train_tensor = torch.tensor(labels_encoded, dtype=torch.float32).clone().detach()

        yield X_batch, y_train_tensor

    # Free up memory
    del train_dataset, train_loader
    del labels_encoded
    del y_train, X_batch, y_batch, y_train_tensor

def prepare_train_data(train_data = None, stock_len=2201, seq_length = 100, batch_size = 32):
  num_workers=multiprocessing.cpu_count() #parallel computing

  # Optimization 4 (Using data generator)
  generator = data_generator(train_data, stock_len, seq_length, batch_size, num_workers) 

  return generator

In [None]:
#num_layers = 1
#nn.dropout(p = 0.2)
def perform_task_3_train():
  for itr in range(0, 10):
    class LSTM1(nn.Module):
      def __init__(self, input_dim=params_dict["input_dim"][itr], 
                  hidden_dim_LSTM=params_dict["hidden_dim_LSTM"][itr],
                  hidden_dim_FC=params_dict["hidden_dim_FC"][itr],
                  output_dim=params_dict["output_dim"][itr],
                  num_LSTM_layers=params_dict["num_layers_LSTM"][itr],
                  num_layers_FC=params_dict["num_layers_FC"][itr]):
          
            super(LSTM1, self).__init__()
            self.lstm = nn.LSTM(input_dim, hidden_dim_LSTM, num_layers=num_LSTM_layers, batch_first=True) #num_layers = 2
            self.dropout = nn.Dropout(p=0.2)
            self.fc = nn.Linear(hidden_dim_LSTM, output_dim)
            self.softmax = nn.Softmax(dim=1)
            self.num_LSTM_layers = num_LSTM_layers
            self.hidden_dim_LSTM = hidden_dim_LSTM 
            
            # Use Xavier initialization for weights
            init.xavier_uniform_(self.lstm.weight_ih_l0)
            init.orthogonal_(self.lstm.weight_hh_l0)
            init.constant_(self.lstm.bias_ih_l0, 0.0)
            init.constant_(self.lstm.bias_hh_l0, 0.0)
            init.xavier_uniform_(self.fc.weight)
            init.constant_(self.fc.bias, 0.0)

      def forward(self, x):
          h0 = torch.zeros(self.num_LSTM_layers, x.size(0), self.hidden_dim_LSTM).requires_grad_().to(device)
          c0 = torch.zeros(self.num_LSTM_layers, x.size(0), self.hidden_dim_LSTM).requires_grad_().to(device)
          out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))
          out = self.dropout(out)
          out = self.fc(out[:, -1, :])
          out = out.view(-1, 3)
          out = self.softmax(out)
          return out
    # Check if CUDA is available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  #--------------------------------------------------------------------------------------------------------------------------------------------  
    def train_model1(batch_generator, model_id=params_dict["model_id"][itr], 
                  input_dim=params_dict["input_dim"][itr], 
                  learning_rate=params_dict["learning rate"][itr], 
                  num_epochs=params_dict["num_epochs"][itr], 
                  hidden_dim_LSTM=params_dict["hidden_dim_LSTM"][itr],
                  output_dim=params_dict["output_dim"][itr]):
          # Evaluate each model for 10 epochs
          # Initialize the model
          learning_rate = learning_rate
          num_epochs = num_epochs
          input_dim = input_dim
          hidden_dim_LSTM = hidden_dim_LSTM
          # output_dim = y_train_tensor.shape[1]
          output_dim = 3

          # Define the loss function and optimizer
          criterion = nn.CrossEntropyLoss()  

          # Initialize the model
          print('Model no', model_id)
          model = LSTM1(input_dim, hidden_dim_LSTM, output_dim).to(device)

          # Define the optimizer and scheduler for the current model
          optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
          scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

          # Checkpointing parameters for the current model
          checkpoint_interval = 1
          checkpoint_file = f'lstm_task_3_checkpoint_{model_id}.pth'

          # Define early stopping parameters
          best_loss = float('inf')
          early_stop_counter = 0
          early_stop_patience = 5

          # Train the current model for 10 epochs
          for epoch in range(num_epochs):
              # Set model to train mode
              model.train()
              # model.double()

              # Iterate over batches
              for x_batch, y_batch in tqdm(generator):
                  # Move data to device
                  x_batch, y_batch = x_batch.to(device), y_batch.to(device)
                  # Cast x_batch to float32
                  x_batch = x_batch.float()
                  print(x_batch.shape)

                  # Forward pass
                  outputs = model(x_batch)
                  loss = criterion(outputs, y_batch)

                  # Backward and optimize
                  optimizer.zero_grad()
                  loss.backward()
                  optimizer.step()

              # Print the loss for every 10 epochs
              if (epoch + 1) % 10 == 0:
                  print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch + 1, num_epochs, loss.item()))

              # Check if current model has reached early stopping criteria
              print(loss.item())
              if loss.item() < best_loss:
                  best_loss = loss.item()
                  early_stop_counter = 0
              else:
                  early_stop_counter += 1

              if early_stop_counter >= early_stop_patience:
                  print(f"Training for model {itr} stopped early at epoch {epoch+1} due to early stopping")
                  break

              # Save checkpoint at regular intervals
              if (epoch + 1) % checkpoint_interval == 0:
                  torch.save({
                      'epoch': epoch + 1,
                      'model_state_dict': model.state_dict(),
                      'optimizer_state_dict': optimizer.state_dict(),
                      'loss': loss.item(),
                  }, checkpoint_file)

              # Step the scheduler
              scheduler.step()

          # Delete variables to free up memory
          del model, optimizer, scheduler
  #------------------------------------------------------------------------------------------------------------------------------------------------------------
    # Empty the cache to free up GPU memory
    torch.cuda.empty_cache()
    seq_length = 100 # Define sequence length
    batch_size = 32 # Create data loader
    stocks_500 = 2201*500
    generator = prepare_train_data(train_data.iloc[:stocks_500, :], 2201, seq_length, batch_size)
  #------------------------------------------------------------------------------------------------------------------------------------------------------------
    train_model1(generator, itr)
# perform_task_3_train()

In [11]:
num_layers_FC = ([1]*5) + ([2]*5)
num_layers_FC.reverse()
hidden_dim_LSTM = [i for i in range(21, 31)]
hidden_dim_LSTM.reverse()
params_dict = {"input_dim": [100]*10, 
               "hidden_dim_FC": [i for i in range(21, 31)],
               "output_dim": [3]*10, 
               "num_layers_LSTM":([1]*5) + ([2]*5), 
               "learning rate":[0.001, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09], 
               "num_epochs": [i for i in range(10, 21)],
               "model_id":[1, 2, 3, 4, 5, 6, 7, 8, 9, 10] }
params_dict["num_layers_FC"] = num_layers_FC
params_dict["hidden_dim_LSTM"] = hidden_dim_LSTM 

In [34]:
metrics = []
#Load Feature Engineered Stock dataset: 100 Features including labels
with open('data_with_features_clean_test.pkl', 'rb') as test_file: #To do: Change file name.
  test_data = pickle.load(test_file)
del test_file

def perform_task_3():
  for itr2 in range(0, 10):
    class LSTM1(nn.Module):
      def __init__(self, input_dim=params_dict["input_dim"][itr2], 
                  hidden_dim_LSTM=params_dict["hidden_dim_LSTM"][itr2],
                  hidden_dim_FC=params_dict["hidden_dim_FC"][itr2],
                  output_dim=params_dict["output_dim"][itr2],
                  num_LSTM_layers=params_dict["num_layers_LSTM"][itr2],
                  num_layers_FC=params_dict["num_layers_FC"][itr2]):
          
            super(LSTM1, self).__init__()
            self.lstm = nn.LSTM(input_dim, hidden_dim_LSTM, num_layers=num_LSTM_layers, batch_first=True) #num_layers = 2
            self.dropout = nn.Dropout(p=0.2)
            self.fc = nn.Linear(hidden_dim_LSTM, output_dim)
            self.softmax = nn.Softmax(dim=1)
            self.num_LSTM_layers = num_LSTM_layers
            self.hidden_dim_LSTM = hidden_dim_LSTM 
            
            # Use Xavier initialization for weights
            init.xavier_uniform_(self.lstm.weight_ih_l0)
            init.orthogonal_(self.lstm.weight_hh_l0)
            init.constant_(self.lstm.bias_ih_l0, 0.0)
            init.constant_(self.lstm.bias_hh_l0, 0.0)
            init.xavier_uniform_(self.fc.weight)
            init.constant_(self.fc.bias, 0.0)

      def forward(self, x):
          h0 = torch.zeros(self.num_LSTM_layers, x.size(0), self.hidden_dim_LSTM).requires_grad_().to(device)
          c0 = torch.zeros(self.num_LSTM_layers, x.size(0), self.hidden_dim_LSTM).requires_grad_().to(device)
          out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))
          out = self.dropout(out)
          out = self.fc(out[:, -1, :])
          out = out.view(-1, 3)
          out = self.softmax(out)
          return out
    # Check if CUDA is available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  #------------------------------------------------------------------------------------------------------------------------------------------  
    def perform_validation1(num_models=1, file_desc=itr2, data_generator=None):
      # Evaluate each model on the primary training set and the validation set  
      checkpoint_file = f'lstm_task_3_checkpoint_{file_desc}.pth' # Load the trained model from the checkpoint file
      checkpoint = torch.load(checkpoint_file, map_location=torch.device('cpu'))  
      model = LSTM1(params_dict["input_dim"][itr2], params_dict["hidden_dim_LSTM"][itr2], 3).to(device)
      model.load_state_dict(checkpoint['model_state_dict'])

      # Evaluate the current model on the passed data  
      result_pred_labels = []  
      precisions = []
      accuracies = []
      for X_data, y_data in data_generator:        
        with torch.no_grad():
          X_data = X_data.float()
          outputs = model.forward(X_data.to(device))

        # Convert predicted and true labels to one-hot encodings
        predicted_labels = torch.argmax(outputs, dim=1).to(device)
        true_labels = torch.argmax(y_data, dim=1).to(device)

        # Calculate precision for each class    
        for i in range(3):
          if i == 2: # positive class
              true_positives = torch.sum((predicted_labels == i) & (true_labels == i))
              false_positives = torch.sum((predicted_labels == i) & (true_labels != 2))
              precision = true_positives.float() / (true_positives + false_positives).float()            
              accuracies.append(torch.mean((predicted_labels == true_labels).float() * (true_labels == i).float()))
          else: # negative class
              true_negatives = torch.sum((predicted_labels == i) & (true_labels == i) & (true_labels != 2))
              false_positives = torch.sum((predicted_labels == i) & (true_labels != i) & (true_labels != 2))
              precision = true_negatives.float() / (true_negatives + false_positives).float()
              accuracies.append(torch.mean((predicted_labels == true_labels).float() * (true_labels != i).float()))
          if not torch.isnan(precision):        
            precisions.append(precision)
      # Calculate average precision and accuracy
      avg_precision = torch.mean(torch.tensor(precisions))
      avg_accuracy = torch.mean(torch.tensor(accuracies))

      # Calculate percentage of positive predictions
      percent_positive = torch.mean((predicted_labels == 2).float())
      
      # Free Memory  
      del model, outputs, X_data, y_data, true_labels, predicted_labels, precisions, accuracies, true_negatives, false_positives, precision
      del result_pred_labels
      torch.cuda.empty_cache()
      return [avg_precision, avg_accuracy, percent_positive]
  #-----------------------------------------------------------------------------------------------------------------------------------------------  
    start = 0
    curr = {}
    generator = prepare_train_data(train_data.iloc[500*2201:599*2201, :], 2201, seq_length=100, batch_size=1000)
  #----------------------------------------------------------------------------------------------------------------------------------------------------
    curr["train"] = perform_validation1(num_models=1, file_desc=itr2, data_generator=generator)
    
    generator = prepare_train_data(test_data, 2201, seq_length=100, batch_size=1000)
  #----------------------------------------------------------------------------------------------------------------------------------------------------
    curr["val"] = perform_validation1(num_models=1, file_desc=itr2, data_generator=generator)
    metrics.append(curr)
  return metrics

metrics = perform_task_3()

  y_train_tensor = torch.tensor(labels_encoded, dtype=torch.float32).clone().detach()
  y_train_tensor = torch.tensor(labels_encoded, dtype=torch.float32).clone().detach()
  y_train_tensor = torch.tensor(labels_encoded, dtype=torch.float32).clone().detach()
  y_train_tensor = torch.tensor(labels_encoded, dtype=torch.float32).clone().detach()
  y_train_tensor = torch.tensor(labels_encoded, dtype=torch.float32).clone().detach()
  y_train_tensor = torch.tensor(labels_encoded, dtype=torch.float32).clone().detach()


In [39]:
def print_table_task3(metrics):
    header = ['Model', 'Train', '', '', 'Validation', '', '', '', '']
    subheader = ['', '', 'Precision', 'Accuracy', '% Positive', 'Precision', 'Accuracy', '% Positive']
    print(" {:<10} {:<10} {:<14} {:<10} {:<10} {:<14} {:<14} {:<14}".format(*header))
    print("{:<10} {:<0} {:<10} {:<10} {:<14} {:<10} {:<10} {:<14}".format(*subheader))

    for i in range(len(metrics)):
        row =[i+1] + [f"{val:.4f}" for val in metrics[i]['train']] + [f"{val:.4f}" for val in metrics[i]['val']]
        print("{:<11} {:<10} {:<10} {:<14} {:<10} {:<10} {:<14}".format(*row))
    return
print_table_task3(metrics)

 Model      Train                                Validation                                             
            Precision  Accuracy   % Positive     Precision  Accuracy   % Positive    
1           0.3991     0.1421     0.3694         0.4851     0.1936     0.3154        
2           0.3744     0.2222     0.0050         0.4613     0.0632     0.0140        
3           0.5268     0.2455     0.0140         0.3223     0.0521     0.0020        
4           0.6488     0.2458     0.0000         0.2485     0.0509     0.0000        
5           0.5892     0.2456     0.0010         0.3069     0.0511     0.0000        
6           0.8606     0.2458     0.0000         0.2485     0.0509     0.0000        
7           0.8606     0.2458     0.0000         0.2485     0.0509     0.0000        
8           0.8606     0.2458     0.0000         0.2484     0.0509     0.0000        
9           0.5507     0.2455     0.0000         0.2484     0.0509     0.0000        
10          0.5036     0.2024     0

Task 3 Start

TASK 3 End

LSTM ARCHITECTURE

---



In [None]:
import torch
import torch.nn as nn
import torch.nn.init as init
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, precision_score
from tqdm import tqdm

class LSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, isBatchNorm):
        super(LSTM, self).__init__()
        self.isBatchNorm = isBatchNorm
        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=1, batch_first=True) #num_layers = 2
        self.dropout = nn.Dropout(p=0.2)
        self.fc = nn.Linear(hidden_dim, output_dim)
        if self.isBatchNorm:
          self.bn = nn.BatchNorm1d(num_features=30) #Added New
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)
        
        # Use Xavier initialization for weights
        init.xavier_uniform_(self.lstm.weight_ih_l0)
        init.orthogonal_(self.lstm.weight_hh_l0)
        init.constant_(self.lstm.bias_ih_l0, 0.0)
        init.constant_(self.lstm.bias_hh_l0, 0.0)
        init.xavier_uniform_(self.fc.weight)
        init.constant_(self.fc.bias, 0.0)

    def forward(self, x):
        h0 = torch.zeros(1, x.size(0), self.hidden_dim).requires_grad_().to(device)
        c0 = torch.zeros(1, x.size(0), self.hidden_dim).requires_grad_().to(device)
        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))
        if self.isBatchNorm:
          out = self.bn(out) #Added New
        out = self.dropout(out)
        out = self.fc(out[:, -1, :])
        out = self.relu(out)
        out = self.softmax(out)
        out = out.view(-1, 3)
        return out

# Check if CUDA is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Helper Function to train the model architecture

---



In [None]:
def train_model(model_id, generator, param_dict):
  # Evaluate each model for 10 epochs
  
  # Initialize the model
  learning_rate = param_dict["learning_rate"]
  num_epochs = param_dict["num_epochs"]
  input_dim = param_dict["input_dim"]
  hidden_dim = param_dict["hidden_dim"] #64
  output_dim = 3
  optimizer_name = param_dict["optimizer_name"]
  isBatchNorm = param_dict["isBatchNorm"]
  
  # Define the loss function and optimizer
  criterion = nn.CrossEntropyLoss()  

  # Initialize the model
  print('Model no', model_id)
  model = LSTM(input_dim, hidden_dim, output_dim, isBatchNorm).to(device)

  # Define the optimizer and scheduler for the current model
  # Define the optimizer and scheduler for the current model
  if optimizer_name == "Adam":
      optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
  elif optimizer_name == "SGD":
      optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)
  else:
      raise ValueError(f"Unsupported optimizer {optimizer_name}")
  scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1) #Step Size for learning rate.

  # Checkpointing parameters for the current model
  checkpoint_interval = 1
  checkpoint_file = f'lstm_checkpoint_{model_id}.pth'

  # Define early stopping parameters
  best_loss = float('inf')
  early_stop_counter = 0
  early_stop_patience = 3

  # Train the current model for 10 epochs
  for epoch in range(num_epochs):
    # Set model to train mode
    model.train()
    # model.double()

    # Iterate over batches
    for x_batch, y_batch in tqdm(generator):
        # Move data to device
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        
        # Cast x_batch to float32        
        x_batch = x_batch.float()
        
        # Forward pass
        outputs = model(x_batch)
        loss = criterion(outputs, y_batch)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Print the loss for every 10 epochs
    if (epoch + 1) % 10 == 0:
        print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch + 1, num_epochs, loss.item()))

    # Check if current model has reached early stopping criteria    
    if loss.item() < best_loss:
        best_loss = loss.item()
        early_stop_counter = 0
    else:
        early_stop_counter += 1

    if early_stop_counter >= early_stop_patience:
        print(f"Training for model {model_id} stopped early at epoch {epoch+1} due to early stopping")
        break

    # Save checkpoint at regular intervals
    if (epoch + 1) % checkpoint_interval == 0:
        torch.save({
            'epoch': epoch + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss.item(),
            # 'bn_state_dict': model.bn.state_dict()
        }, checkpoint_file)

    # Step the scheduler
    scheduler.step()

  # Delete variables to free up memory
  del model, optimizer, scheduler

  # Empty the cache to free up GPU memory
  torch.cuda.empty_cache()


Function performs two steps
1. Incremental PCA
2. SelectKBest
---





In [None]:
from sklearn.model_selection import train_test_split
from sklearn.decomposition import IncrementalPCA
from sklearn.feature_selection import SelectKBest, f_regression

def perform_PCA(data = None, topK = 10, topPCA=20):
  # Separate the target variable (stock price) from the features
  y = data['Label']
  X = data.drop(['Label'], axis=1)

  # Split the dataset into training and testing sets
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

  # Instantiate IncrementalPCA and fit to the training data in chunks
  batch_size = 1000
  
  ipca = IncrementalPCA(n_components=topPCA, batch_size=batch_size)
  for i in range(0, len(X_train), batch_size):
    X_train_batch = X_train.iloc[i:i+batch_size]
    ipca.partial_fit(X_train_batch)

  # Transform the training and testing data in chunks
  X_train_pca = []
  for i in range(0, len(X_train), batch_size):
    X_train_batch = X_train.iloc[i:i+batch_size]
    X_train_pca_batch = ipca.transform(X_train_batch)
    X_train_pca.append(X_train_pca_batch)
  X_train_pca = np.vstack(X_train_pca)

  X_test_pca = []
  for i in range(0, len(X_test), batch_size):
    X_test_batch = X_test.iloc[i:i+batch_size]
    X_test_pca_batch = ipca.transform(X_test_batch)
    X_test_pca.append(X_test_pca_batch)
  X_test_pca = np.vstack(X_test_pca)
  
  # Instantiate SelectKBest and fit to the training data
  selector = SelectKBest(f_regression, k=topK)  
  selector.fit(X_train_pca, y_train.cat.codes.astype('int').to_numpy())

  # Transform the training and testing data
  X_train_selected = selector.transform(X_train_pca)
  X_test_selected = selector.transform(X_test_pca)

  selected_indices = selector.get_support(indices=True)

  # Select only the columns with the selected indices
  X_train_pca_df = pd.DataFrame(data=X_train_selected, columns=X_train.columns[selected_indices])
  X_test_pca_df = pd.DataFrame(data=X_test_selected, columns=X_train.columns[selected_indices])

  # Assign labels to X_train_pca_df
  X_train_pca_df['Label'] = y_train.values
  X_test_pca_df['Label'] = y_test.values
  
  del X_train_selected, X_train_pca, X_train_batch
  del X_test_selected, X_test_pca, X_test_batch
  
  return X_train_pca_df, X_test_pca_df

Helper function to calculate the 3 metrics
1. Preciosn
2. Accuracy
3. Percentage of Positive Prediction

---



In [None]:
def perform_validation(num_models=1, file_desc=None, data_generator=None, param_dict=None):
  # Evaluate each model on the primary training set and the validation set  
  checkpoint_file = f'lstm_checkpoint_{file_desc}.pth' # Load the trained model from the checkpoint file
  checkpoint = torch.load(checkpoint_file, map_location=torch.device('cpu'))  
  model = LSTM(param_dict["input_dim"], param_dict["hidden_dim"], 3, param_dict["isBatchNorm"]).to(device)
  model.load_state_dict(checkpoint['model_state_dict'])
  isValidation = param_dict["isValidation"]
  isSkipResult = param_dict["isSkipResult"]

  # Evaluate the current model on the passed data  
  result_pred_labels = []  
  precisions = []
  accuracies = []
  for X_data, y_data in data_generator:        
    with torch.no_grad():
      X_data = X_data.float()
      outputs = model.forward(X_data.to(device))

    # Convert predicted and true labels to one-hot encodings
    predicted_labels = torch.argmax(outputs, dim=1).to(device)
    if not isSkipResult:
      result_pred_labels.extend(predicted_labels)
    true_labels = torch.argmax(y_data, dim=1).to(device)

    # Calculate precision for each class    
    for i in range(3):
      if i == 2: # positive class
          true_positives = torch.sum((predicted_labels == i) & (true_labels == i))
          false_positives = torch.sum((predicted_labels == i) & (true_labels != 2))
          precision = true_positives.float() / (true_positives + false_positives).float()
          accuracies.append(torch.mean((predicted_labels == true_labels).float() * (true_labels == i).float()))
      else: # negative class
          true_negatives = torch.sum((predicted_labels == i) & (true_labels == i) & (true_labels != 2))
          false_positives = torch.sum((predicted_labels == i) & (true_labels != i) & (true_labels != 2))
          precision = true_negatives.float() / (true_negatives + false_positives).float()
          accuracies.append(torch.mean((predicted_labels == true_labels).float() * (true_labels != i).float()))
      precisions.append(precision)

  # Calculate average precision and accuracy
  avg_precision = torch.mean(torch.tensor(precisions))
  avg_accuracy = torch.mean(torch.tensor(accuracies))

  # Calculate percentage of positive predictions
  percent_positive = torch.mean((predicted_labels == 2).float())
  
  if not isSkipResult:
    if isValidation:
      predictions[1].append(result_pred_labels)
    else:
      predictions[0].append(result_pred_labels)
  
  # Free Memory  
  del model, outputs, X_data, y_data, true_labels, predicted_labels, precisions, accuracies, true_negatives, false_positives, precision
  del result_pred_labels
  torch.cuda.empty_cache()
  return [avg_precision, avg_accuracy, percent_positive]

Perform Voting

In [None]:
def perform_voting(final_predictions, batch_generator):
  # Voting Technique
  cnt = 0
  precisions = []
  accuracies = []
  for X_batch, y_batch in batch_generator:    
    pred = final_predictions[cnt:cnt+len(y_batch)]
    true_labels = torch.argmax(y_batch, dim=1).to(device)
    for i in range(3):
      if i == 2: # positive class
          true_positives = torch.sum((pred == i) & (true_labels == i))
          false_positives = torch.sum((pred == i) & (true_labels != 2))
          precision = true_positives.float() / (true_positives + false_positives).float()
          accuracies.append(torch.mean((pred == true_labels).float() * (true_labels == i).float()))
      else: # negative class
          true_negatives = torch.sum((pred == i) & (true_labels == i) & (true_labels != 2))
          false_positives = torch.sum((pred == i) & (true_labels != i) & (true_labels != 2))
          precision = true_negatives.float() / (true_negatives + false_positives).float()
          accuracies.append(torch.mean((pred == true_labels).float() * (true_labels != i).float()))
      precisions.append(precision)
    
    cnt += len(y_batch)

  # Calculate average precision and accuracy
  avg_precision = torch.mean(torch.tensor(precisions))
  avg_accuracy = torch.mean(torch.tensor(accuracies))

  # Calculate percentage of positive predictions
  percent_positive = torch.mean((pred == 2).float())
  del X_batch, y_batch, true_labels, precisions, accuracies, true_negatives, false_positives, precision, pred
  torch.cuda.empty_cache()
  return [avg_precision, avg_accuracy, percent_positive]  

Helper Function to Print Metric Table

In [None]:
def print_table(metrics):
    header = ['Model', 'Train', '', '', 'Validation', '', '', '', '']
    subheader = ['', '', 'Precision', 'Accuracy', '% Positive', 'Precision', 'Accuracy', '% Positive']
    print(" {:<10} {:<10} {:<14} {:<10} {:<10} {:<14} {:<14} {:<14}".format(*header))
    print("{:<10} {:<0} {:<10} {:<10} {:<14} {:<10} {:<10} {:<14}".format(*subheader))

    for i in range(len(metrics)):
        row =[i+1] + [f"{val:.4f}" for val in metrics[i]['train']] + [f"{val:.4f}" for val in metrics[i]['val']]
        print("{:<11} {:<10} {:<10} {:<14} {:<10} {:<10} {:<14}".format(*row))
    return

In [None]:
def print_test_table(metrics):
    header = ['Model', 'Validation', '', '', '', '']
    subheader = ['', '', 'Precision', 'Accuracy', '% Positive']
    print(" {:<10} {:<10} {:<14} {:<14} {:<14}".format(*header))
    print("{:<10} {:<0} {:<10} {:<10} {:<14}".format(*subheader))

    for i in range(len(metrics)):
        row =[i+1]  + [f"{val:.4f}" for val in metrics[i]['val']]
        print("{:<11} {:<10} {:<10} {:<14}".format(*row))
    return

All the required functions are defined above these cell.

Task 4 Start

In [None]:
#Perform PCA and Feature Selection to get top n features from m selected Principal Components (Upto 10 models trained).
X_train_pca_df, X_test_pca_df = perform_PCA(train_data[:2201*800], 10, 20) 

#Train atleast 10 models on the PCA Transformed data.
seq_length = 30 # Define sequence length
batch_size = stock_len # Create data loader

param_dict = {
    "input_dim": 10,
    "hidden_dim": 32,
    "learning_rate": 0.0001,
    "num_epochs": 20,
    "optimizer_name": 'Adam',
    "isBatchNorm": False    
}

cnt = 0
while cnt < 10:
  #Generator function to load data in batches/chunks and get max performance
  generator = prepare_train_data(X_train_pca_df, 2201, seq_length, batch_size)
  train_model(f'{cnt}_pca_800_step4', generator, param_dict)
  del generator
  cnt += 1

Model no 0_pca_800_step4


  y_train_tensor = torch.tensor(labels_encoded).float().clone().detach()
632it [04:06,  2.56it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


Training for model 0_pca_800_step4 stopped early at epoch 4 due to early stopping
Model no 1_pca_800_step4


632it [03:57,  2.66it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


Training for model 1_pca_800_step4 stopped early at epoch 4 due to early stopping
Model no 2_pca_800_step4


632it [04:16,  2.46it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


Training for model 2_pca_800_step4 stopped early at epoch 4 due to early stopping
Model no 3_pca_800_step4


632it [04:37,  2.28it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


Training for model 3_pca_800_step4 stopped early at epoch 4 due to early stopping
Model no 4_pca_800_step4


632it [03:50,  2.74it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


Training for model 4_pca_800_step4 stopped early at epoch 4 due to early stopping
Model no 5_pca_800_step4


632it [04:04,  2.59it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


Training for model 5_pca_800_step4 stopped early at epoch 4 due to early stopping
Model no 6_pca_800_step4


632it [03:55,  2.68it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


Training for model 6_pca_800_step4 stopped early at epoch 4 due to early stopping
Model no 7_pca_800_step4


632it [04:12,  2.50it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


Training for model 7_pca_800_step4 stopped early at epoch 4 due to early stopping
Model no 8_pca_800_step4


632it [03:56,  2.67it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


Training for model 8_pca_800_step4 stopped early at epoch 4 due to early stopping
Model no 9_pca_800_step4


632it [04:13,  2.49it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]

Training for model 9_pca_800_step4 stopped early at epoch 4 due to early stopping





In [None]:
# Concatenate tensors of empty(Provding early shape can provide performance improvement)
predictions = [[],[]]
metrics = []
param_dict = {
    "input_dim": 10,
    "hidden_dim": 32,
    "learning_rate": 0.0001,
    "num_epochs": 20,
    "optimizer_name": 'Adam',
    "isBatchNorm": False,
    "isValidation": False,
    "isSkipResult": False
}

cnt = 0
while cnt < 10:
  curr = {}
  batch_generator = prepare_train_data(X_train_pca_df.iloc[:2201*200], 2201, 30, stock_len) #Using subset of training data due to ram issue but model trained on huge data.
  param_dict["isValidation"] = False
  curr["train"] = perform_validation(1, f'{cnt}_pca_800_step4', batch_generator, param_dict)
  batch_generator = prepare_train_data(X_test_pca_df, 2201, 30, stock_len)
  param_dict["isValidation"] = True
  curr["val"] = perform_validation(1, f'{cnt}_pca_800_step4', batch_generator, param_dict)
  metrics.append(curr)
  del batch_generator, curr
  cnt += 1
print_table(metrics)

  y_train_tensor = torch.tensor(labels_encoded).float().clone().detach()
  y_train_tensor = torch.tensor(labels_encoded).float().clone().detach()
  y_train_tensor = torch.tensor(labels_encoded).float().clone().detach()
  y_train_tensor = torch.tensor(labels_encoded).float().clone().detach()
  y_train_tensor = torch.tensor(labels_encoded).float().clone().detach()
  y_train_tensor = torch.tensor(labels_encoded).float().clone().detach()
  y_train_tensor = torch.tensor(labels_encoded).float().clone().detach()
  y_train_tensor = torch.tensor(labels_encoded).float().clone().detach()
  y_train_tensor = torch.tensor(labels_encoded).float().clone().detach()


 Model      Train                                Validation                                             
            Precision  Accuracy   % Positive     Precision  Accuracy   % Positive    
1           0.5628     0.2093     0.2305         0.4165     0.1700     0.1758        
2           0.5736     0.2018     0.1857         0.4398     0.1899     0.1093        
3           0.5867     0.2137     0.2537         0.4476     0.1933     0.1864        
4           0.5748     0.2207     0.2736         0.4451     0.1967     0.1686        
5           0.5654     0.1910     0.1360         0.4595     0.1791     0.0987        
6           0.5430     0.2157     0.2919         0.4328     0.1891     0.2141        
7           0.5520     0.2315     0.3118         0.4502     0.1904     0.1830        
8           0.5834     0.2091     0.1874         0.4239     0.1899     0.1326        
9           0.5549     0.2398     0.3599         0.4110     0.2017     0.2551        
10          0.5871     0.1955     0

In [None]:
# Concatenate tensors of empty(Provding early shape can provide performance improvement)
predictions = [[],[]]
metrics = []
param_dict = {
    "input_dim": 10,
    "hidden_dim": 32,
    "learning_rate": 0.0001,
    "num_epochs": 20,
    "optimizer_name": 'Adam',
    "isBatchNorm": False,
    "isValidation": False,
    "isSkipResult": False
}

cnt = 0
while cnt < 10:
  curr = {}
  batch_generator = prepare_train_data(X_train_pca_df.iloc[:2201*200], 2201, 30, stock_len) #Using subset of training data due to ram issue but model trained on huge data.
  param_dict["isValidation"] = False
  curr["train"] = perform_validation(1, f'{cnt}_pca_800', batch_generator, param_dict)
  batch_generator = prepare_train_data(X_test_pca_df, 2201, 30, stock_len)
  param_dict["isValidation"] = True
  curr["val"] = perform_validation(1, f'{cnt}_pca_800', batch_generator, param_dict)
  metrics.append(curr)
  del batch_generator, curr
  cnt += 1
print_table(metrics)

 Model      Train                                Validation                                             
            Precision  Accuracy   % Positive     Precision  Accuracy   % Positive    
1           0.5828     0.2209     0.2852         0.4429     0.1788     0.1320        
2           0.5894     0.2213     0.2852         0.4291     0.1925     0.1398        
3           0.5534     0.1900     0.1708         0.4150     0.1499     0.1120        
4           0.6004     0.2360     0.3068         0.4236     0.1973     0.2080        
5           0.5987     0.2190     0.2172         0.4439     0.1868     0.1375        
6           0.5907     0.2103     0.2156         0.4269     0.1790     0.1337        
7           0.5778     0.2157     0.2388         0.4240     0.1807     0.1608        
8           0.5544     0.2123     0.2272         0.4101     0.1682     0.2546        
9           0.6055     0.2148     0.1990         0.4450     0.1870     0.1021        
10          0.5791     0.2237     0

Task 4 End

Task 5 Start

In [None]:
#Voting
from collections import Counter

curr = {}
for i in range(2):
  final_predictions = [Counter(p).most_common(1)[0][0] for p in zip(*predictions[i])]
  final_predictions = torch.stack(final_predictions)
  if i == 0:
    batch_generator = prepare_train_data(X_train_pca_df.iloc[:2201*200], 2201, 30, stock_len)
    curr["train"] = perform_voting(final_predictions, batch_generator)
  elif i == 1:
    batch_generator = prepare_train_data(X_test_pca_df, 2201, 30, stock_len)
    curr["val"] = perform_voting(final_predictions, batch_generator)
print_table([curr])
del final_predictions, batch_generator, curr

  y_train_tensor = torch.tensor(labels_encoded).float().clone().detach()


 Model      Train                                Validation                                             
            Precision  Accuracy   % Positive     Precision  Accuracy   % Positive    
1           0.5828     0.2209     0.2852         0.4429     0.1788     0.1320        


In [None]:
#Voting 2
from collections import Counter

curr = {}
for i in range(2):
  final_predictions = [Counter(p).most_common(1)[0][0] for p in zip(*predictions[i])]
  final_predictions = torch.stack(final_predictions)
  if i == 0:
    batch_generator = prepare_train_data(X_train_pca_df.iloc[:2201*200], 2201, 30, stock_len)
    curr["train"] = perform_voting(final_predictions, batch_generator)
  elif i == 1:
    batch_generator = prepare_train_data(X_test_pca_df, 2201, 30, stock_len)
    curr["val"] = perform_voting(final_predictions, batch_generator)
print_table([curr])
del final_predictions, batch_generator, curr

  y_train_tensor = torch.tensor(labels_encoded).float().clone().detach()


 Model      Train                                Validation                                             
            Precision  Accuracy   % Positive     Precision  Accuracy   % Positive    
1           0.5628     0.2093     0.2305         0.4165     0.1700     0.1758        


Task 5 End

Validaiton Set Testing on Best Model

In [None]:
def perform_test_on_validation_set():
  
  #Load Feature Engineered Stock dataset: 100 Features including labels
  with open('data_with_features_clean_test.pkl', 'rb') as test_file: #To do: Change file name.
    test_data = pickle.load(test_file)
  del test_file  

  #Perform PCA and Feature Selection to get top n features from m selected Principal Components.
  X_train_pca_df, X_test_pca_df = perform_PCA(test_data, 10, 20)
  dataset = pd.concat([X_train_pca_df, X_test_pca_df])
  del X_train_pca_df, X_test_pca_df, test_data

  #Call Evaluation
  predictions = [[],[]]
  metrics = []
  param_dict = {
      "input_dim": 10,
      "hidden_dim": 32,
      "learning_rate": 0.0001,
      "num_epochs": 10,
      "optimizer_name": 'Adam',
      "isBatchNorm": False,
      "isValidation": True,
      "isSkipResult": True
  }
  batch_generator = prepare_train_data(dataset, 2201, 30, stock_len)
  curr = {}
  curr["val"] = perform_validation(1, '1_pca_800', batch_generator, param_dict)
  
  return curr

curr = perform_test_on_validation_set()

In [None]:
print_test_table([curr])

 Model      Validation                                             
            Precision  Accuracy   % Positive    
1           0.4554     0.1783     0.3724        


Validation Set testing on Best Model End

Testing Set 1


In [None]:
def perform_test_on_testing_set1():
  
  # Load Feature Engineered Stock dataset: 100 Features including labels
  with open('testing_set1_processed.pkl', 'rb') as test_file: #Feature Engineered File
    test_data = pickle.load(test_file)
  del test_file

  # Perform PCA and Feature Selection to get top n features| from m selected Principal Components.
  X_train_pca_df, X_test_pca_df = perform_PCA(test_data, 10, 20)
  dataset = pd.concat([X_train_pca_df, X_test_pca_df])
  del X_train_pca_df, X_test_pca_df, test_data

  # Call Evaluation
  predictions = [[],[]]
  metrics = []

  param_dict = {
      "input_dim": 10,
      "hidden_dim": 32,
      "learning_rate": 0.0001,
      "num_epochs": 10,
      "optimizer_name": 'Adam',
      "isBatchNorm": False,
      "isValidation": True,
      "isSkipResult": True
  }
  batch_generator = prepare_train_data(dataset, 888, 30, 1000)
  curr = {}  
  curr["val"] = perform_validation(1, '1_pca_800', batch_generator, param_dict)
  return curr

curr = perform_test_on_testing_set1()

In [None]:
print_test_table([curr])

 Model      Validation                                             
            Precision  Accuracy   % Positive    
1           0.4247     0.1587     0.2414        


Testing Set 1 End

Testing Set 2 Start

In [None]:
def perform_test_on_testing_set2():
  
  # Load Feature Engineered Stock dataset: 100 Features including labels
  with open('testing_set2_processed.pkl', 'rb') as test_file: #Feature Engineered File
    test_data = pickle.load(test_file)
  del test_file
  
  # Perform PCA and Feature Selection to get top n features| from m selected Principal Components.
  X_train_pca_df, X_test_pca_df = perform_PCA(test_data, 10, 20)
  dataset = pd.concat([X_train_pca_df, X_test_pca_df])
  del X_train_pca_df, X_test_pca_df, test_data

  # # Call Evaluation
  predictions = [[],[]]
  metrics = []

  param_dict = {
      "input_dim": 10,
      "hidden_dim": 32,
      "learning_rate": 0.0001,
      "num_epochs": 10,
      "optimizer_name": 'Adam',
      "isBatchNorm": False,
      "isValidation": True,
      "isSkipResult": True
  }
  batch_generator = prepare_train_data(dataset, 705, 30, 1000)
  curr = {}  
  curr["val"] = perform_validation(1, '1_pca_800', batch_generator, param_dict)
  return curr

curr = perform_test_on_testing_set2()

In [None]:
print_test_table([curr])

 Model      Validation                                             
            Precision  Accuracy   % Positive    
1           0.4306     0.2123     0.2489        


Testing Set 2 End

Testing Set 3 Start

In [None]:
def perform_test_on_testing_set3():

  # Load Feature Engineered Stock dataset: 100 Features including labels
  with open('testing_set3_processed.pkl', 'rb') as test_file: #Feature Engineered File
    test_data = pickle.load(test_file)
  del test_file

  # Perform PCA and Feature Selection to get top n features| from m selected Principal Components.
  X_train_pca_df, X_test_pca_df = perform_PCA(test_data, 10, 20)
  dataset = pd.concat([X_train_pca_df, X_test_pca_df])
  del X_train_pca_df, X_test_pca_df, test_data

  # # Call Evaluation
  predictions = [[],[]]
  metrics = []

  param_dict = {
      "input_dim": 10,
      "hidden_dim": 32,
      "learning_rate": 0.0001,
      "num_epochs": 10,
      "optimizer_name": 'Adam',
      "isBatchNorm": False,
      "isValidation": True,
      "isSkipResult": True
  }
  batch_generator = prepare_train_data(dataset, 4598, 30, 1000)
  curr = {}  
  curr["val"] = perform_validation(1, '1_pca_800', batch_generator, param_dict)
  return curr

curr = perform_test_on_testing_set3()

In [None]:
print_test_table([curr])

 Model      Validation                                             
            Precision  Accuracy   % Positive    
1           0.4517     0.1736     0.2693        


Testing Set 3 End

In [None]:
import torch
import torch.nn as nn
import torch.nn.init as init
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, precision_score
from tqdm import tqdm

class LSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, isBatchNorm):
        super(LSTM, self).__init__()
        self.isBatchNorm = isBatchNorm
        self.hidden_dim = hidden_dim
        self.lstm1 = nn.LSTM(input_dim, hidden_dim, num_layers=2, batch_first=True)
        self.lstm2 = nn.LSTM(hidden_dim, hidden_dim, num_layers=2, batch_first=True)
        self.dropout = nn.Dropout(p=0.2)
        self.fc1 = nn.Linear(hidden_dim, 128)
        self.fc2 = nn.Linear(128, output_dim)  # Change output_dim to 3 for 3 classes
        if self.isBatchNorm:
            self.bn = nn.BatchNorm1d(num_features=30)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)
        
        # Use Xavier initialization for weights
        init.xavier_uniform_(self.lstm1.weight_ih_l0)
        init.orthogonal_(self.lstm1.weight_hh_l0)
        init.constant_(self.lstm1.bias_ih_l0, 0.0)
        init.constant_(self.lstm1.bias_hh_l0, 0.0)
        init.xavier_uniform_(self.lstm2.weight_ih_l0)
        init.orthogonal_(self.lstm2.weight_hh_l0)
        init.constant_(self.lstm2.bias_ih_l0, 0.0)
        init.constant_(self.lstm2.bias_hh_l0, 0.0)
        init.xavier_uniform_(self.fc1.weight)
        init.constant_(self.fc1.bias, 0.0)
        init.xavier_uniform_(self.fc2.weight)
        init.constant_(self.fc2.bias, 0.0)



    def forward(self, x):
        h01 = torch.zeros(2, x.size(0), self.hidden_dim).requires_grad_().to(device)
        c01 = torch.zeros(2, x.size(0), self.hidden_dim).requires_grad_().to(device)
        out1, (hn1, cn1) = self.lstm1(x, (h01.detach(), c01.detach()))
        h02 = torch.zeros(2, x.size(0), self.hidden_dim).requires_grad_().to(device)
        c02 = torch.zeros(2, x.size(0), self.hidden_dim).requires_grad_().to(device)
        out2, (hn2, cn2) = self.lstm2(out1, (h02.detach(), c02.detach()))
        if self.isBatchNorm:
            out2 = self.bn(out2)
        out2 = self.dropout(out2)
        out2 = self.fc1(out2[:, -1, :])
        out2 = self.relu(out2)
        out2 = self.fc2(out2)  # remove [:, -1]
        out2 = self.softmax(out2)
        out2 = out2.view(-1, 3)

        # out2 = self.fc1(out2[:, -1, :])
        # out2 = self.relu(out2)
        # out2 = self.fc2(out2[:, -1])
        # out2 = self.softmax(out2)
        # out2 = out2.view(-1, 3)
        return out2

# Check if CUDA is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
#Perform PCA and Feature Selection to get top n features from m selected Principal Components (Upto 10 models trained).

X_train_pca_df, X_test_pca_df = perform_PCA(train_data, 20, 50)
dataset = pd.concat([X_train_pca_df,X_test_pca_df])
seq_length = 30 # Define sequence length
stock_len = 2201
batch_size = stock_len # Create data loader

param_dict = {
    "input_dim": 20,
    "hidden_dim": 64,
    "learning_rate": 0.001,
    "num_epochs": 30,    
    "optimizer_name": 'SGD',
    "isBatchNorm": True,
}

# param_dict = {
#     "input_dim": 10,
#     "hidden_dim": 64,
#     "learning_rate": 0.001,
#     "num_epochs": 30,
#     "optimizer": 'SGG',
#     "isBatchNorm": False,
# }

# param_dict = {
#     "input_dim": 10,
#     "hidden_dim": 128,
#     "learning_rate": 0.0005,
#     "num_epochs": 25,
#     "optimizer": torch.optim.Adagrad,
#     "isBatchNorm": True,
# }


#Generator function to load data in batches/chunks and get max performance
generator = prepare_train_data(dataset, stock_len, seq_length, batch_size)
train_model(f'{2}_pca_best_model', generator, param_dict)
del generator


Model no 2_pca_best_model


  y_train_tensor = torch.tensor(labels_encoded, dtype=torch.float32).clone().detach()
1578it [02:01, 13.03it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]

Training for model 2_pca_best_model stopped early at epoch 4 due to early stopping





In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd

# Define the environment
class TradingEnv:
    def __init__(self, data):
        self.data = data
        self.reset()

    def reset(self):
        self.t = 0
        self.profit = 0
        self.bought = False
        self.stock_price = self.data.iloc[self.t, 0]
        self.cash = 10000
        self.shares = 0

    def step(self, action):
        reward = 0
        done = False

        if action == 0 and not self.bought:
            # Buy shares
            self.bought = True
            self.shares = self.cash / self.stock_price
            self.cash = 0
        elif action == 1 and self.bought:
            # Sell shares
            self.bought = False
            self.cash = self.stock_price * self.shares
            self.shares = 0

            # Calculate profit
            self.profit += self.cash - 10000
            reward = self.profit

        # Move to the next time step
        self.t += 1
        if self.t >= len(self.data):
            done = True
        else:
            self.stock_price = self.data.iloc[self.t, 0]

        return reward, done

class PolicyNetwork(nn.Module):
    def __init__(self, input_size, output_size):
        super().__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, output_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = torch.softmax(self.fc4(x), dim=-1)
        return x


# Train the policy network
def train(env, agent, optimizer, device, num_episodes, batch_size, gamma):
    rewards = []
    for i_episode in range(num_episodes):
        state = env.reset()
        log_probs = []
        rewards_episode = []
        done = False
        while not done:
            state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)
            action, log_prob = agent.get_action_and_log_prob(state_tensor)
            next_state, reward, done, _ = env.step(action)
            log_probs.append(log_prob)
            rewards_episode.append(reward)
            state = next_state
        rewards.append(sum(rewards_episode))
        
        # compute returns and advantages
        returns = compute_returns(rewards_episode, gamma)
        advantages = compute_advantages(rewards_episode, agent.value_function(state_tensor), gamma)
        
        # concatenate lists of log probabilities, states, returns, and advantages
        log_probs_tensor = torch.cat(log_probs)
        returns_tensor = torch.tensor(returns, dtype=torch.float32).unsqueeze(1).to(device)
        advantages_tensor = torch.tensor(advantages, dtype=torch.float32).unsqueeze(1).to(device)
        
        # update policy and value function
        loss = agent.update_policy_and_value_function(log_probs_tensor, state_tensor, returns_tensor, advantages_tensor, optimizer, batch_size)
    
    return rewards



In [None]:
import torch.optim as optim
from torch.distributions.categorical import Categorical

# Set up the trading environment
env = TradingEnv(train_data)

# Define the neural network model
model = PolicyNetwork(101, 3)

# Define the optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Set the number of episodes to run
num_episodes = 100

# Train the model
for episode in range(num_episodes):
    state = env.get_state()
    done = False
    while not done:
        # Get the action probabilities from the model
        logits = model(torch.tensor(state, dtype=torch.float32))
        action_probs = torch.softmax(logits, dim=0)

        # Sample an action from the action probabilities
        action_dist = Categorical(action_probs)
        action = action_dist.sample()

        # Take a step in the environment
        reward, done = env.step(action.item())

        # Update the state and compute the loss
        next_state = env.get_state()
        state_tensor = torch.tensor(state, dtype=torch.float32)
        action_tensor = torch.tensor(action.item(), dtype=torch.int64)
        reward_tensor = torch.tensor(reward, dtype=torch.float32)
        next_state_tensor = torch.tensor(next_state, dtype=torch.float32)
        done_tensor = torch.tensor(done, dtype=torch.float32)
        log_prob = action_dist.log_prob(action_tensor)
        loss = -log_prob * reward_tensor

        # Compute the gradients and update the model
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        state = next_state
