# LSTM model for the small dataset

In [4]:
from dotenv import load_dotenv, find_dotenv
import sys, os

load_dotenv(find_dotenv())

project_root = os.getenv('PROJECT_ROOT')
sys.path.insert(0, project_root) 

In [5]:
import torch
import torch.optim as optim
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

In [6]:
from src.prepare_data_for_models import build_train_val_test_iter
from src.supconloss_with_cosine import SupConLossWithConsine

In [7]:
seed = 42  # Choose an integer for your seed

torch.manual_seed(seed)
np.random.seed(seed)

if torch.cuda.is_available():
    torch.backends.cudnn.deterministic = True

In [8]:
# load data
data_dir = os.environ.get("DATA_DIR")
dataset_fullpath = os.path.join(project_root, data_dir, "output")
small_dataset_output = os.path.join(dataset_fullpath, "model_output", "small_patches")

In [9]:
# load dataframe
small_patches_df = pd.read_csv(os.path.join(dataset_fullpath, "small-patches.csv"))

In [10]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
device = "cuda" if torch.cuda.is_available() else "cpu"

In [11]:
# small dataset
train_itr_small, val_itr_small, test_itr_small, tokenizer = build_train_val_test_iter(small_dataset_output, small_patches_df, device)

[INFO] Building train, val and test csv files at /student/tdy245/Projects/cmpt828_deeplearning/PaCo/data/output/model_output/small_patches


Token indices sequence length is longer than the specified maximum sequence length for this model (618 > 512). Running this sequence through the model will result in indexing errors


[INFO] Building train, val and test tabular dataset
** Number of training examples: 946
** Number of validation examples: 118
** Number of testing examples: 119
Batch size: 32


In [12]:
import torch.nn as nn

class LSTMCodeEncoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers=1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.batch_norm_one = nn.BatchNorm1d(embedding_dim) 
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.batch_norm_two = nn.BatchNorm1d(hidden_dim) 
        self.linear_projection = nn.Linear(hidden_dim, 128) 
        
    def forward(self, code_tokens):
        embeddings = self.embedding(code_tokens)
        embeddings = self.batch_norm_one(embeddings) # Apply batch norm one
        output, (hidden, cell) = self.lstm(embeddings)  
        output = self.linear_projection(output)
        # hidden = self.batch_norm_two(hidden[-1])  # Apply batch norm two
        return output 

In [13]:
# not using the function for now
def init_train_var(model, trial):
    criterion = SupConLossWithConsine(device=device)
    optimizer =  optim.Adam(model.parameters(), lr= 1e-4)
    return criterion, optimizer

In [14]:
from torch.utils.tensorboard import SummaryWriter
writer_path = "runs/small_dataset/all_runs/"

In [15]:
def train(model, train_iterator, valid_iterator, batch_size, device, trial):
    nb_epochs = 100

    learning_rate = trial.suggest_float('learning_rate', 1e-6, 1e-2, log=True)
    optimizer_name = trial.suggest_categorical('optimizer', ['Adam'])

    train_steps = len(train_iterator.dataset) // batch_size
    val_steps = len(valid_iterator.dataset) // batch_size

    optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=learning_rate) 
    criterion = SupConLossWithConsine(device=device)
    
    writer = SummaryWriter(writer_path + f"{trial.number}")

    model.train()
    for epoch in range(nb_epochs):
        epoch_train_loss = 0
        epoch_val_loss = 0

        # train_correct = 0  not using it now since there is no patch classifier
        # val_correct = 0

        for batch in train_iterator:
            buggy_tensor, patch_tensor, labels = batch.buggy.T, batch.patch.T, batch.numerical_label # data already in device

            optimizer.zero_grad()
            buggy_embd = model(buggy_tensor)
            patch_embd = model(patch_tensor)
            loss = criterion(buggy_embd, patch_embd, labels) # buggy_embd, patch_embd, label
            loss.backward()
            optimizer.step()

            epoch_train_loss += loss

        with torch.no_grad():
            for batch in valid_iterator:
                buggy_tensor, patch_tensor, labels = batch.buggy.T, batch.patch.T, batch.numerical_label
                buggy_embd = model(buggy_tensor)
                patch_embd = model(patch_tensor)

                loss = criterion(buggy_embd, patch_embd, labels) # buggy_embd, patch_embd, label
                
                epoch_val_loss += loss

        mean_train_loss = epoch_train_loss / train_steps
        mean_val_loss = epoch_val_loss / val_steps

        writer.add_scalar("training_loss", mean_train_loss, epoch + 1)
        writer.add_scalar("validation_loss", mean_val_loss, epoch + 1)
        
        # print("[INFO] EPOCH: {}/{}".format(epoch + 1, nb_epochs))
        # print("Train loss: {:.6f}".format(mean_train_loss))
        # print("Val loss: {:.6f}\n".format(mean_val_loss))
    
    return mean_val_loss

In [16]:
def objective(trial):
    code_encoder = LSTMCodeEncoder(tokenizer.vocab_size, 512, 512, num_layers=1)
    code_encoder.to(device)
    val_loss = train(code_encoder, train_itr_small, val_itr_small, 32, device, trial) # bad practice will change later
    return val_loss

In [17]:
import optuna
study = optuna.create_study(direction='minimize')  # Aim to minimize  validation loss
study.optimize(objective, n_trials=10)

[I 2024-03-26 20:03:02,424] A new study created in memory with name: no-name-4f70751d-b34e-4580-bb6b-9e4712c64ae4
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[I 2024-03-26 20:06:06,958] Trial 0 finished with value: 1.3675405979156494 and parameters: {'learning_rate': 1.417452143273518e-05, 'optimizer': 'Adam'}. Best is trial 0 with value: 1.3675405979156494.
[I 2024-03-26 20:09:07,158] Trial 1 finished with value: 2.1669793128967285 and parameters: {'learning_rate': 1.19012

AttributeError: module 'optuna.study' has no attribute 'TrialState'

In [None]:
best_params = study.best_params
print(best_params)