# LSTM model for the large dataset

In [9]:
from dotenv import load_dotenv, find_dotenv
import sys, os

load_dotenv(find_dotenv())

project_root = os.getenv('PROJECT_ROOT')
sys.path.insert(0, project_root) 

In [10]:
import torch
import torch.optim as optim
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

In [11]:
from src.prepare_data_for_models import build_train_val_test_iter
from src.supconloss_with_cosine import SupConLossWithConsine

In [12]:
seed = 42  # Choose an integer for your seed

torch.manual_seed(seed)
np.random.seed(seed)

if torch.cuda.is_available():
    torch.backends.cudnn.deterministic = True

In [13]:
# load data
data_dir = os.environ.get("DATA_DIR")
dataset_fullpath = os.path.join(project_root, data_dir, "output")
large_dataset_output = os.path.join(dataset_fullpath, "model_output", "large_patches")

In [26]:
best_model_save_path = os.path.join(project_root, "notebooks", "runs", "large_dataset", "model")

In [14]:
# load dataframe
large_patches_df = pd.read_csv(os.path.join(dataset_fullpath, "large-patches.csv"))

In [15]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
device = "cuda" if torch.cuda.is_available() else "cpu"

In [16]:
# small dataset
train_itr_large, val_itr_large, test_itr_large, tokenizer = build_train_val_test_iter(large_dataset_output, large_patches_df, device)

[INFO] Building train, val and test csv files at /student/tdy245/Projects/cmpt828_deeplearning/PaCo/data/output/model_output/large_patches


Token indices sequence length is longer than the specified maximum sequence length for this model (564 > 512). Running this sequence through the model will result in indexing errors


[INFO] Building train, val and test tabular dataset
** Number of training examples: 39755
** Number of validation examples: 4969
** Number of testing examples: 4970
Batch size: 32


In [17]:
import torch.nn as nn

class LSTMCodeEncoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers=1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.batch_norm_one = nn.BatchNorm1d(embedding_dim) 
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.batch_norm_two = nn.BatchNorm1d(hidden_dim) 
        self.linear_projection = nn.Linear(hidden_dim, 128) 
        
    def forward(self, code_tokens):
        embeddings = self.embedding(code_tokens)
        embeddings = self.batch_norm_one(embeddings) # Apply batch norm one
        output, (hidden, cell) = self.lstm(embeddings)  
        output = self.linear_projection(output)
        # hidden = self.batch_norm_two(hidden[-1])  # Apply batch norm two
        return output 

In [19]:
# not using the function for now
def init_train_var(model, trial):
    criterion = SupConLossWithConsine(device=device)
    optimizer =  optim.Adam(model.parameters(), lr= 1e-4)
    return criterion, optimizer

In [20]:
from torch.utils.tensorboard import SummaryWriter
writer_path = "runs/large_dataset/all_runs/"

In [30]:
def train(model, train_iterator, valid_iterator, batch_size, device, trial):
    nb_epochs = 50

    learning_rate = trial.suggest_loguniform('learning_rate', 1e-6, 1e-2)
    optimizer_name = trial.suggest_categorical('optimizer', ['Adam'])

    train_steps = len(train_iterator.dataset) // batch_size
    val_steps = len(valid_iterator.dataset) // batch_size

    optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=learning_rate) 
    criterion = SupConLossWithConsine(device=device)
    
    writer = SummaryWriter(writer_path + f"{trial.number}")

    model.train()
    for epoch in range(nb_epochs):
        epoch_train_loss = 0
        epoch_val_loss = 0

        # train_correct = 0  not using it now since there is no patch classifier
        # val_correct = 0

        for batch in train_iterator:
            buggy_tensor, patch_tensor, labels = batch.buggy.T, batch.patch.T, batch.numerical_label # data already in device

            optimizer.zero_grad()
            buggy_embd = model(buggy_tensor)
            patch_embd = model(patch_tensor)
            loss = criterion(buggy_embd, patch_embd, labels) # buggy_embd, patch_embd, label
            loss.backward()
            optimizer.step()

            epoch_train_loss += loss

        with torch.no_grad():
            for batch in valid_iterator:
                buggy_tensor, patch_tensor, labels = batch.buggy.T, batch.patch.T, batch.numerical_label 
                epoch_val_loss += loss

        mean_train_loss = epoch_train_loss / train_steps
        mean_val_loss = epoch_val_loss / val_steps

        writer.add_scalar("training_loss", mean_train_loss, epoch + 1)
        writer.add_scalar("validation_loss", mean_val_loss, epoch + 1)

    writer.add_hparams({'lr': learning_rate}, {'train_loss': mean_train_loss, 'val_loss': mean_val_loss})
        # print("[INFO] EPOCH: {}/{}".format(epoch + 1, nb_epochs))
        # print("Train loss: {:.6f}".format(mean_train_loss))
        # print("Val loss: {:.6f}\n".format(mean_val_loss))  
    return mean_val_loss

In [31]:
import optuna

def create_and_run_study(model_savepath, n_trials = 2):
    study = optuna.create_study(direction='minimize')   # Aim to minimize  validation loss

    def save_model(model, filename):
        torch.save(model.state_dict(), os.path.join(model_savepath, filename))
        
    def objective(trial):
        code_encoder = LSTMCodeEncoder(tokenizer.vocab_size, 512, 512, num_layers=1)
        code_encoder.to(device)
        val_loss = train(code_encoder, train_itr_large, val_itr_large, 32, device, trial) # bad practice will change later

        if trial.number == 0 or val_loss < study.best_value:
            save_model(code_encoder, "best_model_small.pth")
        return val_loss  
    
    study.optimize(objective, n_trials=n_trials)

In [32]:
create_and_run_study(best_model_save_path, n_trials=5)

[I 2024-03-27 19:19:28,476] A new study created in memory with name: no-name-e2203408-b261-4dba-87cf-6a8d95accb20
