# LSTM model for the full dataset

In [15]:
from dotenv import load_dotenv, find_dotenv
import sys, os

load_dotenv(find_dotenv())

project_root = os.getenv('PROJECT_ROOT')
sys.path.insert(0, project_root) 

In [16]:
import torch
import torch.optim as optim
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

In [17]:
from src.prepare_data_for_models import build_train_val_test_iter
from src.supconloss_with_cosine import SupConLossWithConsine

In [18]:
seed = 42  # Choose an integer for your seed

torch.manual_seed(seed)
np.random.seed(seed)

if torch.cuda.is_available():
    torch.backends.cudnn.deterministic = True

In [19]:
# load data
data_dir = os.environ.get("DATA_DIR")
dataset_fullpath = os.path.join(project_root, data_dir, "output")
all_dataset_output = os.path.join(dataset_fullpath, "model_output", "all_patches")

In [20]:
best_model_save_path = os.path.join(project_root, "notebooks", "runs", "all_dataset", "model")

In [21]:
# load dataframe
all_patches_df = pd.read_csv(os.path.join(dataset_fullpath, "all-patches.csv"))

In [22]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
device = "cuda" if torch.cuda.is_available() else "cpu"

In [23]:
# small dataset
train_itr_large, val_itr_large, test_itr_large, tokenizer = build_train_val_test_iter(all_dataset_output, all_patches_df, device)

[INFO] Building train, val and test csv files at /student/tdy245/Projects/cmpt828_deeplearning/PaCo/data/output/model_output/all_patches


Token indices sequence length is longer than the specified maximum sequence length for this model (530 > 512). Running this sequence through the model will result in indexing errors


[INFO] Building train, val and test tabular dataset
** Number of training examples: 40701
** Number of validation examples: 5088
** Number of testing examples: 5088
Batch size: 32


In [24]:
from src.models.lstm_code_encoder import LSTMCodeEncoder

In [25]:
# not using the function for now
def init_train_var(model, trial):
    criterion = SupConLossWithConsine(device=device)
    optimizer =  optim.Adam(model.parameters(), lr= 1e-4)
    return criterion, optimizer

In [26]:
from torch.utils.tensorboard import SummaryWriter
writer_path = "runs/large_dataset/all_runs/"

In [27]:
def train(model, train_iterator, valid_iterator, batch_size, device, trial):
    nb_epochs = 20

    learning_rate = trial.suggest_loguniform('learning_rate', 1e-6, 1e-2)
    optimizer_name = trial.suggest_categorical('optimizer', ['Adam'])

    train_steps = len(train_iterator.dataset) // batch_size
    val_steps = len(valid_iterator.dataset) // batch_size

    optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=learning_rate) 
    criterion = SupConLossWithConsine(device=device)
    
    writer = SummaryWriter(writer_path + f"{trial.number}")

    model.train()
    for epoch in range(nb_epochs):
        epoch_train_loss = 0
        epoch_val_loss = 0

        # train_correct = 0  not using it now since there is no patch classifier
        # val_correct = 0

        for batch in train_iterator:
            buggy_tensor, patch_tensor, labels = batch.buggy.T, batch.patch.T, batch.numerical_label # data already in device

            optimizer.zero_grad()
            buggy_embd = model(buggy_tensor)
            patch_embd = model(patch_tensor)
            loss = criterion(buggy_embd, patch_embd, labels) # buggy_embd, patch_embd, label
            loss.backward()
            optimizer.step()

            epoch_train_loss += loss

        with torch.no_grad():
            for batch in valid_iterator:
                buggy_tensor, patch_tensor, labels = batch.buggy.T, batch.patch.T, batch.numerical_label 
                epoch_val_loss += loss

        mean_train_loss = epoch_train_loss / train_steps
        mean_val_loss = epoch_val_loss / val_steps

        writer.add_scalar("training_loss", mean_train_loss, epoch + 1)
        writer.add_scalar("validation_loss", mean_val_loss, epoch + 1)

    writer.add_hparams({'lr': learning_rate}, {'train_loss': mean_train_loss, 'val_loss': mean_val_loss})
        # print("[INFO] EPOCH: {}/{}".format(epoch + 1, nb_epochs))
        # print("Train loss: {:.6f}".format(mean_train_loss))
        # print("Val loss: {:.6f}\n".format(mean_val_loss))  
    return mean_val_loss

In [31]:
import optuna
import torch.nn as nn

def create_and_run_study(model_savepath, n_trials = 2):
    study = optuna.create_study(direction='minimize')   # Aim to minimize  validation loss

    def save_model(model, filename):
        torch.save(model.state_dict(), os.path.join(model_savepath, filename))
        
    def objective(trial):
        code_encoder = LSTMCodeEncoder(tokenizer.vocab_size, 512, 512, num_layers=1)
        code_encoder.to(device)
        if torch.cuda.device_count() > 1:
            device_ids = [i for i in range(1, torch.cuda.device_count() )]
            code_encoder = nn.DataParallel(code_encoder, device_ids=device_ids)
            print(f"Using {torch.cuda.device_count()} GPUs")
            
        val_loss = train(code_encoder, train_itr_large, val_itr_large, 32, device, trial) # bad practice will change later

        if trial.number == 0 or val_loss < study.best_value:
            save_model(code_encoder, "best_model_small.pth")
        return val_loss  
    
    study.optimize(objective, n_trials=n_trials)

In [32]:
create_and_run_study(best_model_save_path, n_trials=5)

[I 2024-04-02 18:46:48,678] A new study created in memory with name: no-name-4e6fae51-2ec7-4f02-98c3-78f0ffe367b9
[W 2024-04-02 18:46:48,867] Trial 0 failed with parameters: {} because of the following error: OutOfMemoryError('CUDA out of memory. Tried to allocate 100.00 MiB. GPU 0 has a total capacity of 79.14 GiB of which 32.69 MiB is free. Process 2322674 has 3.74 GiB memory in use. Process 2854659 has 918.00 MiB memory in use. Process 2859346 has 1.71 GiB memory in use. Process 3046407 has 47.39 GiB memory in use. Process 2714500 has 600.00 MiB memory in use. Process 1323156 has 492.00 MiB memory in use. Process 1370091 has 5.81 GiB memory in use. Process 2795125 has 10.49 GiB memory in use. Including non-PyTorch memory, this process has 1.34 GiB memory in use. Process 2839123 has 6.59 GiB memory in use. Of the allocated memory 458.28 MiB is allocated by PyTorch, and 157.72 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_C

OutOfMemoryError: CUDA out of memory. Tried to allocate 100.00 MiB. GPU 0 has a total capacity of 79.14 GiB of which 32.69 MiB is free. Process 2322674 has 3.74 GiB memory in use. Process 2854659 has 918.00 MiB memory in use. Process 2859346 has 1.71 GiB memory in use. Process 3046407 has 47.39 GiB memory in use. Process 2714500 has 600.00 MiB memory in use. Process 1323156 has 492.00 MiB memory in use. Process 1370091 has 5.81 GiB memory in use. Process 2795125 has 10.49 GiB memory in use. Including non-PyTorch memory, this process has 1.34 GiB memory in use. Process 2839123 has 6.59 GiB memory in use. Of the allocated memory 458.28 MiB is allocated by PyTorch, and 157.72 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [37]:
import random

def create_embedding_plot_with_best_model(model_path, iterator, writer):
    best_code_encoder = LSTMCodeEncoder(tokenizer.vocab_size, 512, 512, num_layers=1).to(device)
    best_code_encoder.load_state_dict(torch.load(model_path))

    with torch.no_grad():
        data_list = list(iterator)  # Potential memory concerns for large datasets
        random_index = random.randint(0, len(data_list) - 1)
        random_batch = data_list[random_index] 

        bug_embd = best_code_encoder(random_batch.buggy.T)
        patch_embd = best_code_encoder(random_batch.patch.T)
        labels = random_batch.numerical_label

        # Separate embeddings based on labels
        correct_mask = labels == 1 
        incorrect_mask = labels == -1 

        bug_embd_correct = bug_embd[correct_mask]
        patch_embd_correct = patch_embd[correct_mask]
        bug_embd_incorrect = bug_embd[incorrect_mask]
        patch_embd_incorrect = patch_embd[incorrect_mask]
        

        correct_embeddings = torch.cat([bug_embd_correct, patch_embd_correct], dim=0)
        incorrect_embeddings = torch.cat([bug_embd_incorrect, patch_embd_incorrect], dim=0)

        correct_metadata = ['buggy'] * bug_embd_correct.shape[0] + ['correct_patch'] * patch_embd_correct.shape[0]
        incorrect_metadata = ['buggy'] * bug_embd_incorrect.shape[0] + ['incorrect_patch'] * patch_embd_incorrect.shape[0]

        # Add embeddings to TensorBoard
        writer.add_embedding(correct_embeddings.reshape(correct_embeddings.shape[0], -1), metadata=correct_metadata, tag='correct', global_step=0)
        writer.add_embedding(incorrect_embeddings.reshape(incorrect_embeddings.shape[0], -1), metadata=incorrect_metadata, tag='incorrect', global_step=0)

    writer.close() 

In [38]:
writer = SummaryWriter(writer_path + "hyperparameter")
create_embedding_plot_with_best_model(os.path.join(best_model_save_path, "best_model_small.pth"), train_itr_large, writer = writer)