In [1]:
from dotenv import load_dotenv, find_dotenv
import sys, os

load_dotenv(find_dotenv())

project_root = os.getenv('PROJECT_ROOT')
sys.path.insert(0, project_root) 

In [2]:
import torch
import torch.optim as optim
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

In [67]:
# tensorboard
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter("runs/small_dataset")

In [3]:
from src.prepare_data_for_models import build_train_val_test_iter
from src.supconloss_with_cosine import SupConLossWithConsine

In [4]:
# load data
data_dir = os.environ.get("DATA_DIR")
dataset_fullpath = os.path.join(project_root, data_dir, "output")
small_dataset_output = os.path.join(dataset_fullpath, "model_output", "small_patches")
large_dataset_output = os.path.join(dataset_fullpath, "model_output", "large_patches")

In [5]:
# load dataframe
large_patches_df = pd.read_csv(os.path.join(dataset_fullpath, "large-patches.csv"))
small_patches_df = pd.read_csv(os.path.join(dataset_fullpath, "small-patches.csv")) 

In [59]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,3"
device = "cuda" if torch.cuda.is_available() else "cpu"

In [60]:
# small dataset
train_itr_small, val_itr_small, test_itr_small, tokenizer = build_train_val_test_iter(small_dataset_output, small_patches_df, device)

[INFO] Building train, val and test csv files at /student/tdy245/Projects/cmpt828_deeplearning/PaCo/data/output/model_output/small_patches


Token indices sequence length is longer than the specified maximum sequence length for this model (561 > 512). Running this sequence through the model will result in indexing errors


[INFO] Building train, val and test tabular dataset
** Number of training examples: 946
** Number of validation examples: 118
** Number of testing examples: 119
Batch size: 32


In [61]:
import torch.nn as nn

class LSTMCodeEncoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers=1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.batch_norm_one = nn.BatchNorm1d(embedding_dim) 
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.batch_norm_two = nn.BatchNorm1d(hidden_dim) 
        self.linear_projection = nn.Linear(hidden_dim, 128) 
        
    def forward(self, code_tokens):
        embeddings = self.embedding(code_tokens)
        embeddings = self.batch_norm_one(embeddings) # Apply batch norm one
        output, (hidden, cell) = self.lstm(embeddings)  
        output = self.linear_projection(output)
        # hidden = self.batch_norm_two(hidden[-1])  # Apply batch norm two
        return output 

In [70]:
def init_train_var(model):
    criterion = SupConLossWithConsine(device=device)
    optimizer =  optim.Adam(model.parameters(), lr= 1e-5)
    return criterion, optimizer

In [71]:
tokenizer.vocab_size

50265

In [72]:
def train(model, train_iterator, valid_iterator, batch_size):
    nb_epochs = 100
    criterion, optimizer = init_train_var(model)

    train_steps = len(train_iterator.dataset) // batch_size
    val_steps = len(valid_iterator.dataset) // batch_size

    model.train()
    for epoch in range(nb_epochs):
        epoch_train_loss = 0
        epoch_val_loss = 0

        # train_correct = 0  not using it now since there is no patch classifier
        # val_correct = 0

        for batch in train_iterator:
            buggy_tensor, patch_tensor, labels = batch.buggy.T, batch.patch.T, batch.numerical_label # data already in device

            optimizer.zero_grad()
            buggy_embd = model(buggy_tensor)
            patch_embd = model(patch_tensor)
            loss = criterion(buggy_embd, patch_embd, labels) # buggy_embd, patch_embd, label
            loss.backward()
            optimizer.step()

            epoch_train_loss += loss

        with torch.no_grad():
            for batch in valid_iterator:
                buggy_tensor, patch_tensor, labels = batch.buggy.T, batch.patch.T, batch.numerical_label
                buggy_embd = model(buggy_tensor)
                patch_embd = model(patch_tensor)

                loss = criterion(buggy_embd, patch_embd, labels) # buggy_embd, patch_embd, label
                
                epoch_val_loss += loss

        mean_train_loss = epoch_train_loss / train_steps
        mean_val_loss = epoch_val_loss / val_steps

        writer.add_scalar("training_loss", mean_train_loss, nb_epochs * train_steps + epoch)
        writer.add_scalar("validation_loss", mean_val_loss, nb_epochs * val_steps + epoch)
        
        print("[INFO] EPOCH: {}/{}".format(epoch + 1, nb_epochs))
        print("Train loss: {:.6f}".format(mean_train_loss))
        print("Val loss: {:.6f}\n".format(mean_val_loss))

In [73]:

code_encoder = LSTMCodeEncoder(tokenizer.vocab_size, 512, 512, num_layers=1)
code_encoder.to(device)
train(code_encoder, train_itr_small, val_itr_small, 32)

[INFO] EPOCH: 1/100
Train loss: 2.093079
Val loss: 2.672560

[INFO] EPOCH: 2/100
Train loss: 2.088153
Val loss: 2.666965

[INFO] EPOCH: 3/100
Train loss: 2.081014
Val loss: 2.654811

[INFO] EPOCH: 4/100
Train loss: 2.055302
Val loss: 2.513335

[INFO] EPOCH: 5/100
Train loss: 1.952000
Val loss: 2.625548

[INFO] EPOCH: 6/100
Train loss: 2.017523
Val loss: 2.519709

[INFO] EPOCH: 7/100
Train loss: 2.025662
Val loss: 2.677988

[INFO] EPOCH: 8/100
Train loss: 2.091460
Val loss: 2.673651

[INFO] EPOCH: 9/100
Train loss: 2.089927
Val loss: 2.666314

[INFO] EPOCH: 10/100
Train loss: 2.078951
Val loss: 2.654359

[INFO] EPOCH: 11/100
Train loss: 2.069102
Val loss: 2.636626

[INFO] EPOCH: 12/100
Train loss: 2.050417
Val loss: 2.604529

[INFO] EPOCH: 13/100
Train loss: 2.002002
Val loss: 2.442308

[INFO] EPOCH: 14/100
Train loss: 2.011471
Val loss: 2.609372

[INFO] EPOCH: 15/100
Train loss: 2.037206
Val loss: 2.578414

[INFO] EPOCH: 16/100
Train loss: 1.969822
Val loss: 2.365319

[INFO] EPOCH: 17/