1. completion, context_positive, context_negative
2. completion, context_positive, [context_negatives]
3. isolate samples from the same repo, but different completion file
4. isolate samples from the same repo, same completion file, but different content(line_type+line)

In [1]:
from torch.utils.data import DataLoader, Dataset
import json
import numpy as np

class CompletionContextDataset(Dataset):
    def __init__(self, input_data, tokenizer, max_length=128, test=False):
        self.data = []

        for item in input_data:
            completion = item['completion_content']
            em = np.asarray(item['EMs'])
            context_files = np.asarray(item['context_files'])

            positive_indices = np.where(em == 1)[0]
            negative_indices = np.where(em == 0)[0]

            positive_contexts = context_files[positive_indices]
            negative_contexts = context_files[negative_indices]

            if test:
                for p in positive_contexts:
                    self.data.append((completion, p[0]['content']))
                for n in positive_contexts:
                    self.data.append((completion, n[0]['content']))
            else:
                for p in positive_contexts:
                    for n in negative_contexts:
                        self.data.append((completion, p[0]['content'], n[0]['content']))

        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx, ):
        completion, positive_context, negative_context = self.data[idx]

        completion_encoding = self.tokenizer(completion, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        positive_context_encoding = self.tokenizer(positive_context, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        negative_context_encoding = self.tokenizer(negative_context, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        
        return completion_encoding, positive_context_encoding, negative_context_encoding
        # return completion, positive_context, negative_context

In [2]:
import os

project_name = 'lca-eval'
run_id = 'lcdfg1ta'
local_path = '/home/kolomyttseva/Git/learned-retrieval/jsonl'

folder_path = f'{local_path}/{run_id}/generated_data'

file_name = os.listdir(folder_path)[0]
path = f'{folder_path}/{file_name}'
print(path)

/home/kolomyttseva/Git/learned-retrieval/jsonl/lcdfg1ta/generated_data/pred_medium_context_True_5.jsonl


In [11]:
from transformers import BertModel, BertTokenizer
from sklearn.model_selection import train_test_split

with open(path) as f:
    data = json.load(f)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_data, val_data = train_test_split(data, test_size=0.1)

train_dataset = CompletionContextDataset(train_data, tokenizer)
val_dataset = CompletionContextDataset(val_data, tokenizer)

print(len(data), len(train_dataset), len(val_dataset))

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=True)

5 32 42


In [12]:
# train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
# next(iter(train_loader))[0]['input_ids'].shape

In [25]:
import torch
from torch import nn

class ContrastiveLoss(nn.Module):
    def __init__(self):
        super(ContrastiveLoss, self).__init__()
        self.similarity = nn.CosineSimilarity(dim=-1, eps=1e-7)
        self.mse_loss = nn.MSELoss()

    def forward(self, completion, positive_context, negative_context):
        positive_score = torch.abs(self.similarity(completion, positive_context))
        negative_score = self.similarity(completion, negative_context)
        # print(f'Positive Score: {positive_score}, Negative Score: {negative_score}')

        score_difference = positive_score - negative_score
        
        target = torch.ones_like(score_difference)
        loss = self.mse_loss(score_difference, target)
        
        return loss

In [20]:
class BiEncoderModel(nn.Module):
    def __init__(self, model_name):
        super(BiEncoderModel, self).__init__()
        self.completion_encoder = BertModel.from_pretrained(model_name)
        self.context_encoder = BertModel.from_pretrained(model_name)

    def forward(self, completion, positive_context, negative_context):
        completion_embeds = self.completion_encoder(**completion).pooler_output
        positive_context_embeds = self.context_encoder(**positive_context).pooler_output
        negative_context_embeds = self.context_encoder(**negative_context).pooler_output

        return completion_embeds, positive_context_embeds, negative_context_embeds

In [26]:
import torch

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = BiEncoderModel('bert-base-uncased').to(device)

In [27]:
# completion, _, _ = train_dataset[0]
# print(completion['input_ids'][0].shape)

# completion_encoder = BertModel.from_pretrained('bert-base-uncased').to(device)
# completion_embeds = completion_encoder(**completion.to(device))

# print(completion_embeds.last_hidden_state[0].shape)
# print(completion_embeds.pooler_output[0].shape)

In [28]:
import torch
import torch.nn as nn

def train(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0    
    for batch_idx, batch in enumerate(dataloader):
        completion, positive_context, negative_context = batch
        completion = {k: v.squeeze().to(device) for k, v in completion.items()}
        positive_context = {k: v.squeeze().to(device) for k, v in positive_context.items()}
        negative_context = {k: v.squeeze().to(device) for k, v in negative_context.items()}
        
        optimizer.zero_grad()
        completion_embeds, positive_context_embeds, negative_context_embeds = model(completion, positive_context, negative_context)
        loss = criterion(completion_embeds, positive_context_embeds, negative_context_embeds)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        print(f'Train Batch {batch_idx+1}/{len(dataloader)}, Loss: {loss.item():.4f}')
    
    average_loss = total_loss / len(dataloader)
    print(f'Average Training Loss: {average_loss:.4f}')
    return average_loss

def validate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch_idx, batch in enumerate(dataloader):
            completion, positive_context, negative_context = batch
            completion = {k: v.squeeze().to(device) for k, v in completion.items()}
            positive_context = {k: v.squeeze().to(device) for k, v in positive_context.items()}
            negative_context = {k: v.squeeze().to(device) for k, v in negative_context.items()}
            
            completion_embeds, positive_context_embeds, negative_context_embeds = model(completion, positive_context, negative_context)
            loss = criterion(completion_embeds, positive_context_embeds, negative_context_embeds)
            
            total_loss += loss.item()
            print(f'Validation Batch {batch_idx+1}/{len(dataloader)}, Loss: {loss.item():.4f}')
    
    average_loss = total_loss / len(dataloader)
    print(f'Average Validation Loss: {average_loss:.4f}')
    return average_loss

In [29]:
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4)
criterion = ContrastiveLoss()

num_epochs = 5
for epoch in range(num_epochs):
    print(f'Epoch {epoch+1}/{num_epochs}')
    train_loss = train(model, train_loader, optimizer, criterion, device)
    val_loss = validate(model, val_loader, criterion, device)
    print(f'Epoch {epoch+1} completed, Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}')

Epoch 1/5
Train Batch 1/4, Loss: 0.9005
Train Batch 2/4, Loss: 0.6546
Train Batch 3/4, Loss: 1.1035
Train Batch 4/4, Loss: 1.0272
Average Training Loss: 0.9215
Validation Batch 1/6, Loss: 0.9790
Validation Batch 2/6, Loss: 0.9948
Validation Batch 3/6, Loss: 0.9896
Validation Batch 4/6, Loss: 0.9950
Validation Batch 5/6, Loss: 0.9948
Validation Batch 6/6, Loss: 1.0002
Average Validation Loss: 0.9922
Epoch 1 completed, Training Loss: 0.9215, Validation Loss: 0.9922
Epoch 2/5
Train Batch 1/4, Loss: 0.9935
Train Batch 2/4, Loss: 0.3662
Train Batch 3/4, Loss: 0.9348
Train Batch 4/4, Loss: 1.0174
Average Training Loss: 0.8280
Validation Batch 1/6, Loss: 0.9989
Validation Batch 2/6, Loss: 0.9987
Validation Batch 3/6, Loss: 0.9351
Validation Batch 4/6, Loss: 0.6791
Validation Batch 5/6, Loss: 0.8074
Validation Batch 6/6, Loss: 0.9991
Average Validation Loss: 0.9030
Epoch 2 completed, Training Loss: 0.8280, Validation Loss: 0.9030
Epoch 3/5
Train Batch 1/4, Loss: 1.0042
Train Batch 2/4, Loss: 0