In [1]:
!pip install transformers




[notice] A new release of pip is available: 23.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
from transformers import RobertaTokenizer, RobertaModel
import torch
import torch.nn.functional as F
from torch import nn
import pandas as pd
from torch.nn import TripletMarginLoss
from torch import optim

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [4]:
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
base_model = RobertaModel.from_pretrained("microsoft/codebert-base")

In [5]:
class CodeBERT_Contrastive(nn.Module):
    def __init__(self, model):
        super(CodeBERT_Contrastive, self).__init__()
        self.model = model

    def forward(self, input_ids, attention_mask):
        output = self.model(input_ids=input_ids, attention_mask=attention_mask)

        return output.last_hidden_state[:, 0, :]

contrastive_model = CodeBERT_Contrastive(base_model).to(device)

In [6]:
class TripletLoss(nn.Module):
    def __init__(self, margin=1.0):
        super(TripletLoss, self).__init__()
        self.margin = margin

    def forward(self, anchor, positive, negative):
        pos_dist = F.pairwise_distance(anchor, positive, p=2)
        neg_dist = F.pairwise_distance(anchor, negative, p=2)
        loss = torch.clamp(pos_dist - neg_dist + self.margin, min=0.0)
        return loss.mean()

In [7]:
data = pd.read_csv("Input/3pairs.csv")
data.head()

Unnamed: 0,anchor,positive,negative
0,"def count_Pairs(arr,n): \n cnt = 0; \n f...","def var1(arr, n):\n var2 = 0\n for var3 ...",import heapq as hq\ndef heap_queue_largest(num...
1,def find_lists(Input): \n\tif isinstance(Input...,"def var1(Input):\n if isinstance(var2, list...",def combinations_list(list1):\n if len(list...
2,def histogram(test):\n dict1={}\n list1=...,def var1(test):\n var2 = {}\n var3 = var...,def reverse_words(s):\n return ' '.join...
3,"def below_threshold(l: list, t: int):\n for...","def var1(l: list, t: int):\n for var2 in va...",import re\ndef remove_all_spaces(text):\n retu...
4,def max_Abs_Diff(arr): \n n = len(arr)\n ...,def var1(arr):\n var2 = len(var3)\n var4...,"\ndef digitSum(s):\n if s == """": return 0\n..."


In [8]:
len(data)

530

In [9]:
def encode_pair(row):
    encoded_anchor = tokenizer(row["anchor"], padding="max_length", truncation=True, max_length=256, return_tensors="pt")
    encoded_positive = tokenizer(row["positive"], padding="max_length", truncation=True, max_length=256, return_tensors="pt")
    encoded_negative = tokenizer(row["negative"], padding="max_length", truncation=True, max_length=256, return_tensors="pt")
    return {"anchor": encoded_anchor, "positive": encoded_positive, "negative": encoded_negative}

encoded_pairs = data.apply(encode_pair, axis=1).tolist()

In [11]:
optimizer = optim.AdamW(contrastive_model.parameters(), lr=5e-5)
triplet_loss_fn = TripletMarginLoss(margin=1.0)
contrastive_model.train()

for epoch in range(5):
    total_loss = 0
    
    for index, row in data.iterrows():
        print(f"{index}/{len(data)}")

        anchor_inputs = tokenizer(row["anchor"], padding="max_length", truncation=True, max_length=256, return_tensors="pt")
        positive_inputs = tokenizer(row["positive"], padding="max_length", truncation=True, max_length=256, return_tensors="pt")
        negative_inputs = tokenizer(row["negative"], padding="max_length", truncation=True, max_length=256, return_tensors="pt")

        for batch in [anchor_inputs, positive_inputs, negative_inputs]:
            for key in batch:
                batch[key] = batch[key].to(device)
        
        anchor = contrastive_model(anchor_inputs["input_ids"], anchor_inputs["attention_mask"])
        positive = contrastive_model(positive_inputs["input_ids"], positive_inputs["attention_mask"])
        negative = contrastive_model(negative_inputs["input_ids"], negative_inputs["attention_mask"])

        loss = triplet_loss_fn(anchor, positive, negative)
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(data)}")


Device in using: cuda:0
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |    975 MiB |   2422 MiB |   1556 GiB |   1555 GiB |
|       from large pool |    972 MiB |   2417 MiB |   1147 GiB |   1146 GiB |
|       from small pool |      3 MiB |    239 MiB |    408 GiB |    408 GiB |
|---------------------------------------------------------------------------|
| Active memory         |    975 MiB |   2422 MiB |   1556 GiB |   1555 GiB |
|       from large pool |    972 MiB |   2417 MiB |   1147 GiB |   1146 GiB |
|       from small pool |      3 MiB |    239 MiB |    408 GiB |    408 GiB |
|---------------------------------------

In [12]:
torch.save(contrastive_model.state_dict(), "contrastive_model.pt")
tokenizer.save_pretrained("tokenizer/")

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.json',
 'tokenizer/merges.txt',
 'tokenizer/added_tokens.json')