In [4]:
# --- Reading the cpp files from currently directory and storing in array as raw strings ---

import os

def load_cpp_snippets(root_dir, limit=None):
    snippets = []
    for root, _, files in os.walk(root_dir):
        for file in files:
            if file.endswith(".cpp"):
                path = os.path.join(root, file)
                try:
                    with open(path, "r", encoding="utf-8", errors="ignore") as f:
                        code = f.read().strip()
                        if code:
                            snippets.append(code)
                            if limit and len(snippets) >= limit:
                                return snippets
                except:
                    continue  # skip unreadable files
    return snippets

cpp_dir = "sample-data"                                      # Change to your dataset folder
code_snippets = load_cpp_snippets(cpp_dir, limit=100)        # Set limit as needed
print("Data extracted from folder")

Data extracted from folder


In [5]:
# --- Loading the tokenizer and batch for gpu ---

from transformers import AutoTokenizer
from torch.utils.data import DataLoader
import torch
import torch.nn.functional as F

tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")

def tokenize_batch(batch):
    return tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=256)

# Follows the researchers' loss function 
def simcse_unsup_loss(emb1, emb2, temperature=0.1):
    sim = F.cosine_similarity(emb1.unsqueeze(1), emb2.unsqueeze(0), dim=2)
    sim /= temperature
    labels = torch.arange(emb1.size(0)).to(emb1.device)
    return F.cross_entropy(sim, labels)

dataloader = DataLoader(code_snippets, batch_size=16, shuffle=True)
print("Data loaded")


Data loaded


In [6]:
# --- Training the model ---

from transformers import AutoModel
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained("microsoft/graphcodebert-base").to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

model.train()

for epoch in range(3):
    total_loss = 0
    for batch in tqdm(dataloader):
        tokenized = tokenize_batch(batch)
        tokenized = {k: v.to(device) for k, v in tokenized.items()}

        outputs1 = model(**tokenized)
        outputs2 = model(**tokenized)

        out1 = outputs1.last_hidden_state[:, 0, :]  # [CLS]
        out2 = outputs2.last_hidden_state[:, 0, :]  # [CLS]


        loss = simcse_unsup_loss(out1, out2)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()
    
    print(f"Epoch {epoch + 1}: Loss = {total_loss:.4f}")

# Save the model in the current directory
model.save_pretrained("graphcodebert-cpp-simcse")
tokenizer.save_pretrained("graphcodebert-cpp-simcse")

Some weights of RobertaModel were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 7/7 [00:36<00:00,  5.16s/it]


Epoch 1: Loss = 7.7987


100%|██████████| 7/7 [00:33<00:00,  4.85s/it]


Epoch 2: Loss = 2.1588


100%|██████████| 7/7 [00:33<00:00,  4.85s/it]


Epoch 3: Loss = 0.8482


('graphcodebert-cpp-simcse\\tokenizer_config.json',
 'graphcodebert-cpp-simcse\\special_tokens_map.json',
 'graphcodebert-cpp-simcse\\vocab.json',
 'graphcodebert-cpp-simcse\\merges.txt',
 'graphcodebert-cpp-simcse\\added_tokens.json',
 'graphcodebert-cpp-simcse\\tokenizer.json')