In [1]:
! pip install accelerate
! pip install transformers
! pip install torch
! pip install sentence-transformers
! pip install pandas
! pip install datasets




[notice] A new release of pip is available: 23.2.1 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.2.1 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.2.1 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.2.1 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.2.1 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.2.1 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import ast
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader, random_split
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def ast_tokenizer(code):
    try:
        tree = ast.parse(code)
        tokens = [type(node).__name__ for node in ast.walk(tree)]
        return tokens
    except:
        return []

def ast_to_str(tokens):
    return ' '.join(tokens)

In [4]:
class CodeBERTSimilarity(nn.Module):
    def __init__(self, model_name="microsoft/codebert-base"):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.fc = nn.Sequential(
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Linear(256, 1),
            nn.Sigmoid()
        )

    def encode(self, x):
        outputs = self.bert(**x)
        return outputs.last_hidden_state[:, 0, :]  # [CLS] токен

    def forward(self, x1, x2):
        v1 = self.encode(x1)
        v2 = self.encode(x2)
        diff = torch.abs(v1 - v2)
        return self.fc(diff)


In [5]:
class CodeSimilarityDataset(Dataset):
    def __init__(self, csv_file, tokenizer, part=1):
        self.data = pd.read_csv(csv_file)
        self.data = self.data[:len(self.data)//part]
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        code1 = ast_to_str(ast_tokenizer(row['text1']))
        code2 = ast_to_str(ast_tokenizer(row['text2']))
        if not code1 or not code2:
            return self.__getitem__((idx + 1) % len(self))
        
        label = torch.tensor([row['similarity']], dtype=torch.float32)

        return code1, code2, label

In [6]:
def collate_fn(batch):
    codes1, codes2, labels = zip(*batch)
    tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")

    x1 = tokenizer(list(codes1), return_tensors="pt", truncation=True, padding=True, max_length=512)
    x2 = tokenizer(list(codes2), return_tensors="pt", truncation=True, padding=True, max_length=512)
    labels = torch.stack(labels)

    return x1, x2, labels

In [7]:
def train(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for x1, x2, label in dataloader:
        for k in x1:
            x1[k] = x1[k].to(device)
            x2[k] = x2[k].to(device)
        label = label.to(device)

        optimizer.zero_grad()
        output = model(x1, x2)
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

In [8]:
def validate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for x1, x2, label in dataloader:
            for k in x1:
                x1[k] = x1[k].to(device)
                x2[k] = x2[k].to(device)
            label = label.to(device)

            output = model(x1, x2)
            loss = criterion(output, label)
            total_loss += loss.item()

    return total_loss / len(dataloader)

In [18]:
epochs=3
batch_size=8
lr=2e-5

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
model = CodeBERTSimilarity().to(device)

dataset = CodeSimilarityDataset("Input/pl_pairs.csv", tokenizer, part=10)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
criterion = nn.BCELoss()

for epoch in range(epochs):
    train_loss = train(model, train_loader, optimizer, criterion, device)
    val_loss = validate(model, val_loader, criterion, device)
    print(f"Epoch {epoch+1}/{epochs} - Train Loss: {train_loss:.4f} - Val Loss: {val_loss:.4f}")



Epoch 1/3 - Train Loss: 0.3143 - Val Loss: 0.2971




Epoch 2/3 - Train Loss: 0.2946 - Val Loss: 0.3083




Epoch 3/3 - Train Loss: 0.2916 - Val Loss: 0.2761


In [19]:
model.bert.save_pretrained("Models/finetuned_codebert/bert")
tokenizer.save_pretrained("Models/finetuned_codebert/bert")
torch.save(model.fc.state_dict(), "Models/finetuned_model/fc_head.pth")