In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd

train = pd.read_csv("/content/drive/MyDrive/CodeClones/train.csv")
test = pd.read_csv("/content/drive/MyDrive/CodeClones/test.csv")
valid = pd.read_csv("/content/drive/MyDrive/CodeClones/valid.csv")


In [None]:
import re
def preprocess_java_code(code):
    # 1. Remove single-line and multi-line comments
    code = re.sub(r'//.*?\n|/\*.*?\*/', '', code, flags=re.S)

    # 2. Remove string literals
    code = re.sub(r'"(?:\\.|[^"\\])*"|\'(?:\\.|[^\'\\])*\'', 'STRING_LITERAL', code)

    # 3. Remove numeric literals
    code = re.sub(r'\b\d+(\.\d+)?\b', 'NUMERIC_LITERAL', code)

    # 4. Normalize case
    code = code.lower()

    # 5. Tokenize the code
    tokens = re.findall(r'\w+|[^\w\s]', code)

    # 6. Remove unnecessary whitespace
    processed_code = ' '.join(tokens)

    return processed_code

In [None]:
train['function1'] = train['func_x'].apply(preprocess_java_code)
train['function2'] = train['func_y'].apply(preprocess_java_code)

test['function1'] = test['func_x'].apply(preprocess_java_code)
test['function2'] = test['func_y'].apply(preprocess_java_code)

valid['function1'] = valid['func_x'].apply(preprocess_java_code)
valid['function2'] = valid['func_y'].apply(preprocess_java_code)

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer

class CodeCloneDataset(Dataset):
    def __init__(self, dataframe):
        self.tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base')
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        function1 = self.dataframe.iloc[idx]['function1']
        function2 = self.dataframe.iloc[idx]['function2']
        label = self.dataframe.iloc[idx]['Label']

        encoded1 = self.tokenizer.encode_plus(
            function1,
            max_length=512,
            padding='max_length',
            return_tensors='pt',
            truncation=True
        )

        encoded2 = self.tokenizer.encode_plus(
            function2,
            max_length=512,
            padding='max_length',
            return_tensors='pt',
            truncation=True
        )

        return {
            'input_ids1': encoded1['input_ids'].squeeze(),
            'input_ids2': encoded2['input_ids'].squeeze(),
            'label': torch.tensor(label, dtype=torch.float32)
        }


In [None]:
from torch.utils.data import DataLoader

# assuming `df` is your DataFrame
train_dataset = CodeCloneDataset(train)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

valid_dataset = CodeCloneDataset(valid)
valid_loader = DataLoader(valid_dataset, batch_size=4, shuffle=True)

test_dataset = CodeCloneDataset(test)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=True)


In [None]:
import torch
from torch import nn
from transformers import RobertaModel, RobertaTokenizer, AdamW

class SiameseCodeBERT(nn.Module):
    def __init__(self, model_name):
        super(SiameseCodeBERT, self).__init__()
        self.codebert = RobertaModel.from_pretrained(model_name)

    def forward(self, input_ids):
        outputs = self.codebert(input_ids=input_ids)
        return outputs.pooler_output  # return the [CLS] embedding


In [None]:
# Creating an instance of the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SiameseCodeBERT('microsoft/codebert-base').to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)



In [None]:
import torch
def compute_loss(out1, out2, labels, margin=1.0):
    # Calculate Euclidean distance
    euclidean_distance = torch.nn.functional.pairwise_distance(out1, out2)
    # Contrastive Loss
    loss_contrastive = torch.mean((1-labels) * torch.pow(euclidean_distance, 2) +
                                  (labels) * torch.pow(torch.clamp(margin - euclidean_distance, min=0.0), 2))
    return loss_contrastive

# def compute_loss(out1, out2, labels):
#     # Calculate Euclidean distance
#     euclidean_distance = torch.nn.functional.pairwise_distance(out1, out2)
#     # Contrastive Loss
#     loss_contrastive = torch.mean((1-labels) * torch.pow(euclidean_distance, 2) +
#                                   (labels) * torch.pow(torch.clamp(self.margin - euclidean_distance, min=0.0), 2))
#     return loss_contrastive


In [None]:
# Training loop
from tqdm import *
num_epochs = 3
for epoch in range(num_epochs):
    print(f'Starting epoch {epoch+1}/{num_epochs}')

    # Train
    model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_loader, f"Training :Epoch {epoch+1}/{num_epochs}")):
        optimizer.zero_grad()
        input_ids1 = batch['input_ids1'].to(device)
        input_ids2 = batch['input_ids2'].to(device)
        labels = batch['label'].to(device)
        out1 = model(input_ids1)
        out2 = model(input_ids2)
        loss = compute_loss(out1, out2, labels)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    print(f'training loss: {total_loss/len(train_loader)}')

    # Evaluate
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in valid_loader:
            input_ids1 = batch['input_ids1'].to(device)
            input_ids2 = batch['input_ids2'].to(device)
            labels = batch['label'].to(device)
            out1 = model(input_ids1)
            out2 = model(input_ids2)
            loss = compute_loss(out1, out2, labels)
            total_loss += loss.item()
    print(f'Validation loss: {total_loss/len(valid_loader)}')

Starting epoch 1/3


Training :Epoch 1/3: 100%|██████████| 2094/2094 [26:54<00:00,  1.30it/s]


training loss: 0.8819970927223616
Validation loss: 0.11233298957996
Starting epoch 2/3


Training :Epoch 2/3: 100%|██████████| 2094/2094 [27:15<00:00,  1.28it/s]


training loss: 0.2575129350076261
Validation loss: 0.11295038982598128
Starting epoch 3/3


Training :Epoch 3/3: 100%|██████████| 2094/2094 [27:15<00:00,  1.28it/s]


training loss: 0.2544745570466762
Validation loss: 0.10914368355839417


In [None]:
# Assuming you have a DataLoader for your test set, named `test_dataloader`
from sklearn.metrics import confusion_matrix
import numpy as np

model.eval()
all_labels = []
all_predictions = []
correct_predictions = 0
total_loss = 0

with torch.no_grad():
    for batch in test_loader:
        input_ids1 = batch['input_ids1'].to(device)

        input_ids2 = batch['input_ids2'].to(device)

        labels = batch['label'].to(device)

        out1 = model(input_ids1)
        out2 = model(input_ids2)

        loss = compute_loss(out1, out2, labels)
        total_loss += loss.item()

        # Computing predictions
        euclidean_distance = torch.nn.functional.pairwise_distance(out1, out2)
        predictions = (euclidean_distance > 0.4).float()
        all_labels.extend(labels.detach().cpu().numpy())
        all_predictions.extend(predictions.detach().cpu().numpy())

avg_loss = total_loss / len(test_loader)

cm = confusion_matrix(all_labels, all_predictions)
print('Confusion Matrix:')
print(cm)

accuracy = (cm[0, 0] + cm[1, 1]) / np.sum(cm)
print(f'Test Loss: {avg_loss:.4f}, Test Accuracy: {accuracy:.4f}')



Confusion Matrix:
[[1428   53]
 [ 192   17]]
Test Loss: 0.1083, Test Accuracy: 0.8550


# Increase the number of epochs (for example 15) will give a heigh score !!