In [9]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset

In [2]:
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
model = RobertaForSequenceClassification.from_pretrained("microsoft/codebert-base", num_labels=5)  # Assume 5 patterns

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
data = [
    {"code": "class Adapter { constructor() { this.adaptee = new Adaptee(); } method() { this.adaptee.call(); } }", "label": 0},  # Adapter
    {"code": "const instance = Singleton.getInstance();", "label": 1},  # Singleton
]

In [4]:
inputs = tokenizer([d["code"] for d in data], truncation=True, padding=True, max_length=512, return_tensors="pt")
labels = torch.tensor([d["label"] for d in data])

In [11]:
codes = [d["code"] for d in data]
labels = [d["label"] for d in data]

# Tokenize the code samples


In [12]:
encoded_inputs = tokenizer(codes, truncation=True, padding=True, max_length=512, return_tensors="pt")

# Split the dataset into training and validation sets
train_size = int(0.8 * len(data))
train_inputs = {key: tensor[:train_size] for key, tensor in encoded_inputs.items()}
train_labels = torch.tensor(labels[:train_size])
val_inputs = {key: tensor[train_size:] for key, tensor in encoded_inputs.items()}
val_labels = torch.tensor(labels[train_size:])



In [16]:
class CodeDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # Ensure that the data returned is in the format expected by the Trainer
        return {
            'input_ids': self.inputs['input_ids'][idx],
            'attention_mask': self.inputs['attention_mask'][idx],
            'labels': self.labels[idx]
        }

# Prepare datasets for training and validation
train_dataset = CodeDataset(train_inputs, train_labels)
val_dataset = CodeDataset(val_inputs, val_labels)



In [14]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",  # Output directory for model checkpoints and logs
    num_train_epochs=3,  # Number of training epochs
    per_device_train_batch_size=2,  # Batch size per device (train)
    per_device_eval_batch_size=2,  # Batch size per device (eval)
    warmup_steps=500,  # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # Weight decay to apply (if any)
    logging_dir="./logs",  # Directory for storing logs
    logging_steps=10,  # Log every 10 steps
    evaluation_strategy="epoch",  # Evaluate after each epoch
    save_strategy="epoch",
    disable_tqdm=True,# Save the model after each epoch
)

# Initialize the Trainer with the model, training arguments, and datasets
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)



