In [1]:
#!pip install transformers datasets huggingface_hub tensorboard==2.11

In [2]:
#!pip install git-lfs --yes

In [3]:
#!pip install torchvision 

In [4]:
import torch
from datasets import load_dataset
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
)
from huggingface_hub import HfFolder, notebook_login

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [6]:
human_data = pd.read_csv("../data/Real_Code/code_data.csv")

In [7]:
human_data.head()

Unnamed: 0,Code,Label,Number of Characters,Number of Lines
0,'use strict';\n\nvar clear = require(...,human-written,2730,80
1,'use strict';\n\nconst TYPE = Symbol.for('type...,human-written,266,20
2,package sodium\n\n// #cgo pkg-config: libsodiu...,human-written,322,19
3,\nfunction collectWithWildcard(test) {\n\ttest...,human-written,886,49
4,<?php\n\ninterface Container {\n /**\n ...,human-written,165,12


In [8]:
train_data, remaining_data = train_test_split(human_data, train_size=0.8, random_state=42)

validation_data, test_data = train_test_split(remaining_data, train_size=0.5, random_state=42)

# Let's check the sizes of each set
len(train_data), len(validation_data), len(test_data)

(8000, 1000, 1000)

In [9]:
train_texts, val_texts, test_texts = train_data['Code'], validation_data['Code'], test_data['Code']
train_labels, val_labels, test_labels = train_data['Label'], validation_data['Label'], test_data['Label']

In [10]:
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

train_texts_list = train_texts.tolist()
val_texts_list = val_texts.tolist()
test_texts_list = test_texts.tolist()

train_encodings = tokenizer(train_texts_list, truncation=True, padding=True)
val_encodings = tokenizer(val_texts_list, truncation=True, padding=True)
test_encodings = tokenizer(test_texts_list, truncation=True, padding=True)

In [25]:
from sklearn.preprocessing import LabelEncoder

class CodeDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [26]:
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_labels)
val_labels_encoded = label_encoder.transform(val_labels)
test_labels_encoded = label_encoder.transform(test_labels)

# Create the CodeDataset instances with the encoded labels
train_dataset = CodeDataset(train_encodings, train_labels_encoded)
val_dataset = CodeDataset(val_encodings, val_labels_encoded)
test_dataset = CodeDataset(test_encodings, test_labels_encoded)

# Create the data loaders with a corrected batch size
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=False)



In [27]:
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
# Updated TrainingArguments with potentially corrected paths (if the default ones were incorrect)
training_args = TrainingArguments(
    output_dir='./results',            # output directory
    num_train_epochs=3,                # total number of training epochs
    per_device_train_batch_size=16,    # batch size per device during training
    per_device_eval_batch_size=64,     # batch size for evaluation
    warmup_steps=500,                  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                 # strength of weight decay
    logging_dir='./logs',              # directory for storing logs
)

In [29]:
trainer = Trainer(
    model=model,                         # the instantiated model
    args=training_args,                  # training arguments
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    compute_metrics=lambda pred: {"accuracy": (pred.label_ids == pred.predictions.argmax(-1)).astype(float).mean()}
)

In [30]:
trainer.train()

Step,Training Loss


KeyboardInterrupt: 