In [None]:
!pip install transformers datasets huggingface_hub tensorboard==2.11

In [None]:
!pip install git-lfs --yes

In [None]:
!pip install torchvision 

In [None]:
import torch
from datasets import load_dataset
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
)
from huggingface_hub import HfFolder, notebook_login

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
# Read in the data
human_data = pd.read_csv("../data/human/human_data.csv")
ai_data = pd.read_csv("../data/ai/ai_data.csv")

In [None]:
# concatenate the data
data = pd.concat([human_data, ai_data], ignore_index=True)

In [None]:
train_data, remaining_data = train_test_split(data, train_size=0.8, random_state=42)

validation_data, test_data = train_test_split(remaining_data, train_size=0.5, random_state=42)

# Let's check the sizes of each set
len(train_data), len(validation_data), len(test_data)

In [None]:
train_texts, val_texts, test_texts = train_data['Code'], validation_data['Code'], test_data['Code']
train_labels, val_labels, test_labels = train_data['Label'], validation_data['Label'], test_data['Label']

In [None]:
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

train_texts_list = train_texts.tolist()
val_texts_list = val_texts.tolist()
test_texts_list = test_texts.tolist()

train_encodings = tokenizer(train_texts_list, truncation=True, padding=True)
val_encodings = tokenizer(val_texts_list, truncation=True, padding=True)
test_encodings = tokenizer(test_texts_list, truncation=True, padding=True)

In [None]:
from sklearn.preprocessing import LabelEncoder

class CodeDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_labels)
val_labels_encoded = label_encoder.transform(val_labels)
test_labels_encoded = label_encoder.transform(test_labels)

# Create the CodeDataset instances with the encoded labels
train_dataset = CodeDataset(train_encodings, train_labels_encoded)
val_dataset = CodeDataset(val_encodings, val_labels_encoded)
test_dataset = CodeDataset(test_encodings, test_labels_encoded)

# Create the data loaders with a corrected batch size
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=False)



In [None]:
import torch
import torch.nn as nn
from transformers import RobertaPreTrainedModel, RobertaModel

class RobertaForCustomSequenceClassification(RobertaPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        
        self.roberta = RobertaModel(config)

        self.classifier = nn.Sequential(
            nn.Linear(config.hidden_size, 128), 
            nn.Dropout(0.1),                   
            nn.ReLU(),                        
            nn.Linear(128, 64),              
            nn.ReLU(),                        
            nn.Linear(64, 1)                 
        )
        self.sigmoid = nn.Sigmoid()            
        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, labels=None):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        
        sequence_output = outputs[0]
        logits = self.classifier(sequence_output[:, 0, :])  
        logits = self.sigmoid(logits)                       

        loss = None
        if labels is not None:
            loss_fct = nn.BCEWithLogitsLoss()  
            loss = loss_fct(logits.view(-1), labels.view(-1).float())

        return {'loss': loss, 'logits': logits} if loss is not None else logits

config = AutoConfig.from_pretrained("roberta-base", num_labels=1)  # num_labels=1 for binary classification

model = RobertaForCustomSequenceClassification(config)



In [None]:
# Install one of these is enough
# !pip install accelerate -U
!pip install transformers[torch]

In [None]:
# Updated TrainingArguments with potentially corrected paths (if the default ones were incorrect)
training_args = TrainingArguments(
    output_dir='./results',            # output directory
    num_train_epochs=3,                # total number of training epochs
    per_device_train_batch_size=16,    # batch size per device during training
    per_device_eval_batch_size=64,     # batch size for evaluation
    warmup_steps=500,                  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                 # strength of weight decay
    logging_dir='./logs',              # directory for storing logs
)

In [None]:
trainer = Trainer(
    model=model,                         # the instantiated model
    args=training_args,                  # training arguments
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    compute_metrics=lambda pred: {"accuracy": (pred.label_ids == pred.predictions.argmax(-1)).astype(float).mean()}
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()