In [1]:
!pip install transformers datasets huggingface_hub tensorboard==2.11

Collecting transformers
  Downloading transformers-4.40.1-py3-none-any.whl.metadata (137 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.0/138.0 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Collecting tensorboard==2.11
  Downloading tensorboard-2.11.0-py3-none-any.whl.metadata (1.9 kB)
Collecting google-auth-oauthlib<0.5,>=0.4.1 (from tensorboard==2.11)
  Downloading google_auth_oauthlib-0.4.6-py2.py3-none-any.whl.metadata (2.7 kB)
Collecting tensorboard-data-server<0.7.0,>=0.6.0 (from tensorboard==2.11)
  Downloading tensorboard_data_server-0.6.1-py3-none-any.whl.metadata (1.1 kB)
Collecting tensorboard-plugin-wit>=1.6.0 (from tensorboard==2.11)
  Downloading tensorboard_plugin_wit-1.8.1-py3-none-any.whl.metadata (873 bytes)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.4.16-cp310-cp310-macosx_11_0_arm64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m5.6 MB/s[0m eta [36m0:

In [2]:
!pip install git-lfs --yes


Usage:   
  pip install [options] <requirement specifier> [package-index-options] ...
  pip install [options] -r <requirements file> [package-index-options] ...
  pip install [options] [-e] <vcs project url> ...
  pip install [options] [-e] <local project path> ...
  pip install [options] <archive url/path> ...

no such option: --yes


In [3]:
!pip install torchvision 

Collecting torchvision
  Downloading torchvision-0.18.0-cp310-cp310-macosx_11_0_arm64.whl.metadata (6.6 kB)
Collecting torch==2.3.0 (from torchvision)
  Downloading torch-2.3.0-cp310-none-macosx_11_0_arm64.whl.metadata (26 kB)
Collecting typing-extensions>=4.8.0 (from torch==2.3.0->torchvision)
  Downloading typing_extensions-4.11.0-py3-none-any.whl.metadata (3.0 kB)
Collecting sympy (from torch==2.3.0->torchvision)
  Downloading sympy-1.12-py3-none-any.whl.metadata (12 kB)
Collecting mpmath>=0.19 (from sympy->torch==2.3.0->torchvision)
  Downloading mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
Downloading torchvision-0.18.0-cp310-cp310-macosx_11_0_arm64.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading torch-2.3.0-cp310-none-macosx_11_0_arm64.whl (61.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 MB[0m [31m32.0 MB/s[0m eta [36m0:00:0

In [4]:
import torch
from datasets import load_dataset
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
)
from huggingface_hub import HfFolder, notebook_login

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [14]:
# Read in the data
human_data = pd.read_csv("../data/Real_Code/code_data.csv")
ai_data = pd.read_csv("../data/AI_generated/AI_written_10000.csv")

In [18]:
# concatenate the data
data = pd.concat([human_data, ai_data], ignore_index=True)

In [21]:
train_data, remaining_data = train_test_split(data, train_size=0.8, random_state=42)

validation_data, test_data = train_test_split(remaining_data, train_size=0.5, random_state=42)

# Let's check the sizes of each set
len(train_data), len(validation_data), len(test_data)

(16000, 2000, 2000)

In [22]:
train_texts, val_texts, test_texts = train_data['Code'], validation_data['Code'], test_data['Code']
train_labels, val_labels, test_labels = train_data['Label'], validation_data['Label'], test_data['Label']

In [23]:
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

train_texts_list = train_texts.tolist()
val_texts_list = val_texts.tolist()
test_texts_list = test_texts.tolist()

train_encodings = tokenizer(train_texts_list, truncation=True, padding=True)
val_encodings = tokenizer(val_texts_list, truncation=True, padding=True)
test_encodings = tokenizer(test_texts_list, truncation=True, padding=True)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [24]:
from sklearn.preprocessing import LabelEncoder

class CodeDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [25]:
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_labels)
val_labels_encoded = label_encoder.transform(val_labels)
test_labels_encoded = label_encoder.transform(test_labels)

# Create the CodeDataset instances with the encoded labels
train_dataset = CodeDataset(train_encodings, train_labels_encoded)
val_dataset = CodeDataset(val_encodings, val_labels_encoded)
test_dataset = CodeDataset(test_encodings, test_labels_encoded)

# Create the data loaders with a corrected batch size
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=False)



In [26]:
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)



model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
# !pip install accelerate -U
!pip install transformers[torch]

zsh:1: no matches found: transformers[torch]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [32]:
# Updated TrainingArguments with potentially corrected paths (if the default ones were incorrect)
training_args = TrainingArguments(
    output_dir='./results',            # output directory
    num_train_epochs=3,                # total number of training epochs
    per_device_train_batch_size=16,    # batch size per device during training
    per_device_eval_batch_size=64,     # batch size for evaluation
    warmup_steps=500,                  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                 # strength of weight decay
    logging_dir='./logs',              # directory for storing logs
)

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.21.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`

In [29]:
trainer = Trainer(
    model=model,                         # the instantiated model
    args=training_args,                  # training arguments
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    compute_metrics=lambda pred: {"accuracy": (pred.label_ids == pred.predictions.argmax(-1)).astype(float).mean()}
)

In [30]:
trainer.train()

Step,Training Loss


KeyboardInterrupt: 