<a href="https://colab.research.google.com/github/NasserMohamedEid/Text-AI-Detection/blob/main/RoBERTa_traditional.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Install Independencies



In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import csv
import pandas as pd

##Load Data

In [None]:
train_data_path = "/content/drive/MyDrive/Graduation_Project_Ottawa/datasets/dataset collection /all_train_data.csv"
test_data_path="/content/drive/MyDrive/Graduation_Project_Ottawa/datasets/dataset collection /all_test_data.csv"
val_data_path="/content/drive/MyDrive/Graduation_Project_Ottawa/datasets/dataset collection /all_val_data.csv"

In [None]:

train_data= pd.read_csv(train_data_path)
test_data= pd.read_csv(test_data_path)
val_data= pd.read_csv(val_data_path)

In [None]:
train=pd.DataFrame(columns=['text','label'])
validation = pd.DataFrame(columns=['text', 'label'])
test=pd.DataFrame(columns=['text','label'])

In [None]:
X_train=train_data['text'].tolist()
y_train=train_data['label'].tolist()
X_val = val_data['text'].tolist()
y_val = val_data['label'].tolist()
X_test=test_data['text'].tolist()
y_test=test_data['label'].tolist()

In [None]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
train_encodings = tokenizer(X_train, truncation=True, padding=True, return_tensors="pt")
val_encodings = tokenizer(X_val, truncation=True, padding=True, return_tensors="pt")
test_encodings = tokenizer(X_test, truncation=True, padding=True, return_tensors="pt")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [None]:
train_encodings

{'input_ids': tensor([[    0,  1620,    38,  ...,     1,     1,     1],
        [    0,  1779,    38,  ...,     1,     1,     1],
        [    0,   243,    16,  ...,     1,     1,     1],
        ...,
        [    0,   673, 13034,  ...,     5, 37407,     2],
        [    0,   133,  2157,  ...,     1,     1,     1],
        [    0,   713,   189,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [None]:
train_labels = torch.tensor(y_train)
val_labels = torch.tensor(y_val)
test_labels = torch.tensor(y_test)

In [None]:
train_labels

tensor([1, 1, 1,  ..., 0, 1, 0])

In [None]:
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [None]:
train_dataset = CustomDataset(train_encodings, train_labels)
val_dataset = CustomDataset(val_encodings, val_labels)
test_dataset = CustomDataset(test_encodings, test_labels)

In [None]:
torch.cuda.empty_cache()

In [None]:
batch_size = 20
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

##Modeling

In [None]:
num_classes = 2
num_epochs = 3
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=num_classes)
model = model.to("cuda")
optimizer = AdamW(model.parameters(), lr=1e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataset) * num_epochs)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0

    for batch_idx, batch in enumerate(train_loader):
        batch = {key: val.to("cuda") for key, val in batch.items()}  # Move data to GPU
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

        if (batch_idx + 1) % 100 == 0:  # Print every 100 batches
            print(f"Epoch {epoch + 1}/{num_epochs} - Batch {batch_idx + 1}/{len(train_loader)} - Loss: {total_loss / 100:.4f}")
            total_loss = 0.0

    # Validation
    model.eval()
    all_val_outputs = []

    with torch.no_grad():
        for val_batch in val_loader:
            val_batch = {key: val.to("cuda") for key, val in val_batch.items()}  # Move data to GPU
            val_outputs = model(**val_batch)
            all_val_outputs.extend(val_outputs.logits.argmax(dim=1).cpu().numpy())

    val_preds = torch.tensor(all_val_outputs)
    val_acc = accuracy_score(val_labels, val_preds)

    print(f"Epoch {epoch + 1}/{num_epochs} - Validation Accuracy: {val_acc:.4f}")


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


Epoch 1/3 - Batch 100/2141 - Loss: 0.3319
Epoch 1/3 - Batch 200/2141 - Loss: 0.0868
Epoch 1/3 - Batch 300/2141 - Loss: 0.0724
Epoch 1/3 - Batch 400/2141 - Loss: 0.0492
Epoch 1/3 - Batch 500/2141 - Loss: 0.0393
Epoch 1/3 - Batch 600/2141 - Loss: 0.0386
Epoch 1/3 - Batch 700/2141 - Loss: 0.0264
Epoch 1/3 - Batch 800/2141 - Loss: 0.0166
Epoch 1/3 - Batch 900/2141 - Loss: 0.0517
Epoch 1/3 - Batch 1000/2141 - Loss: 0.0255
Epoch 1/3 - Batch 1100/2141 - Loss: 0.0315
Epoch 1/3 - Batch 1200/2141 - Loss: 0.0266
Epoch 1/3 - Batch 1300/2141 - Loss: 0.0211
Epoch 1/3 - Batch 1400/2141 - Loss: 0.0163
Epoch 1/3 - Batch 1500/2141 - Loss: 0.0245
Epoch 1/3 - Batch 1600/2141 - Loss: 0.0215
Epoch 1/3 - Batch 1700/2141 - Loss: 0.0190
Epoch 1/3 - Batch 1800/2141 - Loss: 0.0317
Epoch 1/3 - Batch 1900/2141 - Loss: 0.0068
Epoch 1/3 - Batch 2000/2141 - Loss: 0.0182
Epoch 1/3 - Batch 2100/2141 - Loss: 0.0211
Epoch 1/3 - Validation Accuracy: 0.9925


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


Epoch 2/3 - Batch 100/2141 - Loss: 0.0093
Epoch 2/3 - Batch 200/2141 - Loss: 0.0144
Epoch 2/3 - Batch 300/2141 - Loss: 0.0127
Epoch 2/3 - Batch 400/2141 - Loss: 0.0120
Epoch 2/3 - Batch 500/2141 - Loss: 0.0136
Epoch 2/3 - Batch 600/2141 - Loss: 0.0089
Epoch 2/3 - Batch 700/2141 - Loss: 0.0124
Epoch 2/3 - Batch 800/2141 - Loss: 0.0151
Epoch 2/3 - Batch 900/2141 - Loss: 0.0052
Epoch 2/3 - Batch 1000/2141 - Loss: 0.0139
Epoch 2/3 - Batch 1100/2141 - Loss: 0.0059
Epoch 2/3 - Batch 1200/2141 - Loss: 0.0071
Epoch 2/3 - Batch 1300/2141 - Loss: 0.0043
Epoch 2/3 - Batch 1400/2141 - Loss: 0.0181
Epoch 2/3 - Batch 1500/2141 - Loss: 0.0085
Epoch 2/3 - Batch 1600/2141 - Loss: 0.0082
Epoch 2/3 - Batch 1700/2141 - Loss: 0.0119
Epoch 2/3 - Batch 1800/2141 - Loss: 0.0105
Epoch 2/3 - Batch 1900/2141 - Loss: 0.0115
Epoch 2/3 - Batch 2000/2141 - Loss: 0.0116
Epoch 2/3 - Batch 2100/2141 - Loss: 0.0043
Epoch 2/3 - Validation Accuracy: 0.9963


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


Epoch 3/3 - Batch 100/2141 - Loss: 0.0046
Epoch 3/3 - Batch 200/2141 - Loss: 0.0081
Epoch 3/3 - Batch 300/2141 - Loss: 0.0087
Epoch 3/3 - Batch 400/2141 - Loss: 0.0086
Epoch 3/3 - Batch 500/2141 - Loss: 0.0022
Epoch 3/3 - Batch 600/2141 - Loss: 0.0106
Epoch 3/3 - Batch 700/2141 - Loss: 0.0068
Epoch 3/3 - Batch 800/2141 - Loss: 0.0028
Epoch 3/3 - Batch 900/2141 - Loss: 0.0132
Epoch 3/3 - Batch 1000/2141 - Loss: 0.0062
Epoch 3/3 - Batch 1100/2141 - Loss: 0.0038
Epoch 3/3 - Batch 1200/2141 - Loss: 0.0339
Epoch 3/3 - Batch 1300/2141 - Loss: 0.0065
Epoch 3/3 - Batch 1400/2141 - Loss: 0.0044
Epoch 3/3 - Batch 1500/2141 - Loss: 0.0105
Epoch 3/3 - Batch 1600/2141 - Loss: 0.0121
Epoch 3/3 - Batch 1700/2141 - Loss: 0.0079
Epoch 3/3 - Batch 1800/2141 - Loss: 0.0060
Epoch 3/3 - Batch 1900/2141 - Loss: 0.0028
Epoch 3/3 - Batch 2000/2141 - Loss: 0.0067
Epoch 3/3 - Batch 2100/2141 - Loss: 0.0025
Epoch 3/3 - Validation Accuracy: 0.9778


In [None]:
model.eval()
all_test_outputs = []

with torch.no_grad():
    for test_batch in test_loader:
        test_batch = {key: val.to("cuda") for key, val in test_batch.items()}  # Move data to GPU
        test_outputs = model(**test_batch)
        all_test_outputs.extend(test_outputs.logits.argmax(dim=1).cpu().numpy())

test_preds = torch.tensor(all_test_outputs)
test_acc = accuracy_score(test_labels, test_preds)

print(f"Test Accuracy: {test_acc:.4f}")

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


In [None]:
# Save the trained model
model.save_pretrained("path/to/save/model")