<a href="https://colab.research.google.com/github/NasserMohamedEid/Text-AI-Detection/blob/main/RoBERTa_traditional.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Install Independencies



In [1]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import csv
import pandas as pd

##Load Data

In [2]:
train_data_path = "/content/drive/MyDrive/Graduation_Project_Ottawa/datasets/dataset collection /all_train_data.csv"
test_data_path="/content/drive/MyDrive/Graduation_Project_Ottawa/datasets/dataset collection /all_test_data.csv"
val_data_path="/content/drive/MyDrive/Graduation_Project_Ottawa/datasets/dataset collection /all_val_data.csv"

In [3]:

train_data= pd.read_csv(train_data_path)
test_data= pd.read_csv(test_data_path)
val_data= pd.read_csv(val_data_path)

In [4]:
train=pd.DataFrame(columns=['text','label'])
validation = pd.DataFrame(columns=['text', 'label'])
test=pd.DataFrame(columns=['text','label'])

In [5]:
X_train=train_data['text'].tolist()
y_train=train_data['label'].tolist()
X_val = val_data['text'].tolist()
y_val = val_data['label'].tolist()
X_test=test_data['text'].tolist()
y_test=test_data['label'].tolist()

In [6]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
train_encodings = tokenizer(X_train, truncation=True, padding=True, return_tensors="pt")
val_encodings = tokenizer(X_val, truncation=True, padding=True, return_tensors="pt")
test_encodings = tokenizer(X_test, truncation=True, padding=True, return_tensors="pt")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [7]:
train_encodings

{'input_ids': tensor([[    0,  1620,    38,  ...,     1,     1,     1],
        [    0,  1779,    38,  ...,     1,     1,     1],
        [    0,   243,    16,  ...,     1,     1,     1],
        ...,
        [    0,   673, 13034,  ...,     5, 37407,     2],
        [    0,   133,  2157,  ...,     1,     1,     1],
        [    0,   713,   189,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [8]:
train_labels = torch.tensor(y_train)
val_labels = torch.tensor(y_val)
test_labels = torch.tensor(y_test)

In [9]:
train_labels

tensor([1, 1, 1,  ..., 0, 1, 0])

##prepare data

In [10]:
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [11]:
train_dataset = CustomDataset(train_encodings, train_labels)
val_dataset = CustomDataset(val_encodings, val_labels)
test_dataset = CustomDataset(test_encodings, test_labels)

In [12]:
torch.cuda.empty_cache()

In [13]:
batch_size = 20
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

##Modeling

In [14]:
num_classes = 2
num_epochs = 3
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=num_classes)
model = model.to("cuda")
optimizer = AdamW(model.parameters(), lr=1e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataset) * num_epochs)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0

    for batch_idx, batch in enumerate(train_loader):
        batch = {key: val.to("cuda") for key, val in batch.items()}  # Move data to GPU
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

        if (batch_idx + 1) % 100 == 0:  # Print every 100 batches
            print(f"Epoch {epoch + 1}/{num_epochs} - Batch {batch_idx + 1}/{len(train_loader)} - Loss: {total_loss / 100:.4f}")
            total_loss = 0.0

    # Validation
    model.eval()
    all_val_outputs = []

    with torch.no_grad():
        for val_batch in val_loader:
            val_batch = {key: val.to("cuda") for key, val in val_batch.items()}  # Move data to GPU
            val_outputs = model(**val_batch)
            all_val_outputs.extend(val_outputs.logits.argmax(dim=1).cpu().numpy())

    val_preds = torch.tensor(all_val_outputs)
    val_acc = accuracy_score(val_labels, val_preds)

    print(f"Epoch {epoch + 1}/{num_epochs} - Validation Accuracy: {val_acc:.4f}")


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


Epoch 1/3 - Batch 100/2141 - Loss: 0.2927
Epoch 1/3 - Batch 200/2141 - Loss: 0.0853
Epoch 1/3 - Batch 300/2141 - Loss: 0.0631
Epoch 1/3 - Batch 400/2141 - Loss: 0.0497
Epoch 1/3 - Batch 500/2141 - Loss: 0.0246
Epoch 1/3 - Batch 600/2141 - Loss: 0.0343
Epoch 1/3 - Batch 700/2141 - Loss: 0.0240
Epoch 1/3 - Batch 800/2141 - Loss: 0.0187
Epoch 1/3 - Batch 900/2141 - Loss: 0.0375
Epoch 1/3 - Batch 1000/2141 - Loss: 0.0142
Epoch 1/3 - Batch 1100/2141 - Loss: 0.0221
Epoch 1/3 - Batch 1200/2141 - Loss: 0.0263
Epoch 1/3 - Batch 1300/2141 - Loss: 0.0169
Epoch 1/3 - Batch 1400/2141 - Loss: 0.0174
Epoch 1/3 - Batch 1500/2141 - Loss: 0.0118
Epoch 1/3 - Batch 1600/2141 - Loss: 0.0344
Epoch 1/3 - Batch 1700/2141 - Loss: 0.0202
Epoch 1/3 - Batch 1800/2141 - Loss: 0.0237
Epoch 1/3 - Batch 1900/2141 - Loss: 0.0216
Epoch 1/3 - Batch 2000/2141 - Loss: 0.0163
Epoch 1/3 - Batch 2100/2141 - Loss: 0.0254
Epoch 1/3 - Validation Accuracy: 0.9746


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


Epoch 2/3 - Batch 100/2141 - Loss: 0.0147
Epoch 2/3 - Batch 200/2141 - Loss: 0.0119
Epoch 2/3 - Batch 300/2141 - Loss: 0.0035
Epoch 2/3 - Batch 400/2141 - Loss: 0.0138
Epoch 2/3 - Batch 500/2141 - Loss: 0.0015
Epoch 2/3 - Batch 600/2141 - Loss: 0.0165
Epoch 2/3 - Batch 700/2141 - Loss: 0.0105
Epoch 2/3 - Batch 800/2141 - Loss: 0.0103
Epoch 2/3 - Batch 900/2141 - Loss: 0.0082
Epoch 2/3 - Batch 1000/2141 - Loss: 0.0077
Epoch 2/3 - Batch 1100/2141 - Loss: 0.0118
Epoch 2/3 - Batch 1200/2141 - Loss: 0.0135
Epoch 2/3 - Batch 1300/2141 - Loss: 0.0102
Epoch 2/3 - Batch 1400/2141 - Loss: 0.0044
Epoch 2/3 - Batch 1500/2141 - Loss: 0.0105
Epoch 2/3 - Batch 1600/2141 - Loss: 0.0148
Epoch 2/3 - Batch 1700/2141 - Loss: 0.0074
Epoch 2/3 - Batch 1800/2141 - Loss: 0.0068
Epoch 2/3 - Batch 1900/2141 - Loss: 0.0093
Epoch 2/3 - Batch 2000/2141 - Loss: 0.0076
Epoch 2/3 - Batch 2100/2141 - Loss: 0.0055
Epoch 2/3 - Validation Accuracy: 0.9882


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


Epoch 3/3 - Batch 100/2141 - Loss: 0.0075
Epoch 3/3 - Batch 200/2141 - Loss: 0.0117
Epoch 3/3 - Batch 300/2141 - Loss: 0.0029
Epoch 3/3 - Batch 400/2141 - Loss: 0.0085
Epoch 3/3 - Batch 500/2141 - Loss: 0.0155
Epoch 3/3 - Batch 600/2141 - Loss: 0.0109
Epoch 3/3 - Batch 700/2141 - Loss: 0.0141
Epoch 3/3 - Batch 800/2141 - Loss: 0.0055
Epoch 3/3 - Batch 900/2141 - Loss: 0.0042
Epoch 3/3 - Batch 1000/2141 - Loss: 0.0161
Epoch 3/3 - Batch 1100/2141 - Loss: 0.0080
Epoch 3/3 - Batch 1200/2141 - Loss: 0.0009
Epoch 3/3 - Batch 1300/2141 - Loss: 0.0035
Epoch 3/3 - Batch 1400/2141 - Loss: 0.0066
Epoch 3/3 - Batch 1500/2141 - Loss: 0.0094
Epoch 3/3 - Batch 1600/2141 - Loss: 0.0039
Epoch 3/3 - Batch 1700/2141 - Loss: 0.0092
Epoch 3/3 - Batch 1800/2141 - Loss: 0.0174
Epoch 3/3 - Batch 1900/2141 - Loss: 0.0055
Epoch 3/3 - Batch 2000/2141 - Loss: 0.0119
Epoch 3/3 - Batch 2100/2141 - Loss: 0.0017
Epoch 3/3 - Validation Accuracy: 0.9899


In [16]:
model.eval()
all_test_outputs = []

with torch.no_grad():
    for test_batch in test_loader:
        test_batch = {key: val.to("cuda") for key, val in test_batch.items()}  # Move data to GPU
        test_outputs = model(**test_batch)
        all_test_outputs.extend(test_outputs.logits.argmax(dim=1).cpu().numpy())

test_preds = torch.tensor(all_test_outputs)
test_acc = accuracy_score(test_labels, test_preds)

print(f"Test Accuracy: {test_acc:.4f}")

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


Test Accuracy: 0.9927


In [17]:
model.save_pretrained("/content/drive/MyDrive/Graduation_Project_Ottawa/models/traditional")

In [18]:
tokenizer.save_pretrained("/content/drive/MyDrive/Graduation_Project_Ottawa/models/traditional")

('/content/drive/MyDrive/Graduation_Project_Ottawa/models/traditional/tokenizer_config.json',
 '/content/drive/MyDrive/Graduation_Project_Ottawa/models/traditional/special_tokens_map.json',
 '/content/drive/MyDrive/Graduation_Project_Ottawa/models/traditional/vocab.json',
 '/content/drive/MyDrive/Graduation_Project_Ottawa/models/traditional/merges.txt',
 '/content/drive/MyDrive/Graduation_Project_Ottawa/models/traditional/added_tokens.json')

##check the saved model

In [19]:
output_dir = "/content/drive/MyDrive/Graduation_Project_Ottawa/models/traditional"
loaded_tokenizer = RobertaTokenizer.from_pretrained(output_dir)
loaded_model = RobertaForSequenceClassification.from_pretrained(output_dir)
loaded_model = loaded_model.to("cuda")

In [23]:
loaded_model.eval()
all_test_outputs = []

with torch.no_grad():
    for test_batch in test_loader:
        test_batch = {key: val.to("cuda") for key, val in test_batch.items()}  # Move data to GPU
        test_outputs = loaded_model(**test_batch)
        all_test_outputs.extend(test_outputs.logits.argmax(dim=1).cpu().numpy())

test_preds = torch.tensor(all_test_outputs)
test_acc = accuracy_score(test_labels, test_preds)

print(f"Test Accuracy: {test_acc:.4f}")

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


Test Accuracy: 0.9927


In [20]:
def classify_text(input_text, model, tokenizer):
    # Tokenize the input text
    input_encoding = tokenizer(input_text, truncation=True, padding=True, return_tensors="pt")

    # Move the input tensors to the same device as the model
    input_encoding = {key: val.to(model.device) for key, val in input_encoding.items()}

    # Perform classification using the loaded model
    with torch.no_grad():
        output = model(**input_encoding)

    # Get the predicted label
    predicted_label = torch.argmax(output.logits, dim=1).item()

    return predicted_label

In [21]:
input_text = "Nineteen Eighty-Four (also published as 1984) is a dystopian novel and cautionary tale by English writer George Orwell. It was published on 8 June 1949 by Secker & Warburg as Orwell's ninth and final book completed in his lifetime. Thematically, it centres on the consequences of totalitarianism, mass surveillance and repressive regimentation of people and behaviours within society.[2][3] Orwell, a democratic socialist, modelled the authoritarian state in the novel on the Soviet Union in the era of Stalinism, and Nazi Germany.[4] More broadly, the novel examines the role of truth and facts within societies and the ways in which they can be manipulated."
result = classify_text(input_text, loaded_model, loaded_tokenizer)

In [22]:
result

0