<a href="https://colab.research.google.com/github/Meenusj/Case_study/blob/main/roberta_base.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers[torch]
!pip install accelerate -U


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import torch
from transformers import RobertaForSequenceClassification, RobertaTokenizer, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score

# Load the dataset
file_path = "/content/combined_mutation_results (1).csv"  # Replace with the actual path
df = pd.read_csv(file_path)

# Preprocess the text data
def preprocess_text(text):
    return text.lower()

mutation_columns = ['char_mutated_article', 'char_mutated_adjective', 'char_mutated_adverb',
                    'word_mutated_articles', 'word_mutated_adjectives', 'word_mutated_adverbs']

for col in mutation_columns:
    df[col] = df[col].apply(preprocess_text)

# Concatenate mutation columns into one
df['mutations_combined'] = df[mutation_columns].apply(lambda x: ' '.join(x), axis=1)

# Split the dataset into training, validation, and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)

# Shuffle the datasets
train_df = shuffle(train_df, random_state=42).reset_index(drop=True)
val_df = shuffle(val_df, random_state=42).reset_index(drop=True)
test_df = shuffle(test_df, random_state=42).reset_index(drop=True)

# Define file paths for saving the datasets
train_file = "train.csv"
val_file = "validation.csv"
test_file = "test.csv"

# Save the datasets to CSV files
train_df.to_csv(train_file, index=False)
val_df.to_csv(val_file, index=False)
test_df.to_csv(test_file, index=False)

# Load RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# Tokenize the text data
train_encodings = tokenizer(train_df['mutations_combined'].tolist(), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_df['mutations_combined'].tolist(), truncation=True, padding=True, max_length=128)

# Convert labels to numerical format
label_mapping = {'human': 0, 'bot': 1, 'rnn': 1, 'gpt2': 1, 'others': 1}
train_labels = train_df['class_type'].map(label_mapping).tolist()
val_labels = val_df['class_type'].map(label_mapping).tolist()

# Create PyTorch datasets
class DetectionDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = DetectionDataset(train_encodings, train_labels)
val_dataset = DetectionDataset(val_encodings, val_labels)

# Load pre-trained RoBERTa-Base model
roberta_base_model = RobertaForSequenceClassification.from_pretrained("roberta-base")

# Define the training arguments with reduced batch size
training_args = TrainingArguments(
    output_dir="./roberta_base_model",
    num_train_epochs=50,
    per_device_train_batch_size=32,  # Reduce the batch size here
    per_device_eval_batch_size=32,    # Reduce the batch size here
    logging_dir="./logs",
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=100,
    learning_rate=1e-4,
    load_best_model_at_end=True,
)

# Define the Trainer with the updated training arguments
trainer = Trainer(
    model=roberta_base_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=lambda pred: {"accuracy": accuracy_score(pred.label_ids, pred.predictions.argmax(-1))},
)

# Train the model
trainer.train()


In [None]:
import pandas as pd
test_df=pd.read_csv("/content/test.csv")

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score
from tabulate import tabulate

def evaluate_mutation(trainer, tokenizer, test_df, mutation_column, label_mapping):
    # Tokenize the test data for the specified mutation
    test_encodings = tokenizer(test_df[mutation_column].tolist(), truncation=True, padding=True, max_length=128)

    # Create PyTorch dataset for the specified mutation
    test_labels = test_df['class_type'].map(label_mapping).tolist()
    test_dataset = DetectionDataset(test_encodings, test_labels)

    # Evaluate on the specified mutation test set
    predictions = trainer.predict(test_dataset)

    # Get predicted labels and probabilities
    predicted_labels = predictions.predictions.argmax(-1)
    probs = torch.nn.functional.softmax(torch.tensor(predictions.predictions), dim=-1)

    # Calculate accuracy and AUC for the specified mutation
    accuracy = accuracy_score(test_labels, predicted_labels)
    auc = roc_auc_score(test_labels, probs[:, 1])

    return accuracy, auc

# Evaluate each mutation separately including original text
mutations = ['original_text', 'char_mutated_article', 'char_mutated_adjective', 'char_mutated_adverb',
             'word_mutated_articles', 'word_mutated_adjectives', 'word_mutated_adverbs']

results = []
for mutation in mutations:
    mutation_accuracy, mutation_auc = evaluate_mutation(trainer, tokenizer, test_df, mutation, label_mapping)
    if mutation == 'original_text':
        mutation = 'Human'
    results.append([mutation.capitalize(), mutation_accuracy, mutation_auc])

print("Roberta Base")
print(tabulate(results, headers=["Mutation", "Accuracy", "AUC"], tablefmt="fancy_grid"))


In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Define the source and destination paths
source_path = "./roberta_base_model"  # Path to the folder you want to move
destination_path = "/content/drive/MyDrive/roberta_base_model"  # Path in your Google Drive

# Move the folder to your Google Drive
!cp -r $source_path $destination_path


In [None]:
import matplotlib.pyplot as plt
import numpy as np

def evaluate_mutation(trainer, tokenizer, test_df, mutation_column, label_mapping):
    # Tokenize the test data for the specified mutation
    test_encodings = tokenizer(test_df[mutation_column].tolist(), truncation=True, padding=True, max_length=128)

    # Create PyTorch dataset for the specified mutation
    test_labels = test_df['class_type'].map(label_mapping).tolist()
    test_dataset = DetectionDataset(test_encodings, test_labels)

    # Evaluate on the specified mutation test set
    predictions = trainer.predict(test_dataset)

    # Get predicted labels and probabilities
    probs = torch.nn.functional.softmax(torch.tensor(predictions.predictions), dim=-1)

    # Calculate AUC for the specified mutation
    auc = roc_auc_score(test_labels, probs[:, 1])

    return auc

# Evaluate each mutation separately including original text
mutations = ['original_text', 'char_mutated_article', 'char_mutated_adjective', 'char_mutated_adverb',
             'word_mutated_articles', 'word_mutated_adjectives', 'word_mutated_adverbs']

auc_scores = []
mutation_labels = []
for mutation in mutations:
    mutation_auc = evaluate_mutation(trainer, tokenizer, test_df, mutation, label_mapping)
    auc_scores.append(mutation_auc)
    if mutation == 'original_text':
        mutation_labels.append('Original Text')
    else:
        mutation_labels.append(mutation.capitalize())

# Define colors for each mutation
colors = ['skyblue', 'salmon', 'lightgreen', 'lightcoral', 'orchid', 'gold', 'cyan']

# Plotting
plt.figure(figsize=(8, 5))  # Adjust the figure size as needed
plt.bar(np.arange(len(mutations)), auc_scores, color=colors)
plt.xlabel('Mutation')
plt.ylabel('AUC')
plt.title('Area Under the Curve (AUC) for Various Mutations')
plt.xticks(np.arange(len(mutations)), mutation_labels, rotation=45, ha='right')
plt.show()


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
import numpy as np

def evaluate_mutation(trainer, tokenizer, test_df, mutation_column, label_mapping):
    # Tokenize the test data for the specified mutation
    test_encodings = tokenizer(test_df[mutation_column].tolist(), truncation=True, padding=True, max_length=128)

    # Create PyTorch dataset for the specified mutation
    test_labels = test_df['class_type'].map(label_mapping).tolist()
    test_dataset = DetectionDataset(test_encodings, test_labels)

    # Evaluate on the specified mutation test set
    predictions = trainer.predict(test_dataset)

    # Get predicted labels and probabilities
    probs = torch.nn.functional.softmax(torch.tensor(predictions.predictions), dim=-1)

    # Calculate ROC curve for the specified mutation
    fpr, tpr, _ = roc_curve(test_labels, probs[:, 1])

    return fpr, tpr

# Evaluate each mutation separately including original text
mutations = ['original_text', 'char_mutated_article', 'char_mutated_adjective', 'char_mutated_adverb',
             'word_mutated_articles', 'word_mutated_adjectives', 'word_mutated_adverbs']

plt.figure(figsize=(8, 6))

for mutation in mutations:
    fpr, tpr = evaluate_mutation(trainer, tokenizer, test_df, mutation, label_mapping)
    if mutation == 'original_text':
        mutation_label = 'Original Text'
        color = 'blue'
    else:
        mutation_label = mutation.split('_')[1].capitalize()
        color = 'C' + str(mutations.index(mutation))  # Use a unique color index for each mutation
    plt.plot(fpr, tpr, label=mutation_label, color=color)

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Various Mutations')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()
