<a href="https://colab.research.google.com/github/Inzamam1234/PlagiAI_A-Multi_Document_Authenticity_Detection_System/blob/main/AI_Content_Detector_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q transformers datasets torch accelerate scikit-learn wandb

# Check GPU
import torch
print(f"GPU Available: {torch.cuda.is_available()}")
print(f"GPU Device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")


GPU Available: True
GPU Device: Tesla T4


In [None]:
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict, load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import json
from datetime import datetime

In [None]:
print("Loading dataset...")

# Option A: Use existing dataset
try:
    dataset = load_dataset("Hello-SimpleAI/HC3", "all")
    print("Loaded HC3 dataset")
except:
    print("Creating synthetic dataset for demonstration...")

    # Option B: Create synthetic dataset (replace with real data)
    human_texts = [
        "The methodology employed in this research encompasses a comprehensive analysis.",
        "Previous studies have demonstrated significant correlations between variables.",
        "Our experimental results indicate a notable improvement over baseline methods.",
    ] * 500  # Replicate for demo

    ai_texts = [
        "This paper explores various aspects of machine learning applications.",
        "The research presents findings that contribute to the field significantly.",
        "We propose a novel approach to address these challenges effectively.",
    ] * 500  # Replicate for demo

    # Create dataset
    data = {
        'text': human_texts + ai_texts,
        'label': [0] * len(human_texts) + [1] * len(ai_texts)  # 0=human, 1=AI
    }
    df = pd.DataFrame(data)

    # Shuffle
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)

    # Split train/val/test
    train_size = int(0.7 * len(df))
    val_size = int(0.15 * len(df))

    train_df = df[:train_size]
    val_df = df[train_size:train_size+val_size]
    test_df = df[train_size+val_size:]

    dataset = DatasetDict({
        'train': Dataset.from_pandas(train_df),
        'validation': Dataset.from_pandas(val_df),
        'test': Dataset.from_pandas(test_df)
    })

print(f"Dataset loaded: {dataset}")

Loading dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

HC3.py: 0.00B [00:00, ?B/s]

Creating synthetic dataset for demonstration...
Dataset loaded: DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 2100
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 450
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 450
    })
})


In [None]:
MODEL_NAME = "microsoft/deberta-v3-base"  # Excellent for text classification
# Alternative: "roberta-base", "bert-base-uncased"

print(f"Loading model: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2
)
#Move to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print(f"Model loaded on {device}")


Loading model: microsoft/deberta-v3-base


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded on cuda


In [None]:
# CELL 5: Tokenization
# ============================================================================
def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        padding='max_length',
        truncation=True,
        max_length=512
    )

print("Tokenizing dataset...")
tokenized_dataset = dataset.map(tokenize_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Tokenizing dataset...


Map:   0%|          | 0/2100 [00:00<?, ? examples/s]

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='binary'
    )

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [None]:
OUTPUT_DIR = "./ai_detector_model"

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch", # Corrected argument name
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    push_to_hub=False,
    logging_dir='./logs',
    logging_steps=100,
    fp16=True,  # Use mixed precision for T4 GPU
    report_to="none"  # Disable wandb for simple setup
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("Starting training...")
train_result = trainer.train()

print("Training completed!")
print(f"Training metrics: {train_result.metrics}")

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1129,0.000216,1.0,1.0,1.0,1.0
2,0.0006,0.0001,1.0,1.0,1.0,1.0
3,0.0003,8.2e-05,1.0,1.0,1.0,1.0


Training completed!
Training metrics: {'train_runtime': 506.7026, 'train_samples_per_second': 12.433, 'train_steps_per_second': 0.782, 'total_flos': 1657629375897600.0, 'train_loss': 0.028808371314421446, 'epoch': 3.0}


In [None]:
print("Evaluating on test set...")
test_results = trainer.evaluate(tokenized_dataset["test"])
print(f"Test Results: {test_results}")


Evaluating on test set...


Test Results: {'eval_loss': 0.00021576751896645874, 'eval_accuracy': 1.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_runtime': 9.0558, 'eval_samples_per_second': 49.692, 'eval_steps_per_second': 3.202, 'epoch': 3.0}


In [None]:
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

# Save training metadata
metadata = {
    'model_name': MODEL_NAME,
    'training_date': datetime.now().isoformat(),
    'test_metrics': test_results,
    'training_args': training_args.to_dict()
}

with open(f"{OUTPUT_DIR}/training_metadata.json", 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"Model saved to {OUTPUT_DIR}")


Model saved to ./ai_detector_model


In [None]:
# ============================================================================
# CELL 11: Test Inference
# ============================================================================
from transformers import pipeline

classifier = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1
)
# Test examples
test_texts = [
    "The experimental methodology was carefully designed to ensure reproducibility.",
    "This research explores innovative approaches to solve complex problems efficiently."
]

print("\nTest Predictions:")
for text in test_texts:
    result = classifier(text)[0]
    label = "AI-Generated" if result['label'] == 'LABEL_1' else "Human-Written"
    print(f"Text: {text[:60]}...")
    print(f"Prediction: {label} (confidence: {result['score']:.3f})\n")


Device set to use cuda:0



Test Predictions:
Text: The experimental methodology was carefully designed to ensur...
Prediction: Human-Written (confidence: 0.987)

Text: This research explores innovative approaches to solve comple...
Prediction: AI-Generated (confidence: 1.000)



In [None]:
# ============================================================================
# CELL 12: Download Model (Zip and Download)
# ============================================================================
!zip -r ai_detector_model.zip {OUTPUT_DIR}

from google.colab import files
files.download('ai_detector_model.zip')

print("âœ… Model training complete!")
print("ðŸ“¥ Download ai_detector_model.zip and extract in your VSCode project")
print("   Suggested location: ./models/ai_detector/")

  adding: ai_detector_model/ (stored 0%)
  adding: ai_detector_model/config.json (deflated 54%)
  adding: ai_detector_model/added_tokens.json (stored 0%)
  adding: ai_detector_model/model.safetensors (deflated 23%)
  adding: ai_detector_model/checkpoint-132/ (stored 0%)
  adding: ai_detector_model/checkpoint-132/config.json (deflated 54%)
  adding: ai_detector_model/checkpoint-132/scaler.pt (deflated 64%)
  adding: ai_detector_model/checkpoint-132/added_tokens.json (stored 0%)
  adding: ai_detector_model/checkpoint-132/model.safetensors (deflated 23%)
  adding: ai_detector_model/checkpoint-132/optimizer.pt (deflated 57%)
  adding: ai_detector_model/checkpoint-132/trainer_state.json (deflated 59%)
  adding: ai_detector_model/checkpoint-132/spm.model (deflated 50%)
  adding: ai_detector_model/checkpoint-132/special_tokens_map.json (deflated 50%)
  adding: ai_detector_model/checkpoint-132/tokenizer_config.json (deflated 73%)
  adding: ai_detector_model/checkpoint-132/tokenizer.json (defla

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

âœ… Model training complete!
ðŸ“¥ Download ai_detector_model.zip and extract in your VSCode project
   Suggested location: ./models/ai_detector/
