In [None]:
# ========================================
# 📦 1. Install Required Dependencies
# ========================================
!pip install transformers datasets scikit-learn newspaper3k PyMuPDF

# ========================================
# 🏗️ 2. Setup & Initialization
# ========================================
import pandas as pd
import torch
import newspaper
import fitz  # PyMuPDF for PDF processing
from datasets import load_dataset, Dataset
from transformers import (
    RobertaTokenizer,
    RobertaForSequenceClassification,
    Trainer,
    TrainingArguments
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# ========================================
# 🌐 3. Load & Preprocess Dataset (LIAR)
# ========================================
def load_and_prepare_data():
    """
    Loads the LIAR dataset, maps original 6-class labels to binary labels,
    and returns train/val/test as HuggingFace Datasets.
    """
    dataset = load_dataset("liar")

    df_train = pd.DataFrame(dataset['train'])
    df_val = pd.DataFrame(dataset['validation'])
    df_test = pd.DataFrame(dataset['test'])

    def map_labels(label):
        if isinstance(label, str):
            label = int(label)
        # Map to binary: 0 = Real, 1 = Fake
        return 0 if label in [4, 5] else 1

    # Apply label mapping and clean data
    for df in [df_train, df_val, df_test]:
        df['label'] = df['label'].apply(map_labels).astype(int)
        df.dropna(subset=['label', 'statement'], inplace=True)

    return {
        'train': Dataset.from_pandas(df_train),
        'validation': Dataset.from_pandas(df_val),
        'test': Dataset.from_pandas(df_test)
    }

dataset = load_and_prepare_data()

# ========================================
# 🔤 4. Tokenization
# ========================================
# Load pretrained Roberta tokenizer
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

def tokenize(batch):
    """
    Tokenizes the 'statement' field with padding and truncation.
    """
    return roberta_tokenizer(batch['statement'], truncation=True, padding='max_length', max_length=128)

# Apply tokenization to each dataset split
for split in dataset:
    dataset[split] = dataset[split].map(tokenize, batched=True)
    dataset[split].set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# ========================================
# 🏋️ 5. Model Setup & Fine-Tuning
# ========================================
# Load pre-trained Roberta model for binary classification
model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=2,
    problem_type="single_label_classification"
)

# Training configuration
training_args = TrainingArguments(
    output_dir="./results",                     # Output directory for checkpoints
    eval_strategy="epoch",                      # Evaluation frequency
    learning_rate=2e-5,                          # Learning rate
    per_device_train_batch_size=16,             # Training batch size
    per_device_eval_batch_size=16,              # Evaluation batch size
    num_train_epochs=2,                         # Number of epochs
    weight_decay=0.01,                          # Weight decay
    logging_steps=50,                           # Logging interval
    save_strategy="epoch",                      # Save after every epoch
    load_best_model_at_end=True,                # Load best model based on metric
    metric_for_best_model="accuracy"            # Use accuracy to choose best model
)

# Define evaluation metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Setup Hugging Face Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    compute_metrics=compute_metrics
)

# Train the model
print("🚀 Starting fine-tuning...")
trainer.train()

# Evaluate on test set
print("\n🧪 Evaluating on test set...")
test_results = trainer.evaluate(dataset['test'])

print("\n📊 Test set results:")
for key, value in test_results.items():
    print(f"{key}: {value:.4f}")

# ========================================
# 🌐 6. Input Parser (Text | URL | PDF)
# ========================================
def extract_text(input_type, value):
    """
    Extracts raw text from a URL, PDF, or plain text.
    """
    try:
        if input_type == 'url':
            article = newspaper.Article(value)
            article.download()
            article.parse()
            return article.text
        elif input_type == 'pdf':
            with fitz.open(value) as doc:
                return "\n".join([page.get_text() for page in doc])
        elif input_type == 'text':
            return value
        else:
            return "Invalid input type"
    except Exception as e:
        return f"Error processing input: {str(e)}"

# ========================================
# 🧠 7. Prediction Only (No Explanation)
# ========================================
def classify_text(text):
    """
    Predicts the label (Real or Fake) and confidence from a given input text.
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    inputs = roberta_tokenizer(text, return_tensors="pt", truncation=True, padding='max_length', max_length=128).to(device)

    with torch.no_grad():
        logits = model(**inputs).logits
    probs = torch.softmax(logits, dim=1)
    pred = torch.argmax(probs, dim=1).item()
    confidence = probs[0][pred].item()
    label = "Real" if pred == 0 else "Fake"
    return label, confidence

# ========================================
# 🚀 8. Full Inference Pipeline
# ========================================
def analyze_news(input_type, value):
    """
    End-to-end pipeline that:
    1. Extracts text
    2. Classifies as Fake/Real
    3. Returns result with confidence
    """
    print(f"\n📥 Processing {input_type} input...")
    text = extract_text(input_type, value)

    if text.startswith("Error") or text == "Invalid input type":
        print(f"❌ Error: {text}")
        return

    print(f"\n📄 Extracted text preview:\n{text[:500]}...\n")

    label, confidence = classify_text(text)

    print(f"\n🔍 Prediction Results:")
    print(f"🏷️ Label: {label}")
    print(f"📊 Confidence: {confidence:.2f}")

    return label, confidence

# ========================================
# 🧪 9. Example Inference Usage
# ========================================
# Example 1: Realistic-sounding fake news
analyze_news(
    input_type='text',
    value="Scientists confirm that eating chocolate daily improves longevity by 20 years."
)

# Example 2: Conspiracy-type claim
analyze_news(
    input_type='text',
    value="The moon landing was filmed in a Hollywood studio."
)

# ========================================
# 💾 10. Save & Download the Fine-Tuned Model
# ========================================
# Save locally
model.save_pretrained("./fine_tuned_liar_detector")
roberta_tokenizer.save_pretrained("./fine_tuned_liar_detector")
print("\n💾 Model saved to ./fine_tuned_liar_detector")

# Download as ZIP from Colab
from google.colab import files
import shutil
shutil.make_archive("fine_tuned_liar_detector", 'zip', "./fine_tuned_liar_detector")
files.download("fine_tuned_liar_detector.zip")


In [1]:
# ========================================
# 📦 1. Install Dependencies
# ========================================
!pip install transformers -U datasets scikit-learn newspaper3k PyMuPDF lxml[html_clean]
# !pip install -U datasets

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (17 kB)
Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl.metadata (11 kB)
Collecting PyMuPDF
  Downloading pymupdf-1.26.1-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting cssselect>=0.9.2 (from newspaper3k)
  Downloading cssselect-1.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting feedparser>=5.2.1 (from newspaper3k)
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting tldextract>=2.0.1 (from newspaper3k)
  Downloading tldextract-5.3.0-py3-none-any.whl.metadata (11 kB)
Collecting feedfinder2>=0.0.4 (from newspaper3k)
  Downloading feedfinder2-0.0.4.tar.gz (3.3 kB)
  

In [2]:
# ========================================
# 🏗️ 2. Setup & Initialization
# ========================================
import pandas as pd
import torch
import newspaper
import fitz  # PyMuPDF for PDF processing
from datasets import load_dataset, Dataset
from transformers import (
    RobertaTokenizer,
    RobertaForSequenceClassification,
    Trainer,
    TrainingArguments
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [3]:
# ========================================
# 🌐 3. Load & Preprocess Dataset (LIAR)
# ========================================
def load_and_prepare_data():
    """
    Loads the LIAR dataset, maps original 6-class labels to binary labels,
    and returns train/val/test as HuggingFace Datasets.
    """
    dataset = load_dataset("liar")

    df_train = pd.DataFrame(dataset['train'])
    df_val = pd.DataFrame(dataset['validation'])
    df_test = pd.DataFrame(dataset['test'])

    def map_labels(label):
        if isinstance(label, str):
            label = int(label)
        # Map to binary: 0 = Real, 1 = Fake
        return 0 if label in [4, 5] else 1

    # Apply label mapping and clean data
    for df in [df_train, df_val, df_test]:
        df['label'] = df['label'].apply(map_labels).astype(int)
        df.dropna(subset=['label', 'statement'], inplace=True)

    return {
        'train': Dataset.from_pandas(df_train),
        'validation': Dataset.from_pandas(df_val),
        'test': Dataset.from_pandas(df_test)
    }

dataset = load_and_prepare_data()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/5.16k [00:00<?, ?B/s]

liar.py:   0%|          | 0.00/6.41k [00:00<?, ?B/s]

The repository for liar contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/liar.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/1.01M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10269 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1283 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1284 [00:00<?, ? examples/s]

In [4]:
# ========================================
# 🔤 4. Tokenization
# ========================================
# Load pretrained Roberta tokenizer
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

def tokenize(batch):
    """
    Tokenizes the 'statement' field with padding and truncation.
    """
    return roberta_tokenizer(batch['statement'], truncation=True, padding='max_length', max_length=128)

# Apply tokenization to each dataset split
for split in dataset:
    dataset[split] = dataset[split].map(tokenize, batched=True)
    dataset[split].set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Map:   0%|          | 0/10269 [00:00<?, ? examples/s]

Map:   0%|          | 0/1284 [00:00<?, ? examples/s]

Map:   0%|          | 0/1283 [00:00<?, ? examples/s]

In [5]:
# ========================================
# 🏋️ 5. Model Setup & Fine-Tuning
# ========================================
# Load pre-trained Roberta model for binary classification
model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=2,
    problem_type="single_label_classification"
)

# Training configuration
training_args = TrainingArguments(
    output_dir="./results",                     # Output directory for checkpoints
    eval_strategy="epoch",                      # Evaluation frequency
    learning_rate=2e-5,                         # Learning rate
    per_device_train_batch_size=16,             # Training batch size
    per_device_eval_batch_size=16,              # Evaluation batch size
    num_train_epochs=1,                         # Number of epochs
    weight_decay=0.01,                          # Weight decay
    logging_steps=50,                           # Logging interval
    save_strategy="epoch",                      # Save after every epoch
    load_best_model_at_end=True,                # Load best model based on metric
    metric_for_best_model="accuracy"            # Use accuracy to choose best model
)

# Define evaluation metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Setup Hugging Face Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    compute_metrics=compute_metrics
)

# Train the model
print("🚀 Starting fine-tuning...")
trainer.train()

# Evaluate on test set
print("\n🧪 Evaluating on test set...")
test_results = trainer.evaluate(dataset['test'])

print("\n📊 Test set results:")
for key, value in test_results.items():
    print(f"{key}: {value:.4f}")

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


🚀 Starting fine-tuning...


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33miconicemon01[0m ([33miconicemon01-city-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5277,0.572186,0.725078,0.840632,0.725078,1.0



🧪 Evaluating on test set...



📊 Test set results:
eval_loss: 0.5196
eval_accuracy: 0.7615
eval_f1: 0.8646
eval_precision: 0.7615
eval_recall: 1.0000
eval_runtime: 8.9114
eval_samples_per_second: 143.9740
eval_steps_per_second: 9.0900
epoch: 1.0000


In [6]:
# ========================================
# 🌐 6. Input Parser (Text | URL | PDF)
# ========================================
def extract_text(input_type, value):
    """
    Extracts raw text from a URL, PDF, or plain text.
    """
    try:
        if input_type == 'url':
            article = newspaper.Article(value)
            article.download()
            article.parse()
            return article.text
        elif input_type == 'pdf':
            with fitz.open(value) as doc:
                return "\n".join([page.get_text() for page in doc])
        elif input_type == 'text':
            return value
        else:
            return "Invalid input type"
    except Exception as e:
        return f"Error processing input: {str(e)}"

In [7]:
# ========================================
# 🧠 7. Prediction Only (No Explanation)
# ========================================
def classify_text(text):
    """
    Predicts the label (Real or Fake) and confidence from a given input text.
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    inputs = roberta_tokenizer(text, return_tensors="pt", truncation=True, padding='max_length', max_length=128).to(device)

    with torch.no_grad():
        logits = model(**inputs).logits
    probs = torch.softmax(logits, dim=1)
    pred = torch.argmax(probs, dim=1).item()
    confidence = probs[0][pred].item()
    label = "Real" if pred == 0 else "Fake"
    return label, confidence

In [8]:
# ========================================
# 🚀 8. Full Inference Pipeline
# ========================================
def analyze_news(input_type, value):
    """
    End-to-end pipeline that:
    1. Extracts text
    2. Classifies as Fake/Real
    3. Returns result with confidence
    """
    print(f"\n📥 Processing {input_type} input...")
    text = extract_text(input_type, value)

    if text.startswith("Error") or text == "Invalid input type":
        print(f"❌ Error: {text}")
        return

    print(f"\n📄 Extracted text preview:\n{text[:500]}...\n")

    label, confidence = classify_text(text)

    print(f"\n🔍 Prediction Results:")
    print(f"🏷️ Label: {label}")
    print(f"📊 Confidence: {confidence:.2f}")

    return label, confidence

In [9]:
# ========================================
# 🧪 9. Example Inference Usage
# ========================================
# Example 1: Realistic-sounding fake news
analyze_news(
    input_type='text',
    value="Scientists confirm that eating chocolate daily improves longevity by 20 years."
)

# Example 2: Conspiracy-type claim
analyze_news(
    input_type='text',
    value="The moon landing was filmed in a Hollywood studio."
)


📥 Processing text input...

📄 Extracted text preview:
Scientists confirm that eating chocolate daily improves longevity by 20 years....


🔍 Prediction Results:
🏷️ Label: Fake
📊 Confidence: 0.82

📥 Processing text input...

📄 Extracted text preview:
The moon landing was filmed in a Hollywood studio....


🔍 Prediction Results:
🏷️ Label: Fake
📊 Confidence: 0.62


('Fake', 0.6171628832817078)

In [10]:
# ========================================
# 💾 10. Save & Download the Fine-Tuned Model
# ========================================
# Save locally
model.save_pretrained("./fine_tuned_liar_detector")
roberta_tokenizer.save_pretrained("./fine_tuned_liar_detector")
print("\n💾 Model saved to ./fine_tuned_liar_detector")


💾 Model saved to ./fine_tuned_liar_detector


In [None]:
# Download as ZIP from Colab
from google.colab import files
import shutil
shutil.make_archive("fine_tuned_liar_detector", 'zip', "./fine_tuned_liar_detector")
files.download("fine_tuned_liar_detector.zip")