In [1]:
# ========================================
# 📦 1. Install Dependencies
# ========================================
!pip install transformers -U datasets scikit-learn newspaper3k PyMuPDF lxml[html_clean] peft accelerate

Collecting transformers
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (17 kB)
Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl.metadata (11 kB)
Collecting PyMuPDF
  Downloading pymupdf-1.26.1-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting peft
  Downloading peft-0.15.2-py3-none-any.whl.metadata (13 kB)
Collecting accelerate
  Downloading accelerate-1.7.0-py3-none-any.whl.metadata (19 kB)
Collecting lxml[html_clean]
  Downloading lxml-5.4.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.5 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.33.0-py3-none-any.whl.metadata (14 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (

In [2]:
# ========================================
# 🏗️ Import Libraries
# ========================================
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import (
    RobertaTokenizer,
    RobertaForSequenceClassification,
    Trainer,
    TrainingArguments
)
from peft import get_peft_model, LoraConfig, TaskType
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import newspaper
import fitz
import os

In [3]:
# ========================================
# 🌐 Load and Prepare LIAR Dataset
# ========================================
def load_and_prepare_data():
    dataset = load_dataset("liar")

    df_train = pd.DataFrame(dataset['train'])
    df_val = pd.DataFrame(dataset['validation'])
    df_test = pd.DataFrame(dataset['test'])

    def map_labels(label):
        return 0 if label in [4, 5] else 1

    for df in [df_train, df_val, df_test]:
        df['label'] = df['label'].apply(map_labels).astype(int)
        df.dropna(subset=['statement', 'label'], inplace=True)

    return {
        'train': Dataset.from_pandas(df_train),
        'validation': Dataset.from_pandas(df_val),
        'test': Dataset.from_pandas(df_test)
    }

dataset = load_and_prepare_data()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/5.16k [00:00<?, ?B/s]

liar.py:   0%|          | 0.00/6.41k [00:00<?, ?B/s]

The repository for liar contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/liar.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/1.01M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10269 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1283 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1284 [00:00<?, ? examples/s]

In [4]:
# ========================================
# 🔤 Tokenization
# ========================================
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

def tokenize(batch):
    return tokenizer(batch['statement'], truncation=True, padding='max_length', max_length=128)

for split in dataset:
    dataset[split] = dataset[split].map(tokenize, batched=True)
    dataset[split].set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Map:   0%|          | 0/10269 [00:00<?, ? examples/s]

Map:   0%|          | 0/1284 [00:00<?, ? examples/s]

Map:   0%|          | 0/1283 [00:00<?, ? examples/s]

In [5]:
# ========================================
# 🧠 Load Roberta with PEFT LoRA Adapters
# ========================================
base_model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none"
)

model = get_peft_model(base_model, peft_config)
model.print_trainable_parameters()

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 887,042 || all params: 125,534,212 || trainable%: 0.7066


In [7]:
# ========================================
# 🏋️ Training Configuration
# ========================================
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_steps=50,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [8]:
# ========================================
# 🚀 Train the PEFT LoRA Model
# ========================================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    compute_metrics=compute_metrics
)

print("🔧 Starting PEFT LoRA fine-tuning...")
trainer.train()

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


🔧 Starting PEFT LoRA fine-tuning...


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33miconicemon01[0m ([33miconicemon01-city-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5696,0.586837,0.725078,0.840632,0.725078,1.0
2,0.5708,0.588359,0.725078,0.840632,0.725078,1.0


TrainOutput(global_step=1284, training_loss=0.5647607696390597, metrics={'train_runtime': 393.2065, 'train_samples_per_second': 52.232, 'train_steps_per_second': 3.265, 'total_flos': 1364935190427648.0, 'train_loss': 0.5647607696390597, 'epoch': 2.0})

In [9]:
# ========================================
# 📊 Evaluate on Test Data
# ========================================
print("\n📈 Evaluating on test set...")
test_results = trainer.evaluate(dataset['test'])

print("\n🔍 Test set metrics:")
for key, value in test_results.items():
    print(f"{key}: {value:.4f}")


📈 Evaluating on test set...



🔍 Test set metrics:
eval_loss: 0.5478
eval_accuracy: 0.7615
eval_f1: 0.8646
eval_precision: 0.7615
eval_recall: 1.0000
eval_runtime: 8.4948
eval_samples_per_second: 151.0340
eval_steps_per_second: 9.5350
epoch: 2.0000


In [10]:
# ========================================
# 🧠 Inference: classify text as Fake or Real
# ========================================
def extract_text(input_type, value):
    try:
        if input_type == 'url':
            article = newspaper.Article(value)
            article.download()
            article.parse()
            return article.text
        elif input_type == 'pdf':
            with fitz.open(value) as doc:
                return "\n".join([page.get_text() for page in doc])
        elif input_type == 'text':
            return value
        else:
            return "Invalid input type"
    except Exception as e:
        return f"Error processing input: {str(e)}"

def classify_text(text):
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding='max_length', max_length=128).to(device)

    with torch.no_grad():
        logits = model(**inputs).logits
    probs = torch.softmax(logits, dim=1)
    pred = torch.argmax(probs, dim=1).item()
    label = "Real" if pred == 0 else "Fake"
    confidence = probs[0][pred].item()
    return label, confidence

def analyze_news(input_type, value):
    print(f"\n📰 Input Type: {input_type}")
    text = extract_text(input_type, value)
    if text.startswith("Error") or text == "Invalid input type":
        print(f"⚠️ {text}")
        return
    print(f"🧾 Extracted Content:\n{text[:400]}...\n")
    label, conf = classify_text(text)
    print(f"🏷️ Prediction: {label} ({conf:.2f} confidence)")
    return label, conf

In [11]:
analyze_news(
    input_type='text',
    value="NASA scientists discover water on Mars, raising possibility of microbial life."
)

analyze_news(
    input_type='text',
    value="The Earth is flat and this is confirmed by new satellite data, claims expert."
)


📰 Input Type: text
🧾 Extracted Content:
NASA scientists discover water on Mars, raising possibility of microbial life....

🏷️ Prediction: Fake (0.74 confidence)

📰 Input Type: text
🧾 Extracted Content:
The Earth is flat and this is confirmed by new satellite data, claims expert....

🏷️ Prediction: Fake (0.74 confidence)


('Fake', 0.735588788986206)

In [12]:
# ========================================
# 💾 Save Model & Tokenizer
# ========================================
model.save_pretrained("./peft_lora_roberta_fake_news")
tokenizer.save_pretrained("./peft_lora_roberta_fake_news")

('./peft_lora_roberta_fake_news/tokenizer_config.json',
 './peft_lora_roberta_fake_news/special_tokens_map.json',
 './peft_lora_roberta_fake_news/vocab.json',
 './peft_lora_roberta_fake_news/merges.txt',
 './peft_lora_roberta_fake_news/added_tokens.json')

In [None]:
# ✅ Zip & Download (Colab)
import shutil
from google.colab import files

shutil.make_archive("peft_lora_roberta_fake_news", 'zip', "./peft_lora_roberta_fake_news")
files.download("peft_lora_roberta_fake_news.zip")