In [None]:
# ==============================
# 1. Install Dependencies & Imports
# ==============================
!pip install torch torchvision transformers scikit-learn pandas tqdm pillow requests gradio seaborn matplotlib

import os
import torch
import random
import numpy as np
import pandas as pd
import requests
from tqdm.notebook import tqdm
from getpass import getpass

import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import confusion_matrix, accuracy_score

from transformers import BertTokenizer, BertModel
from torchvision import transforms
from torchvision.models import resnet50
from PIL import Image

import gradio as gr
import seaborn as sns
import matplotlib.pyplot as plt



In [None]:
# ==============================
# 2. Configuration & Drive Mount
# ==============================
# Reproducibility and device setup
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Mount Google Drive (if using Colab)
try:
    from google.colab import drive
    drive.mount("/content/drive")
    # ⚡ IMPORTANT: Adjust this path if needed
    data_root = "/content/drive/MyDrive/FakeNewsNet/"
except ImportError:
    data_root = "./" # Use current directory if not in Colab

Using device: cuda
Mounted at /content/drive


In [None]:
# ==============================
# 3. Load Data & Define Transforms
# ==============================
# Load CSV files
real = pd.read_csv("https://raw.githubusercontent.com/KaiDMML/FakeNewsNet/master/dataset/gossipcop_real.csv")
fake = pd.read_csv("https://raw.githubusercontent.com/KaiDMML/FakeNewsNet/master/dataset/gossipcop_fake.csv")

real["label"] = 0  # REAL
fake["label"] = 1  # FAKE
df = pd.concat([real, fake]).reset_index(drop=True)
print(f"Full dataset size: {df.shape}")

# Define tokenizer and image transforms
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
val_test_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

Full dataset size: (22140, 5)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
# ==============================
# 4. Dataset and Model Class Definitions
# ==============================
class FakeNewsDataset(Dataset):
    def __init__(self, df, transform, img_root_path):
        self.df = df
        self.transform = transform
        self.img_root = img_root_path

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = str(row["title"])
        enc = tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
        img_folder = os.path.join(self.img_root, "gossipcop", str(row["id"]))
        img = Image.new('RGB', (224, 224), 'black')
        if os.path.exists(img_folder):
            files = [f for f in os.listdir(img_folder) if f.lower().endswith(('png', 'jpg', 'jpeg'))]
            if files:
                try:
                    path = os.path.join(img_folder, files[0])
                    img = Image.open(path).convert("RGB")
                except Exception: pass
        return {
            "input_ids": enc["input_ids"].squeeze(),
            "attention_mask": enc["attention_mask"].squeeze(),
            "image": self.transform(img),
            "label": torch.tensor(row["label"], dtype=torch.long)
        }

class FusionLayer(nn.Module):
    def __init__(self, input_size=1024, output_size=512):
        super().__init__()
        self.fc = nn.Linear(input_size, output_size)
    def forward(self, text_feat, img_feat):
        x = torch.cat([text_feat, img_feat], dim=1)
        return torch.relu(self.fc(x))

class CCFM(nn.Module):
    def __init__(self, dropout_rate=0.3):
        super().__init__()
        self.text_model = BertModel.from_pretrained("bert-base-uncased")
        for p in self.text_model.parameters(): p.requires_grad = False
        for p in self.text_model.encoder.layer[-1].parameters(): p.requires_grad = True
        self.text_fc = nn.Linear(768, 512)

        resnet = resnet50(weights="IMAGENET1K_V1")
        for p in resnet.parameters(): p.requires_grad = False
        for p in resnet.layer4.parameters(): p.requires_grad = True
        self.img_model = nn.Sequential(*list(resnet.children())[:-1])
        self.img_fc = nn.Linear(2048, 512)

        self.fusion = FusionLayer(1024, 512)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc1 = nn.Linear(512, 256)
        self.fc2 = nn.Linear(256, 2)

    def forward(self, input_ids, attention_mask, image):
        text_out = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
        text_feat = torch.relu(self.text_fc(text_out.pooler_output))
        img_feat = self.img_model(image).flatten(1)
        img_feat = torch.relu(self.img_fc(img_feat))
        fused = self.fusion(text_feat, img_feat)
        x = self.dropout(torch.relu(self.fc1(fused)))
        return self.fc2(x)

In [None]:
# ==============================
# 5. Evaluation Helper Function
# ==============================
def eval_model(model, loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in loader:
            ids, mask, imgs, labels = batch["input_ids"].to(device), batch["attention_mask"].to(device), batch["image"].to(device), batch["label"].to(device)
            out = model(ids, mask, imgs)
            preds = torch.argmax(out, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    acc = accuracy_score(all_labels, all_preds)
    return acc, all_preds, all_labels

In [None]:
# ==============================
# 6. 5-Fold Cross-Validation
# ==============================
print("--- Starting 5-Fold Cross-Validation ---")
cv_df, _ = train_test_split(df, test_size=0.2, random_state=seed, stratify=df.label) # Use 80% for CV
N_SPLITS = 5
kfold = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=seed)
X, y = cv_df.index, cv_df.label
fold_results = []

for fold, (train_ids, val_ids) in enumerate(kfold.split(X, y)):
    print(f"\n{'='*20} FOLD {fold + 1}/{N_SPLITS} {'='*20}")
    train_df_fold, val_df_fold = cv_df.iloc[train_ids], cv_df.iloc[val_ids]
    train_ds_fold = FakeNewsDataset(train_df_fold, train_transform, data_root)
    val_ds_fold = FakeNewsDataset(val_df_fold, val_test_transform, data_root)
    train_loader_fold = DataLoader(train_ds_fold, batch_size=32, shuffle=True)
    val_loader_fold = DataLoader(val_ds_fold, batch_size=32)

    model = CCFM().to(device)
    optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=2e-5)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(3):
        model.train()
        for batch in tqdm(train_loader_fold, desc=f"Fold {fold+1}, Epoch {epoch+1}", leave=False):
            ids, mask, imgs, y_batch = batch["input_ids"].to(device), batch["attention_mask"].to(device), batch["image"].to(device), batch["label"].to(device)
            optimizer.zero_grad()
            out = model(ids, mask, imgs)
            loss = criterion(out, y_batch)
            loss.backward()
            optimizer.step()

    val_acc, _, _ = eval_model(model, val_loader_fold)
    print(f"✅ Fold {fold + 1} Validation Accuracy: {val_acc:.4f}")
    fold_results.append(val_acc)

print(f"\n\n{'='*50}\nCross-Validation Results\n{'='*50}")
print(f"Mean Validation Accuracy: {np.mean(fold_results):.4f}")
print(f"Standard Deviation: {np.std(fold_results):.4f}")

--- Starting 5-Fold Cross-Validation ---



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth


100%|██████████| 97.8M/97.8M [00:02<00:00, 35.2MB/s]


Fold 1, Epoch 1:   0%|          | 0/443 [00:00<?, ?it/s]

Fold 1, Epoch 2:   0%|          | 0/443 [00:00<?, ?it/s]

Fold 1, Epoch 3:   0%|          | 0/443 [00:00<?, ?it/s]

✅ Fold 1 Validation Accuracy: 0.8400



Fold 2, Epoch 1:   0%|          | 0/443 [00:00<?, ?it/s]

Fold 2, Epoch 2:   0%|          | 0/443 [00:00<?, ?it/s]

Fold 2, Epoch 3:   0%|          | 0/443 [00:00<?, ?it/s]

✅ Fold 2 Validation Accuracy: 0.8419



Fold 3, Epoch 1:   0%|          | 0/443 [00:00<?, ?it/s]

Fold 3, Epoch 2:   0%|          | 0/443 [00:00<?, ?it/s]

Fold 3, Epoch 3:   0%|          | 0/443 [00:00<?, ?it/s]

✅ Fold 3 Validation Accuracy: 0.8377



Fold 4, Epoch 1:   0%|          | 0/443 [00:00<?, ?it/s]

Fold 4, Epoch 2:   0%|          | 0/443 [00:00<?, ?it/s]

Fold 4, Epoch 3:   0%|          | 0/443 [00:00<?, ?it/s]

✅ Fold 4 Validation Accuracy: 0.8363



Fold 5, Epoch 1:   0%|          | 0/443 [00:00<?, ?it/s]

Fold 5, Epoch 2:   0%|          | 0/443 [00:00<?, ?it/s]

Fold 5, Epoch 3:   0%|          | 0/443 [00:00<?, ?it/s]

✅ Fold 5 Validation Accuracy: 0.8413


Cross-Validation Results
Mean Validation Accuracy: 0.8394
Standard Deviation: 0.0022


In [None]:
from sklearn.model_selection import StratifiedKFold
import numpy as np

# This code assumes df, data_root, transforms, and the model classes are in memory.

# --- 1. Hold out a final test set (20% of data) ---
cv_df, final_test_df = train_test_split(df, test_size=0.2, random_state=seed, stratify=df.label)
print(f"Data for Cross-Validation: {len(cv_df)} samples")
print(f"Held-out Final Test Set: {len(final_test_df)} samples")

# --- 2. Set up 5-Fold Cross-Validation ---
N_SPLITS = 5
kfold = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=seed)
X, y = cv_df.index, cv_df.label
fold_best_accuracies = [] # Store the BEST validation accuracy of each fold

# --- 3. The Cross-Validation Loop ---
for fold, (train_ids, val_ids) in enumerate(kfold.split(X, y)):
    print(f"\n{'='*20} FOLD {fold + 1}/{N_SPLITS} {'='*20}")

    train_df_fold = cv_df.iloc[train_ids]
    val_df_fold = cv_df.iloc[val_ids]
    train_ds_fold = FakeNewsDataset(train_df_fold, train_transform, data_root)
    val_ds_fold = FakeNewsDataset(val_df_fold, val_test_transform, data_root)
    train_loader_fold = DataLoader(train_ds_fold, batch_size=32, shuffle=True)
    val_loader_fold = DataLoader(val_ds_fold, batch_size=32)

    # Re-initialize the model for each fold
    model = CCFM().to(device)
    optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=2e-5)
    criterion = nn.CrossEntropyLoss()

    # --- Training Loop FOR THIS FOLD ---
    best_accuracy_in_fold = 0
    # Let's train for a few epochs and find the best one
    for epoch in range(5):
        model.train()
        progress_bar = tqdm(train_loader_fold, desc=f"Fold {fold+1}, Epoch {epoch+1}", leave=False)
        for batch in progress_bar:
            ids, mask, imgs, y_batch = batch["input_ids"].to(device), batch["attention_mask"].to(device), batch["image"].to(device), batch["label"].to(device)
            optimizer.zero_grad()
            out = model(ids, mask, imgs)
            loss = criterion(out, y_batch)
            loss.backward()
            optimizer.step()

        # --- Evaluate on this fold's validation set AFTER EACH EPOCH ---
        current_val_acc, _, _ = eval_model(model, val_loader_fold)
        print(f"Fold {fold+1}, Epoch {epoch+1} | Current Val Acc: {current_val_acc:.4f}")

        # If this epoch is the best so far for this fold, update the score
        if current_val_acc > best_accuracy_in_fold:
            best_accuracy_in_fold = current_val_acc

    # After training for all epochs in this fold, save the BEST accuracy found
    print(f"✅ Fold {fold + 1} Best Validation Accuracy: {best_accuracy_in_fold:.4f}")
    fold_best_accuracies.append(best_accuracy_in_fold)


# --- 4. Final Results ---
mean_accuracy = np.mean(fold_best_accuracies)
std_deviation = np.std(fold_best_accuracies)

print(f"\n\n{'='*50}\nCross-Validation Results\n{'='*50}")
print(f"Best Validation Accuracies for each fold: {[f'{acc:.4f}' for acc in fold_best_accuracies]}")
print(f"🎯 Mean Validation Accuracy: {mean_accuracy:.4f}")
print(f"📊 Standard Deviation: {std_deviation:.4f}")

Data for Cross-Validation: 17712 samples
Held-out Final Test Set: 4428 samples



Fold 1, Epoch 1:   0%|          | 0/443 [00:00<?, ?it/s]

Fold 1, Epoch 1 | Current Val Acc: 0.8244


Fold 1, Epoch 2:   0%|          | 0/443 [00:00<?, ?it/s]

Fold 1, Epoch 2 | Current Val Acc: 0.8318


Fold 1, Epoch 3:   0%|          | 0/443 [00:00<?, ?it/s]

Fold 1, Epoch 3 | Current Val Acc: 0.8400


Fold 1, Epoch 4:   0%|          | 0/443 [00:00<?, ?it/s]

Fold 1, Epoch 4 | Current Val Acc: 0.8445


Fold 1, Epoch 5:   0%|          | 0/443 [00:00<?, ?it/s]

Fold 1, Epoch 5 | Current Val Acc: 0.8383
✅ Fold 1 Best Validation Accuracy: 0.8445



Fold 2, Epoch 1:   0%|          | 0/443 [00:00<?, ?it/s]

Fold 2, Epoch 1 | Current Val Acc: 0.8134


Fold 2, Epoch 2:   0%|          | 0/443 [00:00<?, ?it/s]

Fold 2, Epoch 2 | Current Val Acc: 0.8115


Fold 2, Epoch 3:   0%|          | 0/443 [00:00<?, ?it/s]

Fold 2, Epoch 3 | Current Val Acc: 0.8394


Fold 2, Epoch 4:   0%|          | 0/443 [00:00<?, ?it/s]

Fold 2, Epoch 4 | Current Val Acc: 0.8470


Fold 2, Epoch 5:   0%|          | 0/443 [00:00<?, ?it/s]

Fold 2, Epoch 5 | Current Val Acc: 0.8473
✅ Fold 2 Best Validation Accuracy: 0.8473



Fold 3, Epoch 1:   0%|          | 0/443 [00:00<?, ?it/s]

Fold 3, Epoch 1 | Current Val Acc: 0.8230


Fold 3, Epoch 2:   0%|          | 0/443 [00:00<?, ?it/s]

Fold 3, Epoch 2 | Current Val Acc: 0.8346


Fold 3, Epoch 3:   0%|          | 0/443 [00:00<?, ?it/s]

Fold 3, Epoch 3 | Current Val Acc: 0.8447


Fold 3, Epoch 4:   0%|          | 0/443 [00:00<?, ?it/s]

Fold 3, Epoch 4 | Current Val Acc: 0.8425


Fold 3, Epoch 5:   0%|          | 0/443 [00:00<?, ?it/s]

Fold 3, Epoch 5 | Current Val Acc: 0.8408
✅ Fold 3 Best Validation Accuracy: 0.8447



Fold 4, Epoch 1:   0%|          | 0/443 [00:00<?, ?it/s]

Fold 4, Epoch 1 | Current Val Acc: 0.8216


Fold 4, Epoch 2:   0%|          | 0/443 [00:00<?, ?it/s]

Fold 4, Epoch 2 | Current Val Acc: 0.8278


Fold 4, Epoch 3:   0%|          | 0/443 [00:00<?, ?it/s]

Fold 4, Epoch 3 | Current Val Acc: 0.8354


Fold 4, Epoch 4:   0%|          | 0/443 [00:00<?, ?it/s]

Fold 4, Epoch 4 | Current Val Acc: 0.8405


Fold 4, Epoch 5:   0%|          | 0/443 [00:00<?, ?it/s]

Fold 4, Epoch 5 | Current Val Acc: 0.8470
✅ Fold 4 Best Validation Accuracy: 0.8470



Fold 5, Epoch 1:   0%|          | 0/443 [00:00<?, ?it/s]

Fold 5, Epoch 1 | Current Val Acc: 0.8224


Fold 5, Epoch 2:   0%|          | 0/443 [00:00<?, ?it/s]

Fold 5, Epoch 2 | Current Val Acc: 0.8204


Fold 5, Epoch 3:   0%|          | 0/443 [00:00<?, ?it/s]

Fold 5, Epoch 3 | Current Val Acc: 0.8464


Fold 5, Epoch 4:   0%|          | 0/443 [00:00<?, ?it/s]

Fold 5, Epoch 4 | Current Val Acc: 0.8529


Fold 5, Epoch 5:   0%|          | 0/443 [00:00<?, ?it/s]

Fold 5, Epoch 5 | Current Val Acc: 0.8439
✅ Fold 5 Best Validation Accuracy: 0.8529


Cross-Validation Results
Best Validation Accuracies for each fold: ['0.8445', '0.8473', '0.8447', '0.8470', '0.8529']
🎯 Mean Validation Accuracy: 0.8473
📊 Standard Deviation: 0.0030


In [None]:
# ==============================
# 8. External Verification API
# ==============================
def google_fact_check_api(text, api_key):
    if not api_key:
        return "VERDICT: API KEY NOT PROVIDED"
    url = "https://factchecktools.googleapis.com/v1alpha1/claims:search"
    params = {'query': text, 'key': api_key, 'languageCode': 'en'}
    negative_verdicts = ["false", "untrue", "misleading", "incorrect", "pants on fire"]
    try:
        response = requests.get(url, params=params)
        response.raise_for_status()
        data = response.json()
        if 'claims' in data and data['claims']:
            for claim in data['claims'][0]['claimReview']:
                verdict = claim['textualRating'].lower()
                if any(neg_verdict in verdict for neg_verdict in negative_verdicts):
                    return f"VERDICT: FALSE (Source: {claim['publisher']['name']})"
        return "VERDICT: NOT SURE"
    except Exception:
        return "VERDICT: API ERROR"

In [None]:
# ==============================
# 9. Gradio Interface with API
# ==============================
# Load the fully trained model for the interface
model = CCFM().to(device)
model.load_state_dict(torch.load(final_model_path))
model.eval()

def predict_news_hybrid(text, image, api_key):
    if not text.strip():
        return "Please enter a headline.", {}

    # --- Internal Model Prediction ---
    enc = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
    if image is None:
        img = Image.new("RGB", (224, 224), "black")
    else:
        img = image.convert("RGB")
    img = val_test_transform(img).unsqueeze(0)

    with torch.no_grad():
        out = model(enc["input_ids"].to(device), enc["attention_mask"].to(device), img.to(device))
        probs = torch.softmax(out, dim=1).cpu().numpy()[0]
        internal_verdict = "FAKE" if np.argmax(probs) == 1 else "REAL"

    # --- External API Verification ---
    external_verdict = google_fact_check_api(text, api_key)

    # --- Final Verdict Logic ---
    final_verdict = internal_verdict
    if "FALSE" in external_verdict:
        final_verdict = "FAKE"

    # --- Format Output ---
    report = {
        "Internal Model Verdict": internal_verdict,
        "REAL Probability": f"{float(probs[0]):.4f}",
        "FAKE Probability": f"{float(probs[1]):.4f}",
        "External Fact-Check": external_verdict,
        "Final Verdict": final_verdict
    }
    return final_verdict, report

# --- Build the Interface ---
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🔍 Hybrid Fake News Detector")
    gr.Markdown("This tool uses a locally trained AI model and the Google Fact Check API to analyze news headlines.")
    with gr.Row():
        with gr.Column(scale=2):
            news_input = gr.Textbox(label="Enter News Headline", placeholder="e.g., Shocking new discovery changes everything...")
            img_input = gr.Image(type="pil", label="Optional Image")
            api_key_input = gr.Textbox(label="Enter Google API Key", type="password", placeholder="Paste your API key here for external fact-checking...")
            submit_btn = gr.Button("Analyze", variant="primary")
        with gr.Column(scale=1):
            output_label = gr.Label(label="Final Verdict")
            output_json = gr.JSON(label="Full Analysis Report")

    submit_btn.click(
        fn=predict_news_hybrid,
        inputs=[news_input, img_input, api_key_input],
        outputs=[output_label, output_json]
    )

print("Launching Gradio Interface...")
demo.launch(share=True, debug=True)

NameError: name 'final_model_path' is not defined