In [3]:
import pandas as pd
import torch
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report, accuracy_score

import kagglehub

from src.model_trainer import load_trained_model, get_model_predictions
from src.data_preprocessor import TextPreprocessor, FakeNewsDataset, collate_fn


In [4]:
def evaluate_on_external_df(
        path,
        delimiter=',',
        text_column='text',
        label_column='label',
        true_label='true',
        fake_label='false',
        batch_size=16,
        max_sentences=20,
        max_words_per_sentence=50,
):

    # ------------------------------------------------------------------
    # 1.  Load the file and keep only the columns we need
    # ------------------------------------------------------------------
    df = pd.read_csv(path, delimiter=delimiter, usecols=[text_column, label_column])\
           .dropna(subset=[text_column, label_column])

    # ------------------------------------------------------------------
    # 2.  Normalise the label values to the exact strings that were
    #     used to fit the original LabelEncoder
    # ------------------------------------------------------------------
    mapping = {true_label: "true", fake_label: "fake"}
    df["label_norm"] = df[label_column].map(mapping)

    if df["label_norm"].isna().any():
        bad_values = df.loc[df["label_norm"].isna(), label_column].unique()
        raise ValueError(f"Unmapped label values found: {bad_values}")

    # ------------------------------------------------------------------
    # 3.  Load model + encoder + vocabulary
    # ------------------------------------------------------------------
    device = "cuda" if torch.cuda.is_available() else "cpu"

    model, _, ckpt = load_trained_model("best_han_model.pth", device=device)
    label_encoder = ckpt["label_encoder"]

    preproc = TextPreprocessor()
    preproc.load_vocabulary("vocabulary.pkl")

    # ------------------------------------------------------------------
    # 4.  Build the FakeNewsDataset
    # ------------------------------------------------------------------
    texts  = df[text_column].tolist()
    labels = label_encoder.transform(df["label_norm"])

    dataset = FakeNewsDataset(
        texts,
        labels,
        preproc,
        max_sentences,
        max_words_per_sentence,
    )
    loader = DataLoader(dataset,
                        batch_size=batch_size,
                        shuffle=False,
                        collate_fn=collate_fn,
                        num_workers=2)

    # ------------------------------------------------------------------
    # 5.  Run inference
    # ------------------------------------------------------------------
    results = get_model_predictions(model, loader, label_encoder, device)

    # ------------------------------------------------------------------
    # 6.  Report metrics
    # ------------------------------------------------------------------
    acc = accuracy_score(results["true_labels"], results["predictions"])
    report = classification_report(
        results["true_labels"], results["predictions"],
        target_names=label_encoder.classes_,
        digits=4,
    )

    print(f"Accuracy : {acc:.4f}\n")
    print(report)

    return acc, report


Dataset: https://www.kaggle.com/datasets/aadyasingh55/fake-news-classification

In [5]:
path = kagglehub.dataset_download("aadyasingh55/fake-news-classification")
evaluate_on_external_df(
    path = path + '/train (2).csv',
    delimiter = ';',
    true_label = 1,
    fake_label = 0
)

Model loaded from best_han_model.pth
Best validation accuracy: 0.9997775800711743


Convertng text to hierarchical format...: 100%|██████████| 24353/24353 [00:27<00:00, 895.03it/s]
Getting predictions: 100%|██████████| 1523/1523 [00:15<00:00, 99.80it/s] 


Accuracy : 0.9813

              precision    recall  f1-score   support

        fake     0.9614    0.9992    0.9799     11107
        true     0.9993    0.9663    0.9825     13246

    accuracy                         0.9813     24353
   macro avg     0.9803    0.9828    0.9812     24353
weighted avg     0.9820    0.9813    0.9813     24353



(0.9813164702500718,
 '              precision    recall  f1-score   support\n\n        fake     0.9614    0.9992    0.9799     11107\n        true     0.9993    0.9663    0.9825     13246\n\n    accuracy                         0.9813     24353\n   macro avg     0.9803    0.9828    0.9812     24353\nweighted avg     0.9820    0.9813    0.9813     24353\n')

Dataset: https://www.kaggle.com/datasets/hassanamin/textdb3?select=fake_or_real_news.csv

In [9]:
path = kagglehub.dataset_download("hassanamin/textdb3")
evaluate_on_external_df(
    path = path + '/fake_or_real_news.csv',
    fake_label = 'FAKE',
    true_label = 'REAL'    
)

Model loaded from best_han_model.pth
Best validation accuracy: 0.9995551601423488


Convertng text to hierarchical format...: 100%|██████████| 6335/6335 [00:08<00:00, 729.97it/s]
Getting predictions: 100%|██████████| 396/396 [00:01<00:00, 242.46it/s]

Accuracy : 0.5165

              precision    recall  f1-score   support

        fake     0.5082    0.9899    0.6716      3164
        true     0.8140    0.0442    0.0838      3171

    accuracy                         0.5165      6335
   macro avg     0.6611    0.5170    0.3777      6335
weighted avg     0.6612    0.5165    0.3774      6335






(0.5164956590370955,
 '              precision    recall  f1-score   support\n\n        fake     0.5082    0.9899    0.6716      3164\n        true     0.8140    0.0442    0.0838      3171\n\n    accuracy                         0.5165      6335\n   macro avg     0.6611    0.5170    0.3777      6335\nweighted avg     0.6612    0.5165    0.3774      6335\n')

Dataset: https://www.kaggle.com/datasets/saurabhshahane/fake-news-classification


> There is an error in the data card of this model, labels are inverted (check discussion). Code below is correct.

In [6]:
path = kagglehub.dataset_download("saurabhshahane/fake-news-classification")
evaluate_on_external_df(
    path = path + "/WELFake_Dataset.csv",
    fake_label = 1,
    true_label = 0
    )

Model loaded from best_han_model.pth
Best validation accuracy: 0.9995551601423488


Convertng text to hierarchical format...: 100%|██████████| 72095/72095 [01:23<00:00, 866.70it/s]
Getting predictions: 100%|██████████| 4506/4506 [00:17<00:00, 251.92it/s]


Accuracy : 0.8212

              precision    recall  f1-score   support

        fake     0.7431    0.9967    0.8514     37067
        true     0.9945    0.6354    0.7754     35028

    accuracy                         0.8212     72095
   macro avg     0.8688    0.8161    0.8134     72095
weighted avg     0.8653    0.8212    0.8145     72095



(0.8211665164019696,
 '              precision    recall  f1-score   support\n\n        fake     0.7431    0.9967    0.8514     37067\n        true     0.9945    0.6354    0.7754     35028\n\n    accuracy                         0.8212     72095\n   macro avg     0.8688    0.8161    0.8134     72095\nweighted avg     0.8653    0.8212    0.8145     72095\n')