In [None]:
"""
Pipeline for Fake News Detection Using Transformers.
"""

from pathlib import Path
from datasets import load_dataset

from config import RAW_DATA_DIR, PROCESSED_DATA_DIR


def fetch_datasets() -> None:
    """
    Use HuggingFace to fetch LIAR or FakeNewsNet (or manually download).
    """
    RAW_DATA_DIR.mkdir(parents=True, exist_ok=True)

    # Example: LIAR dataset via HuggingFace
    liar = load_dataset("liar")
    liar.save_to_disk(str(RAW_DATA_DIR / "liar"))


def run_pipeline():
    """
    Steps:

    1. Download and load LIAR and/or FakeNewsNet.
    2. Standardize label schema (fake / real / mixed / etc.).
    3. Tokenize with a transformer tokenizer (e.g., DistilBERT).
    4. Fine-tune model on classification task.
    5. Build publisher-level reliability memory:
        - aggregate historical predictions per source
        - compute reliability score and use as feature/adjustment.
    6. Export:
        - article-level predictions + probabilities
        - publisher-level reliability metrics for Tableau.
    """
    fetch_datasets()

    PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)

    # TODO:
    # liar = load_dataset_from_disk(RAW_DATA_DIR / "liar")
    # train_ds, val_ds, test_ds = preprocess_liar(liar)
    # model = train_transformer(train_ds, val_ds)
    # preds = evaluate_model(model, test_ds)
    # reliability = build_publisher_reliability(preds)
    #
    # preds.to_csv(PROCESSED_DATA_DIR / "article_predictions.csv", index=False)
    # reliability.to_csv(PROCESSED_DATA_DIR / "publisher_reliability.csv", index=False)

    print("Fake news pipeline outline executed.")


if __name__ == "__main__":
    run_pipeline()