In [2]:
import pandas as pd
import os
import re

# clean text
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", " ", text)
    text = re.sub(r"[^a-z\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# normalize label
def normalize_label(label):
    label = str(label).lower()
    if label in ["fake", "false", "pants-fire"]:
        return 0
    elif label in ["real", "true"]:
        return 1
    else:
        return None

# dirs
DATA_DIR = "../data"
PROCESSED_DIR = os.path.join(DATA_DIR, "processed")
os.makedirs(PROCESSED_DIR, exist_ok=True)

# FakeNewsNet
fnn_fake1 = pd.read_csv(os.path.join(DATA_DIR, "fakenewsnet_dataset", "dataset", "politifact_fake.csv"))
fnn_real1 = pd.read_csv(os.path.join(DATA_DIR, "fakenewsnet_dataset", "dataset", "politifact_real.csv"))
fnn_fake2 = pd.read_csv(os.path.join(DATA_DIR, "fakenewsnet_dataset", "dataset", "gossipcop_fake.csv"))
fnn_real2 = pd.read_csv(os.path.join(DATA_DIR, "fakenewsnet_dataset", "dataset", "gossipcop_real.csv"))
fnn_fake1["label"] = 0
fnn_fake2["label"] = 0
fnn_real1["label"] = 1
fnn_real2["label"] = 1
fake_newsnet = pd.concat([fnn_fake1, fnn_real1, fnn_fake2, fnn_real2], ignore_index=True)
fake_newsnet["source"] = "fakenewsnet"
if "text" not in fake_newsnet.columns and "title" in fake_newsnet.columns:
    fake_newsnet.rename(columns={"title": "text"}, inplace=True)
fake_newsnet["text"] = fake_newsnet["text"].apply(clean_text)

# Kaggle
kaggle = pd.read_csv(os.path.join(DATA_DIR, "kaggle_dataset", "Fake.csv"))
kaggle["label"] = 0
if "text" not in kaggle.columns and "title" in kaggle.columns:
    kaggle["text"] = kaggle["title"]
kaggle["source"] = "kaggle"
kaggle["text"] = kaggle["text"].apply(clean_text)

# LIAR
liar_train = pd.read_csv(os.path.join(DATA_DIR, "liar_dataset", "train.tsv"), sep="\t", header=None)
liar_test = pd.read_csv(os.path.join(DATA_DIR, "liar_dataset", "test.tsv"), sep="\t", header=None)
liar_val = pd.read_csv(os.path.join(DATA_DIR, "liar_dataset", "valid.tsv"), sep="\t", header=None)
liar = pd.concat([liar_train, liar_test, liar_val], ignore_index=True)
liar = liar[[0,1,2]]
liar.columns = ["id", "label_text", "text"]
liar["label"] = liar["label_text"].apply(normalize_label)
liar = liar.dropna(subset=["label"])
liar["source"] = "liar"
liar["text"] = liar["text"].apply(clean_text)

# merge all
all_datasets = pd.concat([
    fake_newsnet[["text", "label", "source"]],
    kaggle[["text", "label", "source"]],
    liar[["text", "label", "source"]],
], ignore_index=True)
all_datasets.insert(0, "id", range(1, len(all_datasets) + 1))

# save
output_file = os.path.join(PROCESSED_DIR, "all_news.csv")
all_datasets.to_csv(output_file, index=False)
print(f"Saved processed dataset to {output_file}")

# check dataset summary
print(all_datasets.shape)  # rows, cols
print(all_datasets["label"].value_counts())  # fake vs real count
print(all_datasets["source"].value_counts())  # how many from each source


Saved processed dataset to ../data/processed/all_news.csv
(52284, 4)
label
0.0    32790
1.0    19494
Name: count, dtype: int64
source
kaggle         23481
fakenewsnet    23196
liar            5607
Name: count, dtype: int64
