In [1]:
import pandas as pd
import numpy as np
import os
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from bs4 import BeautifulSoup
from collections import Counter

# set directories
DATA_DIR = "../data/processed"
OUTPUT_DIR = "../data/final"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# load dataset
df = pd.read_csv(os.path.join(DATA_DIR, "all_news.csv"))

# replace NaN with empty string
df["text"] = df["text"].fillna("")

# remove HTML tags
def remove_html(text):
    return BeautifulSoup(text, "html.parser").get_text()

df["text"] = df["text"].apply(remove_html)

# clean and tokenize
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", " ", text)
    text = re.sub(r"[^a-z\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["text"] = df["text"].apply(clean_text)

# dataset info
print("Dataset shape:", df.shape)
print("\nLabel distribution:\n", df["label"].value_counts(normalize=True))

# plot label counts
plt.figure(figsize=(6,4))
sns.countplot(x="label", data=df)
plt.title("Fake vs Real News Count")
plt.xticks([0,1], ["Fake", "Real"])
plt.show()

# plot text length distribution
df["text_length"] = df["text"].apply(lambda x: len(x.split()))
plt.figure(figsize=(6,4))
sns.boxplot(x="label", y="text_length", data=df)
plt.title("Text Length Distribution")
plt.xticks([0,1], ["Fake", "Real"])
plt.show()

# balance dataset
min_class_size = df["label"].value_counts().min()
df_balanced = df.groupby("label").sample(n=min_class_size, random_state=42)

print("\nBalanced label distribution:\n", df_balanced["label"].value_counts())

# split dataset
train_df, temp_df = train_test_split(df_balanced, test_size=0.3, stratify=df_balanced["label"], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df["label"], random_state=42)

print(f"\nTrain size: {len(train_df)}, Validation size: {len(val_df)}, Test size: {len(test_df)}")

# save splits
train_df.to_csv(os.path.join(OUTPUT_DIR, "train.csv"), index=False)
val_df.to_csv(os.path.join(OUTPUT_DIR, "validation.csv"), index=False)
test_df.to_csv(os.path.join(OUTPUT_DIR, "test.csv"), index=False)

print(f"Saved train, validation, and test sets to {OUTPUT_DIR}")


Matplotlib is building the font cache; this may take a moment.


TypeError: Incoming markup is of an invalid type: nan. Markup must be a string, a bytestring, or an open filehandle.