# 01 – Data Exploration

This notebook explores the fake news dataset used in the **Misinformation Detection Engine**.

**Goals:**
- Understand dataset structure (columns, size, missing values)
- Inspect label distribution (`real` vs `fake`)
- Analyze basic text statistics (lengths, word counts)
- Note limitations and assumptions for the research report

In [None]:
import os
import re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

# Configure plots
sns.set(style="whitegrid")


def find_project_root(start_dir: str) -> str:
    """Walk upward until we find the repo root that contains data/ and src/."""
    cur = os.path.abspath(start_dir)
    while True:
        if os.path.isdir(os.path.join(cur, "data")) and os.path.isdir(os.path.join(cur, "src")):
            return cur
        parent = os.path.dirname(cur)
        if parent == cur:
            raise FileNotFoundError(
                "Could not find project root containing 'data/' and 'src/'. "
                "Run Jupyter from inside the 'misinformation-detection-engine' folder."
            )
        cur = parent


PROJECT_ROOT = find_project_root(os.getcwd())

PROCESSED_PATH = os.path.join(PROJECT_ROOT, "data", "processed", "processed_fake_news.csv")
RAW_PATH = os.path.join(PROJECT_ROOT, "data", "raw", "fake_news.csv")

print("Project root:", PROJECT_ROOT)
print("Processed path:", PROCESSED_PATH)
print("Raw path:", RAW_PATH)


def normalize_raw_schema(df_raw: pd.DataFrame) -> pd.DataFrame:
    """Make raw datasets look like our expected schema: columns [label, text]."""
    possible_text_cols = ["text", "content", "article"]
    possible_title_cols = ["title", "headline"]
    possible_label_cols = ["label", "class", "target"]

    text_col = next((c for c in possible_text_cols if c in df_raw.columns), None)
    title_col = next((c for c in possible_title_cols if c in df_raw.columns), None)
    label_col = next((c for c in possible_label_cols if c in df_raw.columns), None)

    if label_col is None:
        raise ValueError(f"Could not find label column. Expected one of: {possible_label_cols}. Got: {list(df_raw.columns)}")

    if text_col and title_col:
        full_text = (df_raw[title_col].fillna("") + " " + df_raw[text_col].fillna("")).str.strip()
    elif text_col:
        full_text = df_raw[text_col].fillna("")
    elif title_col:
        full_text = df_raw[title_col].fillna("")
    else:
        raise ValueError(
            f"Could not find text column. Expected one of: {possible_text_cols + possible_title_cols}. Got: {list(df_raw.columns)}"
        )

    df = pd.DataFrame({"label": df_raw[label_col], "text": full_text})

    # Normalize labels to {0,1} if needed
    if df["label"].dtype == "object":
        df["label"] = (
            df["label"]
            .astype(str)
            .str.lower()
            .map({"fake": 1, "false": 1, "real": 0, "true": 0})
        )

    df = df.dropna(subset=["text", "label"]).copy()
    df["label"] = df["label"].astype(int)
    return df


# Load processed if available, else fall back to raw
if os.path.exists(PROCESSED_PATH):
    df = pd.read_csv(PROCESSED_PATH)
    print("\nLoaded processed dataset.")
elif os.path.exists(RAW_PATH):
    df_raw = pd.read_csv(RAW_PATH)
    df = normalize_raw_schema(df_raw)
    print("\nLoaded raw dataset (normalized schema for exploration).")
else:
    raise FileNotFoundError(
        "No dataset found. Put one of these files in place:\n"
        f"- {PROCESSED_PATH}  (run src/preprocess.py)\n"
        f"- {RAW_PATH}        (download a fake news CSV and place it here)"
    )

print("\nBasic info:")
display(df.head())
print("\nColumns:", list(df.columns))
print("\nMissing values (top):")
display(df.isna().sum().sort_values(ascending=False).head(10))

print("\nDataset size:", df.shape)

In [None]:
# Label distribution

if "label" not in df.columns:
    raise KeyError("Expected a 'label' column after loading/normalizing the dataset.")

# Ensure numeric {0,1}
labels = df["label"].astype(int)
label_map = {0: "real", 1: "fake"}

label_counts = labels.value_counts().sort_index()

print("Label counts:")
for label, count in label_counts.items():
    print(f"{label_map.get(label, str(label))} ({label}): {count}")

plt.figure(figsize=(5, 4))
ax = sns.barplot(
    x=[label_map.get(i, str(i)) for i in label_counts.index],
    y=label_counts.values,
    palette="viridis",
)
ax.set_title("Label Distribution")
ax.set_xlabel("Class")
ax.set_ylabel("Count")
plt.tight_layout()
plt.show()

In [None]:
# Text length analysis

# Prefer cleaned text when available
candidate_cols = ["clean_text", "text"]
text_col = next((c for c in candidate_cols if c in df.columns), None)
if text_col is None:
    raise KeyError(f"Expected one of {candidate_cols} for text content. Got: {list(df.columns)}")

texts = df[text_col].astype(str).fillna("")

df["char_len"] = texts.str.len()
df["word_len"] = texts.str.split().apply(len)

print("Using text column:", text_col)

print("\nCharacter length stats:")
display(df["char_len"].describe())

print("\nWord length stats:")
display(df["word_len"].describe())

fig, axes = plt.subplots(1, 2, figsize=(12, 4))

sns.histplot(df["char_len"], bins=50, ax=axes[0], kde=False)
axes[0].set_title("Character Length Distribution")
axes[0].set_xlabel("Number of characters")

sns.histplot(df["word_len"], bins=50, ax=axes[1], kde=False)
axes[1].set_title("Word Length Distribution")
axes[1].set_xlabel("Number of words")

plt.tight_layout()
plt.show()

# Optional: text length by class
plt.figure(figsize=(8, 4))
sns.boxplot(x=df["label"].astype(int).map({0: "real", 1: "fake"}), y=df["word_len"], palette="viridis")
plt.title("Word Length by Class")
plt.xlabel("Class")
plt.ylabel("Word count")
plt.tight_layout()
plt.show()

## Notes for Report / Paper

Use this space to write **research-style observations** based on the above analysis.

- **Dataset size**: `<fill in>` samples; label balance is `<balanced / imbalanced>`.
- **Label interpretation**: `0 = real`, `1 = fake`.
- **Typical article length**: median of `<X>` words; long tail up to `<Y>`.
- **Potential limitations**:
  - English-only news articles.
  - Labels come from the dataset provider (may contain bias).
  - May not generalize to tweets, memes, or other short/noisy text.

You can refine these bullets once you see the real numbers and plots above.

---

### (Optional) Next checks

- Inspect a few random samples per class to understand writing style.
- Look for duplicates or near-duplicates.
- Note any obvious labeling noise or domain bias.

You can delete this cell later if you don’t need it.

In [None]:
# (Optional) Show a few samples per class

label_map = {0: "real", 1: "fake"}

for cls in [0, 1]:
    sample = df[df["label"].astype(int) == cls].sample(n=min(3, (df["label"].astype(int) == cls).sum()), random_state=42)
    print(f"\n=== Samples for {label_map[cls].upper()} ===")
    for i, row in sample.iterrows():
        txt_col = "clean_text" if "clean_text" in df.columns else "text"
        print("-", str(row[txt_col])[:300], "...")