# 03 – Feature Extraction (TF-IDF)

This notebook explores **TF-IDF vectorization**: vocabulary size, top n-grams, and how features look before training.

**Goals:**
- Fit TF-IDF on processed text (same config as `src/train.py`).
- Inspect vocabulary and top terms by class.
- Visualize feature importance / top keywords (basic explainability).

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from IPython.display import display

def find_project_root(start_dir):
    cur = os.path.abspath(start_dir)
    while True:
        if os.path.isdir(os.path.join(cur, "data")) and os.path.isdir(os.path.join(cur, "src")):
            return cur
        parent = os.path.dirname(cur)
        if parent == cur:
            raise FileNotFoundError("Run Jupyter from inside misinformation-detection-engine.")
        cur = parent

PROJECT_ROOT = find_project_root(os.getcwd())
PROCESSED_PATH = os.path.join(PROJECT_ROOT, "data", "processed", "processed_fake_news.csv")
df = pd.read_csv(PROCESSED_PATH)
tcol = "clean_text" if "clean_text" in df.columns else "text"
X = df[tcol].astype(str)
y = df["label"].astype(int)
print("Loaded processed data. Shape:", df.shape)

## Fit TF-IDF (same config as `src/train.py`)

- max_features=10000, ngram_range=(1,2), stop_words="english"

In [None]:
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2), stop_words="english")
X_vec = vectorizer.fit_transform(X)
vocab = vectorizer.get_feature_names_out()
print("Vocabulary size:", len(vocab))
print("Matrix shape:", X_vec.shape)
print("Sample terms:", list(vocab[:20]))

## Top terms by class (basic explainability)

For each class we show the terms with highest average TF-IDF.

In [None]:
# Mean TF-IDF per feature, per class
real_mask = (y == 0)
fake_mask = (y == 1)
mean_real = np.asarray(X_vec[real_mask].mean(axis=0)).flatten()
mean_fake = np.asarray(X_vec[fake_mask].mean(axis=0)).flatten()

top_n = 15
top_real_idx = np.argsort(mean_real)[-top_n:][::-1]
top_fake_idx = np.argsort(mean_fake)[-top_n:][::-1]

print("Top terms (avg TF-IDF) for REAL (0):")
for i in top_real_idx:
    print(f"  {vocab[i]}: {mean_real[i]:.4f}")
print("\nTop terms (avg TF-IDF) for FAKE (1):")
for i in top_fake_idx:
    print(f"  {vocab[i]}: {mean_fake[i]:.4f}")

In [None]:
# Bar plot: top 10 terms per class
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
n_show = 10
axes[0].barh(range(n_show), mean_real[top_real_idx[:n_show]], color="steelblue")
axes[0].set_yticks(range(n_show))
axes[0].set_yticklabels([vocab[i] for i in top_real_idx[:n_show]])
axes[0].set_title("Top terms — REAL")
axes[0].invert_yaxis()
axes[1].barh(range(n_show), mean_fake[top_fake_idx[:n_show]], color="coral")
axes[1].set_yticks(range(n_show))
axes[1].set_yticklabels([vocab[i] for i in top_fake_idx[:n_show]])
axes[1].set_title("Top terms — FAKE")
axes[1].invert_yaxis()
plt.tight_layout()
plt.show()