In [1]:
# ─────────────────────────────────────────────────────────────────────────────
# notebooks/03_feature_engineering.ipynb
# ─────────────────────────────────────────────────────────────────────────────

# 0. Allow importing from src/
%run setup.py

# 1. Imports
import pandas as pd
import numpy as np
from pathlib import Path
from src.features import build_features

# ─────────────────────────────────────────────────────────────────────────────
# 2. Load preprocessing artefacts
# ─────────────────────────────────────────────────────────────────────────────

# Meta CSV (contains: question, clean, len, words)
meta = pd.read_csv("../data/processed/question_meta.csv")

# Cleaned questions array (indexed by numeric ID = row in `meta`)
clean = np.load("../data/processed/clean_questions.npy", allow_pickle=True)

# Sentence embeddings (384-d MiniLM vectors, indexed by the same ID)
# We will pass this path to build_features
embedding_path = "../data/processed/question_embeddings.npy"

# ─────────────────────────────────────────────────────────────────────────────
# 3. Build a lookup from original question text → numeric ID
#    so we can assign `qid1`/`qid2` in the pairs DataFrame
# ─────────────────────────────────────────────────────────────────────────────

# meta["question"] holds the original text. Its row index (0..n-1) is the question ID.
qid_lookup = {q: idx for idx, q in enumerate(meta["question"].tolist())}

# ─────────────────────────────────────────────────────────────────────────────
# 4. Load the train split and map qid1/qid2
# ─────────────────────────────────────────────────────────────────────────────

train = pd.read_csv("../data/splits/train.csv")
# Drop any rows where either question is missing (just in case)
train = train.dropna(subset=["question1", "question2"])

# Map the original question text to its numeric ID in `meta`
train["qid1"] = train["question1"].map(qid_lookup)
train["qid2"] = train["question2"].map(qid_lookup)

# (Optional sanity‐check: ensure no NaNs remain in qid1/qid2)
if train["qid1"].isnull().any() or train["qid2"].isnull().any():
    missing_qs = train[train["qid1"].isnull() | train["qid2"].isnull()]
    raise ValueError(
        "Some questions in train.csv did not appear in question_meta.csv:\n"
        f"{missing_qs.head(5)}"
    )

# Convert qid1/qid2 to integer dtype
train["qid1"] = train["qid1"].astype(int)
train["qid2"] = train["qid2"].astype(int)

print(f"Total pairs to process: {len(train):,}")

# ─────────────────────────────────────────────────────────────────────────────
# 5. Build the full feature matrix for train
# ─────────────────────────────────────────────────────────────────────────────

# build_features signature:
#   build_features(pair_df, clean_questions, meta_df,
#                  embedding_path, cache_dir="models") -> np.ndarray

X_train = build_features(
    pair_df = train,
    clean_questions = clean.tolist(),      # list[str] of length = # unique questions
    meta_df = meta,
    embedding_path = embedding_path,
    cache_dir = "../models"                # TF‐free cache of TF-IDF etc.
)

print(f"Feature matrix built. Shape = {X_train.shape}")

# ─────────────────────────────────────────────────────────────────────────────
# 6. Save the feature matrix
# ─────────────────────────────────────────────────────────────────────────────

OUT = Path("../data/processed")
OUT.mkdir(parents=True, exist_ok=True)
np.save(OUT/"X_train.npy", X_train)

print("Saved: ../data/processed/X_train.npy")

# ─────────────────────────────────────────────────────────────────────────────
# End of notebook.
# ─────────────────────────────────────────────────────────────────────────────

Total pairs to process: 323,554
Feature matrix built. Shape = (323554, 1559)
Saved: ../data/processed/X_train.npy
