In [1]:
# ────────────────────────────────────────────────────────────────
# notebooks/03_feature_engineering.ipynb ─ Cell 1
# ===============================================

# 0) Ensure that src/ is on PYTHONPATH
%run setup.py

# 1) Imports
import pandas as pd
import numpy as np
import joblib
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

# 2) Paths to per-question artefacts & splits
PROCESSED_DIR    = Path("../data/processed")
meta_df          = pd.read_csv(PROCESSED_DIR / "question_meta.csv")
clean_questions  = np.load(PROCESSED_DIR / "clean_questions.npy", allow_pickle=True).tolist()
qid_lookup       = {q: idx for idx, q in enumerate(meta_df["question"])}

# 3) Load & map TRAIN pairs -> qid1, qid2
train_df = (
    pd.read_csv("../data/splits/train.csv")
    .dropna(subset=["question1","question2"])
    .assign(
        qid1 = lambda d: d.question1.map(qid_lookup).astype(int),
        qid2 = lambda d: d.question2.map(qid_lookup).astype(int)
    )
)

# Sanity check: no NaNs
assert not train_df["qid1"].isna().any() and not train_df["qid2"].isna().any()

print(f"Number of training pairs: {len(train_df):,}")
train_qids = np.unique(np.concatenate([train_df.qid1.values, train_df.qid2.values]))
print(f"-> {len(train_qids):,} unique questions in TRAIN split")

# 4) Subset of cleaned questions that actually appear in train
train_corpus = [clean_questions[i] for i in train_qids]  # List[str]

# 5) Fit TF-IDF (word & char) ON TRAIN ONLY
vec_w = TfidfVectorizer(ngram_range=(1,2), min_df=3, sublinear_tf=True)
vec_c = TfidfVectorizer(analyzer="char", ngram_range=(3,5), min_df=10)

print("Fitting TF-IDF (word ngrams) on TRAIN...", end=" ")
vec_w.fit(train_corpus)
print("done!")
print("Fitting TF-IDF (char ngrams) on TRAIN...", end=" ")
vec_c.fit(train_corpus)
print("done!")

# 6) Persist both vectorisers
MODEL_DIR = Path("../models/features/")
MODEL_DIR.mkdir(parents=True, exist_ok=True)
joblib.dump(vec_w, MODEL_DIR / "tfidf_w.pkl")
joblib.dump(vec_c, MODEL_DIR / "tfidf_c.pkl")
print("Saved TF-IDF vectorisers -> models/tfidf_w.pkl  &  models/tfidf_c.pkl")

# 7) Fit SVD ON TRAIN TF-IDF matrices (word & char)
Z_w = vec_w.transform(train_corpus)  # sparse (n_train_q, V_w)
Z_c = vec_c.transform(train_corpus)  # sparse (n_train_q, V_c)

svd_w = TruncatedSVD(n_components=150, random_state=42).fit(Z_w)
svd_c = TruncatedSVD(n_components=100, random_state=42).fit(Z_c)

# 8) Persist the **SVD models** themselves (not just projections)
joblib.dump(svd_w, MODEL_DIR / "svd_w_150.pkl")
joblib.dump(svd_c, MODEL_DIR / "svd_c_100.pkl")
print("Saved SVD models -> models/svd_w_150.pkl  &  models/svd_c_100.pkl")

print("TF-IDF & SVD have been fitted on TRAIN ONLY.")

Number of training pairs: 323,554
-> 442,917 unique questions in TRAIN split
Fitting TF-IDF (word ngrams) on TRAIN... done!
Fitting TF-IDF (char ngrams) on TRAIN... done!
Saved TF-IDF vectorisers -> models/tfidf_w.pkl  &  models/tfidf_c.pkl
Saved SVD models -> models/svd_w_150.pkl  &  models/svd_c_100.pkl
TF-IDF & SVD have been fitted on TRAIN ONLY.


In [2]:
# ─────────────────────────────────────────────────────────────────────────────
# notebooks/03_feature_engineering.ipynb ― Cell 2
# ===============================================

# 0) Ensure that src/ is on PYTHONPATH
%run setup.py

# 1) Import
from src.features import build_features

# 2) Paths
PROCESSED_DIR = Path("../data/processed")
MODEL_DIR     = Path("../models/features")     # where TF-IDF/SVD/PCA live
MODEL_DIR.mkdir(parents=True, exist_ok=True)

# 3) Load per-question artefacts
meta_df         = pd.read_csv(PROCESSED_DIR / "question_meta.csv")
clean_questions = np.load(PROCESSED_DIR / "clean_questions.npy", allow_pickle=True).tolist()
qid_lookup      = {q: idx for idx, q in enumerate(meta_df["question"])}

# ─────────────────────────────────────────────────────────────────────────────
# 4) BUILD & SAVE PCA-REDUCED FEATURES FOR TRAIN SPLIT
# ─────────────────────────────────────────────────────────────────────────────
train_df = (
    pd.read_csv("../data/splits/train.csv")
      .dropna(subset=["question1","question2"])
)
train_df["qid1"] = train_df["question1"].map(qid_lookup).astype(int)
train_df["qid2"] = train_df["question2"].map(qid_lookup).astype(int)

if train_df["qid1"].isna().any() or train_df["qid2"].isna().any():
    raise ValueError("Some questions in train.csv could not be mapped to question_meta.csv.")

# Build + fit IncrementalPCA (two passes) on TRAIN
print("* Building TRAIN features + fitting IncrementalPCA (95% variance)…")
X_train = build_features(
    pair_df         = train_df,
    clean_questions = clean_questions,
    meta_df         = meta_df,
    embedding_path  = "../data/processed/question_embeddings.npy",
    cache_dir       = str(MODEL_DIR),
    cross_cache     = "../data/processed/train_cross_scores.npy",
    fit_pca         = True        # ← fit & save PCA on TRAIN
)
np.save(PROCESSED_DIR / "X_train.npy", X_train)
print(f"* Saved X_train.npy with shape {X_train.shape} (d_reduced ≪ 3598)")

# ─────────────────────────────────────────────────────────────────────────────
# 5) BUILD & SAVE PCA-REDUCED FEATURES FOR VALID SPLIT
# ─────────────────────────────────────────────────────────────────────────────
valid_df = pd.read_csv("../data/splits/valid.csv").dropna(subset=["question1","question2"])
valid_df["qid1"] = valid_df["question1"].map(qid_lookup).astype(int)
valid_df["qid2"] = valid_df["question2"].map(qid_lookup).astype(int)

if valid_df["qid1"].isna().any() or valid_df["qid2"].isna().any():
    raise ValueError("Some questions in valid.csv could not be mapped to question_meta.csv.")

print("- Building VALID features + applying saved PCA…")
X_valid = build_features(
    pair_df         = valid_df,
    clean_questions = clean_questions,
    meta_df         = meta_df,
    embedding_path  = "../data/processed/question_embeddings.npy",
    cache_dir       = str(MODEL_DIR),
    cross_cache     = "../data/processed/valid_cross_scores.npy",
    fit_pca         = False       # ← just transform with saved PCA
)
np.save(PROCESSED_DIR / "X_valid.npy", X_valid)
print(f"* Saved X_valid.npy with shape {X_valid.shape}")

# ─────────────────────────────────────────────────────────────────────────────
# 6) BUILD & SAVE PCA-REDUCED FEATURES FOR TEST SPLIT (optional)
# ─────────────────────────────────────────────────────────────────────────────
test_df = pd.read_csv("../data/splits/test.csv").dropna(subset=["question1","question2"])
test_df["qid1"] = test_df["question1"].map(qid_lookup).astype(int)
test_df["qid2"] = test_df["question2"].map(qid_lookup).astype(int)

if test_df["qid1"].isna().any() or test_df["qid2"].isna().any():
    raise ValueError("Some questions in test.csv could not be mapped to question_meta.csv.")

print("* Building TEST features + applying saved PCA…")
X_test = build_features(
    pair_df         = test_df,
    clean_questions = clean_questions,
    meta_df         = meta_df,
    embedding_path  = "../data/processed/question_embeddings.npy",
    cache_dir       = str(MODEL_DIR),
    cross_cache     = "../data/processed/test_cross_scores.npy",
    fit_pca         = False       # ← just transform with saved PCA
)
np.save(PROCESSED_DIR / "X_test.npy", X_test)
print(f"* Saved X_test.npy with shape {X_test.shape}")

print("\nFeature‐engineering + IncrementalPCA complete. Files now in data/processed:")
print("  * X_train.npy (TRAIN reduced → 95%)")
print("  * X_valid.npy (VALID reduced)")
print("  * X_test.npy  (TEST reduced)")

* Building TRAIN features + fitting IncrementalPCA (95% variance)…
  [features.py]  1st pass of IncrementalPCA(n_components=None) to compute variance…
  [features.py]  #components → 3 to retain 95% variance
  [features.py]  2nd pass of IncrementalPCA(n_components=k95) to fit_transform…
* Saved X_train.npy with shape (323554, 3) (d_reduced ≪ 3598)
- Building VALID features + applying saved PCA…
* Saved X_valid.npy with shape (40087, 3)
* Building TEST features + applying saved PCA…
* Saved X_test.npy with shape (40646, 3)

Feature‐engineering + IncrementalPCA complete. Files now in data/processed:
  * X_train.npy (TRAIN reduced → 95%)
  * X_valid.npy (VALID reduced)
  * X_test.npy  (TEST reduced)
