In [None]:
# ────────────────────────────────────────────────────────────────
# notebooks/3_feature_engineering.ipynb ─ Cell 1
# ===============================================

# 0) Ensure that src/ is on PYTHONPATH
%run setup.py

# 1) Imports
import pandas as pd
import numpy as np
from pathlib import Path

# 2) Paths to per‐question artifacts & splits
PROCESSED_DIR    = Path("../data/processed")
meta_fp          = PROCESSED_DIR / "question_meta.csv"
clean_fp         = PROCESSED_DIR / "clean_questions.npy"
split_dir        = Path("../data/splits")

# 3) Load question_meta & cleaned questions
if not meta_fp.exists():
    raise FileNotFoundError(f"Missing '{meta_fp}'. Run preprocessing to generate question_meta.csv.")
meta_df = pd.read_csv(meta_fp)

if not clean_fp.exists():
    raise FileNotFoundError(f"Missing '{clean_fp}'. Run preprocessing to generate clean_questions.npy.")
clean_questions = np.load(clean_fp, allow_pickle=True).tolist()

if len(clean_questions) != len(meta_df):
    raise RuntimeError(
        f"Length mismatch: clean_questions.npy has {len(clean_questions)} entries, "
        f"but question_meta.csv has {len(meta_df)} rows."
    )

# 4) Build a lookup from question text -> its integer ID (row index in question_meta.csv)
qid_lookup = {q: idx for idx, q in enumerate(meta_df["question"].astype(str).tolist())}

# 5) Load & map TRAIN/VALID/TEST pairs -> add qid1/qid2
def _load_and_map_split(split_name: str) -> pd.DataFrame:
    """
    Reads "../data/splits/{split_name}.csv", drops any NA pairs,
    and adds two columns: qid1 and qid2. Raises if any question strings fail mapping.
    """
    fp = split_dir / f"{split_name}.csv"
    if not fp.exists():
        raise FileNotFoundError(f"Missing split file: {fp}")
    df = (
        pd.read_csv(fp)
          .dropna(subset=["question1", "question2"])
          .assign(
              qid1=lambda d: d.question1.map(qid_lookup).astype(int),
              qid2=lambda d: d.question2.map(qid_lookup).astype(int)
          )
    )
    if df.qid1.isna().any() or df.qid2.isna().any():
        raise ValueError(f"Some questions in {split_name.upper()} cannot be mapped to question_meta.csv.")
    return df

train_df = _load_and_map_split("train")
valid_df = _load_and_map_split("valid")
test_df  = _load_and_map_split("test")

print(f"Number of TRAIN pairs: {len(train_df):,}")
print(f"Number of VALID pairs: {len(valid_df):,}")
print(f"Number of TEST  pairs: {len(test_df):,}")

Number of TRAIN pairs: 323,613
Number of VALID pairs: 40,710
Number of TEST  pairs: 39,964


In [2]:
# ────────────────────────────────────────────────────────────────
# notebooks/03_feature_engineering.ipynb ─ Cell 2 (updated)
# ===============================================

# 0) Ensure that src/ is on PYTHONPATH (again, if you restarted kernel)
%run setup.py

# 1) Import build_features
from src.features import build_features

# 2) Common settings
from pathlib import Path
import numpy as np

PROCESSED_DIR = Path("../data/processed")
DIM_LIST      = [384, 768]

# 3) UMAP target dimension (adjust as desired)
n_components_umap = 10

# 4) Loop over each SBERT track (384 & 768)
for dim in DIM_LIST:
    model_dir = Path(f"../models/features_{dim}")
    model_dir.mkdir(parents=True, exist_ok=True)

    emb_fp = PROCESSED_DIR / f"question_embeddings_{dim}.npy"
    if not emb_fp.exists():
        raise FileNotFoundError(f"Missing embeddings for dim={dim}: {emb_fp}")

    print(f"\n=== TRACK dim={dim} ===")

    #
    # 1) IPCA-only (95%) – TRAIN
    #
    out_tr_ipca = PROCESSED_DIR / f"X_train_{dim}_ipca.npy"
    if out_tr_ipca.exists():
        X_tr_ipca = np.load(out_tr_ipca, mmap_mode="r")
        print(f"1) [SKIP] X_train_{dim}_ipca.npy already exists (shape={X_tr_ipca.shape})")
    else:
        print("1) Fitting IPCA-only features on TRAIN…")
        X_tr_ipca = build_features(
            pair_df         = train_df,
            clean_questions = clean_questions,
            meta_df         = meta_df,
            embedding_path  = str(emb_fp),
            cache_dir       = str(model_dir),
            cross_cache     = str(PROCESSED_DIR / f"train_cross_{dim}.npy"),
            fit_pca         = True,    # chunked two-pass IPCA
            features_cache  = str(PROCESSED_DIR / f"train_raw_{dim}.npy"),
            reduction       = "ipca",
            n_components    = None,    # auto k95
        )
        np.save(out_tr_ipca, X_tr_ipca)
        print(f"   -> Saved X_train_{dim}_ipca.npy (shape={X_tr_ipca.shape})")

    #
    # 2) IPCA-only (95%) – VALID
    #
    out_val_ipca = PROCESSED_DIR / f"X_valid_{dim}_ipca.npy"
    if out_val_ipca.exists():
        X_val_ipca = np.load(out_val_ipca, mmap_mode="r")
        print(f"2) [SKIP] X_valid_{dim}_ipca.npy already exists (shape={X_val_ipca.shape})")
    else:
        print("2) Applying saved IPCA on VALID…")
        X_val_ipca = build_features(
            pair_df         = valid_df,
            clean_questions = clean_questions,
            meta_df         = meta_df,
            embedding_path  = str(emb_fp),
            cache_dir       = str(model_dir),
            cross_cache     = str(PROCESSED_DIR / f"valid_cross_{dim}.npy"),
            fit_pca         = False,  # reuse existing IPCA pickles
            features_cache  = str(PROCESSED_DIR / f"valid_raw_{dim}.npy"),
            reduction       = "ipca",
            n_components    = None,
        )
        np.save(out_val_ipca, X_val_ipca)
        print(f"   -> Saved X_valid_{dim}_ipca.npy (shape={X_val_ipca.shape})")

    #
    # 3) IPCA-only (95%) – TEST
    #
    out_te_ipca = PROCESSED_DIR / f"X_test_{dim}_ipca.npy"
    if out_te_ipca.exists():
        X_te_ipca = np.load(out_te_ipca, mmap_mode="r")
        print(f"3) [SKIP] X_test_{dim}_ipca.npy already exists (shape={X_te_ipca.shape})")
    else:
        print("3) Applying saved IPCA on TEST…")
        X_te_ipca = build_features(
            pair_df         = test_df,
            clean_questions = clean_questions,
            meta_df         = meta_df,
            embedding_path  = str(emb_fp),
            cache_dir       = str(model_dir),
            cross_cache     = str(PROCESSED_DIR / f"test_cross_{dim}.npy"),
            fit_pca         = False,
            features_cache  = str(PROCESSED_DIR / f"test_raw_{dim}.npy"),
            reduction       = "ipca",
            n_components    = None,
        )
        np.save(out_te_ipca, X_te_ipca)
        print(f"   -> Saved X_test_{dim}_ipca.npy (shape={X_te_ipca.shape})")


    # ────────────────────────────────────────────────────────────────────────────
    # 4) UMAP-only – TRAIN
    #
    out_tr_umap = PROCESSED_DIR / f"X_train_{dim}_umap{n_components_umap}.npy"
    if out_tr_umap.exists():
        X_tr_umap = np.load(out_tr_umap, mmap_mode="r")
        print(f"4) [SKIP] X_train_{dim}_umap{n_components_umap}.npy already exists (shape={X_tr_umap.shape})")
    else:
        print("4) Fitting UMAP-only features on TRAIN…")
        X_tr_umap = build_features(
            pair_df         = train_df,
            clean_questions = clean_questions,
            meta_df         = meta_df,
            embedding_path  = str(emb_fp),
            cache_dir       = str(model_dir),
            cross_cache     = str(PROCESSED_DIR / f"train_cross_{dim}.npy"),
            fit_pca         = True,   # ensures raw features are built, then chunked IPCA->k95
            features_cache  = str(PROCESSED_DIR / f"train_raw_{dim}.npy"),
            reduction       = "umap",
            n_components    = n_components_umap,
        )
        np.save(out_tr_umap, X_tr_umap)
        print(f"   -> Saved X_train_{dim}_umap{n_components_umap}.npy (shape={X_tr_umap.shape})")

    #
    # 5) UMAP-only – VALID
    #
    out_val_umap = PROCESSED_DIR / f"X_valid_{dim}_umap{n_components_umap}.npy"
    if out_val_umap.exists():
        X_val_umap = np.load(out_val_umap, mmap_mode="r")
        print(f"5) [SKIP] X_valid_{dim}_umap{n_components_umap}.npy already exists (shape={X_val_umap.shape})")
    else:
        print("5) Applying saved UMAP on VALID…")
        X_val_umap = build_features(
            pair_df         = valid_df,
            clean_questions = clean_questions,
            meta_df         = meta_df,
            embedding_path  = str(emb_fp),
            cache_dir       = str(model_dir),
            cross_cache     = str(PROCESSED_DIR / f"valid_cross_{dim}.npy"),
            fit_pca         = False,  # reuse existing UMAP pickles
            features_cache  = str(PROCESSED_DIR / f"valid_raw_{dim}.npy"),
            reduction       = "umap",
            n_components    = n_components_umap,
        )
        np.save(out_val_umap, X_val_umap)
        print(f"   -> Saved X_valid_{dim}_umap{n_components_umap}.npy (shape={X_val_umap.shape})")

    #
    # 6) UMAP-only – TEST
    #
    out_te_umap = PROCESSED_DIR / f"X_test_{dim}_umap{n_components_umap}.npy"
    if out_te_umap.exists():
        X_te_umap = np.load(out_te_umap, mmap_mode="r")
        print(f"6) [SKIP] X_test_{dim}_umap{n_components_umap}.npy already exists (shape={X_te_umap.shape})")
    else:
        print("6) Applying saved UMAP on TEST…")
        X_te_umap = build_features(
            pair_df         = test_df,
            clean_questions = clean_questions,
            meta_df         = meta_df,
            embedding_path  = str(emb_fp),
            cache_dir       = str(model_dir),
            cross_cache     = str(PROCESSED_DIR / f"test_cross_{dim}.npy"),
            fit_pca         = False,
            features_cache  = str(PROCESSED_DIR / f"test_raw_{dim}.npy"),
            reduction       = "umap",
            n_components    = n_components_umap,
        )
        np.save(out_te_umap, X_te_umap)
        print(f"   -> Saved X_test_{dim}_umap{n_components_umap}.npy (shape={X_te_umap.shape})")

    print(f"=== Completed IPCA & UMAP for dim={dim} ===\n")

print("All feature-engineering tracks (IPCA / UMAP) complete.")

E0000 00:00:1749000421.989491   32316 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749000421.992036   32316 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1749000421.998423   32316 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1749000421.998434   32316 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1749000421.998435   32316 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1749000421.998436   32316 computation_placer.cc:177] computation placer already registered. Please check linka

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'


=== TRACK dim=384 ===
1) [SKIP] X_train_384_ipca.npy already exists (shape=(323613, 3))
2) [SKIP] X_valid_384_ipca.npy already exists (shape=(40710, 3))
3) [SKIP] X_test_384_ipca.npy already exists (shape=(39964, 3))
4) [SKIP] X_train_384_umap10.npy already exists (shape=(323613, 10))
5) [SKIP] X_valid_384_umap10.npy already exists (shape=(40710, 10))
6) [SKIP] X_test_384_umap10.npy already exists (shape=(39964, 10))
=== Completed IPCA & UMAP for dim=384 ===


=== TRACK dim=768 ===
1) Fitting IPCA-only features on TRAIN…
   -> Saved X_train_768_ipca.npy (shape=(323613, 3))
2) Applying saved IPCA on VALID…
   -> Saved X_valid_768_ipca.npy (shape=(40710, 3))
3) Applying saved IPCA on TEST…
   -> Saved X_test_768_ipca.npy (shape=(39964, 3))
4) Fitting UMAP-only features on TRAIN…




   -> Saved X_train_768_umap10.npy (shape=(323613, 10))
5) Applying saved UMAP on VALID…




   -> Saved X_valid_768_umap10.npy (shape=(40710, 10))
6) Applying saved UMAP on TEST…




   -> Saved X_test_768_umap10.npy (shape=(39964, 10))
=== Completed IPCA & UMAP for dim=768 ===

All feature-engineering tracks (IPCA / UMAP) complete.
