In [1]:
# ────────────────────────────────────────────────────────────────
# notebooks/02_preprocessing.ipynb
# ----------------------------------------------------------------
# 0) Enable src/ on PYTHONPATH
%run setup.py

# 1) Imports
import pandas as pd
import numpy as np
import os, glob
from pathlib import Path
from src.preprocessing import clean_and_stats, build_st_embeddings

# 2) Read split CSVs
SPLIT_DIR = Path("../data/splits")
train_df  = pd.read_csv(SPLIT_DIR / "train.csv").dropna(subset=["question1", "question2"])
valid_df  = pd.read_csv(SPLIT_DIR / "valid.csv").dropna(subset=["question1", "question2"])
test_df   = pd.read_csv(SPLIT_DIR / "test.csv").dropna(subset=["question1", "question2"])

# -----------------------------------------------------------------
# IMPORTANT  ❗
# We UNION all questions so every qid appearing in *any* split
# has  (1) a cleaned string,  (2) an SBERT embedding row.
# This is NOT data-leakage because the encoder is PRETRAINED + FROZEN.
# -----------------------------------------------------------------
all_questions = (
    pd.concat(
        [
            train_df[["question1", "question2"]],
            valid_df[["question1", "question2"]],
            test_df[["question1", "question2"]],
        ],
        axis=0,
    )
    .stack()
    .unique()
)

print(f"Unique questions across all splits: {len(all_questions):,}")

# 3) Clean texts & stats
cleaned, char_len, word_cnt = [], [], []
for q in all_questions:
    c, ln, wc = clean_and_stats(q)
    cleaned.append(c)
    char_len.append(ln)
    word_cnt.append(wc)

# 4) Persist per-question artefacts
OUT_DIR = Path("../data/processed")
OUT_DIR.mkdir(parents=True, exist_ok=True)

pd.DataFrame(
    {
        "question": all_questions,
        "clean": cleaned,
        "len": char_len,
        "words": word_cnt,
    }
).to_csv(OUT_DIR / "question_meta.csv", index=False)

np.save(OUT_DIR / "clean_questions.npy", np.array(cleaned, dtype=object))

# 5) SBERT embeddings (Quora-DistilBERT, 768-d)
#    – Delete any stale file first to avoid 384-d confusion
emb_fp = OUT_DIR / "question_embeddings.npy"
if emb_fp.exists():
    emb_fp.unlink()
    print("Deleted stale question_embeddings.npy")

emb = build_st_embeddings(
    corpus     = cleaned,
    model_name = "sentence-transformers/distilbert-base-nli-stsb-quora-ranking",
    cache_dir  = OUT_DIR,          # hashed cache lives here
    batch_size = 512,
    out_fp     = emb_fp            # canonical file for downstream code
)

print("SBERT embedding matrix shape:", emb.shape)  # (N_q, 768)

print("\nPre-processing complete. Files now in data/processed/:")
for p in sorted(glob.glob(str(OUT_DIR / '*'))):
    print(" *", os.path.basename(p))

Unique questions across all splits: 537,359
SBERT embedding matrix shape: (537359, 768)

Pre-processing complete. Files now in data/processed/:
 * clean_questions.npy
 * question_embeddings.npy
 * question_meta.csv
 * st_1253c25d.npy
