In [None]:
# ────────────────────────────────────────────────────────────────
# notebooks/2_preprocessing.ipynb
# ----------------------------------------------------------------

# 0) Enable src/ on PYTHONPATH
# ─────────────────────────────────────────────────────────────────────────────
%run setup.py

# 1) Imports
# ─────────────────────────────────────────────────────────────────────────────
import pandas as pd
import numpy as np
import glob
from pathlib import Path
from src.preprocessing import clean_and_stats, build_st_embeddings

# 2) Read split CSVs
# ─────────────────────────────────────────────────────────────────────────────
SPLIT_DIR = Path("../data/splits")
train_df  = pd.read_csv(SPLIT_DIR / "train.csv").dropna(subset=["question1", "question2"])
valid_df  = pd.read_csv(SPLIT_DIR / "valid.csv").dropna(subset=["question1", "question2"])
test_df   = pd.read_csv(SPLIT_DIR / "test.csv").dropna(subset=["question1", "question2"])

# -----------------------------------------------------------------
# UNION ALL QUESTIONS
#
# We take the union of every unique string in question1/2 across
# train/valid/test. Since our SBERT models are pretrained and
# frozen, this does NOT leak label information.
# -----------------------------------------------------------------
all_questions = (
    pd.concat(
        [
            train_df[["question1", "question2"]],
            valid_df[["question1", "question2"]],
            test_df[["question1", "question2"]],
        ],
        axis=0,
    )
    .stack()
    .unique()
)

print(f"Unique questions across all splits: {len(all_questions):,}")

# 3) Clean texts & record char/word stats
# ─────────────────────────────────────────────────────────────────────────────
cleaned, char_len, word_cnt = [], [], []
for q in all_questions:
    c, ln, wc = clean_and_stats(q)
    cleaned.append(c)
    char_len.append(ln)
    word_cnt.append(wc)

# 4) Persist per-question artefacts
# ─────────────────────────────────────────────────────────────────────────────
OUT_DIR = Path("../data/processed")
OUT_DIR.mkdir(parents=True, exist_ok=True)

pd.DataFrame(
    {
        "question": all_questions,
        "clean": cleaned,
        "len": char_len,
        "words": word_cnt,
    }
).to_csv(OUT_DIR / "question_meta.csv", index=False)

np.save(OUT_DIR / "clean_questions.npy", np.array(cleaned, dtype=object))

# 5) SBERT embeddings (768-dim & 384-dim), on-disk cache + logging
# ─────────────────────────────────────────────────────────────────────────────

# Remove stale files if present
emb_768_fp = OUT_DIR / "question_embeddings_768.npy"
if emb_768_fp.exists():
    emb_768_fp.unlink()
    print("Deleted stale question_embeddings_768.npy")

emb_384_fp = OUT_DIR / "question_embeddings_384.npy"
if emb_384_fp.exists():
    emb_384_fp.unlink()
    print("Deleted stale question_embeddings_384.npy")


# 5a) Build / load 768-dim embeddings
emb_768 = build_st_embeddings(
    texts       = cleaned,
    target_dim  = 768,
    cache_dir   = OUT_DIR,            # hashed SBERT cache lives here
    batch_size  = 512,
    save_path   = emb_768_fp          # canonical downstream file
)
print("768-dim SBERT embedding matrix shape:", emb_768.shape)


# 5b) Build / load 384-dim embeddings
emb_384 = build_st_embeddings(
    texts       = cleaned,
    target_dim  = 384,
    cache_dir   = OUT_DIR,
    batch_size  = 512,
    save_path   = emb_384_fp
)
print("384-dim SBERT embedding matrix shape:", emb_384.shape)


print("\nPre-processing complete. Files now in data/processed/:")
for p in sorted(glob.glob(str(OUT_DIR / '*'))):
    print(" *", Path(p).name)

Unique questions across all splits: 537,359


Batches:   0%|          | 0/1050 [00:00<?, ?it/s]

768-dim SBERT embedding matrix shape: (537359, 768)


Batches:   0%|          | 0/1050 [00:00<?, ?it/s]

384-dim SBERT embedding matrix shape: (537359, 384)

Pre-processing complete. Files now in data/processed/:
 * clean_questions.npy
 * question_embeddings_384.npy
 * question_embeddings_768.npy
 * question_meta.csv
 * st_d2389aac.npy
 * st_d46f7701.npy
