In [1]:
# ────────────────────────────────────────────────────────────────
# notebooks/02_preprocessing.ipynb
# ===============================================

# 0) Enable our src/ folder on the PYTHONPATH
%run setup.py

# 1) Imports
import pandas as pd
import numpy as np
from pathlib import Path

from src.preprocessing import clean_and_stats, build_st_embeddings

# 2) Paths to the **three** pre-split CSVs
SPLIT_DIR = Path("../data/splits")
train_df  = pd.read_csv(SPLIT_DIR / "train.csv").dropna(subset=["question1","question2"])
valid_df  = pd.read_csv(SPLIT_DIR / "valid.csv").dropna(subset=["question1","question2"])
test_df   = pd.read_csv(SPLIT_DIR / "test.csv").dropna(subset=["question1","question2"])

# 3) Collect **every unique** raw question string from train ∪ valid ∪ test
all_questions = pd.concat(
    [
        train_df[["question1", "question2"]],
        valid_df[["question1", "question2"]],
        test_df[["question1", "question2"]],
    ],
    axis=0
).stack().unique()

print(f"Unique questions across all splits: {len(all_questions)}")

# 4) Clean each unique question and collect stats (char length, word count)
cleaned, char_len, word_cnt = [], [], []
for q in all_questions:
    c, ln, wc = clean_and_stats(q)
    cleaned.append(c)
    char_len.append(ln)
    word_cnt.append(wc)

# 5) Save per-question artifacts to disk
OUT_DIR = Path("../data/processed")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# 5a) question_meta.csv → columns: ["question", "clean", "len", "words"]
pd.DataFrame({
    "question": all_questions,
    "clean":    cleaned,
    "len":      char_len,
    "words":    word_cnt
}).to_csv(OUT_DIR / "question_meta.csv", index=False)

# 5b) clean_questions.npy → array of cleaned strings (dtype=object)
np.save(OUT_DIR / "clean_questions.npy", np.array(cleaned, dtype=object))

# 6) Pre-compute MiniLM-L6 embeddings for every cleaned question (bi-encoder caching)
#    On first run, this will download the model. Afterwards, it mmap-loads for speed.
emb = build_st_embeddings(
    corpus      = cleaned,
    model_name  = "sentence-transformers/all-MiniLM-L6-v2",
    cache_dir   = "models",
    batch_size  = 512
)
np.save(OUT_DIR / "question_embeddings.npy", emb)

print("Preprocessing complete. Written to:", OUT_DIR)
print(list(OUT_DIR.iterdir()))

Unique questions across all splits: 537359
Preprocessing complete. Written to: ../data/processed
[PosixPath('../data/processed/clean_questions.npy'), PosixPath('../data/processed/X_train.npy'), PosixPath('../data/processed/question_meta.csv'), PosixPath('../data/processed/train_cross_scores.npy'), PosixPath('../data/processed/question_embeddings.npy')]
