In [1]:
# notebooks/02_preprocessing.ipynb

%run setup.py

import pandas as pd
import numpy as np
from pathlib import Path
from src.preprocessing import clean_and_stats, build_st_embeddings

RAW = "../data/splits/train.csv"
OUT = Path("../data/processed")
OUT.mkdir(parents=True, exist_ok=True)

# 1. Load *train* and derive the list of unique questions
train_df = pd.read_csv(RAW)
questions = pd.concat([train_df.question1, train_df.question2]).unique()

# 2. Clean & collect stats
cleaned, q_len, q_words = [], [], []
for q in questions:
    c, ln, wc = clean_and_stats(q)
    cleaned.append(c)
    q_len.append(ln)
    q_words.append(wc)

# 3. Generate embeddings using Sentence Transformers
embeddings = build_st_embeddings(cleaned)

# 4. Save artefacts
# a) cleaned strings → keep order => numeric ID
np.save(OUT / "clean_questions.npy", np.array(cleaned, dtype=object))

# b) meta CSV for quick joins
pd.DataFrame({
    "question": questions,
    "clean": cleaned,
    "len": q_len,
    "words": q_words,
}).to_csv(OUT / "question_meta.csv", index=False)

# c) embeddings
np.save(OUT / "question_embeddings.npy", embeddings)

print("Saved:", list(OUT.iterdir()))

Saved: [PosixPath('../data/processed/clean_questions.npy'), PosixPath('../data/processed/question_meta.csv'), PosixPath('../data/processed/question_embeddings.npy')]
