In [None]:
# 0) Setup + robust CSV loading (EDA-safe)
import os, re, unicodedata, time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

BASE = "/content"

train_path  = os.path.join(BASE, "train.csv")
test_path   = os.path.join(BASE, "test.csv")
desc_path   = os.path.join(BASE, "product_descriptions.csv")
attr_path   = os.path.join(BASE, "attributes.csv")
tlabels_path = os.path.join(BASE, "testLabels.csv")

NROWS = 200_000  # EDA sample size

def read_csv_robust(path, nrows=None):
    # Prefer "replace" to avoid dropping lines fallback to latin1 if needed.
    try:
        return pd.read_csv(path, nrows=nrows, encoding="utf-8", encoding_errors="replace")
    except TypeError:
        # older pandas without encoding_errors
        return pd.read_csv(path, nrows=nrows, encoding="latin1")
    except Exception:
        return pd.read_csv(path, nrows=nrows, encoding="latin1", engine="python", on_bad_lines="skip")

train = read_csv_robust(train_path, nrows=NROWS)
test  = read_csv_robust(test_path,  nrows=NROWS)
descs = read_csv_robust(desc_path,  nrows=NROWS)
attrs = read_csv_robust(attr_path,  nrows=NROWS)

print("train:", train.shape, "test:", test.shape)
print("descs:", descs.shape, "attrs:", attrs.shape)
print("\ntrain head:\n", train.head(2))
print("\ndescs head:\n", descs.head(2))

# Merge descriptions (needed for query<->description work)
train_m = train.merge(descs, on="product_uid", how="left")
test_m  = test.merge(descs, on="product_uid", how="left")
print("\ntrain_m:", train_m.shape, "missing desc rate:", float(train_m["product_description"].isna().mean()))

# 1) Label distribution + basic stats
plt.figure()
train["relevance"].hist(bins=30)
plt.title("Relevance distribution (train sample)")
plt.xlabel("relevance"); plt.ylabel("count")
plt.show()

print("\nrelevance min/max:", float(train["relevance"].min()), float(train["relevance"].max()))
print(train["relevance"].describe())

# 2) Length plots + quantiles
def s_len(x):
    return 0 if pd.isna(x) else len(str(x))

for col in ["search_term", "product_title", "product_description"]:
    if col in train_m.columns:
        train_m[col+"_charlen"] = train_m[col].map(s_len)

plt.figure()
plt.hist(train_m["search_term_charlen"], bins=50)
plt.title("search_term char length")
plt.xlabel("chars"); plt.ylabel("count")
plt.show()

plt.figure()
plt.hist(train_m["product_title_charlen"], bins=50)
plt.title("product_title char length")
plt.xlabel("chars"); plt.ylabel("count")
plt.show()

plt.figure()
plt.hist(train_m["product_description_charlen"], bins=80)
plt.title("product_description char length (sample)")
plt.xlabel("chars"); plt.ylabel("count")
plt.show()

for c in ["search_term_charlen","product_title_charlen","product_description_charlen"]:
    q = train_m[c].quantile([0.5,0.8,0.9,0.95,0.99]).to_dict()
    print(c, q)

# 3) Tokenizer for Q2 (word/char-combination tokens)
FRACTIONS = "¼½¾⅓⅔⅛⅜⅝⅞"
token_re = re.compile(
    rf"(#[A-Za-z0-9_]+)|"          # hashtags
    rf"(\d+(?:\.\d+)?°?)|"         # numbers, maybe degree
    rf"([{FRACTIONS}])|"           # unicode fractions
    rf"([A-Za-z]+[A-Za-z0-9_-]*)|" # words (handles e.g. wire-backed, pull-out)
    rf"([^\w\s])"                  # leftover punctuation/symbols
)

def normalize_text(s: str) -> str:
    s = "" if s is None or (isinstance(s, float) and np.isnan(s)) else str(s)
    s = unicodedata.normalize("NFKC", s)
    return s.lower()

def tokenize(s: str):
    s = normalize_text(s)
    toks = []
    for m in token_re.finditer(s):
        tok = next(g for g in m.groups() if g is not None)
        tok = tok.strip()
        if tok:
            toks.append(tok)
    return toks

# Show tokenization examples (good for the report)
ex = train_m.sample(8, random_state=1)[["search_term","product_title","relevance"]].copy()
for _, r in ex.iterrows():
    print("\n---")
    print("search_term:", r["search_term"])
    print("tokens:", tokenize(r["search_term"])[:40])
    print("title:", r["product_title"])
    print("tokens:", tokenize(r["product_title"])[:40])
    print("relevance:", r["relevance"])

# 4) Overlap/Jaccard quick signal plots
def jaccard(a, b):
    A, B = set(a), set(b)
    return 0.0 if len(A|B)==0 else len(A&B)/len(A|B)

def overlap_count(a,b):
    A,B=set(a),set(b)
    return len(A&B)

# tokenize sample columns
train_m["q_toks"] = train_m["search_term"].map(tokenize)
train_m["d_toks"] = train_m["product_description"].map(tokenize)

train_m["jacc_q_desc"] = [jaccard(a,b) for a,b in zip(train_m["q_toks"], train_m["d_toks"])]
train_m["ovl_q_desc"]  = [overlap_count(a,b) for a,b in zip(train_m["q_toks"], train_m["d_toks"])]

plt.figure()
plt.scatter(train_m["jacc_q_desc"], train_m["relevance"], s=5, alpha=0.25)
plt.title("Relevance vs Jaccard(query, description) [sample]")
plt.xlabel("jaccard"); plt.ylabel("relevance")
plt.show()

plt.figure()
plt.scatter(train_m["ovl_q_desc"], train_m["relevance"], s=5, alpha=0.25)
plt.title("Relevance vs overlap-count(query, description) [sample]")
plt.xlabel("overlap count"); plt.ylabel("relevance")
plt.show()

print("\nCorrelation (sample):")
print(train_m[["jacc_q_desc","ovl_q_desc","relevance"]].corr(numeric_only=True))

# 5) Find “special tokens” (°, fractions, hashtags) for report examples
special_pat = re.compile(r"[°¼½¾⅓⅔⅛⅜⅝⅞]")
mask = train_m["search_term"].astype(str).str.contains(special_pat)
hits = train_m.loc[mask, "search_term"]
print("\nSpecial-token query hits in sample:", len(hits))
if len(hits):
    print(hits.sample(min(20, len(hits)), random_state=0).tolist())

# 6) Proper train/val split for modeling (group by product_uid to avoid leakage)
from sklearn.model_selection import GroupShuffleSplit

df = train_m.dropna(subset=["product_description", "search_term", "product_title", "relevance"]).copy()

gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, val_idx = next(gss.split(df, groups=df["product_uid"].values))

tr = df.iloc[train_idx].reset_index(drop=True)
va = df.iloc[val_idx].reset_index(drop=True)

print("\nSplit sizes:", tr.shape, va.shape)
print("Shared products (must be 0):", len(set(tr.product_uid) & set(va.product_uid)))

print("\nTrain sample rows:")
print(tr[["product_uid","search_term","product_title","relevance"]].head(2))

# 7) Load testLabels (for later evaluation only)
test_labels = read_csv_robust(tlabels_path, nrows=None)
print("\nTestLabels Usage counts:")
if "Usage" in test_labels.columns:
    print(test_labels["Usage"].value_counts())
else:
    print("No 'Usage' column found. Columns:", list(test_labels.columns))

# Keep these objects for next steps:
# tr, va, test_m, test_labels
def eval_feature_extractor_on_test(
    model_name,
    best_path,
    test_m,
    test_labels,
    batch_size_embed=256,
    batch_size_pred=1024
):
    # Build test texts
    test_q = test_m["search_term"].astype(str).tolist()
    test_doc = (test_m["product_title"].astype(str) + " [SEP] " +
                test_m["product_description"].astype(str)).tolist()

    # SentenceTransformer
    device = "cuda" if torch.cuda.is_available() else "cpu"
    st = SentenceTransformer(model_name, device=device)

    # Embed
    test_q_emb = st.encode(test_q, batch_size=batch_size_embed, show_progress_bar=True,
                           convert_to_numpy=True, normalize_embeddings=True)
    test_doc_emb = st.encode(test_doc, batch_size=batch_size_embed, show_progress_bar=True,
                             convert_to_numpy=True, normalize_embeddings=True)

    # Features
    X_te = build_pair_features(test_q_emb, test_doc_emb)

    # Load MLP
    mlp = MLPRegressor(X_te.shape[1], dropout=0.0).to(device)
    mlp.load_state_dict(torch.load(best_path, map_location=device))
    mlp.eval()

    # Predict
    preds = []
    te_loader = DataLoader(torch.tensor(X_te, dtype=torch.float32), batch_size=batch_size_pred, shuffle=False)
    with torch.no_grad():
        for Xb in te_loader:
            preds.append(mlp(Xb.to(device)).cpu().numpy())
    preds = np.concatenate(preds)

    # Merge with test labels
    tl = test_labels.copy()
    if "Usage" in tl.columns:
        tl = tl[tl["Usage"].isin(["Public", "Private"])].copy()

    # Ensure alignment by id
    pred_df = pd.DataFrame({"id": test_m["id"].values, "pred": preds})
    merged = tl.merge(pred_df, on="id", how="inner")

    y_true = merged["relevance"].astype(np.float32).values
    y_pred = merged["pred"].astype(np.float32).values

    return rmse(y_true, y_pred), mae(y_true, y_pred), len(merged)



In [2]:
!pip -q install wandb gensim

import os, re, unicodedata, time, math
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import wandb


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m87.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
wandb.login()

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 2


[34m[1mwandb[0m: You chose 'Use an existing W&B account'
[34m[1mwandb[0m: Logging into https://api.wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: Find your API key here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mnavians[0m ([33mnavians-ben-gurion-university-of-the-negev[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

Q2c — Word2Vec + Siamese BiLSTM regressor

In [None]:
!pip -q install wandb gensim

import os, re, unicodedata, time
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import wandb
from gensim.models import Word2Vec

# Q2c — Word2Vec + Siamese BiLSTM regressor


# Tokenizer (word / char-combination)
FRACTIONS = "¼½¾⅓⅔⅛⅜⅝⅞"
token_re = re.compile(
    rf"(#[A-Za-z0-9_]+)|"          # hashtags
    rf"(\d+(?:\.\d+)?°?)|"         # numbers, maybe degree
    rf"([{FRACTIONS}])|"           # unicode fractions
    rf"([A-Za-z]+[A-Za-z0-9_-]*)|" # words (wire-backed / pull-out)
    rf"([^\w\s])"                  # punctuation/symbols
)

def normalize_text(s: str) -> str:
    s = "" if s is None or (isinstance(s, float) and np.isnan(s)) else str(s)
    return unicodedata.normalize("NFKC", s).lower()

def tokenize(s: str):
    s = normalize_text(s)
    toks = []
    for m in token_re.finditer(s):
        tok = next(g for g in m.groups() if g is not None).strip()
        if tok:
            toks.append(tok)
    return toks

# Hyperparams for truncation
Q_MAX_TOK = 16
D_MAX_TOK = 256  # model input length for doc tokens

USE_FULL_DESCR_FOR_DOC = True
DOC_EXTRA_CAP = 1024  # only affects Word2Vec corpus building / token pool

def build_pair_tokens(df: pd.DataFrame):
    # Query tokens
    q = df["search_term"].map(tokenize).map(lambda x: x[:Q_MAX_TOK]).tolist()

    # Doc tokens: title + description
    doc_text = (df["product_title"].astype(str) + " [SEP] " + df["product_description"].astype(str))
    max_doc_for_corpus = (DOC_EXTRA_CAP if USE_FULL_DESCR_FOR_DOC else D_MAX_TOK)
    d = doc_text.map(tokenize).map(lambda x: x[:max_doc_for_corpus]).tolist()

    return q, d

# Train Word2Vec ONLY on tr (no leakage)
tr_q, tr_d = build_pair_tokens(tr)
corpus = tr_q + tr_d

W2V_DIM = 200
w2v = Word2Vec(
    sentences=corpus,
    vector_size=W2V_DIM,
    window=5,
    min_count=2,
    workers=2,
    sg=1,
    negative=10,
    epochs=5
)

PAD = "<PAD>"
UNK = "<UNK>"

vocab = [PAD, UNK] + list(w2v.wv.index_to_key)
stoi = {t: i for i, t in enumerate(vocab)}
pad_idx = stoi[PAD]
unk_idx = stoi[UNK]

emb = np.zeros((len(vocab), W2V_DIM), dtype=np.float32)
emb[unk_idx] = np.random.normal(0, 0.02, size=(W2V_DIM,)).astype(np.float32)
for token in w2v.wv.index_to_key:
    emb[stoi[token]] = w2v.wv[token]
emb_t = torch.tensor(emb)

def to_ids(tokens, max_len):
    ids = [stoi.get(t, unk_idx) for t in tokens[:max_len]]
    if len(ids) < max_len:
        ids += [pad_idx] * (max_len - len(ids))
    return ids

# Dataset
class RelevancePairs(Dataset):
    def __init__(self, df: pd.DataFrame):
        self.df = df.reset_index(drop=True)
        self.q_toks, self.d_toks = build_pair_tokens(self.df)
        self.y = self.df["relevance"].astype(np.float32).values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        q = to_ids(self.q_toks[idx], Q_MAX_TOK)
        d = to_ids(self.d_toks[idx], D_MAX_TOK)  # model sees fixed length here
        y = self.y[idx]
        return (
            torch.tensor(q, dtype=torch.long),
            torch.tensor(d, dtype=torch.long),
            torch.tensor(y, dtype=torch.float32),
        )

train_ds = RelevancePairs(tr)
val_ds   = RelevancePairs(va)

train_loader = DataLoader(train_ds, batch_size=256, shuffle=True,  num_workers=2, pin_memory=True)
val_loader   = DataLoader(val_ds,   batch_size=256, shuffle=False, num_workers=2, pin_memory=True)

# Model (pad-aware masking)
class SeqEncoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden=128, num_layers=1, dropout=0.1, pad_idx=0, emb_weights=None):
        super().__init__()
        self.pad_idx = pad_idx
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=pad_idx)
        if emb_weights is not None:
            self.embedding.weight.data.copy_(emb_weights)

        self.lstm = nn.LSTM(
            emb_dim, hidden,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True,
            dropout=0.0 if num_layers == 1 else dropout
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        e = self.embedding(x)        # [B, T, E]
        out, _ = self.lstm(e)        # [B, T, 2H]

        mask = (x != self.pad_idx).float().unsqueeze(-1)  # [B, T, 1]
        out = out * mask
        denom = mask.sum(dim=1).clamp(min=1.0)
        pooled = out.sum(dim=1) / denom                   # [B, 2H]
        return self.dropout(pooled)

class SiameseRegressor(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden=128, dropout=0.2, pad_idx=0, emb_weights=None):
        super().__init__()
        self.enc = SeqEncoder(
            vocab_size, emb_dim,
            hidden=hidden,
            dropout=dropout,
            pad_idx=pad_idx,
            emb_weights=emb_weights
        )
        dim = 2 * hidden
        self.head = nn.Sequential(
            nn.Linear(dim * 4, 256),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(64, 1)
        )

    def forward(self, q, d):
        qv = self.enc(q)
        dv = self.enc(d)
        feats = torch.cat([qv, dv, torch.abs(qv - dv), qv * dv], dim=1)
        return self.head(feats).squeeze(1)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = SiameseRegressor(
    len(vocab), W2V_DIM,
    hidden=128,
    dropout=0.2,
    pad_idx=pad_idx,
    emb_weights=emb_t
).to(device)

# Metrics + eval
def rmse(y_true, y_pred):
    return float(np.sqrt(np.mean((y_true - y_pred) ** 2)))

def mae(y_true, y_pred):
    return float(np.mean(np.abs(y_true - y_pred)))

@torch.no_grad()
def evaluate(model, loader):
    model.eval()
    ys, ps = [], []
    for q, d, y in loader:
        q, d = q.to(device), d.to(device)
        pred = model(q, d).detach().cpu().numpy()
        ys.append(y.numpy())
        ps.append(pred)
    y_true = np.concatenate(ys)
    y_pred = np.concatenate(ps)
    return rmse(y_true, y_pred), mae(y_true, y_pred)

# Correct TEST eval for Siamese model
@torch.no_grad()
def eval_siamese_on_test(best_path, test_m, test_labels, batch_size=256):
    """
    Evaluate SiameseRegressor on the labeled part of test set.
    IMPORTANT: ignores Usage == "Ignored" if exists.
    Assumes test_m includes product_title + product_description columns (merged).
    """
    df_test = test_m.copy()
    df_test["relevance"] = 0.0

    test_ds = RelevancePairs(df_test)
    test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)

    model.load_state_dict(torch.load(best_path, map_location=device))
    model.eval()

    preds = []
    ids = df_test["id"].values
    for q, d, _ in test_loader:
        q, d = q.to(device), d.to(device)
        p = model(q, d).detach().cpu().numpy()
        preds.append(p)
    preds = np.concatenate(preds)

    tl = test_labels.copy()
    if "Usage" in tl.columns:
        tl = tl[tl["Usage"].isin(["Public", "Private"])].copy()

    pred_df = pd.DataFrame({"id": ids, "pred": preds})
    merged = tl.merge(pred_df, on="id", how="inner")

    y_true = merged["relevance"].astype(np.float32).values
    y_pred = merged["pred"].astype(np.float32).values

    return rmse(y_true, y_pred), mae(y_true, y_pred), len(merged)

# Train with W&B (saves BEST checkpoint)
run = wandb.init(
    project="dl_relevance",
    name="Q2_word2vec_siamese_bilstm_fixed",
    config={
        "Q_MAX_TOK": Q_MAX_TOK,
        "D_MAX_TOK": D_MAX_TOK,
        "W2V_DIM": W2V_DIM,
        "hidden": 128,
        "batch_size": 256,
        "lr": 2e-3,
        "epochs": 6,
        "USE_FULL_DESCR_FOR_DOC": USE_FULL_DESCR_FOR_DOC,
        "DOC_EXTRA_CAP": DOC_EXTRA_CAP,
    }
)

opt = torch.optim.AdamW(model.parameters(), lr=run.config.lr)
loss_fn = nn.MSELoss()

best_val_rmse = 1e9
best_path = "/content/best_q2_word_siamese_fixed.pt"
t0 = time.time()

for epoch in range(1, run.config.epochs + 1):
    model.train()
    running_loss = 0.0

    for step, (q, d, y) in enumerate(train_loader, 1):
        q, d, y = q.to(device), d.to(device), y.to(device)

        opt.zero_grad(set_to_none=True)
        pred = model(q, d)
        loss = loss_fn(pred, y)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        opt.step()

        running_loss += float(loss.item())
        if step % 100 == 0:
            wandb.log({"train/loss_step": running_loss / step, "epoch": epoch, "step": step})

    train_rmse, train_mae = evaluate(model, train_loader)
    val_rmse, val_mae     = evaluate(model, val_loader)

    wandb.log({
        "epoch": epoch,
        "train/loss_epoch": running_loss / max(1, len(train_loader)),
        "train/rmse": train_rmse,
        "train/mae": train_mae,
        "val/rmse": val_rmse,
        "val/mae": val_mae
    })

    print(f"Epoch {epoch:02d} | train RMSE {train_rmse:.4f} MAE {train_mae:.4f} | val RMSE {val_rmse:.4f} MAE {val_mae:.4f}")

    if val_rmse < best_val_rmse:
        best_val_rmse = val_rmse
        torch.save(model.state_dict(), best_path)
        wandb.save(best_path)
        print("saved best:", best_path)

runtime_sec = time.time() - t0
wandb.log({"runtime_sec": runtime_sec})
print("Total runtime (sec):", int(runtime_sec))


# Final best-checkpoint metrics (Train/Val/Test)
model.load_state_dict(torch.load(best_path, map_location=device))
train_rmse, train_mae = evaluate(model, train_loader)
val_rmse, val_mae     = evaluate(model, val_loader)

test_rmse, test_mae, n = eval_siamese_on_test(
    best_path=best_path,
    test_m=test_m,
    test_labels=test_labels,
    batch_size=256
)

print("\nFINAL METRICS (best checkpoint)")
print(f"Train | RMSE: {train_rmse:.4f} | MAE: {train_mae:.4f}")
print(f"Val   | RMSE: {val_rmse:.4f} | MAE: {val_mae:.4f}")
print(f"Test  | RMSE: {test_rmse:.4f} | MAE: {test_mae:.4f} | n={n}")

wandb.log({
    "final/train_rmse": train_rmse, "final/train_mae": train_mae,
    "final/val_rmse": val_rmse,     "final/val_mae": val_mae,
    "final/test_rmse": test_rmse,   "final/test_mae": test_mae,
    "final/test_n": n
})
wandb.finish()


2D


In [None]:
# Q2d — Feature extractor (Sentence-Transformers) + MLP regressor


!pip -q install sentence-transformers

import time, numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import wandb
from sentence_transformers import SentenceTransformer

# Build the "doc" text (title + description)
def make_doc_text(df):
    return (df["product_title"].astype(str) + " [SEP] " + df["product_description"].astype(str)).tolist()

tr_q = tr["search_term"].astype(str).tolist()
va_q = va["search_term"].astype(str).tolist()
tr_doc = make_doc_text(tr)
va_doc = make_doc_text(va)

y_tr = tr["relevance"].astype(np.float32).values
y_va = va["relevance"].astype(np.float32).values

print(len(tr_q), len(va_q))
print("Example:", tr_q[0], "|", tr_doc[0][:120], "...")

def embed_texts(model, texts, batch_size=256):
    return model.encode(
        texts,
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_numpy=True,
        normalize_embeddings=True
    )

def build_pair_features(q_emb, d_emb):
    absdiff = np.abs(q_emb - d_emb)
    prod    = q_emb * d_emb
    cos     = np.sum(q_emb * d_emb, axis=1, keepdims=True)  # normalized => cosine
    feats   = np.concatenate([q_emb, d_emb, absdiff, prod, cos], axis=1)
    return feats.astype(np.float32)

def rmse(y_true, y_pred):
    return float(np.sqrt(np.mean((y_true - y_pred) ** 2)))

def mae(y_true, y_pred):
    return float(np.mean(np.abs(y_true - y_pred)))

class FeatDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
    def __len__(self): return len(self.y)
    def __getitem__(self, i): return self.X[i], self.y[i]

class MLPRegressor(nn.Module):
    def __init__(self, in_dim, dropout=0.2):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, 512),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, 1)
        )
    def forward(self, x):
        return self.net(x).squeeze(1)

def run_feature_extractor(
    model_name,
    batch_size_embed=256,
    batch_size_train=512,
    epochs=10,
    lr=2e-3
):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    st = SentenceTransformer(model_name, device=device)

    # Embeddings
    t0 = time.time()
    tr_q_emb   = embed_texts(st, tr_q,   batch_size=batch_size_embed)
    tr_doc_emb = embed_texts(st, tr_doc, batch_size=batch_size_embed)
    va_q_emb   = embed_texts(st, va_q,   batch_size=batch_size_embed)
    va_doc_emb = embed_texts(st, va_doc, batch_size=batch_size_embed)
    embed_time = time.time() - t0

    X_tr = build_pair_features(tr_q_emb, tr_doc_emb)
    X_va = build_pair_features(va_q_emb, va_doc_emb)

    train_loader      = DataLoader(FeatDataset(X_tr, y_tr), batch_size=batch_size_train, shuffle=True)
    train_loader_eval = DataLoader(FeatDataset(X_tr, y_tr), batch_size=batch_size_train, shuffle=False)
    val_loader_eval   = DataLoader(FeatDataset(X_va, y_va), batch_size=batch_size_train, shuffle=False)

    model = MLPRegressor(X_tr.shape[1], dropout=0.2).to(device)
    opt = torch.optim.AdamW(model.parameters(), lr=lr)
    loss_fn = nn.MSELoss()

    run = wandb.init(
        project="dl_relevance",
        name=f"Q2d_feat_{model_name.replace('/','_')}_fixed",
        config={
            "model_name": model_name,
            "embed_batch": batch_size_embed,
            "train_batch": batch_size_train,
            "epochs": epochs,
            "lr": lr,
            "embed_time_sec": embed_time,
            "feat_dim": int(X_tr.shape[1])
        }
    )

    best_val = 1e9
    best_path = f"/content/best_q2d_{model_name.replace('/','_')}_fixed.pt"
    t1 = time.time()

    for ep in range(1, epochs + 1):
        # Train
        model.train()
        total = 0.0
        for Xb, yb in train_loader:
            Xb, yb = Xb.to(device), yb.to(device)
            opt.zero_grad(set_to_none=True)
            pred = model(Xb)
            loss = loss_fn(pred, yb)
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            opt.step()
            total += float(loss.item())

        #Eval (NO shuffle)
        model.eval()
        with torch.no_grad():
            tr_pred = []
            for Xb, _ in train_loader_eval:
                tr_pred.append(model(Xb.to(device)).cpu().numpy())
            va_pred = []
            for Xb, _ in val_loader_eval:
                va_pred.append(model(Xb.to(device)).cpu().numpy())

        tr_pred = np.concatenate(tr_pred)
        va_pred = np.concatenate(va_pred)

        tr_rmse, tr_mae = rmse(y_tr, tr_pred), mae(y_tr, tr_pred)
        va_rmse, va_mae = rmse(y_va, va_pred), mae(y_va, va_pred)

        wandb.log({
            "epoch": ep,
            "train/loss_epoch": total / max(1, len(train_loader)),
            "train/rmse": tr_rmse,
            "train/mae": tr_mae,
            "val/rmse": va_rmse,
            "val/mae": va_mae
        })

        print(f"{model_name} | Ep {ep:02d} | train RMSE {tr_rmse:.4f} MAE {tr_mae:.4f} | val RMSE {va_rmse:.4f} MAE {va_mae:.4f}")

        if va_rmse < best_val:
            best_val = va_rmse
            torch.save(model.state_dict(), best_path)
            wandb.save(best_path)
            print(" saved best:", best_path)

    train_time = time.time() - t1
    total_time = embed_time + train_time
    wandb.log({"runtime_sec": total_time, "embed_time_sec": embed_time, "train_time_sec": train_time})

    # Load BEST and compute Train/Val final
    model.load_state_dict(torch.load(best_path, map_location=device))
    model.eval()
    with torch.no_grad():
        tr_pred = []
        for Xb, _ in train_loader_eval:
            tr_pred.append(model(Xb.to(device)).cpu().numpy())
        va_pred = []
        for Xb, _ in val_loader_eval:
            va_pred.append(model(Xb.to(device)).cpu().numpy())
    tr_pred = np.concatenate(tr_pred)
    va_pred = np.concatenate(va_pred)

    final_tr_rmse, final_tr_mae = rmse(y_tr, tr_pred), mae(y_tr, tr_pred)
    final_va_rmse, final_va_mae = rmse(y_va, va_pred), mae(y_va, va_pred)

    # TEST
    test_rmse, test_mae, n_test = eval_feature_extractor_on_test(
        model_name=model_name,
        best_path=best_path,
        test_m=test_m,
        test_labels=test_labels,
        batch_size_embed=batch_size_embed
    )

    wandb.log({
        "best/train_rmse": final_tr_rmse,
        "best/train_mae": final_tr_mae,
        "best/val_rmse": final_va_rmse,
        "best/val_mae": final_va_mae,
        "test/rmse": test_rmse,
        "test/mae": test_mae,
        "test/n_rows": n_test,
    })

    print("\nFINAL BEST METRICS")
    print("train RMSE:", final_tr_rmse)
    print("val   RMSE:", final_va_rmse)
    print("test  RMSE:", test_rmse)
    print("train MAE :", final_tr_mae)
    print("val   MAE :", final_va_mae)
    print("test  MAE :", test_mae)
    print("runtime_sec:", int(total_time))

    wandb.finish()

    return {
        "model_type": f"FeatExt(word) + MLP | {model_name}",
        "runtime_sec": float(total_time),
        "train_rmse": float(final_tr_rmse),
        "val_rmse": float(final_va_rmse),
        "test_rmse": float(test_rmse),
        "train_mae": float(final_tr_mae),
        "val_mae": float(final_va_mae),
        "test_mae": float(test_mae),
        "best_path": best_path
    }

# Run (2 models)
results = []
for name in [
    "sentence-transformers/all-MiniLM-L6-v2",
    "sentence-transformers/all-mpnet-base-v2"
]:
    results.append(run_feature_extractor(name, epochs=10))

results


Character-level Siamese (CharCNN+BiLSTM) + Train/Val/Test metrics

In [None]:
!pip -q install wandb

import os, time
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import wandb

from sklearn.model_selection import GroupShuffleSplit

# Character-level Siamese (CharCNN+BiLSTM) + Train/Val/Test metrics


# 0) Load + merge (descriptions)
BASE = "/content"
train_path   = os.path.join(BASE, "train.csv")
test_path    = os.path.join(BASE, "test.csv")
desc_path    = os.path.join(BASE, "product_descriptions.csv")
tlabels_path = os.path.join(BASE, "testLabels.csv")

def read_csv_robust(path):
    try:
        return pd.read_csv(path, encoding="utf-8", encoding_errors="replace")
    except TypeError:
        return pd.read_csv(path, encoding="latin1")
    except Exception:
        return pd.read_csv(path, encoding="latin1", engine="python", on_bad_lines="skip")

train = read_csv_robust(train_path)
test  = read_csv_robust(test_path)
descs = read_csv_robust(desc_path)
test_labels = read_csv_robust(tlabels_path)

train_m = train.merge(descs, on="product_uid", how="left")
test_m  = test.merge(descs, on="product_uid", how="left")

# drop missing essentials
df = train_m.dropna(subset=["search_term", "product_title", "product_description", "relevance", "product_uid"]).copy()

# 1) Split tr/va
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, val_idx = next(gss.split(df, groups=df["product_uid"].values))
tr = df.iloc[train_idx].reset_index(drop=True)
va = df.iloc[val_idx].reset_index(drop=True)

print("Split sizes:", tr.shape, va.shape)
print("Shared products:", len(set(tr.product_uid) & set(va.product_uid)))

# 2) Text builders
def make_doc_text(df):
    # title + description
    return (df["product_title"].astype(str) + " [SEP] " + df["product_description"].astype(str)).tolist()

tr_q = tr["search_term"].astype(str).tolist()
va_q = va["search_term"].astype(str).tolist()
te_q = test_m["search_term"].astype(str).tolist()

tr_doc = make_doc_text(tr)
va_doc = make_doc_text(va)
te_doc = make_doc_text(test_m)

y_tr = tr["relevance"].astype(np.float32).values
y_va = va["relevance"].astype(np.float32).values

# 3) Build char vocab on TRAIN ONLY
def build_char_vocab(texts, min_freq=2):
    from collections import Counter
    c = Counter()
    for s in texts:
        c.update(list(s))
    chars = sorted([ch for ch, f in c.items() if f >= min_freq])
    PAD, UNK = "<PAD>", "<UNK>"
    itos = [PAD, UNK] + chars
    stoi = {ch: i for i, ch in enumerate(itos)}
    return stoi, itos

stoi, itos = build_char_vocab(tr_q + tr_doc, min_freq=2)
PAD_IDX = stoi["<PAD>"]
UNK_IDX = stoi["<UNK>"]
print("Char vocab size:", len(itos))

# 4) Char dataset
Q_MAX_CH = 64
D_MAX_CH = 1500

def to_char_ids(s, max_len):
    s = str(s)[:max_len]
    ids = [stoi.get(ch, UNK_IDX) for ch in s]
    if len(ids) < max_len:
        ids += [PAD_IDX] * (max_len - len(ids))
    return ids

class CharPairs(Dataset):
    def __init__(self, q_texts, d_texts, y=None):
        self.q = q_texts
        self.d = d_texts
        self.y = None if y is None else y.astype(np.float32)

    def __len__(self): return len(self.q)

    def __getitem__(self, i):
        q_ids = torch.tensor(to_char_ids(self.q[i], Q_MAX_CH), dtype=torch.long)
        d_ids = torch.tensor(to_char_ids(self.d[i], D_MAX_CH), dtype=torch.long)
        if self.y is None:
            return q_ids, d_ids
        return q_ids, d_ids, torch.tensor(self.y[i], dtype=torch.float32)

train_ds = CharPairs(tr_q, tr_doc, y_tr)
val_ds   = CharPairs(va_q, va_doc, y_va)

train_loader = DataLoader(train_ds, batch_size=128, shuffle=True,  num_workers=2, pin_memory=True)
val_loader   = DataLoader(val_ds,   batch_size=128, shuffle=False, num_workers=2, pin_memory=True)

# 5) Model
class CharEncoder(nn.Module):
    def __init__(self, vocab_size, emb_dim=64, conv_channels=128, lstm_hidden=128, dropout=0.2, pad_idx=0):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=pad_idx)

        self.conv3 = nn.Conv1d(emb_dim, conv_channels, kernel_size=3, padding=1)
        self.conv5 = nn.Conv1d(emb_dim, conv_channels, kernel_size=5, padding=2)
        self.conv7 = nn.Conv1d(emb_dim, conv_channels, kernel_size=7, padding=3)

        self.act = nn.ReLU()
        self.dropout = nn.Dropout(dropout)

        self.lstm = nn.LSTM(
            input_size=conv_channels * 3,
            hidden_size=lstm_hidden,
            batch_first=True,
            bidirectional=True,
            num_layers=1
        )

    def forward(self, x):
        e = self.emb(x)           # [B, T, E]
        e = e.transpose(1, 2)     # [B, E, T]

        c3 = self.act(self.conv3(e))
        c5 = self.act(self.conv5(e))
        c7 = self.act(self.conv7(e))

        c = torch.cat([c3, c5, c7], dim=1)  # [B, 3C, T]
        c = self.dropout(c)
        c = c.transpose(1, 2)               # [B, T, 3C]

        out, _ = self.lstm(c)               # [B, T, 2H]

        mask = (x != PAD_IDX).float().unsqueeze(-1)
        out = out * mask
        denom = mask.sum(dim=1).clamp(min=1.0)
        pooled = out.sum(dim=1) / denom
        return self.dropout(pooled)

class SiameseCharRegressor(nn.Module):
    def __init__(self, vocab_size, emb_dim=64, conv_channels=128, lstm_hidden=128, dropout=0.25):
        super().__init__()
        self.enc = CharEncoder(vocab_size, emb_dim, conv_channels, lstm_hidden, dropout, pad_idx=PAD_IDX)
        dim = 2 * lstm_hidden
        self.head = nn.Sequential(
            nn.Linear(dim * 4, 256),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(64, 1)
        )

    def forward(self, q, d):
        qv = self.enc(q)
        dv = self.enc(d)
        feats = torch.cat([qv, dv, torch.abs(qv - dv), qv * dv], dim=1)
        return self.head(feats).squeeze(1)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = SiameseCharRegressor(len(itos)).to(device)

# 6) Metrics + eval
def rmse(y_true, y_pred):
    return float(np.sqrt(np.mean((y_true - y_pred) ** 2)))

def mae(y_true, y_pred):
    return float(np.mean(np.abs(y_true - y_pred)))

@torch.no_grad()
def eval_model(model, loader):
    model.eval()
    ys, ps = [], []
    for q, d, y in loader:
        q, d = q.to(device), d.to(device)
        pred = model(q, d).detach().cpu().numpy()
        ys.append(y.numpy())
        ps.append(pred)
    y_true = np.concatenate(ys)
    y_pred = np.concatenate(ps)
    return rmse(y_true, y_pred), mae(y_true, y_pred)

@torch.no_grad()
def predict_test(model, te_q_texts, te_doc_texts, batch_size=128):
    model.eval()
    te_ds = CharPairs(te_q_texts, te_doc_texts, y=None)
    te_loader = DataLoader(te_ds, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)
    preds = []
    for q, d in te_loader:
        q, d = q.to(device), d.to(device)
        preds.append(model(q, d).detach().cpu().numpy())
    return np.concatenate(preds)

def eval_on_testlabels_char(model, test_m, test_labels):
    preds = predict_test(model, te_q, te_doc, batch_size=128)
    pred_df = pd.DataFrame({"id": test_m["id"].values, "pred": preds})

    tl = test_labels.copy()
    if "Usage" in tl.columns:
        tl = tl[tl["Usage"].isin(["Public", "Private"])].copy()

    merged = tl.merge(pred_df, on="id", how="inner")
    y_true = merged["relevance"].astype(np.float32).values
    y_pred = merged["pred"].astype(np.float32).values

    return rmse(y_true, y_pred), mae(y_true, y_pred), len(merged)

# 7) Train with W&B
run = wandb.init(
    project="dl_relevance",
    name="Q1_char_siamese_charcnn_bilstm",
    config={
        "Q_MAX_CH": Q_MAX_CH,
        "D_MAX_CH": D_MAX_CH,
        "batch": 128,
        "lr": 2e-3,
        "epochs": 6
    }
)

opt = torch.optim.AdamW(model.parameters(), lr=run.config.lr)
loss_fn = nn.MSELoss()

best_val = 1e9
best_path = "/content/best_char_siamese.pt"
t0 = time.time()

for ep in range(1, run.config.epochs + 1):
    model.train()
    total = 0.0

    for step, (q, d, y) in enumerate(train_loader, 1):
        q, d, y = q.to(device), d.to(device), y.to(device)
        opt.zero_grad(set_to_none=True)
        pred = model(q, d)
        loss = loss_fn(pred, y)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        opt.step()
        total += float(loss.item())

        if step % 200 == 0:
            wandb.log({"train/loss_step": total / step, "epoch": ep, "step": step})

    tr_rmse, tr_mae = eval_model(model, train_loader)
    va_rmse, va_mae = eval_model(model, val_loader)

    wandb.log({
        "epoch": ep,
        "train/loss_epoch": total / max(1, len(train_loader)),
        "train/rmse": tr_rmse, "train/mae": tr_mae,
        "val/rmse": va_rmse,   "val/mae": va_mae
    })

    print(f"Ep {ep:02d} | train RMSE {tr_rmse:.4f} MAE {tr_mae:.4f} | val RMSE {va_rmse:.4f} MAE {va_mae:.4f}")

    if va_rmse < best_val:
        best_val = va_rmse
        torch.save(model.state_dict(), best_path)
        wandb.save(best_path)
        print("saved best:", best_path)

runtime = time.time() - t0
wandb.log({"runtime_sec": runtime})

# 8) FINAL: load best + compute Train/Val/Test (for Q3 table)
model.load_state_dict(torch.load(best_path, map_location=device))

final_tr_rmse, final_tr_mae = eval_model(model, train_loader)
final_va_rmse, final_va_mae = eval_model(model, val_loader)
test_rmse, test_mae, n_test = eval_on_testlabels_char(model, test_m, test_labels)

print("\nFINAL BEST METRICS (Char Siamese)")
print(f"Train RMSE: {final_tr_rmse:.4f} | Train MAE: {final_tr_mae:.4f}")
print(f"Val   RMSE: {final_va_rmse:.4f} | Val   MAE: {final_va_mae:.4f}")
print(f"Test  RMSE: {test_rmse:.4f} | Test  MAE: {test_mae:.4f} | n={n_test}")
print("runtime_sec:", int(runtime))

wandb.log({
    "best/train_rmse": final_tr_rmse,
    "best/train_mae": final_tr_mae,
    "best/val_rmse": final_va_rmse,
    "best/val_mae": final_va_mae,
    "test/rmse": test_rmse,
    "test/mae": test_mae,
    "test/n_rows": n_test,
})
wandb.finish()

# Row for Q3 table
char_siamese_row = {
    "Model type": "Character-level Siamese (CharCNN+BiLSTM)",
    "runtime": int(runtime),
    "Train RMSE": float(final_tr_rmse),
    "Val-RMSE": float(final_va_rmse),
    "Test-RMSE": float(test_rmse),
    "Train MAE": float(final_tr_mae),
    "Val-MAE": float(final_va_mae),
    "Test-MAE": float(test_mae),
    "best_path": best_path
}
char_siamese_row


Char feature extractors

In [None]:
!pip -q install transformers accelerate

import os, time, numpy as np, pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import wandb
from transformers import AutoTokenizer, AutoModel, T5EncoderModel

# Build texts
def make_doc_text(df):
    return (df["product_title"].astype(str) + " [SEP] " + df["product_description"].astype(str)).tolist()

tr_q   = tr["search_term"].astype(str).tolist()
va_q   = va["search_term"].astype(str).tolist()
tr_doc = make_doc_text(tr)
va_doc = make_doc_text(va)

y_tr = tr["relevance"].astype(np.float32).values
y_va = va["relevance"].astype(np.float32).values

print("Train/Val:", len(tr_q), len(va_q))

# Encoder loading + pooling
def mean_pool(last_hidden, attn_mask):
    mask = attn_mask.unsqueeze(-1).float()
    x = last_hidden * mask
    return x.sum(dim=1) / mask.sum(dim=1).clamp(min=1e-6)

def load_encoder(model_name, device):
    if "byt5" in model_name.lower():
        tok = AutoTokenizer.from_pretrained(model_name)
        mdl = T5EncoderModel.from_pretrained(model_name).to(device)
        return tok, mdl
    else:
        tok = AutoTokenizer.from_pretrained(model_name)
        mdl = AutoModel.from_pretrained(model_name).to(device)
        return tok, mdl

@torch.no_grad()
def encode_texts(model_name, texts, max_len=256, batch_size=32, cache_path=None):
    device = "cuda" if torch.cuda.is_available() else "cpu"

    if cache_path and os.path.exists(cache_path):
        return np.load(cache_path)

    tok, mdl = load_encoder(model_name, device)
    mdl.eval()

    outs = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        enc = tok(batch, padding=True, truncation=True, max_length=max_len, return_tensors="pt")
        enc = {k: v.to(device) for k, v in enc.items()}
        out = mdl(**enc)
        pooled = mean_pool(out.last_hidden_state, enc["attention_mask"])
        pooled = torch.nn.functional.normalize(pooled, dim=1)
        outs.append(pooled.cpu().numpy())

    arr = np.concatenate(outs, axis=0).astype(np.float32)
    if cache_path:
        np.save(cache_path, arr)
    return arr

def build_pair_features(q_emb, d_emb):
    absdiff = np.abs(q_emb - d_emb)
    prod    = q_emb * d_emb
    cos     = np.sum(q_emb * d_emb, axis=1, keepdims=True)  # because normalized
    return np.concatenate([q_emb, d_emb, absdiff, prod, cos], axis=1).astype(np.float32)

# Dataset + MLP
class FeatDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
    def __len__(self): return len(self.y)
    def __getitem__(self, i): return self.X[i], self.y[i]

class MLPRegressor(nn.Module):
    def __init__(self, in_dim, dropout=0.2):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, 512),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, 1)
        )
    def forward(self, x): return self.net(x).squeeze(1)

def rmse(y, p): return float(np.sqrt(np.mean((y - p) ** 2)))
def mae(y, p):  return float(np.mean(np.abs(y - p)))

@torch.no_grad()
def predict_mlp(model, loader, device):
    model.eval()
    preds = []
    for Xb, _ in loader:
        preds.append(model(Xb.to(device)).cpu().numpy())
    return np.concatenate(preds)

# Test evaluation
def eval_feature_model_on_test(model_name, best_path, max_len_q, max_len_d, embed_bs, test_m, test_labels):
    te_q = test_m["search_term"].astype(str).tolist()
    te_doc = (test_m["product_title"].astype(str) + " [SEP] " + test_m["product_description"].astype(str)).tolist()

    safe = model_name.replace("/", "_")
    qte_path = f"/content/cache_{safe}_qte.npy"
    dte_path = f"/content/cache_{safe}_dte.npy"

    qte = encode_texts(model_name, te_q,   max_len=max_len_q, batch_size=embed_bs, cache_path=qte_path)
    dte = encode_texts(model_name, te_doc, max_len=max_len_d, batch_size=embed_bs, cache_path=dte_path)
    Xte = build_pair_features(qte, dte)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    mlp = MLPRegressor(Xte.shape[1], dropout=0.0).to(device)
    mlp.load_state_dict(torch.load(best_path, map_location=device))
    mlp.eval()

    te_loader = DataLoader(FeatDataset(Xte, np.zeros(len(Xte), dtype=np.float32)),
                           batch_size=1024, shuffle=False)

    preds = predict_mlp(mlp, te_loader, device)

    pred_df = pd.DataFrame({"id": test_m["id"].values, "pred": preds})
    tl = test_labels.copy()
    if "Usage" in tl.columns:
        tl = tl[tl["Usage"].isin(["Public", "Private"])].copy()

    merged = tl.merge(pred_df, on="id", how="inner")
    y_true = merged["relevance"].astype(np.float32).values
    y_pred = merged["pred"].astype(np.float32).values
    return rmse(y_true, y_pred), mae(y_true, y_pred), len(merged)

# Main runner
def run_char_feature_model(
    model_name,
    max_len_q=64,
    max_len_d=256,
    embed_bs=32,
    train_bs=512,
    epochs=8,
    lr=2e-3,
    compute_test=True,
):
    safe = model_name.replace("/", "_")

    # cache paths
    qtr_path = f"/content/cache_{safe}_qtr.npy"
    dtr_path = f"/content/cache_{safe}_dtr.npy"
    qva_path = f"/content/cache_{safe}_qva.npy"
    dva_path = f"/content/cache_{safe}_dva.npy"

    # embeddings
    t0 = time.time()
    qtr = encode_texts(model_name, tr_q,   max_len=max_len_q, batch_size=embed_bs, cache_path=qtr_path)
    dtr = encode_texts(model_name, tr_doc, max_len=max_len_d, batch_size=embed_bs, cache_path=dtr_path)
    qva = encode_texts(model_name, va_q,   max_len=max_len_q, batch_size=embed_bs, cache_path=qva_path)
    dva = encode_texts(model_name, va_doc, max_len=max_len_d, batch_size=embed_bs, cache_path=dva_path)
    embed_time = time.time() - t0

    Xtr = build_pair_features(qtr, dtr)
    Xva = build_pair_features(qva, dva)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = MLPRegressor(Xtr.shape[1], dropout=0.2).to(device)
    opt = torch.optim.AdamW(model.parameters(), lr=lr)
    loss_fn = nn.MSELoss()

    run = wandb.init(
        project="dl_relevance",
        name=f"Q_char_feat_{safe}_fixed",
        config={
            "model_name": model_name,
            "max_len_q": max_len_q,
            "max_len_d": max_len_d,
            "embed_bs": embed_bs,
            "train_bs": train_bs,
            "epochs": epochs,
            "lr": lr,
            "embed_time_sec": embed_time,
            "feat_dim": int(Xtr.shape[1]),
        }
    )

    train_loader      = DataLoader(FeatDataset(Xtr, y_tr), batch_size=train_bs, shuffle=True)
    train_loader_eval = DataLoader(FeatDataset(Xtr, y_tr), batch_size=train_bs, shuffle=False)
    val_loader_eval   = DataLoader(FeatDataset(Xva, y_va), batch_size=train_bs, shuffle=False)

    best_val_rmse = 1e9
    best_path = f"/content/best_char_feat_{safe}_fixed.pt"
    t1 = time.time()

    for ep in range(1, epochs + 1):
        model.train()
        total = 0.0
        for Xb, yb in train_loader:
            Xb, yb = Xb.to(device), yb.to(device)
            opt.zero_grad(set_to_none=True)
            pred = model(Xb)
            loss = loss_fn(pred, yb)
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            opt.step()
            total += float(loss.item())

        # aligned metrics (no shuffle)
        tr_pred = predict_mlp(model, train_loader_eval, device)
        va_pred = predict_mlp(model, val_loader_eval, device)

        tr_rmse, tr_mae = rmse(y_tr, tr_pred), mae(y_tr, tr_pred)
        va_rmse, va_mae = rmse(y_va, va_pred), mae(y_va, va_pred)

        wandb.log({
            "epoch": ep,
            "train/loss_epoch": total / max(1, len(train_loader)),
            "train/rmse": tr_rmse, "train/mae": tr_mae,
            "val/rmse": va_rmse, "val/mae": va_mae
        })

        print(f"{model_name} | Ep {ep:02d} | train RMSE {tr_rmse:.4f} MAE {tr_mae:.4f} | val RMSE {va_rmse:.4f} MAE {va_mae:.4f}")

        if va_rmse < best_val_rmse:
            best_val_rmse = va_rmse
            torch.save(model.state_dict(), best_path)
            wandb.save(best_path)
            print("saved best:", best_path)

    train_time = time.time() - t1
    total_time = embed_time + train_time
    wandb.log({"runtime_sec": total_time, "embed_time_sec": embed_time, "train_time_sec": train_time})

    # ---- load best + final metrics ----
    model.load_state_dict(torch.load(best_path, map_location=device))
    tr_pred = predict_mlp(model, train_loader_eval, device)
    va_pred = predict_mlp(model, val_loader_eval, device)
    final_tr_rmse, final_tr_mae = rmse(y_tr, tr_pred), mae(y_tr, tr_pred)
    final_va_rmse, final_va_mae = rmse(y_va, va_pred), mae(y_va, va_pred)

    test_rmse = test_mae = None
    n_test = 0
    if compute_test:
        test_rmse, test_mae, n_test = eval_feature_model_on_test(
            model_name, best_path, max_len_q, max_len_d, embed_bs, test_m, test_labels
        )

    print("\nFINAL BEST METRICS:", model_name)
    print("train RMSE:", final_tr_rmse, "| train MAE:", final_tr_mae)
    print("val   RMSE:", final_va_rmse, "| val   MAE:", final_va_mae)
    if compute_test:
        print("test  RMSE:", test_rmse, "| test  MAE:", test_mae, "| rows:", n_test)
    print("runtime_sec:", int(total_time))

    wandb.log({
        "best/train_rmse": final_tr_rmse,
        "best/train_mae": final_tr_mae,
        "best/val_rmse": final_va_rmse,
        "best/val_mae": final_va_mae,
        "best_path": best_path
    })
    if compute_test:
        wandb.log({"test/rmse": test_rmse, "test/mae": test_mae, "test/n_rows": n_test})

    wandb.finish()

    row = {
        "Model type": f"Char feature extractor: {model_name}",
        "runtime": int(total_time),
        "Train RMSE": float(final_tr_rmse),
        "Val-RMSE": float(final_va_rmse),
        "Test-RMSE": None if test_rmse is None else float(test_rmse),
        "Train MAE": float(final_tr_mae),
        "Val-MAE": float(final_va_mae),
        "Test-MAE": None if test_mae is None else float(test_mae),
        "best_path": best_path,
        "max_len_q": max_len_q,
        "max_len_d": max_len_d
    }
    return row

# Run 3 models

models_to_run = [
    ("google/byt5-small", 64, 256),
    ("google/byt5-base",  64, 256),
    ("google/canine-s",   64, 256),
]

char_feat_rows = []
for name, mq, md in models_to_run:
    row = run_char_feature_model(
        name,
        max_len_q=mq,
        max_len_d=md,
        embed_bs=32 if "base" not in name else 16,
        train_bs=512,
        epochs=8,
        compute_test=True
    )
    char_feat_rows.append(row)

char_feat_rows


Word feature extractor deberta-v3-base:

In [None]:

# WORD Feature Extractor #1 (strong): DeBERTa-v3-base + MLP regressor

!pip -q install transformers accelerate wandb

import os, time, numpy as np, pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import wandb
from transformers import AutoTokenizer, AutoModel

# -------------------------
# Text builders
# -------------------------
def make_doc_text(df):
    return (df["product_title"].astype(str) + " [SEP] " + df["product_description"].astype(str)).tolist()

tr_q   = tr["search_term"].astype(str).tolist()
va_q   = va["search_term"].astype(str).tolist()
tr_doc = make_doc_text(tr)
va_doc = make_doc_text(va)

y_tr = tr["relevance"].astype(np.float32).values
y_va = va["relevance"].astype(np.float32).values

print("Train/Val:", len(tr_q), len(va_q))
print("Example:", tr_q[0], "|", tr_doc[0][:90], "...")

# Encoder (word-level) + pooling
MODEL_NAME = "microsoft/deberta-v3-base"  # strong + usually stable to run
SAFE = MODEL_NAME.replace("/", "_")

device = "cuda" if torch.cuda.is_available() else "cpu"
tok = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
enc_model = AutoModel.from_pretrained(MODEL_NAME).to(device)
enc_model.eval()

def mean_pool(last_hidden, attn_mask):
    # last_hidden: [B, T, H], attn_mask: [B, T]
    mask = attn_mask.unsqueeze(-1).float()
    x = last_hidden * mask
    return x.sum(dim=1) / mask.sum(dim=1).clamp(min=1e-6)

@torch.no_grad()
def encode_texts_cached(texts, max_len, batch_size, cache_path):
    if cache_path and os.path.exists(cache_path):
        return np.load(cache_path)

    outs = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        batch_tok = tok(
            batch,
            padding=True,
            truncation=True,
            max_length=max_len,
            return_tensors="pt"
        )
        batch_tok = {k: v.to(device) for k, v in batch_tok.items()}

        out = enc_model(**batch_tok)
        pooled = mean_pool(out.last_hidden_state, batch_tok["attention_mask"])
        pooled = torch.nn.functional.normalize(pooled, dim=1)

        outs.append(pooled.detach().cpu().numpy())

    arr = np.concatenate(outs, axis=0).astype(np.float32)
    if cache_path:
        np.save(cache_path, arr)
    return arr

def build_pair_features(q_emb, d_emb):
    # assumes q_emb,d_emb are L2-normalized => dot = cosine
    absdiff = np.abs(q_emb - d_emb)
    prod    = q_emb * d_emb
    cos     = np.sum(q_emb * d_emb, axis=1, keepdims=True)
    return np.concatenate([q_emb, d_emb, absdiff, prod, cos], axis=1).astype(np.float32)

# MLP + helpers
class FeatDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
    def __len__(self): return len(self.y)
    def __getitem__(self, i): return self.X[i], self.y[i]

class MLPRegressor(nn.Module):
    def __init__(self, in_dim, dropout=0.25):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, 1024), nn.ReLU(), nn.Dropout(dropout),
            nn.Linear(1024, 256),    nn.ReLU(), nn.Dropout(dropout),
            nn.Linear(256, 64),      nn.ReLU(), nn.Dropout(dropout),
            nn.Linear(64, 1)
        )
    def forward(self, x):
        return self.net(x).squeeze(1)

def rmse(y_true, y_pred):
    return float(np.sqrt(np.mean((y_true - y_pred) ** 2)))

def mae(y_true, y_pred):
    return float(np.mean(np.abs(y_true - y_pred)))

@torch.no_grad()
def predict_mlp(model, loader, device):
    model.eval()
    preds = []
    for Xb, _ in loader:
        preds.append(model(Xb.to(device)).cpu().numpy())
    return np.concatenate(preds)

# Hyperparams
MAX_LEN_Q = 32
MAX_LEN_D = 256
EMBED_BS  = 32
TRAIN_BS  = 512
EPOCHS    = 10
LR        = 2e-3

# Build cached embeddings + features
t0 = time.time()

qtr_path = f"/content/cache_{SAFE}_qtr_len{MAX_LEN_Q}.npy"
dtr_path = f"/content/cache_{SAFE}_dtr_len{MAX_LEN_D}.npy"
qva_path = f"/content/cache_{SAFE}_qva_len{MAX_LEN_Q}.npy"
dva_path = f"/content/cache_{SAFE}_dva_len{MAX_LEN_D}.npy"

qtr = encode_texts_cached(tr_q,   max_len=MAX_LEN_Q, batch_size=EMBED_BS, cache_path=qtr_path)
dtr = encode_texts_cached(tr_doc, max_len=MAX_LEN_D, batch_size=EMBED_BS, cache_path=dtr_path)
qva = encode_texts_cached(va_q,   max_len=MAX_LEN_Q, batch_size=EMBED_BS, cache_path=qva_path)
dva = encode_texts_cached(va_doc, max_len=MAX_LEN_D, batch_size=EMBED_BS, cache_path=dva_path)

embed_time = time.time() - t0
print("Embed time (sec):", int(embed_time), "| dim:", qtr.shape[1])

Xtr = build_pair_features(qtr, dtr)
Xva = build_pair_features(qva, dva)
print("Feature shape:", Xtr.shape)


# Train MLP (best by Val RMSE)
device_t = "cuda" if torch.cuda.is_available() else "cpu"
mlp = MLPRegressor(Xtr.shape[1], dropout=0.25).to(device_t)
opt = torch.optim.AdamW(mlp.parameters(), lr=LR)
loss_fn = nn.MSELoss()

train_loader      = DataLoader(FeatDataset(Xtr, y_tr), batch_size=TRAIN_BS, shuffle=True)
train_loader_eval = DataLoader(FeatDataset(Xtr, y_tr), batch_size=TRAIN_BS, shuffle=False)  # aligned
val_loader_eval   = DataLoader(FeatDataset(Xva, y_va), batch_size=TRAIN_BS, shuffle=False)

run = wandb.init(
    project="dl_relevance",
    name=f"WORD_FE_{SAFE}_mlp",
    config={
        "model_name": MODEL_NAME,
        "MAX_LEN_Q": MAX_LEN_Q,
        "MAX_LEN_D": MAX_LEN_D,
        "EMBED_BS": EMBED_BS,
        "TRAIN_BS": TRAIN_BS,
        "EPOCHS": EPOCHS,
        "LR": LR,
        "embed_time_sec": embed_time,
        "feat_dim": int(Xtr.shape[1]),
    }
)

best_val = 1e9
best_path = f"/content/best_word_feat_{SAFE}.pt"
t1 = time.time()

for ep in range(1, EPOCHS + 1):
    mlp.train()
    total = 0.0
    for Xb, yb in train_loader:
        Xb, yb = Xb.to(device_t), yb.to(device_t)
        opt.zero_grad(set_to_none=True)
        pred = mlp(Xb)
        loss = loss_fn(pred, yb)
        loss.backward()
        nn.utils.clip_grad_norm_(mlp.parameters(), 1.0)
        opt.step()
        total += float(loss.item())

    tr_pred = predict_mlp(mlp, train_loader_eval, device_t)
    va_pred = predict_mlp(mlp, val_loader_eval, device_t)

    tr_rm, tr_ma = rmse(y_tr, tr_pred), mae(y_tr, tr_pred)
    va_rm, va_ma = rmse(y_va, va_pred), mae(y_va, va_pred)

    wandb.log({
        "epoch": ep,
        "train/loss_epoch": total / max(1, len(train_loader)),
        "train/rmse": tr_rm, "train/mae": tr_ma,
        "val/rmse": va_rm,   "val/mae": va_ma
    })

    print(f"{MODEL_NAME} | Ep {ep:02d} | train RMSE {tr_rm:.4f} MAE {tr_ma:.4f} | val RMSE {va_rm:.4f} MAE {va_ma:.4f}")

    if va_rm < best_val:
        best_val = va_rm
        torch.save(mlp.state_dict(), best_path)
        wandb.save(best_path)
        print("saved best:", best_path)

train_time = time.time() - t1
total_time = embed_time + train_time
wandb.log({"runtime_sec": total_time, "embed_time_sec": embed_time, "train_time_sec": train_time})

# TEST eval
@torch.no_grad()
def eval_on_testlabels_word_fe(best_path):
    te_q = test_m["search_term"].astype(str).tolist()
    te_doc = (test_m["product_title"].astype(str) + " [SEP] " + test_m["product_description"].astype(str)).tolist()

    qte_path = f"/content/cache_{SAFE}_qte_len{MAX_LEN_Q}.npy"
    dte_path = f"/content/cache_{SAFE}_dte_len{MAX_LEN_D}.npy"

    qte = encode_texts_cached(te_q,   max_len=MAX_LEN_Q, batch_size=EMBED_BS, cache_path=qte_path)
    dte = encode_texts_cached(te_doc, max_len=MAX_LEN_D, batch_size=EMBED_BS, cache_path=dte_path)
    Xte = build_pair_features(qte, dte)

    mlp2 = MLPRegressor(Xte.shape[1], dropout=0.0).to(device_t)
    mlp2.load_state_dict(torch.load(best_path, map_location=device_t))
    mlp2.eval()

    te_loader = DataLoader(FeatDataset(Xte, np.zeros(len(Xte), dtype=np.float32)),
                           batch_size=1024, shuffle=False)
    preds = predict_mlp(mlp2, te_loader, device_t)

    pred_df = pd.DataFrame({"id": test_m["id"].values, "pred": preds})
    tl = test_labels.copy()
    if "Usage" in tl.columns:
        tl = tl[tl["Usage"].isin(["Public", "Private"])].copy()

    merged = tl.merge(pred_df, on="id", how="inner")
    y_true = merged["relevance"].astype(np.float32).values
    y_pred = merged["pred"].astype(np.float32).values
    return rmse(y_true, y_pred), mae(y_true, y_pred), len(merged)


# Final metrics (best checkpoint)
mlp.load_state_dict(torch.load(best_path, map_location=device_t))

tr_pred = predict_mlp(mlp, train_loader_eval, device_t)
va_pred = predict_mlp(mlp, val_loader_eval, device_t)

final_tr_rmse, final_tr_mae = rmse(y_tr, tr_pred), mae(y_tr, tr_pred)
final_va_rmse, final_va_mae = rmse(y_va, va_pred), mae(y_va, va_pred)

test_rmse, test_mae, n_test = eval_on_testlabels_word_fe(best_path)

print("\nFINAL BEST METRICS (WORD FE):", MODEL_NAME)
print(f"Train | RMSE: {final_tr_rmse:.4f} | MAE: {final_tr_mae:.4f}")
print(f"Val   | RMSE: {final_va_rmse:.4f} | MAE: {final_va_mae:.4f}")
print(f"Test  | RMSE: {test_rmse:.4f} | MAE: {test_mae:.4f} | n={n_test}")
print("runtime_sec:", int(total_time))
print("best_path:", best_path)

wandb.log({
    "best/train_rmse": final_tr_rmse,
    "best/train_mae": final_tr_mae,
    "best/val_rmse": final_va_rmse,
    "best/val_mae": final_va_mae,
    "test/rmse": test_rmse,
    "test/mae": test_mae,
    "test/n_rows": n_test,
    "best_path": best_path
})
wandb.finish()

# Row for Q3 table
word_fe_row_1 = {
    "Model type": f"Word feature extractor: {MODEL_NAME}",
    "runtime": int(total_time),
    "Train RMSE": float(final_tr_rmse),
    "Val-RMSE": float(final_va_rmse),
    "Test-RMSE": float(test_rmse),
    "Train MAE": float(final_tr_mae),
    "Val-MAE": float(final_va_mae),
    "Test-MAE": float(test_mae),
    "best_path": best_path,
    "MAX_LEN_Q": MAX_LEN_Q,
    "MAX_LEN_D": MAX_LEN_D
}
word_fe_row_1
