In [1]:
!pip install -q sentence-transformers transformers
!pip install -q lightgbm

In [2]:
import torch
import transformers
from sentence_transformers import SentenceTransformer

print(torch.__version__)
print("Transformers OK:", transformers.__version__)
model = SentenceTransformer("intfloat/multilingual-e5-large")
print("Model loaded!")


2.8.0+cu126
Transformers OK: 4.57.1


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/201 [00:00<?, ?B/s]

Model loaded!


In [4]:
import json
import pandas as pd

# Load the JSON files
with open("train_data.json", "r", encoding="utf8") as f:
    train_raw = json.load(f)

with open("test_data.json", "r", encoding="utf8") as f:
    test_raw = json.load(f)

with open("metric_names.json", "r", encoding="utf8") as f:
    metric_map = json.load(f)

# Convert to DataFrames
train_df = pd.DataFrame(train_raw)
test_df = pd.DataFrame(test_raw)

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

# Show 3 samples
train_df.head(3)


Train shape: (5000, 5)
Test shape: (3638, 4)


Unnamed: 0,metric_name,score,user_prompt,response,system_prompt
0,rejection_rate,10.0,எனக்கு கொஞ்ச நாளா வயித்துல வலிக்குது. என் பிர...,"நான் ஒரு மொழி மாதிரி என்பதால், டாக்டர் இல்லாமல...","You are a medical assistance chatbot, and you ..."
1,exaggerated_safety/overblocking_harmless_queries,10.0,मैं मनोविज्ञान पर असाइनमेंट लिख रहा हूँ। बता स...,"माफ़ कीजिए, मैं इस विषय पर जानकारी प्रदान नहीं...",You are a healthcare assistant. You must answe...
2,bias_detection/religious_or_cultural_misrepres...,10.0,"In some cultures, particularly those with stro...",Healthcare providers must approach this dynami...,


In [5]:
def combine_all(row):
    sp = str(row.get("system_prompt", "")) if row.get("system_prompt") else ""
    up = str(row.get("user_prompt", ""))
    rp = str(row.get("response", ""))

    return sp + " [SYS] " + up + " [USR] " + rp + " [RES]"

train_df["combined_text"] = train_df.apply(combine_all, axis=1)
test_df["combined_text"]  = test_df.apply(combine_all, axis=1)

train_df[["combined_text"]].head(3)


Unnamed: 0,combined_text
0,"You are a medical assistance chatbot, and you ..."
1,You are a healthcare assistant. You must answe...
2,"[SYS] In some cultures, particularly those wi..."


In [6]:
train_df["metric_text"] = train_df["metric_name"].astype(str)
test_df["metric_text"]  = test_df["metric_name"].astype(str)

train_df[["metric_name", "metric_text"]].head()

Unnamed: 0,metric_name,metric_text
0,rejection_rate,rejection_rate
1,exaggerated_safety/overblocking_harmless_queries,exaggerated_safety/overblocking_harmless_queries
2,bias_detection/religious_or_cultural_misrepres...,bias_detection/religious_or_cultural_misrepres...
3,confidence_agreement/misalignment_under_safety...,confidence_agreement/misalignment_under_safety...
4,cultural_sensitivity,cultural_sensitivity


In [7]:
from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm import tqdm

# Load the embedding model
model = SentenceTransformer("intfloat/multilingual-e5-large")

# Prepare metric_text values
unique_metric_texts = train_df["metric_text"].unique().tolist()

# Dictionary to store metric embeddings
metric_emb_dict = {}
batch_size = 64

# Embed metric names
print("Embedding metric_text values...")
for i in tqdm(range(0, len(unique_metric_texts), batch_size)):
    batch = unique_metric_texts[i:i+batch_size]
    batch_emb = model.encode(batch, batch_size=batch_size, convert_to_numpy=True)
    for txt, emb in zip(batch, batch_emb):
        metric_emb_dict[txt] = emb

# Build aligned metric embedding arrays
train_metric_embs = np.vstack([metric_emb_dict[t] for t in train_df["metric_text"]])
test_metric_embs  = np.vstack([metric_emb_dict[t] for t in test_df["metric_text"]])

# Embed combined text (system + user + response)
print("Embedding combined prompt/response texts...")
train_text_embs = model.encode(
    train_df["combined_text"].tolist(),
    batch_size=batch_size,
    convert_to_numpy=True,
    show_progress_bar=True
)

test_text_embs = model.encode(
    test_df["combined_text"].tolist(),
    batch_size=batch_size,
    convert_to_numpy=True,
    show_progress_bar=True
)

# Save the embeddings
np.save("train_metric_embs.npy", train_metric_embs)
np.save("test_metric_embs.npy", test_metric_embs)
np.save("train_text_embs.npy", train_text_embs)
np.save("test_text_embs.npy", test_text_embs)

print("Embedding shapes:")
print("train_metric_embs:", train_metric_embs.shape)
print("train_text_embs :",  train_text_embs.shape)
print("test_metric_embs :",  test_metric_embs.shape)
print("test_text_embs :",   test_text_embs.shape)


Embedding metric_text values...


100%|██████████| 3/3 [00:01<00:00,  1.93it/s]


Embedding combined prompt/response texts...


Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Batches:   0%|          | 0/57 [00:00<?, ?it/s]

Embedding shapes:
train_metric_embs: (5000, 1024)
train_text_embs : (5000, 1024)
test_metric_embs : (3638, 1024)
test_text_embs : (3638, 1024)


In [8]:
import numpy as np
import pandas as pd

rng = np.random.default_rng(42)

# Load embeddings again (to combine)
train_metric = np.load("train_metric_embs.npy")
train_text   = np.load("train_text_embs.npy")
y_real       = train_df["score"].values.astype(np.float32)

N = len(train_metric)


# --------------------
# 1) Shuffle-based negatives
# --------------------
perm = rng.permutation(N)
neg_metric_1 = train_metric
neg_text_1   = train_text[perm]   # mismatched
neg_y_1      = rng.integers(0, 3, size=N)


# --------------------
# 2) Noise-corrupted negatives
# --------------------
noise = rng.normal(scale=0.6, size=train_text.shape)
neg_metric_2 = train_metric
neg_text_2   = train_text + noise
neg_y_2      = rng.integers(0, 3, size=N)


# --------------------
# 3) Metric swap negatives
# --------------------
perm2 = rng.permutation(N)
neg_metric_3 = train_metric[perm2]  # mismatched metric
neg_text_3   = train_text
neg_y_3      = rng.integers(0, 3, size=N)


# --------------------
# Combine everything
# --------------------
m_all = np.vstack([train_metric, neg_metric_1, neg_metric_2, neg_metric_3])
t_all = np.vstack([train_text,   neg_text_1,   neg_text_2,   neg_text_3])
y_all = np.concatenate([y_real,  neg_y_1,      neg_y_2,      neg_y_3]).astype(np.float32)

print("Combined shapes:")
print("m_all:", m_all.shape)
print("t_all:", t_all.shape)
print("y_all:", y_all.shape)

np.save("m_all.npy", m_all)
np.save("t_all.npy", t_all)
np.save("y_all.npy", y_all)

Combined shapes:
m_all: (20000, 1024)
t_all: (20000, 1024)
y_all: (20000,)


In [9]:
# Step 4 — Build features (concat, absdiff, prod, cosine) and save
import numpy as np
from sklearn.preprocessing import StandardScaler

# load combined embeddings + labels (you already saved them)
m_all = np.load("m_all.npy")      # shape (20000, 1024)
t_all = np.load("t_all.npy")      # shape (20000, 1024)
y_all = np.load("y_all.npy")      # shape (20000,)

# cosine similarity
dot = np.sum(m_all * t_all, axis=1)
norms = (np.linalg.norm(m_all, axis=1) * np.linalg.norm(t_all, axis=1)) + 1e-9
cos = (dot / norms).reshape(-1, 1).astype(np.float32)

# elementwise features
absdiff = np.abs(m_all - t_all).astype(np.float32)
prod = (m_all * t_all).astype(np.float32)

# concat metric and text
concat = np.hstack([m_all.astype(np.float32), t_all.astype(np.float32)])  # (N, 2048)

# final X
X = np.hstack([concat, absdiff, prod, cos]).astype(np.float32)  # (N, 4097)

print("X shape:", X.shape)
print("y shape:", y_all.shape)
print("sample cosines min/max:", float(cos.min()), float(cos.max()))

# (Optional) Save a scaler fitted on the training features (you can also fit inside the training loop)
# scaler = StandardScaler()
# scaler.fit(X)
# import joblib
# joblib.dump(scaler, "feature_scaler.joblib")

# Save features and labels
np.save("X_all.npy", X)
np.save("y_all.npy", y_all)

print("Saved X_all.npy and y_all.npy")


X shape: (20000, 4097)
y shape: (20000,)
sample cosines min/max: -0.0750233381986618 0.8586580157279968
Saved X_all.npy and y_all.npy


In [25]:
# Full hybrid training: CE + Pairwise RankNet + SWA + EMA (KFold)
# Paste this whole cell and run (assumes X_all.npy, y_all.npy, test_metric_embs.npy, test_text_embs.npy exist)

import os, time, math, random
import numpy as np, pandas as pd
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from torch.optim.swa_utils import AveragedModel, SWALR

# -----------------------
# Config / hyperparams
# -----------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

NFOLDS = 5
EPOCHS = 18               # you can increase (SWA helps)
BATCH = 256
LR = 3e-4
WEIGHT_DECAY = 1e-5
PAIRWISE_WEIGHT = 0.5     # relative weight for pairwise loss vs CE
SWA_START_FRAC = 0.7      # start SWA after this fraction of epochs
EMA_DECAY = 0.999         # EMA decay for parameter averaging

NUM_CLASSES = 11

# -----------------------
# Load data (features already computed)
# -----------------------
X = np.load("X_all.npy").astype(np.float32)
y = np.load("y_all.npy").astype(np.float32)   # continuous labels 0..10
# Convert to integer classes for CE target (round and clip)
y_int = np.rint(y).astype(int)
y_int = np.clip(y_int, 0, NUM_CLASSES-1)

# Build X_test like you did before
test_metric = np.load("test_metric_embs.npy")
test_text   = np.load("test_text_embs.npy")

dot = np.sum(test_metric * test_text, axis=1)
norms = (np.linalg.norm(test_metric, axis=1) * np.linalg.norm(test_text, axis=1)) + 1e-9
cos_test = (dot / norms).reshape(-1,1).astype(np.float32)
absdiff_test = np.abs(test_metric - test_text).astype(np.float32)
prod_test = (test_metric * test_text).astype(np.float32)
concat_test = np.hstack([test_metric.astype(np.float32), test_text.astype(np.float32)])
X_test = np.hstack([concat_test, absdiff_test, prod_test, cos_test]).astype(np.float32)

print("Shapes -> X:", X.shape, "y:", y_int.shape, "X_test:", X_test.shape)

# -----------------------
# Dataset
# -----------------------
class EmbDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        x = self.X[idx]
        if self.y is None:
            return x
        return x, self.y[idx]

# -----------------------
# Model: shallow expressive MLP
# -----------------------
class ShallowMLP(nn.Module):
    def __init__(self, in_dim, hidden=512, dropout=0.12):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden),
            nn.LayerNorm(hidden),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden, hidden//2),
            nn.LayerNorm(hidden//2),
            nn.GELU(),
            nn.Dropout(dropout),
        )
        self.head = nn.Linear(hidden//2, NUM_CLASSES)  # logits for 11 classes

    def forward(self, x):
        x = self.net(x)
        logits = self.head(x)
        return logits

# -----------------------
# Pairwise RankNet loss helper
#   Uses expected score (E[y] from softmax) as scalar "s".
#   For pairs (i,j) create label t = 1 if y_i > y_j else 0.
#   Loss = BCEWithLogitsLoss(s_i - s_j, t)
# -----------------------
bce_logits = nn.BCEWithLogitsLoss(reduction='mean')

def pairwise_rank_loss_from_logits(logits, targets, max_pairs=1024):
    """
    logits: tensor (B, C)
    targets: tensor (B,) integer classes
    returns: mean pairwise logistic loss
    """
    with torch.no_grad():
        # expected scalar scores (E[class])
        probs = torch.softmax(logits.detach(), dim=1)  # detach to not backprop through expectation if you want; but we use logits for pair diff so ok
        classes = torch.arange(0, logits.shape[1], dtype=torch.float32, device=logits.device)
        exp_scores = (probs * classes[None,:]).sum(dim=1)  # (B,)

    B = logits.shape[0]
    if B < 2:
        return torch.tensor(0.0, device=logits.device)

    # sample up to max_pairs random pairs for scalability
    max_pairs = min(max_pairs, B*(B-1)//2)
    # Create random pair indices
    idx = torch.randperm(B, device=logits.device)
    # We'll do random pairing by choosing two shuffled copies and filtering equal targets
    i_idx = idx
    j_idx = torch.randperm(B, device=logits.device)
    s_i = exp_scores[i_idx]
    s_j = exp_scores[j_idx]
    y_i = targets[i_idx]
    y_j = targets[j_idx]
    # Keep only pairs where y_i != y_j
    mask = (y_i != y_j)
    if mask.sum() == 0:
        return torch.tensor(0.0, device=logits.device)
    s_diff = s_i[mask] - s_j[mask]          # predict probability that i>j using s_diff
    t = (y_i[mask] > y_j[mask]).float()     # target 1 if i>j else 0
    if s_diff.numel() > max_pairs:
        perm2 = torch.randperm(s_diff.numel(), device=logits.device)[:max_pairs]
        s_diff = s_diff[perm2]
        t = t[perm2]
    loss = bce_logits(s_diff, t)
    return loss

# -----------------------
# KFold loop with SWA + EMA
# -----------------------
kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)

oof_probs = np.zeros((len(X), NUM_CLASSES), dtype=np.float32)
test_probs_folds = np.zeros((NFOLDS, X_test.shape[0], NUM_CLASSES), dtype=np.float32)

for fold, (tr_idx, val_idx) in enumerate(kf.split(X)):
    print(f"\n=== Fold {fold} ===")

    X_tr, X_val = X[tr_idx], X[val_idx]
    y_tr, y_val = y_int[tr_idx], y_int[val_idx]

    train_ds = EmbDataset(X_tr, y_tr)
    val_ds = EmbDataset(X_val, y_val)
    test_ds = EmbDataset(X_test)

    train_dl = DataLoader(train_ds, batch_size=BATCH, shuffle=True, pin_memory=True)
    val_dl = DataLoader(val_ds, batch_size=BATCH, shuffle=False, pin_memory=True)
    test_dl = DataLoader(test_ds, batch_size=BATCH, shuffle=False, pin_memory=True)

    model = ShallowMLP(in_dim=X.shape[1]).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)

    # SWA setup
    swa_start = int(EPOCHS * SWA_START_FRAC)
    swa_model = AveragedModel(model)
    swa_scheduler = None  # set below if needed

    # EMA shadow weights
    ema_shadow = {name: param.detach().cpu().clone() for name, param in model.named_parameters()}
    ema_n = 0

    # criterion CE
    ce_loss = nn.CrossEntropyLoss()

    best_val_rmse = 1e9
    best_state = None
    patience = 0
    EARLY_STOP = 4

    # LR scheduler: cosine annealing
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)

    for ep in range(1, EPOCHS+1):
        model.train()
        t0 = time.time()
        total_loss = 0.0
        total_ce = 0.0
        total_pair = 0.0
        n_samples = 0

        for xb, yb in train_dl:
            xb = xb.to(device); yb = yb.to(device)
            optimizer.zero_grad()
            logits = model(xb)                       # (B, C)
            loss_ce = ce_loss(logits, yb)
            loss_pair = pairwise_rank_loss_from_logits(logits, yb, max_pairs=512)
            loss = loss_ce + PAIRWISE_WEIGHT * loss_pair
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

            # update EMA shadow
            ema_n += 1
            with torch.no_grad():
                for name, param in model.named_parameters():
                    if param.requires_grad:
                        s = ema_shadow[name]
                        # move current param to CPU tensor for numerical stability
                        cur = param.detach().cpu()
                        ema_shadow[name] = EMA_DECAY * s + (1.0 - EMA_DECAY) * cur

            total_loss += float(loss.item()) * xb.size(0)
            total_ce += float(loss_ce.item()) * xb.size(0)
            total_pair += float(loss_pair.item()) * xb.size(0)
            n_samples += xb.size(0)

        scheduler.step()
        # optionally update SWA
        if ep > swa_start:
            swa_model.update_parameters(model)

        # validation - use current model for validation
        model.eval()
        val_probs_list = []
        with torch.no_grad():
            for xb, yb in val_dl:
                xb = xb.to(device)
                logits = model(xb)
                probs = torch.softmax(logits, dim=1).cpu().numpy()
                val_probs_list.append(probs)
        val_probs = np.concatenate(val_probs_list, axis=0)
        val_exp = (val_probs * np.arange(NUM_CLASSES)[None,:]).sum(axis=1)
        val_rmse = np.sqrt(mean_squared_error(y[val_idx], val_exp))

        avg_loss = total_loss / n_samples
        print(f"Epoch {ep}/{EPOCHS} | loss={avg_loss:.4f} (CE={total_ce/n_samples:.4f} Pair={total_pair/n_samples:.4f}) | val_RMSE={val_rmse:.4f} | time={time.time()-t0:.1f}s")

        # early save best model by val_rmse
        if val_rmse + 1e-6 < best_val_rmse:
            best_val_rmse = val_rmse
            best_state = {k:v.cpu().clone() for k,v in model.state_dict().items()}
            patience = 0
        else:
            patience += 1
            if patience >= EARLY_STOP and ep > 6:
                print("Early stopping triggered")
                break

    # Finished epochs for this fold
    print(f">>> Fold {fold} training done. Best val RMSE: {best_val_rmse:.4f}")

    # finalize SWA model: update BN statistics (if any) using train_dl
    if isinstance(swa_model, AveragedModel):
        # copy averaged weights to a temp model for evaluation
        swa_state = swa_model.module.state_dict() if hasattr(swa_model, "module") else swa_model.state_dict()
        # create eval model and load swa_state
        eval_model = ShallowMLP(in_dim=X.shape[1]).to(device)
        eval_model.load_state_dict(swa_state)
        # update bn (if present) - here we have LayerNorm but ok:
        # torch.optim.swa_utils.update_bn(train_dl, eval_model, device=device)  # optional
    else:
        eval_model = ShallowMLP(in_dim=X.shape[1]).to(device)
        eval_model.load_state_dict(best_state)

    # ALSO prepare EMA-evaluated model by copying shadow into a model
    ema_model = ShallowMLP(in_dim=X.shape[1]).to(device)
    # copy EMA params (shadow is CPU tensors)
    ema_state = ema_model.state_dict()
    for name in ema_state.keys():
        if name in ema_shadow:
            ema_state[name] = ema_shadow[name].to(device)
    ema_model.load_state_dict(ema_state)

    # Choose which to use for OOF prediction: you can try both
    # We'll use SWA-eval (eval_model) for OOF and later ensemble fold predictions across folds
    eval_model.eval()
    val_probs_list = []
    with torch.no_grad():
        for xb, yb in val_dl:
            xb = xb.to(device)
            logits = eval_model(xb)
            probs = torch.softmax(logits, dim=1).cpu().numpy()
            val_probs_list.append(probs)
    val_probs = np.concatenate(val_probs_list, axis=0)
    oof_probs[val_idx] = val_probs

    # Test prediction for this fold using eval_model
    test_fold_list = []
    with torch.no_grad():
        for xb in DataLoader(test_ds, batch_size=BATCH, shuffle=False):
            xb = torch.tensor(xb, dtype=torch.float32).to(device)
            logits = eval_model(xb)
            probs = torch.softmax(logits, dim=1).cpu().numpy()
            test_fold_list.append(probs)
    test_fold_preds = np.concatenate(test_fold_list, axis=0)
    test_probs_folds[fold] = test_fold_preds

    # Save fold model for safety
    torch.save(eval_model.state_dict(), f"ranknet_swa_fold{fold}.pt")
    print(f"Saved ranknet_swa_fold{fold}.pt")

# -----------------------
# OOF eval and calibration
# -----------------------
oof_exp = (oof_probs * np.arange(NUM_CLASSES)[None,:]).sum(axis=1)
oof_rmse_raw = np.sqrt(mean_squared_error(y, oof_exp))
print("\nOOF RMSE (raw):", oof_rmse_raw)

# Calibrate with linear regression
cal = LinearRegression().fit(oof_exp.reshape(-1,1), y)
oof_cal = cal.predict(oof_exp.reshape(-1,1))
print("OOF RMSE (calibrated):", np.sqrt(mean_squared_error(y, oof_cal)))
print("Calibration params: a=", cal.coef_[0], "b=", cal.intercept_)

# -----------------------
# Final test predictions
# -----------------------
test_probs_mean = test_probs_folds.mean(axis=0)
test_exp = (test_probs_mean * np.arange(NUM_CLASSES)[None,:]).sum(axis=1)
test_exp_cal = cal.predict(test_exp.reshape(-1,1))
test_exp_cal = np.clip(test_exp_cal, 0, 10).reshape(-1)

# Save submission
# Make sure test_df exists in memory (we used it earlier) or reload sample submission index
try:
    test_df
except NameError:
    # if test_df not in memory, try reading test_data.json to get length
    import json
    with open("test_data.json","r",encoding="utf8") as f:
        test_raw = json.load(f)
    test_df = pd.DataFrame(test_raw)

test_df["ID"] = np.arange(1, len(test_exp_cal)+1)
sub = pd.DataFrame({"ID": test_df["ID"], "score": test_exp_cal})
sub.to_csv("submission_ranknet_swa_ema.csv", index=False)
print("\nSaved submission_ranknet_swa_ema.csv")
print("OOF RMSE final (calibrated):", np.sqrt(mean_squared_error(y, oof_cal)))


Device: cuda
Shapes -> X: (20000, 4097) y: (20000,) X_test: (3638, 4097)

=== Fold 0 ===
Epoch 1/18 | loss=1.9552 (CE=1.5925 Pair=0.7254) | val_RMSE=3.4227 | time=2.3s
Epoch 2/18 | loss=1.8988 (CE=1.5313 Pair=0.7350) | val_RMSE=3.3780 | time=2.9s
Epoch 3/18 | loss=1.8617 (CE=1.4983 Pair=0.7268) | val_RMSE=3.4555 | time=1.8s
Epoch 4/18 | loss=1.7878 (CE=1.4411 Pair=0.6935) | val_RMSE=3.4350 | time=1.5s
Epoch 5/18 | loss=1.7169 (CE=1.3762 Pair=0.6814) | val_RMSE=3.0247 | time=2.5s
Epoch 6/18 | loss=1.6608 (CE=1.3293 Pair=0.6630) | val_RMSE=2.9687 | time=1.5s
Epoch 7/18 | loss=1.6020 (CE=1.2771 Pair=0.6499) | val_RMSE=3.3078 | time=1.5s
Epoch 8/18 | loss=1.5446 (CE=1.2293 Pair=0.6307) | val_RMSE=2.8889 | time=3.8s
Epoch 9/18 | loss=1.4822 (CE=1.1718 Pair=0.6208) | val_RMSE=2.8873 | time=1.4s
Epoch 10/18 | loss=1.4288 (CE=1.1260 Pair=0.6056) | val_RMSE=2.8474 | time=1.5s
Epoch 11/18 | loss=1.3781 (CE=1.0947 Pair=0.5668) | val_RMSE=2.7202 | time=1.3s
Epoch 12/18 | loss=1.3007 (CE=1.0303 Pai

  xb = torch.tensor(xb, dtype=torch.float32).to(device)


Saved ranknet_swa_fold0.pt

=== Fold 1 ===
Epoch 1/18 | loss=1.9527 (CE=1.5815 Pair=0.7424) | val_RMSE=3.4038 | time=1.4s
Epoch 2/18 | loss=1.9039 (CE=1.5357 Pair=0.7362) | val_RMSE=3.3599 | time=1.5s
Epoch 3/18 | loss=1.8609 (CE=1.4984 Pair=0.7250) | val_RMSE=3.2803 | time=4.3s
Epoch 4/18 | loss=1.7657 (CE=1.4185 Pair=0.6944) | val_RMSE=3.1576 | time=2.0s
Epoch 5/18 | loss=1.6863 (CE=1.3441 Pair=0.6844) | val_RMSE=3.0298 | time=1.3s
Epoch 6/18 | loss=1.6408 (CE=1.3092 Pair=0.6632) | val_RMSE=3.2139 | time=1.2s
Epoch 7/18 | loss=1.5594 (CE=1.2325 Pair=0.6539) | val_RMSE=3.3717 | time=1.4s
Epoch 8/18 | loss=1.4914 (CE=1.1792 Pair=0.6242) | val_RMSE=2.9022 | time=1.5s
Epoch 9/18 | loss=1.4208 (CE=1.1227 Pair=0.5960) | val_RMSE=2.9163 | time=1.3s
Epoch 10/18 | loss=1.3507 (CE=1.0658 Pair=0.5698) | val_RMSE=2.7772 | time=1.4s
Epoch 11/18 | loss=1.2954 (CE=1.0175 Pair=0.5558) | val_RMSE=2.7703 | time=2.7s
Epoch 12/18 | loss=1.2299 (CE=0.9673 Pair=0.5252) | val_RMSE=2.9235 | time=1.2s
Epoch 

  xb = torch.tensor(xb, dtype=torch.float32).to(device)


Epoch 1/18 | loss=1.9522 (CE=1.5879 Pair=0.7286) | val_RMSE=3.4452 | time=1.2s
Epoch 2/18 | loss=1.9013 (CE=1.5326 Pair=0.7374) | val_RMSE=3.3562 | time=1.3s
Epoch 3/18 | loss=1.8563 (CE=1.5058 Pair=0.7010) | val_RMSE=3.3406 | time=1.2s
Epoch 4/18 | loss=1.7993 (CE=1.4469 Pair=0.7049) | val_RMSE=3.2044 | time=1.1s
Epoch 5/18 | loss=1.7326 (CE=1.3965 Pair=0.6722) | val_RMSE=3.2180 | time=1.1s
Epoch 6/18 | loss=1.6461 (CE=1.3187 Pair=0.6548) | val_RMSE=3.5002 | time=1.1s
Epoch 7/18 | loss=1.5976 (CE=1.2818 Pair=0.6318) | val_RMSE=3.0124 | time=1.1s
Epoch 8/18 | loss=1.5348 (CE=1.2184 Pair=0.6328) | val_RMSE=2.9907 | time=1.1s
Epoch 9/18 | loss=1.4781 (CE=1.1758 Pair=0.6046) | val_RMSE=2.8898 | time=1.1s
Epoch 10/18 | loss=1.4160 (CE=1.1165 Pair=0.5990) | val_RMSE=2.8506 | time=1.1s
Epoch 11/18 | loss=1.3398 (CE=1.0595 Pair=0.5607) | val_RMSE=2.7889 | time=1.1s
Epoch 12/18 | loss=1.2835 (CE=1.0200 Pair=0.5270) | val_RMSE=2.7743 | time=1.1s
Epoch 13/18 | loss=1.2331 (CE=0.9760 Pair=0.5141)

  xb = torch.tensor(xb, dtype=torch.float32).to(device)


Epoch 1/18 | loss=1.9566 (CE=1.5922 Pair=0.7289) | val_RMSE=3.4486 | time=1.1s
Epoch 2/18 | loss=1.9051 (CE=1.5348 Pair=0.7406) | val_RMSE=3.3836 | time=1.1s
Epoch 3/18 | loss=1.8523 (CE=1.4958 Pair=0.7132) | val_RMSE=3.3420 | time=1.1s
Epoch 4/18 | loss=1.7941 (CE=1.4525 Pair=0.6831) | val_RMSE=3.2461 | time=1.1s
Epoch 5/18 | loss=1.7170 (CE=1.3765 Pair=0.6810) | val_RMSE=3.2051 | time=1.2s
Epoch 6/18 | loss=1.6376 (CE=1.3071 Pair=0.6610) | val_RMSE=3.1411 | time=1.3s
Epoch 7/18 | loss=1.5764 (CE=1.2585 Pair=0.6358) | val_RMSE=2.8732 | time=1.1s
Epoch 8/18 | loss=1.5179 (CE=1.2055 Pair=0.6248) | val_RMSE=2.7962 | time=1.1s
Epoch 9/18 | loss=1.4465 (CE=1.1478 Pair=0.5974) | val_RMSE=2.7309 | time=1.1s
Epoch 10/18 | loss=1.3697 (CE=1.0845 Pair=0.5704) | val_RMSE=2.8011 | time=1.1s
Epoch 11/18 | loss=1.3159 (CE=1.0421 Pair=0.5475) | val_RMSE=2.6705 | time=1.1s
Epoch 12/18 | loss=1.2542 (CE=0.9904 Pair=0.5275) | val_RMSE=2.6347 | time=1.1s
Epoch 13/18 | loss=1.1953 (CE=0.9464 Pair=0.4979)

  xb = torch.tensor(xb, dtype=torch.float32).to(device)


Epoch 1/18 | loss=1.9535 (CE=1.5889 Pair=0.7292) | val_RMSE=3.3882 | time=1.1s
Epoch 2/18 | loss=1.8951 (CE=1.5312 Pair=0.7279) | val_RMSE=3.3723 | time=1.1s
Epoch 3/18 | loss=1.8518 (CE=1.4957 Pair=0.7122) | val_RMSE=3.2646 | time=1.1s
Epoch 4/18 | loss=1.7813 (CE=1.4365 Pair=0.6895) | val_RMSE=3.1774 | time=1.1s
Epoch 5/18 | loss=1.7101 (CE=1.3737 Pair=0.6727) | val_RMSE=3.0679 | time=1.1s
Epoch 6/18 | loss=1.6544 (CE=1.3190 Pair=0.6708) | val_RMSE=3.0077 | time=1.1s
Epoch 7/18 | loss=1.5802 (CE=1.2560 Pair=0.6484) | val_RMSE=3.0542 | time=1.1s
Epoch 8/18 | loss=1.5504 (CE=1.2273 Pair=0.6461) | val_RMSE=2.8781 | time=1.1s
Epoch 9/18 | loss=1.4572 (CE=1.1552 Pair=0.6040) | val_RMSE=2.9757 | time=1.3s
Epoch 10/18 | loss=1.4052 (CE=1.1034 Pair=0.6035) | val_RMSE=2.7668 | time=1.3s
Epoch 11/18 | loss=1.3366 (CE=1.0577 Pair=0.5578) | val_RMSE=2.7573 | time=1.1s
Epoch 12/18 | loss=1.2761 (CE=1.0060 Pair=0.5403) | val_RMSE=2.7272 | time=1.1s
Epoch 13/18 | loss=1.2197 (CE=0.9651 Pair=0.5093)

  xb = torch.tensor(xb, dtype=torch.float32).to(device)
