In [3]:
# ======================================================
# DATASET VISUALIZATION SCRIPT
# ======================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import os
import json

# Create folder
FIG_DIR = "figures_dataset"
os.makedirs(FIG_DIR, exist_ok=True)
print("Saving figures to:", FIG_DIR)

# ======================================================
# 1. Load files
# ======================================================

# metric embeddings
metric_emb = np.load("metric_name_embeddings.npy")

# metric names (JSON list)
with open("metric_names.json", "r") as f:
    metric_names = json.load(f)

# train & test (JSON array, NOT jsonl)
train_df = pd.read_json("train_data.json")
test_df = pd.read_json("test_data.json")

print("metric embeddings:", metric_emb.shape)
print("metric names:", len(metric_names))
print("train:", train_df.shape)
print("test:", test_df.shape)



Saving figures to: figures_dataset
metric embeddings: (145, 768)
metric names: 145
train: (5000, 5)
test: (3638, 4)


In [6]:
# ======================================================
# 2. Visualize metric embeddings
# ======================================================

# PCA
pca = PCA(n_components=2)
metric_pca = pca.fit_transform(metric_emb)

plt.figure(figsize=(10,8))
plt.scatter(metric_pca[:,0], metric_pca[:,1], s=60)
for i, name in enumerate(metric_names):
    plt.text(metric_pca[i,0]+0.01, metric_pca[i,1]+0.01, name, fontsize=7)
plt.title("Metric Embeddings - PCA")
plt.tight_layout()
plt.savefig(f"{FIG_DIR}/metric_embeddings_pca.png")
plt.close()

# TSNE (better perplexity)
tsne = TSNE(n_components=2, perplexity=20, random_state=42)
metric_tsne = tsne.fit_transform(metric_emb)

plt.figure(figsize=(10,8))
plt.scatter(metric_tsne[:,0], metric_tsne[:,1], s=60)
for i, name in enumerate(metric_names):
    plt.text(metric_tsne[i,0]+0.01, metric_tsne[i,1]+0.01, name, fontsize=7)
plt.title("Metric Embeddings - tSNE")
plt.tight_layout()
plt.savefig(f"{FIG_DIR}/metric_embeddings_tsne.png")
plt.close()

# Norm distribution
norms = np.linalg.norm(metric_emb, axis=1)
plt.figure(figsize=(12,5))
plt.bar(range(len(metric_names)), norms)
plt.xticks(range(len(metric_names)), metric_names, rotation=90, fontsize=6)
plt.title("Embedding Norms per Metric")
plt.tight_layout()
plt.savefig(f"{FIG_DIR}/metric_embedding_norms.png")
plt.close()

# Cosine similarity heatmap
from sklearn.metrics.pairwise import cosine_similarity
cos_sim = cosine_similarity(metric_emb)

plt.figure(figsize=(14,12))
sns.heatmap(
    cos_sim,
    xticklabels=metric_names,
    yticklabels=metric_names,
    cmap="viridis",
    cbar=True
)
plt.xticks(rotation=90, fontsize=6)
plt.yticks(fontsize=6)
plt.title("Cosine Similarity between Metric Embeddings")
plt.tight_layout()
plt.savefig(f"{FIG_DIR}/metric_embeddings_cosine_heatmap.png")
plt.close()


In [9]:
print(train_df.columns)


Index(['metric_name', 'score', 'user_prompt', 'response', 'system_prompt'], dtype='object')


In [10]:
# ======================================================
# 3. Training Data Visualization
# ======================================================

# 1. Score histogram
plt.figure(figsize=(7,5))
sns.histplot(train_df["score"], bins=40, kde=True, color="blue", alpha=0.7)
plt.xlabel("Score")
plt.ylabel("Count")
plt.title("Train Score Distribution")
plt.tight_layout()
plt.savefig(f"{FIG_DIR}/train_score_hist.png")
plt.close()

# 2. Score boxplot
plt.figure(figsize=(6,4))
sns.boxplot(x=train_df["score"], color="lightblue")
plt.title("Score Boxplot")
plt.tight_layout()
plt.savefig(f"{FIG_DIR}/train_score_box.png")
plt.close()

# 3. Prompt text length
train_df["user_len"] = train_df["user_prompt"].apply(lambda x: len(str(x)))
plt.figure(figsize=(7,5))
sns.histplot(train_df["user_len"], bins=40, kde=True)
plt.title("User Prompt Text Length (Characters)")
plt.tight_layout()
plt.savefig(f"{FIG_DIR}/train_user_prompt_length.png")
plt.close()

# 4. Response text length
train_df["response_len"] = train_df["response"].apply(lambda x: len(str(x)))
plt.figure(figsize=(7,5))
sns.histplot(train_df["response_len"], bins=40, kde=True, color="orange")
plt.title("Response Text Length (Characters)")
plt.tight_layout()
plt.savefig(f"{FIG_DIR}/train_response_length.png")
plt.close()

# 5. Metric name counts
plt.figure(figsize=(10,5))
train_df["metric_name"].value_counts().plot(kind="bar")
plt.title("Metric Name Frequency in Training Data")
plt.xlabel("Metric Name")
plt.ylabel("Count")
plt.tight_layout()
plt.savefig(f"{FIG_DIR}/train_metric_name_distribution.png")
plt.close()


  plt.tight_layout()


In [11]:
# ======================================================
# 4. Test Data Visualization
# ======================================================

# Prompt length
test_df["user_len"] = test_df["user_prompt"].apply(lambda x: len(str(x)))
plt.figure(figsize=(7,5))
sns.histplot(test_df["user_len"], bins=40, kde=True, color="orange")
plt.title("Test User Prompt Length Distribution")
plt.tight_layout()
plt.savefig(f"{FIG_DIR}/test_user_prompt_length.png")
plt.close()

# Response length
test_df["response_len"] = test_df["response"].apply(lambda x: len(str(x)))
plt.figure(figsize=(7,5))
sns.histplot(test_df["response_len"], bins=40, kde=True, color="green")
plt.title("Test Response Length Distribution")
plt.tight_layout()
plt.savefig(f"{FIG_DIR}/test_response_length.png")
plt.close()

# System prompt length (if not empty)
test_df["system_len"] = test_df["system_prompt"].apply(lambda x: len(str(x)))
plt.figure(figsize=(7,5))
sns.histplot(test_df["system_len"], bins=40, kde=True, color="purple")
plt.title("Test System Prompt Length Distribution")
plt.tight_layout()
plt.savefig(f"{FIG_DIR}/test_system_prompt_length.png")
plt.close()

# Metric name counts
plt.figure(figsize=(10,5))
test_df["metric_name"].value_counts().plot(kind="bar", color="orange")
plt.title("Metric Name Frequency in Test Data")
plt.xlabel("Metric Name")
plt.ylabel("Count")
plt.tight_layout()
plt.savefig(f"{FIG_DIR}/test_metric_name_distribution.png")
plt.close()


  plt.tight_layout()


In [15]:
# ======================================================
# 5. Compare Train vs Test
# ======================================================

# ---- 1. Compare user prompt length ----
plt.figure(figsize=(8,5))
plt.hist(train_df["user_len"], bins=40, alpha=0.5, label="Train", color="blue")
plt.hist(test_df["user_len"], bins=40, alpha=0.5, label="Test", color="orange")
plt.title("User Prompt Length: Train vs Test")
plt.xlabel("Length (characters)")
plt.ylabel("Count")
plt.legend()
plt.tight_layout()
plt.savefig(f"{FIG_DIR}/train_vs_test_user_prompt_length.png")
plt.close()

# ---- 2. Compare response length ----
plt.figure(figsize=(8,5))
plt.hist(train_df["response_len"], bins=40, alpha=0.5, label="Train", color="blue")
plt.hist(test_df["response_len"], bins=40, alpha=0.5, label="Test", color="orange")
plt.title("Response Length: Train vs Test")
plt.xlabel("Length (characters)")
plt.ylabel("Count")
plt.legend()
plt.tight_layout()
plt.savefig(f"{FIG_DIR}/train_vs_test_response_length.png")
plt.close()

# ---- 3. Compare metric_name distribution ----
plt.figure(figsize=(10,5))
train_df["metric_name"].value_counts().sort_index().plot(kind="bar", alpha=0.5, label="Train", color="blue")
test_df["metric_name"].value_counts().sort_index().plot(kind="bar", alpha=0.5, label="Test", color="orange")
plt.title("Metric Distribution: Train vs Test")
plt.xlabel("Metric Name")
plt.ylabel("Count")
plt.legend()
plt.tight_layout()
plt.savefig(f"{FIG_DIR}/train_vs_test_metric_distribution.png")
plt.close()


  plt.tight_layout()


In [None]:
# ================================================================
# RESNET-MLP pipeline (full): MSE + KL(hist) + SWA + EMA + per-fold quantile mapping
# ================================================================
import os, time, math, random
import numpy as np, pandas as pd
from tqdm import tqdm

import torch, torch.nn as nn, torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from torch.optim.swa_utils import AveragedModel

# ---------------- CONFIG ----------------
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", DEVICE)

NFOLDS = 5
EPOCHS = 24
BATCH = 256
LR = 2.5e-4
WEIGHT_DECAY = 1e-5

# histogram params
NUM_BINS = 200
SIGMA_BIN = 0.15
LAMBDA_KL = 0.08   # try slightly larger than before

SWA_START_FRAC = 0.7
EMA_DECAY = 0.999

# ResNet-MLP hyperparams
INPUT_DIM = None   # filled after loading X
HIDDEN_DIM = 1024   # base hidden dimension (can tune: 512, 768, 1024)
NUM_BLOCKS = 8      # depth: 6-12 typical
MLP_EXPAND = 4      # inner expansion factor inside block
DROPOUT = 0.12

# ---------------- Load features ----------------
X = np.load("X_all.npy").astype(np.float32)
y = np.load("y_all.npy").astype(np.float32)

test_metric = np.load("test_metric_embs.npy")
test_text   = np.load("test_text_embs.npy")

dot = np.sum(test_metric * test_text, axis=1)
norms = (np.linalg.norm(test_metric, axis=1) * np.linalg.norm(test_text, axis=1)) + 1e-9
cos_test = (dot / norms).reshape(-1,1).astype(np.float32)
absdiff_test = np.abs(test_metric - test_text).astype(np.float32)
prod_test    = (test_metric * test_text).astype(np.float32)
concat_test  = np.hstack([test_metric.astype(np.float32), test_text.astype(np.float32)])
X_test = np.hstack([concat_test, absdiff_test, prod_test, cos_test]).astype(np.float32)

print("X", X.shape, "y", y.shape, "X_test", X_test.shape)
INPUT_DIM = X.shape[1]

# ---------------- target pdf ----------------
def build_target_pdf(num_bins=NUM_BINS):
    bins = np.linspace(0,10,num_bins+1)
    centers = 0.5*(bins[:-1] + bins[1:])
    pdf_low = np.exp(-0.5*((centers-0.9)/0.25)**2)
    pdf_high = 1.0 * np.exp(-0.5*((centers-8.6)/0.6)**2)
    pdf_mid = 0.08 * np.exp(-0.5*((centers-4.0)/1.3)**2)
    pdf = pdf_low + pdf_high + pdf_mid
    pdf = np.maximum(pdf, 1e-12)
    pdf = pdf / pdf.sum()
    return centers.astype(np.float32), pdf.astype(np.float32)

BIN_CENTERS, TARGET_PDF = build_target_pdf(NUM_BINS)
BIN_CENTERS_T = torch.tensor(BIN_CENTERS, dtype=torch.float32, device=DEVICE)
TARGET_PDF_T  = torch.tensor(TARGET_PDF, dtype=torch.float32, device=DEVICE)

# ---------------- soft histogram & KL ----------------
def soft_histogram_torch(preds, bin_centers_t, sigma=SIGMA_BIN):
    # preds: (B,), bin_centers_t: (M,)
    d = preds.unsqueeze(1) - bin_centers_t.unsqueeze(0)   # (B, M)
    w = torch.exp(-0.5 * (d / sigma)**2)                 # (B, M)
    hist = w.sum(dim=0)                                   # (M,)
    hist = hist / (hist.sum() + 1e-12)
    return hist

def kl_hist_loss_torch(preds, target_pdf_t=TARGET_PDF_T, bin_centers_t=BIN_CENTERS_T, sigma=SIGMA_BIN):
    hist = soft_histogram_torch(preds, bin_centers_t, sigma)
    loss = F.kl_div((hist+1e-12).log(), target_pdf_t, reduction='batchmean')
    return loss

# ---------------- Dataset ----------------
class EmbDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = X
        self.y = y
    def __len__(self): return len(self.X)
    def __getitem__(self, idx):
        if self.y is None:
            return self.X[idx]
        return self.X[idx], self.y[idx]

# ---------------- Residual MLP Block ----------------
class ResMLPBlock(nn.Module):
    def __init__(self, d_model, mlp_expansion=MLP_EXPAND, drop=DROPOUT):
        super().__init__()
        hidden = d_model * mlp_expansion
        self.norm1 = nn.LayerNorm(d_model)
        self.fc1 = nn.Linear(d_model, hidden)
        self.act = nn.GELU()
        self.fc2 = nn.Linear(hidden, d_model)
        self.drop = nn.Dropout(drop)
    def forward(self, x):
        # Pre-norm residual block
        out = self.norm1(x)
        out = self.fc1(out)
        out = self.act(out)
        out = self.drop(out)
        out = self.fc2(out)
        out = self.drop(out)
        return x + out

# ---------------- ResNet-MLP model ----------------
class ResNetMLP(nn.Module):
    def __init__(self, in_dim, d_model=HIDDEN_DIM, num_blocks=NUM_BLOCKS, mlp_expansion=MLP_EXPAND, drop=DROPOUT):
        super().__init__()
        # input projection
        self.input_proj = nn.Sequential(
            nn.Linear(in_dim, d_model),
            nn.LayerNorm(d_model),
            nn.GELU(),
            nn.Dropout(drop)
        )
        # stack of residual blocks
        self.blocks = nn.ModuleList([ResMLPBlock(d_model, mlp_expansion, drop) for _ in range(num_blocks)])
        # final head
        self.head = nn.Sequential(
            nn.LayerNorm(d_model),
            nn.Linear(d_model, 1)
        )
    def forward(self, x):
        x = self.input_proj(x)
        for b in self.blocks:
            x = b(x)
        out = self.head(x).squeeze(-1)
        return out

# ---------------- Training loop with SWA + EMA ----------------
kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)
oof = np.zeros(len(X), dtype=np.float32)
test_preds_folds = np.zeros((NFOLDS, X_test.shape[0]), dtype=np.float32)

fold = 0
for tr_idx, val_idx in kf.split(X):
    print(f"\n========== Fold {fold} ==========")
    X_tr, X_val = X[tr_idx], X[val_idx]
    y_tr, y_val = y[tr_idx], y[val_idx]

    train_dl = DataLoader(EmbDataset(X_tr, y_tr), batch_size=BATCH, shuffle=True, pin_memory=True)
    val_dl   = DataLoader(EmbDataset(X_val, y_val), batch_size=BATCH, shuffle=False, pin_memory=True)
    test_dl  = DataLoader(EmbDataset(X_test), batch_size=BATCH, shuffle=False, pin_memory=True)

    model = ResNetMLP(INPUT_DIM, d_model=HIDDEN_DIM, num_blocks=NUM_BLOCKS, mlp_expansion=MLP_EXPAND, drop=DROPOUT).to(DEVICE)
    opt = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=EPOCHS)

    swa_start = int(EPOCHS * SWA_START_FRAC)
    swa_model = AveragedModel(model)
    swa_count = 0

    # EMA shadow params (store on CPU)
    ema_shadow = {name: param.detach().cpu().clone() for name, param in model.named_parameters()}
    ema_n = 0

    mse_loss = nn.MSELoss()
    best_rmse = 1e9
    best_state = None
    patience = 0

    for ep in range(1, EPOCHS+1):
        model.train()
        total_loss = total_mse = total_kl = ns = 0
        t0 = time.time()
        for xb, yb in train_dl:
            xb = xb.to(DEVICE); yb = yb.to(DEVICE)
            opt.zero_grad()
            preds = model(xb)
            loss_mse = mse_loss(preds, yb)
            loss_kl  = kl_hist_loss_torch(preds)
            loss = loss_mse + LAMBDA_KL * loss_kl
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            opt.step()

            # EMA update (on CPU)
            with torch.no_grad():
                for name, param in model.named_parameters():
                    ema_shadow[name] = EMA_DECAY * ema_shadow[name] + (1 - EMA_DECAY) * param.detach().cpu()

            n_b = xb.size(0)
            total_loss += float(loss.item()) * n_b
            total_mse += float(loss_mse.item()) * n_b
            total_kl  += float(loss_kl.item()) * n_b
            ns += n_b

        scheduler.step()
        if ep > swa_start:
            swa_model.update_parameters(model)
            swa_count += 1

        # validation
        model.eval()
        val_preds_list = []
        with torch.no_grad():
            for xb, yb in val_dl:
                xb = xb.to(DEVICE)
                p = model(xb).detach().cpu().numpy()
                val_preds_list.append(p)
        val_preds = np.concatenate(val_preds_list, axis=0)
        val_rmse = np.sqrt(mean_squared_error(y_val, val_preds))

        avg_loss = total_loss / (ns + 1e-12)
        print(f"Ep {ep}/{EPOCHS} | loss={avg_loss:.5f} mse={total_mse/(ns+1e-12):.5f} kl={total_kl/(ns+1e-12):.5f} | val_RMSE={val_rmse:.4f} | time={time.time()-t0:.1f}s")

        if val_rmse < best_rmse:
            best_rmse = val_rmse
            best_state = {k:v.cpu().clone() for k,v in model.state_dict().items()}
            patience = 0
        else:
            patience += 1
            if patience >= 4 and ep > 8:
                print("Early stopping triggered")
                break

    print(f"Fold {fold} finished. best_val_rmse={best_rmse:.4f} swa_count={swa_count}")

    # choose final model: SWA if available
    if swa_count > 0:
        eval_model = ResNetMLP(INPUT_DIM, d_model=HIDDEN_DIM, num_blocks=NUM_BLOCKS, mlp_expansion=MLP_EXPAND, drop=DROPOUT).to(DEVICE)
        swa_state = swa_model.module.state_dict() if hasattr(swa_model, "module") else swa_model.state_dict()
        eval_model.load_state_dict(swa_state)
    else:
        eval_model = ResNetMLP(INPUT_DIM, d_model=HIDDEN_DIM, num_blocks=NUM_BLOCKS, mlp_expansion=MLP_EXPAND, drop=DROPOUT).to(DEVICE)
        eval_model.load_state_dict(best_state)

    # build EMA model and save both
    ema_model = ResNetMLP(INPUT_DIM, d_model=HIDDEN_DIM, num_blocks=NUM_BLOCKS, mlp_expansion=MLP_EXPAND, drop=DROPOUT).to(DEVICE)
    ema_state = ema_model.state_dict()
    for n in ema_state.keys():
        if n in ema_shadow:
            ema_state[n] = ema_shadow[n].to(DEVICE)
    ema_model.load_state_dict(ema_state)

    torch.save(eval_model.state_dict(), f"resnetmlp_fold{fold}.pt")
    torch.save(ema_model.state_dict(), f"resnetmlp_ema_fold{fold}.pt")
    print(f"Saved resnetmlp_fold{fold}.pt and resnetmlp_ema_fold{fold}.pt")

    # OOF preds (eval_model)
    eval_model.eval()
    val_preds_list = []
    with torch.no_grad():
        for xb, yb in val_dl:
            p = eval_model(xb.to(DEVICE)).cpu().numpy()
            val_preds_list.append(p)
    oof[val_idx] = np.concatenate(val_preds_list)

    # test preds
    test_fold_list = []
    with torch.no_grad():
        for xb in test_dl:
            p = eval_model(xb.to(DEVICE)).cpu().numpy()
            test_fold_list.append(p)
    test_preds_folds[fold] = np.concatenate(test_fold_list)

    fold += 1

# ---------------- OOF eval + linear calibration ----------------
oof_rmse_raw = np.sqrt(mean_squared_error(y, oof))
print("\nOOF RMSE (raw):", oof_rmse_raw)

lr_cal = LinearRegression().fit(oof.reshape(-1,1), y)
oof_cal = lr_cal.predict(oof.reshape(-1,1))
oof_rmse_cal = np.sqrt(mean_squared_error(y, oof_cal))
print("OOF RMSE (calibrated):", oof_rmse_cal)
print("Calibration params: a=", lr_cal.coef_[0], "b=", lr_cal.intercept_)

# ---------------- PER-FOLD quantile mapping ----------------
kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)
fold_val_idxs = [val_idx for _, val_idx in kf.split(X)]
sampled_vals = np.random.choice(BIN_CENTERS, size=200_000, p=TARGET_PDF)

def quantile_match_array(preds_raw, sampled_vals):
    order = np.argsort(preds_raw)
    ranks = np.empty_like(order)
    ranks[order] = np.arange(len(preds_raw))
    q = ranks.astype(np.float32) / (len(preds_raw)-1 + 1e-12)
    return np.quantile(sampled_vals, q)

mapped_test_folds = np.zeros_like(test_preds_folds)
for f in range(NFOLDS):
    print("Quantile mapping fold", f)
    val_idx = fold_val_idxs[f]
    val_preds_fold = oof[val_idx]
    # safe check
    val_preds_fold = np.nan_to_num(val_preds_fold, nan=np.nanmedian(val_preds_fold))
    mapped_test_folds[f] = quantile_match_array(test_preds_folds[f], sampled_vals)

test_mean_mapped = mapped_test_folds.mean(axis=0)
test_calibrated = lr_cal.predict(test_mean_mapped.reshape(-1,1)).reshape(-1)
test_final = np.clip(test_calibrated, 0.0, 10.0)

# ---------------- Save submission ----------------
if "test_df" not in globals():
    import json
    with open("test_data.json","r",encoding="utf8") as f:
        test_raw = json.load(f)
    test_df = pd.DataFrame(test_raw)

test_df["ID"] = np.arange(1, len(test_final)+1)
sub = pd.DataFrame({"ID": test_df["ID"], "score": test_final})
sub.to_csv("submission_resnetmlp_histKL_swa_ema_perfold.csv", index=False)
print("\nSaved submission_resnetmlp_histKL_swa_ema_perfold.csv")

print("\nFINAL OOF RMSE (raw):", oof_rmse_raw, "OOF RMSE (cal):", oof_rmse_cal)


In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# ===========================
# Load submission file
# ===========================
sub = pd.read_csv("submission_final02.csv")

scores = sub["score"].values
print("Loaded predictions:", scores.shape)

# Create folder
FIG_DIR = "figures_submission"
os.makedirs(FIG_DIR, exist_ok=True)
print("Saving figures to:", FIG_DIR)

# ===========================
# 1. Histogram
# ===========================
plt.figure(figsize=(8,5))
sns.histplot(scores, bins=40, kde=True, color="purple")
plt.title("Final Prediction Score Distribution")
plt.xlabel("Predicted Score")
plt.ylabel("Count")
plt.tight_layout()
plt.savefig(f"{FIG_DIR}/histogram_scores.png")
plt.close()

# ===========================
# 2. Boxplot
# ===========================
plt.figure(figsize=(6,4))
sns.boxplot(x=scores, color="orange")
plt.title("Prediction Score Boxplot")
plt.tight_layout()
plt.savefig(f"{FIG_DIR}/boxplot_scores.png")
plt.close()

# ===========================
# 3. Violin plot
# ===========================
plt.figure(figsize=(6,4))
sns.violinplot(x=scores, color="green")
plt.title("Prediction Score Violin Plot")
plt.tight_layout()
plt.savefig(f"{FIG_DIR}/violin_scores.png")
plt.close()

# ===========================
# 4. Sorted predictions scatter
# ===========================
plt.figure(figsize=(8,5))
plt.scatter(np.arange(len(scores)), np.sort(scores), s=8, alpha=0.7)
plt.title("Sorted Predictions Curve")
plt.xlabel("Sample Index")
plt.ylabel("Predicted Score")
plt.tight_layout()
plt.savefig(f"{FIG_DIR}/sorted_scores.png")
plt.close()

# ===========================
# 5. CDF (cumulative distribution)
# ===========================
sorted_scores = np.sort(scores)
cdf = np.arange(len(scores)) / (len(scores) - 1)

plt.figure(figsize=(8,5))
plt.plot(sorted_scores, cdf, linewidth=2)
plt.title("Cumulative Distribution Function of Predictions")
plt.xlabel("Score")
plt.ylabel("CDF")
plt.grid(alpha=0.4)
plt.tight_layout()
plt.savefig(f"{FIG_DIR}/cdf_scores.png")
plt.close()

# ===========================
# 6. 2D heatmap (score frequency in ranges)
# ===========================
bins = np.linspace(0, 10, 21)
hist, edges = np.histogram(scores, bins=bins)

plt.figure(figsize=(10,4))
sns.heatmap(hist.reshape(1,-1), annot=True, fmt="d", cmap="magma")
plt.yticks([], [])
plt.xticks(np.arange(len(edges)-1)+0.5, [f"{edges[i]:.1f}-{edges[i+1]:.1f}" for i in range(len(edges)-1)], rotation=90)
plt.title("Heatmap of Score Frequencies")
plt.tight_layout()
plt.savefig(f"{FIG_DIR}/heatmap_score_bins.png")
plt.close()

# ===========================
# 7. Outlier detection (z-score)
# ===========================
z = (scores - scores.mean()) / scores.std()

plt.figure(figsize=(8,5))
plt.scatter(np.arange(len(z)), z, s=10)
plt.axhline(+3, color='red', linestyle='--')
plt.axhline(-3, color='red', linestyle='--')
plt.title("Outlier Detection via Z-score")
plt.xlabel("Sample Index")
plt.ylabel("Z-score")
plt.tight_layout()
plt.savefig(f"{FIG_DIR}/outlier_detection.png")
plt.close()

print("All visualizations saved!")


Loaded predictions: (3638,)
Saving figures to: figures_submission
All visualizations saved!
