In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
############################################################
# 0. IMPORTS
############################################################
import numpy as np
import pandas as pd
import os, json, random
from tqdm.auto import tqdm
import lightgbm as lgb
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import StratifiedKFold

############################################################
# 1. LOAD DATA
############################################################
train_df = pd.read_json("/kaggle/input/da5401-2025-data-challenge/train_data.json", lines=False)
test_df  = pd.read_json("/kaggle/input/da5401-2025-data-challenge/test_data.json", lines=False)

with open("/kaggle/input/da5401-2025-data-challenge/metric_names.json") as f:
    metric_list = json.load(f)

metric_emb = np.load("/kaggle/input/da5401-2025-data-challenge/metric_name_embeddings.npy")

train_df["score"] = train_df["score"].astype(float)
train_df.fillna("", inplace=True)
test_df.fillna("", inplace=True)

print("Step 1 done")
############################################################
# 2. BUILD FULL TEXT
############################################################
def build_full_text(df):
    return (
        df["system_prompt"].fillna("") + " " +
        df["user_prompt"].fillna("") + " " +
        df["response"].fillna("")
    )

train_df["full_text"] = build_full_text(train_df)
test_df["full_text"]  = build_full_text(test_df)

print("Step 2 done")
############################################################
# 3. SBERT (LABSE) EMBEDDINGS + CACHING
############################################################
train_cache = "/kaggle/working/labse_train.npy"
test_cache  = "/kaggle/working/labse_test.npy"

if os.path.exists(train_cache) and os.path.exists(test_cache):
    print("Loading cached embeddings...")
    X_train_sbert = np.load(train_cache)
    X_test_sbert  = np.load(test_cache)
else:
    print("Computing LaBSE embeddings...")
    model = SentenceTransformer("sentence-transformers/LaBSE")

    X_train_sbert = model.encode(
        train_df["full_text"].tolist(),
        batch_size=32,
        show_progress_bar=True
    )
    X_test_sbert = model.encode(
        test_df["full_text"].tolist(),
        batch_size=32,
        show_progress_bar=True
    )

    np.save(train_cache, X_train_sbert)
    np.save(test_cache, X_test_sbert)

print("Train shape:", X_train_sbert.shape)
print("Test shape :", X_test_sbert.shape)

############################################################
# 4. NEGATIVE SAMPLING (WRONG METRIC → SCORE 0)
############################################################
def negative_sampling(df, metric_list, k=1):
    high = df[df["score"] >= 8]
    new_rows = []

    for _, row in tqdm(high.iterrows(), total=len(high)):
        current_metric = row["metric_name"]
        candidates = [m for m in metric_list if m != current_metric]

        chosen = random.sample(candidates, k)

        for cm in chosen:
            new_row = row.copy()
            new_row["metric_name"] = cm
            new_row["score"] = 0.0
            new_rows.append(new_row)

    return pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)

train_aug = negative_sampling(train_df, metric_list, k=1)
train_aug = train_aug.sample(frac=1).reset_index(drop=True)
print("Step 4 done")
############################################################
# 5. ALIGN EMBEDDINGS WITH AUGMENTED DATA
############################################################
orig_len = len(train_df)

row_map = []
for idx, row in train_aug.iterrows():
    if row["score"] == 0:
        row_map.append(random.randint(0, orig_len - 1))
    else:
        row_map.append(idx)

X_train_final = X_train_sbert[row_map]
y = train_aug["score"].astype(int).values
print("Step 5 done")
############################################################
# 6. TOPIC DIVERGENCE
############################################################
def compute_topic_div(train_df, test_df):
    all_text = pd.concat([
        train_df["user_prompt"], train_df["response"],
        test_df["user_prompt"], test_df["response"]
    ]).fillna("")

    tfidf = TfidfVectorizer(max_features=4000, min_df=3, stop_words="english")
    X = tfidf.fit_transform(all_text)

    nmf = NMF(n_components=20, init="nndsvda", random_state=42)
    topics = nmf.fit_transform(X)

    n_tr = len(train_df)
    n_ts = len(test_df)

    P_tr = topics[:n_tr]
    R_tr = topics[n_tr:n_tr*2]

    P_ts = topics[n_tr*2:n_tr*2+n_ts]
    R_ts = topics[n_tr*2+n_tr*2-n_ts : n_tr*2+n_ts]

    div_tr = 1 - np.diag(cosine_similarity(P_tr, R_tr))
    div_ts = 1 - np.diag(cosine_similarity(P_ts, R_ts))

    return div_tr, div_ts

topic_div_train, topic_div_test = compute_topic_div(train_aug, test_df)

############################################################
# 7. MODEL A — MULTICLASS CLASSIFIER (EMBEDDINGS)
############################################################
params_cls = {
    "objective": "multiclass",
    "num_class": 11,
    "metric": "multi_logloss",
    "learning_rate": 0.04,
    "num_leaves": 48,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 1,
    "seed": 42,
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_cls = np.zeros((len(train_aug), 11))
test_cls = np.zeros((len(test_df), 11))

for tr, va in skf.split(X_train_final, y):
    dtr = lgb.Dataset(X_train_final[tr], label=y[tr])
    dval = lgb.Dataset(X_train_final[va], label=y[va])

    model = lgb.train(params_cls, dtr, 600,
                      valid_sets=[dval],
                      verbose_eval=False,
                      early_stopping_rounds=50)

    oof_cls[va] = model.predict(X_train_final[va])
    test_cls += model.predict(X_test_sbert) / 5

pred_cls = (oof_cls * np.arange(11)).sum(axis=1)
pred_cls_test = (test_cls * np.arange(11)).sum(axis=1)
print("STEP 7 DONE")
############################################################
# 8. MODEL B — TOPIC DIVERGENCE REGRESSOR
############################################################
params_div = {
    "objective": "regression",
    "metric": "rmse",
    "learning_rate": 0.04,
    "num_leaves": 20,
    "seed": 42,
}

oof_div = np.zeros(len(train_aug))
test_div = np.zeros(len(test_df))

for tr, va in skf.split(topic_div_train.reshape(-1,1), y):
    dtr = lgb.Dataset(topic_div_train[tr].reshape(-1,1), label=y[tr])
    dval = lgb.Dataset(topic_div_train[va].reshape(-1,1), label=y[va])

    model = lgb.train(params_div, dtr, 300,
                      valid_sets=[dval],
                      verbose_eval=False,
                      early_stopping_rounds=50)

    oof_div[va] = model.predict(topic_div_train[va].reshape(-1,1))
    test_div += model.predict(topic_div_test.reshape(-1,1)) / 5

oof_div_scaled  = 10 * (1 - oof_div / oof_div.max())
test_div_scaled = 10 * (1 - test_div / test_div.max())
print("Syep 8 done")
############################################################
# 9. ENSEMBLE
############################################################
final_oof  = 0.65 * pred_cls + 0.35 * oof_div_scaled
final_test = 0.65 * pred_cls_test + 0.35 * test_div_scaled
final_test = np.clip(final_test, 0, 10)

############################################################
# 10. SUBMISSION
############################################################
submission = pd.DataFrame({
    "ID": np.arange(1, len(test_df)+1),
    "score": final_test
})

submission.to_csv("submission.csv", index=False)
print("Saved submission.csv")


Step 1 done
Step 2 done
Computing LaBSE embeddings...


modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/804 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

2_Dense/model.safetensors:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

2_Dense/pytorch_model.bin:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

Batches:   0%|          | 0/157 [00:00<?, ?it/s]

Batches:   0%|          | 0/114 [00:00<?, ?it/s]

Train shape: (5000, 768)
Test shape : (3638, 768)


  0%|          | 0/4825 [00:00<?, ?it/s]

Step 4 done


IndexError: index 5003 is out of bounds for axis 0 with size 5000

In [6]:
# ============================================
# CELL 1 — Imports, Setup, Seed
# ============================================

import os
import json
import random
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

import torch
from sentence_transformers import SentenceTransformer

# Ensure reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

print("Cell 1 complete — imports loaded.")


Cell 1 complete — imports loaded.


In [7]:
# ============================================
# CELL 2 — Load Data
# ============================================

TRAIN_PATH = "/kaggle/input/da5401-2025-data-challenge/train_data.json"
TEST_PATH  = "/kaggle/input/da5401-2025-data-challenge/test_data.json"
METRIC_EMB_PATH = "/kaggle/input/da5401-2025-data-challenge/metric_name_embeddings.npy"
METRIC_NAMES_PATH = "/kaggle/input/da5401-2025-data-challenge/metric_names.json"

print("Loading JSON files...")

train_df = pd.read_json(TRAIN_PATH, lines=False)
test_df  = pd.read_json(TEST_PATH,  lines=False)

with open(METRIC_NAMES_PATH, "r") as f:
    metric_list = json.load(f)

metric_embeddings = np.load(METRIC_EMB_PATH)

print("Train shape:", train_df.shape)
print("Test shape :", test_df.shape)
print("Number of metric names:", len(metric_list))

# Add original index for safe back-referencing
train_df["orig_index"] = np.arange(len(train_df))

print("Cell 2 complete — data loaded.")


Loading JSON files...
Train shape: (5000, 5)
Test shape : (3638, 4)
Number of metric names: 145
Cell 2 complete — data loaded.


In [8]:
# ============================================
# CELL 3 — Build FULL TEXT for Embedding
# ============================================

def build_full_text(df):
    return (
        df["system_prompt"].fillna("") + " " +
        df["user_prompt"].fillna("") + " " +
        df["response"].fillna("")
    )

train_df["full_text"] = build_full_text(train_df)
test_df["full_text"]  = build_full_text(test_df)

print("Example full_text:")
print(train_df["full_text"].iloc[0][:200], "...")

print("Cell 3 complete — full_text constructed.")


Example full_text:
You are a medical assistance chatbot, and you must reject any unsafe, illegal, or unethical requests, while clearly explaining the reason for the rejection. எனக்கு கொஞ்ச நாளா வயித்துல வலிக்குது. என்  ...
Cell 3 complete — full_text constructed.


In [9]:
# ============================================
# CELL 4 — LaBSE Embeddings (with caching)
# ============================================

CACHE_TRAIN = "/kaggle/working/labse_train_embeddings.npy"
CACHE_TEST  = "/kaggle/working/labse_test_embeddings.npy"

model = SentenceTransformer("sentence-transformers/LaBSE")

if os.path.exists(CACHE_TRAIN) and os.path.exists(CACHE_TEST):
    print("Loading cached embeddings...")
    X_train_labse = np.load(CACHE_TRAIN)
    X_test_labse  = np.load(CACHE_TEST)

else:
    print("Computing LaBSE embeddings...")
    X_train_labse = model.encode(
        train_df["full_text"].tolist(),
        batch_size=32,
        show_progress_bar=True
    )
    np.save(CACHE_TRAIN, X_train_labse)

    X_test_labse = model.encode(
        test_df["full_text"].tolist(),
        batch_size=32,
        show_progress_bar=True
    )
    np.save(CACHE_TEST, X_test_labse)

print("LaBSE shapes:", X_train_labse.shape, X_test_labse.shape)
print("Cell 4 complete — embeddings ready.")


Computing LaBSE embeddings...


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

Batches:   0%|          | 0/114 [00:00<?, ?it/s]

LaBSE shapes: (5000, 768) (3638, 768)
Cell 4 complete — embeddings ready.


In [17]:
# ============================================
# CELL 3 — Load LaBSE (fixed name)
# ============================================

from sentence_transformers import SentenceTransformer

labse_model = SentenceTransformer("sentence-transformers/LaBSE")

print("LaBSE model loaded as labse_model")


LaBSE model loaded as labse_model


In [18]:
# ============================================
# CELL 4 — Compute Topic Divergence Function
# ============================================

def compute_divergence(df, col1="user_prompt", col2="response"):
    """
    Compute topic divergence = 1 - cosine_similarity
    using LaBSE embeddings for (user_prompt, response).
    """
    print(f"Encoding {col1}...")
    emb1 = labse_model.encode(
        df[col1].fillna("").tolist(),
        batch_size=32,
        show_progress_bar=True
    )

    print(f"Encoding {col2}...")
    emb2 = labse_model.encode(
        df[col2].fillna("").tolist(),
        batch_size=32,
        show_progress_bar=True
    )

    cos = cosine_similarity(emb1, emb2).diagonal()
    divergence = 1 - cos

    return divergence


In [19]:
# ============================================
# CELL 5 — Generate Train/Test Divergence
# ============================================

print("Computing divergence for train...")
div_train = compute_divergence(train_df)

print("Computing divergence for test...")
div_test = compute_divergence(test_df)

X_div_train = div_train.reshape(-1, 1)
y_div_train = train_df["score"].astype(float).values

X_div_test = div_test.reshape(-1, 1)

print("Train divergence shape:", X_div_train.shape)
print("Test divergence shape :", X_div_test.shape)


Computing divergence for train...
Encoding user_prompt...


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

Encoding response...


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

Computing divergence for test...
Encoding user_prompt...


Batches:   0%|          | 0/114 [00:00<?, ?it/s]

Encoding response...


Batches:   0%|          | 0/114 [00:00<?, ?it/s]

Train divergence shape: (5000, 1)
Test divergence shape : (3638, 1)


In [20]:
# ============================================
# CELL 6 — Train LightGBM Divergence Regressor
# ============================================

params_div = {
    "objective": "regression",
    "metric": "rmse",
    "learning_rate": 0.05,
    "num_leaves": 16,
    "min_data_in_leaf": 25,
    "verbosity": -1,
    "seed": 42,
}

gkf = GroupKFold(n_splits=5)
groups = train_df["metric_name"].values

oof_div = np.zeros_like(y_div_train)
test_div_pred = np.zeros(len(test_df))

print("Training divergence regressor...")

for fold, (tr_idx, val_idx) in enumerate(gkf.split(X_div_train, y_div_train, groups)):
    print(f"\nFold {fold}")

    train_set = lgb.Dataset(X_div_train[tr_idx], y_div_train[tr_idx])
    val_set   = lgb.Dataset(X_div_train[val_idx], y_div_train[val_idx])

    model_div = lgb.train(
        params_div,
        train_set,
        num_boost_round=500,
        valid_sets=[val_set],
        callbacks=[
            lgb.early_stopping(50),
            lgb.log_evaluation(50)
        ]
    )

    oof_div[val_idx] = model_div.predict(X_div_train[val_idx])
    test_div_pred += model_div.predict(X_div_test) / gkf.n_splits

rmse_div = mean_squared_error(y_div_train, oof_div, squared=False)
print("\nDivergence-only RMSE:", rmse_div)


Training divergence regressor...

Fold 0
Training until validation scores don't improve for 50 rounds
[50]	valid_0's rmse: 0.914709
Early stopping, best iteration is:
[1]	valid_0's rmse: 0.909861

Fold 1
Training until validation scores don't improve for 50 rounds
[50]	valid_0's rmse: 0.959689
Early stopping, best iteration is:
[1]	valid_0's rmse: 0.955632

Fold 2
Training until validation scores don't improve for 50 rounds
[50]	valid_0's rmse: 0.983316
Early stopping, best iteration is:
[5]	valid_0's rmse: 0.980331

Fold 3
Training until validation scores don't improve for 50 rounds
[50]	valid_0's rmse: 0.846234
Early stopping, best iteration is:
[2]	valid_0's rmse: 0.843561

Fold 4
Training until validation scores don't improve for 50 rounds
[50]	valid_0's rmse: 1.02515
Early stopping, best iteration is:
[1]	valid_0's rmse: 1.01526

Divergence-only RMSE: 0.9427769128674659


In [21]:
# ============================================
# CELL 7 — Submission
# ============================================

submission = pd.DataFrame({
    "ID": np.arange(1, len(test_df) + 1),
    "score": np.clip(test_div_pred, 0, 10)
})

save_path = "/kaggle/working/submission_divergence_only.csv"
submission.to_csv(save_path, index=False)

print("Saved:", save_path)
print(submission.head())


Saved: /kaggle/working/submission_divergence_only.csv
   ID     score
0   1  9.122885
1   2  9.122717
2   3  9.119098
3   4  9.100678
4   5  9.122392


In [23]:
submission.describe()

Unnamed: 0,ID,score
count,3638.0,3638.0
mean,1819.5,9.119384
std,1050.344467,0.006934
min,1.0,9.089155
25%,910.25,9.11806
50%,1819.5,9.120066
75%,2728.75,9.122717
max,3638.0,9.133104


In [24]:
print("Topic Divergence — Test Set")
print("----------------------------------")
print("Shape:", div_test.shape)

print("\nFirst 20 values:")
print(div_test[:20])

print("\nSummary statistics:")
print(pd.Series(div_test).describe())

print("\nHistogram (rounded):")
print(pd.Series(np.round(div_test, 3)).value_counts().sort_index())

div_test.describe()


Topic Divergence — Test Set
----------------------------------
Shape: (3638,)

First 20 values:
[0.30729687 0.31880486 0.31376982 0.3927331  0.4595837  0.707551
 0.30869532 0.60051966 0.5009735  0.51938814 0.54840076 0.4415053
 0.3096547  0.5254268  0.49331784 0.54024184 0.3991598  0.40014017
 0.37033117 0.26876163]

Summary statistics:
count    3.638000e+03
mean     4.216523e-01
std      1.327039e-01
min     -2.384186e-07
25%      3.342464e-01
50%      4.025587e-01
75%      4.913008e-01
max      1.107813e+00
dtype: float64

Histogram (rounded):
-0.000    3
 0.060    1
 0.082    1
 0.090    1
 0.104    1
         ..
 1.033    1
 1.036    1
 1.041    1
 1.071    1
 1.108    1
Name: count, Length: 616, dtype: int64


AttributeError: 'numpy.ndarray' object has no attribute 'describe'

In [25]:
import numpy as np
import pandas as pd

# -------------------------------------------------------
# 1. YOU ALREADY HAVE div_test FROM PREVIOUS CELLS
#    ≤ just print first few to confirm
# -------------------------------------------------------

print("Example divergence values:", div_test[:20])
print("Total test rows:", len(div_test))

# -------------------------------------------------------
# 2. Partition divergence into 11 brackets (0–10 classes)
# -------------------------------------------------------

num_classes = 11
percentiles = np.linspace(0, 100, num_classes + 1)  # 12 cut points
cuts = np.percentile(div_test, percentiles)

print("\nClass bin borders (percentiles):")
for i in range(len(cuts) - 1):
    print(f"Class {i}: {cuts[i]:.4f} → {cuts[i+1]:.4f}")

# -------------------------------------------------------
# 3. Assign each test sample a class label based on divergence
# -------------------------------------------------------

class_labels = np.digitize(div_test, cuts[1:-1], right=True)   # returns 0..10

print("\nRaw class distribution:")
print(pd.Series(class_labels).value_counts().sort_index())

# -------------------------------------------------------
# 4. OPTIONAL: Smooth distribution so each class has similar freq
# -------------------------------------------------------
# Why? divergence tends to cluster — so we re-quantize evenly

def enforce_even_bins(values, n_bins=11):
    """
    Sorts values and assigns evenly-distributed class labels.
    Ensures each class has ~len(values)/11 samples.
    """
    sorted_idx = np.argsort(values)
    labels = np.zeros(len(values), dtype=int)

    per_bin = len(values) // n_bins

    for cls in range(n_bins):
        start = cls * per_bin
        end = (cls + 1) * per_bin if cls < n_bins - 1 else len(values)
        labels[sorted_idx[start:end]] = cls
    
    return labels

balanced_labels = enforce_even_bins(div_test, num_classes)

print("\nBalanced class distribution:")
print(pd.Series(balanced_labels).value_counts().sort_index())

# -------------------------------------------------------
# 5. Convert class label to score (0–10)
# -------------------------------------------------------

final_scores = balanced_labels.astype(float)

# -------------------------------------------------------
# 6. Save submission.csv
# -------------------------------------------------------

submission = pd.DataFrame({
    "ID": np.arange(1, len(final_scores) + 1),
    "score": final_scores
})

submission.to_csv("submission.csv", index=False)

print("\nCreated submission.csv with balanced divergence-based scores!")
print(submission.head())


Example divergence values: [0.30729687 0.31880486 0.31376982 0.3927331  0.4595837  0.707551
 0.30869532 0.60051966 0.5009735  0.51938814 0.54840076 0.4415053
 0.3096547  0.5254268  0.49331784 0.54024184 0.3991598  0.40014017
 0.37033117 0.26876163]
Total test rows: 3638

Class bin borders (percentiles):
Class 0: -0.0000 → 0.2686
Class 1: 0.2686 → 0.3094
Class 2: 0.3094 → 0.3422
Class 3: 0.3422 → 0.3657
Class 4: 0.3657 → 0.3915
Class 5: 0.3915 → 0.4153
Class 6: 0.4153 → 0.4448
Class 7: 0.4448 → 0.4819
Class 8: 0.4819 → 0.5281
Class 9: 0.5281 → 0.6067
Class 10: 0.6067 → 1.1078

Raw class distribution:
0     331
1     331
2     330
3     331
4     331
5     330
6     331
7     331
8     330
9     331
10    331
Name: count, dtype: int64

Balanced class distribution:
0     330
1     330
2     330
3     330
4     330
5     330
6     330
7     330
8     330
9     330
10    338
Name: count, dtype: int64

Created submission.csv with balanced divergence-based scores!
   ID  score
0   1    1.0
1 

In [26]:
submission.describe()

Unnamed: 0,ID,score
count,3638.0,3638.0
mean,1819.5,5.010995
std,1050.344467,3.167905
min,1.0,0.0
25%,910.25,2.0
50%,1819.5,5.0
75%,2728.75,8.0
max,3638.0,10.0


In [10]:
# ============================================
# CELL 5 — Negative Sampling
# ============================================

def apply_random_metric_shuffling(df, metric_list, num_negatives=1):
    print(f"Generating {num_negatives} negative samples per row...")
    new_rows = []
    
    high_df = df[df["score"] >= 8.0]  # only high quality for negative sampling
    
    for _, row in tqdm(high_df.iterrows(), total=len(high_df)):
        current_metric = row["metric_name"]
        valid_metrics = [m for m in metric_list if m != current_metric]

        if not valid_metrics:
            continue

        selected = random.sample(valid_metrics, num_negatives)

        for m in selected:
            nr = row.copy()
            nr["metric_name"] = m

            # KEY TRICK: wrong metric → score 0
            nr["score"] = 0.0

            # keep original index for embedding
            nr["orig_index"] = row["orig_index"]

            new_rows.append(nr)

    df_aug = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)
    df_aug = df_aug.sample(frac=1, random_state=SEED).reset_index(drop=True)

    print("Negative samples generated:", len(new_rows))
    return df_aug


train_aug = apply_random_metric_shuffling(train_df, metric_list, num_negatives=1)

print("Augmented shape:", train_aug.shape)
print(train_aug["score"].value_counts().sort_index())

print("Cell 5 complete — augmented dataset created.")


Generating 1 negative samples per row...


  0%|          | 0/4825 [00:00<?, ?it/s]

Negative samples generated: 4825
Augmented shape: (9825, 7)
score
0.0     4838
1.0        6
2.0        5
3.0        7
4.0        3
5.0        1
6.0       45
7.0       95
8.0      259
9.0     3123
9.5        1
10.0    1442
Name: count, dtype: int64
Cell 5 complete — augmented dataset created.


In [11]:
# ============================================
# CELL 6 — Re-align LaBSE Embeddings using orig_index
# ============================================

# Extract safe mapping
row_map = train_aug["orig_index"].values

print("Max orig_index:", row_map.max())
print("Embedding matrix rows:", X_train_labse.shape[0])

# SAFE RE-INDEXING
X_train_final = X_train_labse[row_map]
y_final = train_aug["score"].astype(int).values
metric_names_final = train_aug["metric_name"].values

print("Final embedding shape:", X_train_final.shape)
print("Final labels shape:", y_final.shape)

print("Cell 6 complete — safe embedding re-alignment successful.")


Max orig_index: 4999
Embedding matrix rows: 5000
Final embedding shape: (9825, 768)
Final labels shape: (9825,)
Cell 6 complete — safe embedding re-alignment successful.


In [12]:
# ============================================
# CELL 7 — Topic Divergence Feature
# ============================================

def compute_topic_divergence(df, text_col1="user_prompt", text_col2="response"):
    print("Computing divergence embeddings...")
    
    t1 = model.encode(
        df[text_col1].fillna("").tolist(),
        batch_size=32,
        show_progress_bar=True
    )
    t2 = model.encode(
        df[text_col2].fillna("").tolist(),
        batch_size=32,
        show_progress_bar=True
    )

    # Cosine similarity → divergence = 1 - cos
    cos = cosine_similarity(t1, t2).diagonal()
    div = 1 - cos

    return div

# Divergence for augmented train and test
topic_div_train = compute_topic_divergence(train_aug)
topic_div_test  = compute_topic_divergence(test_df)

print("Divergence examples:", topic_div_train[:10])
print("Cell 7 complete — topic divergence ready.")


Computing divergence embeddings...


Batches:   0%|          | 0/308 [00:00<?, ?it/s]

Batches:   0%|          | 0/308 [00:00<?, ?it/s]

Computing divergence embeddings...


Batches:   0%|          | 0/114 [00:00<?, ?it/s]

Batches:   0%|          | 0/114 [00:00<?, ?it/s]

Divergence examples: [0.5457946  0.44151962 0.5220934  0.30910146 0.5443378  0.42945468
 0.15131378 0.47859347 0.21403229 0.62286836]
Cell 7 complete — topic divergence ready.


In [13]:
# ============================================
# CELL 8 — Prepare Final Model Inputs
# ============================================

# Map metric_name → embedding index
metric_name_to_idx = {name: i for i, name in enumerate(metric_list)}

train_metric_idx = np.array([
    metric_name_to_idx[m] for m in train_aug["metric_name"]
])
test_metric_idx = np.array([
    metric_name_to_idx[m] for m in test_df["metric_name"]
])

metric_emb_train = metric_embeddings[train_metric_idx]
metric_emb_test  = metric_embeddings[test_metric_idx]

# Final feature concatenation: [LaBSE | metric_emb | divergence]
X_train_combo = np.hstack([X_train_final, metric_emb_train, topic_div_train.reshape(-1,1)])
X_test_combo  = np.hstack([X_test_labse, metric_emb_test, topic_div_test.reshape(-1,1)])

print("X_train_combo shape:", X_train_combo.shape)
print("X_test_combo  shape:", X_test_combo.shape)

print("Cell 8 complete — final model inputs prepared.")


X_train_combo shape: (9825, 1537)
X_test_combo  shape: (3638, 1537)
Cell 8 complete — final model inputs prepared.


In [16]:
# =====================================================
# TOPIC DIVERGENCE ONLY MODEL
# =====================================================

print("Preparing Topic Divergence–only model...")

# 1) Compute divergence (user_prompt vs response)
def compute_div(df):
    e1 = model.encode(df["user_prompt"].fillna("").tolist(),
                      batch_size=32, show_progress_bar=True)
    e2 = model.encode(df["response"].fillna("").tolist(),
                      batch_size=32, show_progress_bar=True)
    sim = cosine_similarity(e1, e2).diagonal()
    return 1 - sim   # divergence

print("Computing divergence for train/test...")
div_train = compute_div(train_df)
div_test  = compute_div(test_df)

# 2) Prepare train matrix
X_div_train = div_train.reshape(-1, 1)
y_div_train = train_df["score"].astype(float).values

X_div_test = div_test.reshape(-1, 1)

print("Train divergence shape:", X_div_train.shape)
print("Test divergence shape:", X_div_test.shape)

# 3) Train simple LGBM regression
params_div = {
    "objective": "regression",
    "metric": "rmse",
    "learning_rate": 0.05,
    "num_leaves": 16,
    "min_data_in_leaf": 20,
    "feature_fraction": 1.0,
    "verbosity": -1,
    "seed": 42
}

print("Training divergence regressor...")

gkf = GroupKFold(n_splits=5)
groups = train_df["metric_name"].values

oof_div = np.zeros_like(y_div_train)
test_div_pred = np.zeros((len(X_div_test),))

for fold, (tr, val) in enumerate(gkf.split(X_div_train, y_div_train, groups)):
    print(f"\nFold {fold}")
    
    train_set = lgb.Dataset(X_div_train[tr], y_div_train[tr])
    val_set   = lgb.Dataset(X_div_train[val], y_div_train[val])

    m = lgb.train(
        params_div,
        train_set,
        num_boost_round=500,
        valid_sets=[val_set],
        callbacks=[lgb.early_stopping(50),
                   lgb.log_evaluation(50)]
    )

    oof_div[val] = m.predict(X_div_train[val])
    test_div_pred += m.predict(X_div_test) / gkf.n_splits

# Clip to valid range, round to 1 decimal
oof_div = np.clip(oof_div, 0, 10)
test_div_pred = np.clip(test_div_pred, 0, 10)

rmse_div = mean_squared_error(y_div_train, oof_div, squared=False)
print("\nTopic Divergence–only RMSE:", rmse_div)

# 4) Create submission file
sub_div = pd.DataFrame({
    "ID": np.arange(1, len(test_div_pred) + 1),
    "score": test_div_pred
})

sub_div_path = "/kaggle/working/submission_divergence_only.csv"
sub_div.to_csv(sub_div_path, index=False)

print("\nSubmission saved to:", sub_div_path)
print(sub_div.head())


Preparing Topic Divergence–only model...
Computing divergence for train/test...


AttributeError: 'Booster' object has no attribute 'encode'

In [15]:
# ============================================
# CELL 9 — LightGBM Multiclass Classification (FIXED)
# ============================================

classes = np.arange(0, 11)
num_classes = len(classes)

# Class counts (after augmentation)
class_counts = np.bincount(y_final, minlength=num_classes)

# Inverse frequency weights
weights = {i: 1 / (class_counts[i] + 1e-6) for i in range(num_classes)}

print("Class counts:", class_counts)
print("Class weights:", weights)

params = {
    "objective": "multiclass",
    "metric": "multi_logloss",
    "num_class": num_classes,
    "learning_rate": 0.04,
    "num_leaves": 64,
    "feature_fraction": 0.75,
    "bagging_fraction": 0.7,
    "bagging_freq": 1,
    "min_data_in_leaf": 25,
    "lambda_l2": 0.1,
    "seed": SEED,
    "verbosity": -1
}

gkf = GroupKFold(n_splits=5)

oof_preds = np.zeros((len(y_final), num_classes))
test_preds = np.zeros((X_test_combo.shape[0], num_classes))
groups = metric_names_final

fold_rmse = []

for fold, (tr, val) in enumerate(gkf.split(X_train_combo, y_final, groups)):
    print(f"\n=== Fold {fold} ===")

    X_tr, X_val = X_train_combo[tr], X_train_combo[val]
    y_tr, y_val = y_final[tr], y_final[val]

    # ---------------------------
    # FIX: Apply weights per sample
    # ---------------------------
    sample_weights = np.array([weights[y] for y in y_tr])

    train_set = lgb.Dataset(X_tr, y_tr, weight=sample_weights)
    val_set   = lgb.Dataset(X_val, y_val)

    model = lgb.train(
        params,
        train_set,
        num_boost_round=600,
        valid_sets=[val_set],
        callbacks=[
            lgb.early_stopping(50),
            lgb.log_evaluation(50)
        ]
    )

    # Store predictions
    oof_preds[val] = model.predict(X_val)
    test_preds += model.predict(X_test_combo) / gkf.n_splits

    # Decode expected value for RMSE
    val_pred_scores = (oof_preds[val] * classes).sum(axis=1)
    rmse = mean_squared_error(y_val, val_pred_scores, squared=False)
    fold_rmse.append(rmse)

    print(f"Fold {fold} RMSE: {rmse:.4f}")

print("\nAverage CV RMSE:", np.mean(fold_rmse))
print("Cell 9 complete — LGBM classifier trained with sample weights.")


Class counts: [4838    6    5    7    3    1   45   95  259 3124 1442]
Class weights: {0: 0.00020669698218133588, 1: 0.16666663888889352, 2: 0.199999960000008, 3: 0.1428571224489825, 4: 0.33333322222225925, 5: 0.9999990000010001, 6: 0.022222221728395074, 7: 0.01052631567867036, 8: 0.00386100384609651, 9: 0.00032010243267602354, 10: 0.0006934812755246316}

=== Fold 0 ===
Training until validation scores don't improve for 50 rounds
[50]	valid_0's multi_logloss: 1.63774
[100]	valid_0's multi_logloss: 1.43446
[150]	valid_0's multi_logloss: 1.33806
[200]	valid_0's multi_logloss: 1.28454
[250]	valid_0's multi_logloss: 1.25578
[300]	valid_0's multi_logloss: 1.23914
[350]	valid_0's multi_logloss: 1.22996
[400]	valid_0's multi_logloss: 1.22447
[450]	valid_0's multi_logloss: 1.22108
[500]	valid_0's multi_logloss: 1.22064
Early stopping, best iteration is:
[463]	valid_0's multi_logloss: 1.22055
Fold 0 RMSE: 4.6972

=== Fold 1 ===
Training until validation scores don't improve for 50 rounds
[50]	v

KeyboardInterrupt: 

In [None]:
# ============================================
# CELL 10 — Topic Divergence Regression Model
# ============================================

params_reg = {
    "objective": "regression",
    "metric": "rmse",
    "learning_rate": 0.04,
    "num_leaves": 31,
    "min_data_in_leaf": 20,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 1,
    "verbosity": -1,
    "seed": SEED
}

oof_reg = np.zeros(len(y_final))
test_reg = np.zeros(topic_div_test.shape[0])

for fold, (tr, val) in enumerate(gkf.split(topic_div_train, y_final, groups)):
    print(f"\n[Reg Fold {fold}]")

    X_tr, X_val = topic_div_train[tr].reshape(-1,1), topic_div_train[val].reshape(-1,1)
    y_tr, y_val = y_final[tr], y_final[val]

    dtrain = lgb.Dataset(X_tr, y_tr)
    dval   = lgb.Dataset(X_val, y_val)

    mreg = lgb.train(
        params_reg,
        dtrain,
        num_boost_round=500,
        valid_sets=[dval],
        callbacks=[lgb.early_stopping(30), lgb.log_evaluation(50)]
    )

    oof_reg[val] = mreg.predict(X_val)
    test_reg += mreg.predict(topic_div_test.reshape(-1,1)) / gkf.n_splits

rmse_reg = mean_squared_error(y_final, oof_reg, squared=False)
print("\nRegressor OOF RMSE:", rmse_reg)
print("Cell 10 complete — divergence regressor trained.")


In [None]:
# ============================================
# CELL 11 — Ensemble final predictions
# ============================================

# Classifier expected-value prediction
test_pred_scores_cls = (test_preds * classes).sum(axis=1)

# Ensemble weights
W_CLS = 0.85
W_REG = 0.15

test_pred_final = (
    W_CLS * test_pred_scores_cls
    + W_REG * test_reg
)

# Clip to valid range
test_pred_final = np.clip(test_pred_final, 0, 10)

print("Ensemble preview:", test_pred_final[:20])
print("Cell 11 complete — ensemble done.")


In [None]:
# ============================================
# CELL 12 — OOF Evaluation
# ============================================

# OOF classifier expected values
oof_cls_scores = (oof_preds * classes).sum(axis=1)

# Final OOF ensemble
oof_final = W_CLS * oof_cls_scores + W_REG * oof_reg
oof_final = np.clip(oof_final, 0, 10)

oof_rmse = mean_squared_error(y_final, oof_final, squared=False)

print(f"Final OOF RMSE (ensemble): {oof_rmse:.4f}")
print("Cell 12 complete — OOF computed.")


In [None]:
# ============================================
# CELL 13 — Create submission.csv
# ============================================

sub = pd.DataFrame({
    "ID": np.arange(1, len(test_pred_final) + 1),
    "score": test_pred_final
})

# Round to 1 decimal to match expected format
sub["score"] = sub["score"].round(1)

SUB_PATH = "/kaggle/working/submission.csv"
sub.to_csv(SUB_PATH, index=False)

print("Submission file written to:", SUB_PATH)
print(sub.head())
print("Cell 13 complete — submission generated.")


In [None]:
# ============================================
# CELL 14 — Final Summary
# ============================================

print("==== SUMMARY ====")
print("Train rows (augmented):", train_aug.shape[0])
print("Original embeddings:", X_train_labse.shape)
print("Final training features:", X_train_combo.shape)
print("Test rows:", X_test_combo.shape[0])

print("\nModel components:")
print("- LaBSE Encoder")
print("- Metric Embedding")
print("- Topic Divergence Feature")
print("- LGBM Multiclass Classifier")
print("- LGBM Divergence Regressor")
print("- Weighted Ensemble")

print("\nFinal OOF RMSE:", oof_rmse)
print("Submission located at:", SUB_PATH)

print("\nNotebook complete.")


In [None]:
# ============================================================
# IMPORTS
# ============================================================
import numpy as np
import pandas as pd
import json
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import lightgbm as lgb
from sklearn.model_selection import train_test_split


# ============================================================
# 1. LOAD DATA (JSON)
# ============================================================
train_path = "/kaggle/input/da5401-2025-data-challenge/train_data.json"
test_path  = "/kaggle/input/da5401-2025-data-challenge/test_data.json"

train_df = pd.read_json(train_path, lines=True)
test_df  = pd.read_json(test_path,  lines=True)

# Expected columns:
# system_prompt, user_prompt, response, metric_name, score


# ============================================================
# 2. LOAD METRIC NAME EMBEDDINGS (given)
# ============================================================
metric_emb = np.load("/kaggle/input/da5401-2025-data-challenge/metric_name_embeddings.npy")
# shape (145, 300)

with open("/kaggle/input/da5401-2025-data-challenge/metric_names.json") as f:
    metric_list = json.load(f)

metric_to_id = {m: i for i, m in enumerate(metric_list)}

train_metric_idx = train_df["metric_name"].map(metric_to_id).values
test_metric_idx  = test_df["metric_name"].map(metric_to_id).values

train_metric_vec = metric_emb[train_metric_idx]   # (n_train, 300)
test_metric_vec  = metric_emb[test_metric_idx]    # (n_test, 300)


# ============================================================
# 3. LOAD LaBSE (for system / user / response)
# ============================================================
labse = SentenceTransformer("sentence-transformers/LaBSE")

def embed(texts):
    return labse.encode(texts, batch_size=64, show_progress_bar=True)


# Encode train
train_df["system_prompt"]  = train_df["system_prompt"].fillna("")
train_df["user_prompt"]    = train_df["user_prompt"].fillna("")
train_df["response"]       = train_df["response"].fillna("")
test_df["system_prompt"]   = test_df["system_prompt"].fillna("")
test_df["user_prompt"]     = test_df["user_prompt"].fillna("")
test_df["response"]        = test_df["response"].fillna("")

# Now embed safely
train_sys_vec  = embed(train_df["system_prompt"].tolist())
train_usr_vec  = embed(train_df["user_prompt"].tolist())
train_resp_vec = embed(train_df["response"].tolist())

test_sys_vec  = embed(test_df["system_prompt"].tolist())
test_usr_vec  = embed(test_df["user_prompt"].tolist())
test_resp_vec = embed(test_df["response"].tolist())


# Encode test
#test_sys_vec  = embed(test_df["system_prompt"].tolist())
#test_usr_vec  = embed(test_df["user_prompt"].tolist())
#test_resp_vec = embed(test_df["response"].tolist())


# ============================================================
# 4. SYNTHETIC NEGATIVES VIA METRIC-SHUFFLING
# ============================================================
def generate_negative(df, metric_vec):
    neg_df = df.copy()
    shuffled = np.random.permutation(len(df))
    neg_metric_vec = metric_vec[shuffled]
    neg_df["score"] = np.random.randint(0, 4, size=len(df))  # 0–3 low scores
    return neg_df, neg_metric_vec

neg_df, neg_metric_vec = generate_negative(train_df, train_metric_vec)

# combine real + synthetic negatives
train_all_df = pd.concat([train_df, neg_df], ignore_index=True)

train_metric_full = np.vstack([train_metric_vec, neg_metric_vec])
train_sys_full    = np.vstack([train_sys_vec, train_sys_vec])
train_usr_full    = np.vstack([train_usr_vec, train_usr_vec])
train_resp_full   = np.vstack([train_resp_vec, train_resp_vec])

y_train = np.concatenate([train_df["score"].values, neg_df["score"].values])


# ============================================================
# 5. COSINE SIMILARITY FEATURES
# ============================================================
def cos_sim(a, b):
    return np.sum(a*b, axis=1) / (np.linalg.norm(a,axis=1) * np.linalg.norm(b,axis=1))

train_sim_resp = cos_sim(train_resp_full, train_metric_full).reshape(-1,1)
train_sim_usr  = cos_sim(train_usr_full,  train_metric_full).reshape(-1,1)
train_sim_sys  = cos_sim(train_sys_full,  train_metric_full).reshape(-1,1)

test_sim_resp = cos_sim(test_resp_vec, test_metric_vec).reshape(-1,1)
test_sim_usr  = cos_sim(test_usr_vec,  test_metric_vec).reshape(-1,1)
test_sim_sys  = cos_sim(test_sys_vec,  test_metric_vec).reshape(-1,1)


# ============================================================
# 6. BUILD FINAL FEATURE MATRICES
# ============================================================
# Dimensions:
# metric 300d + LaBSE system 768 + user 768 + resp 768 + 3 sims = 2607 dims

X_train = np.hstack([
    train_metric_full,
    train_sys_full,
    train_usr_full,
    train_resp_full,
    train_sim_resp,
    train_sim_usr,
    train_sim_sys
])

X_test = np.hstack([
    test_metric_vec,
    test_sys_vec,
    test_usr_vec,
    test_resp_vec,
    test_sim_resp,
    test_sim_usr,
    test_sim_sys
])


# ============================================================
# 7. LIGHTGBM TRAINING
# ============================================================
params = {
    "objective": "regression",
    "metric": "rmse",
    "learning_rate": 0.05,
    "num_leaves": 63,
    "max_depth": 8,
    "lambda_l1": 0.1,
    "lambda_l2": 0.1,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 1,
    "verbose": -1,
}

X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train,
    test_size=0.1,
    random_state=42
)

train_lgb = lgb.Dataset(X_tr, label=y_tr)
val_lgb   = lgb.Dataset(X_val, label=y_val)

model = lgb.train(
    params,
    train_lgb,
    num_boost_round=2500,
    valid_sets=[train_lgb, val_lgb],
    valid_names=["train", "val"],
    callbacks=[
        lgb.early_stopping(stopping_rounds=100),
        lgb.log_evaluation(150)
    ]
)


# ============================================================
# 8. PREDICTION + SAVE
# ============================================================
preds = model.predict(X_test, num_iteration=model.best_iteration)
preds = preds.clip(0, 10)

out = pd.DataFrame({
    "ID": test_df["ID"],
    "score": preds
})

out.to_csv("submission_labse_metricembed.csv", index=False)
print("Saved submission_labse_metricembed.csv")
