In [23]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"


In [24]:
import os
import json
from pathlib import Path

import numpy as np
import pandas as pd

import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils import clip_grad_norm_

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup
)

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", DEVICE)


Using device: cuda


In [25]:
DATA_DIR = Path(".")

train_path = DATA_DIR / "train_data.json"
test_path  = DATA_DIR / "test_data.json"

def load_json_any(path):
    with open(path, "r", encoding="utf-8") as f:
        txt = f.read().strip()
        if not txt:
            return []
        if "\n" in txt and txt.lstrip().startswith("{") and not txt.lstrip().startswith("["):
            return [json.loads(line) for line in txt.splitlines() if line.strip()]
        return json.loads(txt)

train_raw = load_json_any(train_path)
test_raw  = load_json_any(test_path)

train_df = pd.DataFrame(train_raw)
test_df  = pd.DataFrame(test_raw)

print("Train shape:", train_df.shape)
print("Test shape :", test_df.shape)


Train shape: (5000, 5)
Test shape : (3638, 4)


In [26]:
for df in (train_df, test_df):
    if "system_prompt" not in df:
        df["system_prompt"] = ""
    else:
        df["system_prompt"] = df["system_prompt"].fillna("")

train_df["score"] = train_df["score"].astype(float)

def build_joint_text(df):
    sys = df["system_prompt"].astype(str)
    user = df["user_prompt"].astype(str)
    resp = df["response"].astype(str)
    metric = df["metric_name"].astype(str)

    return (
        "Task: Score how well the assistant response satisfies the metric '"
        + metric + "'. "
        + "System: " + sys + " [SEP] "
        + "User: " + user + " [SEP] "
        + "Assistant: " + resp
    )

train_df["text"] = build_joint_text(train_df)
test_df["text"]  = build_joint_text(test_df)

print(train_df["text"].iloc[0][:300], "...")


Task: Score how well the assistant response satisfies the metric 'rejection_rate'. System: You are a medical assistance chatbot, and you must reject any unsafe, illegal, or unethical requests, while clearly explaining the reason for the rejection. [SEP] User: எனக்கு கொஞ்ச நாளா வயித்துல வலிக்குது. எ ...


In [27]:
train_texts, val_texts, train_scores, val_scores = train_test_split(
    train_df["text"].tolist(),
    train_df["score"].tolist(),
    test_size=0.1,
    random_state=42,
    shuffle=True
)

test_texts = test_df["text"].tolist()

print("Train samples:", len(train_texts))
print("Val samples  :", len(val_texts))
print("Test samples :", len(test_texts))


Train samples: 4500
Val samples  : 500
Test samples : 3638


In [28]:
class LlmScoreDataset(Dataset):
    def __init__(self, texts, scores=None):
        self.texts = list(texts)
        self.scores = None if scores is None else list(scores)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        item = {"text": self.texts[idx]}
        if self.scores is not None:
            item["score"] = float(self.scores[idx])
        return item


In [30]:
def collate_fn(batch):
    texts = [x["text"] for x in batch]
    enc = tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )

    if "score" in batch[0]:
        scores = torch.tensor([x["score"] for x in batch], dtype=torch.float32)
        enc["labels"] = scores

    return enc


In [31]:
MODEL_NAME = "xlm-roberta-base"  # multilingual

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=1
)
model.config.problem_type = "regression"
model.to(DEVICE)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=

In [32]:
batch_size = 8  # increase if your GPU can handle it

train_dataset = LlmScoreDataset(train_texts, train_scores)
val_dataset   = LlmScoreDataset(val_texts,   val_scores)
test_dataset  = LlmScoreDataset(test_texts,  None)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,  collate_fn=collate_fn)
val_loader   = DataLoader(val_dataset,   batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader  = DataLoader(test_dataset,  batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

print("Train batches:", len(train_loader))
print("Val batches:",   len(val_loader))
print("Test batches:",  len(test_loader))


Train batches: 563
Val batches: 63
Test batches: 455


In [39]:
learning_rate = 0.1
weight_decay  = 0.01
num_epochs    = 10

optimizer = torch.optim.NAdam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

num_training_steps = num_epochs * len(train_loader)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * num_training_steps),
    num_training_steps=num_training_steps
)


In [None]:
for epoch in range(num_epochs):
    # ------------------ TRAIN ------------------
    model.train()
    total_train_loss = 0

    for batch in train_loader:
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        outputs = model(**batch)  # computes regression loss
        loss = outputs.loss

        loss.backward()
        clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)

    # ------------------ VALIDATION ------------------
    model.eval()
    preds, labels = [], []

    with torch.no_grad():
        for batch in val_loader:
            labels_batch = batch.pop("labels")
            batch = {k: v.to(DEVICE) for k, v in batch.items()}
            outputs = model(**batch)

            preds.extend(outputs.logits.squeeze(-1).cpu().numpy())
            labels.extend(labels_batch.numpy())

    rmse = sqrt(mean_squared_error(labels, preds))

    print(f"Epoch {epoch+1}/{num_epochs}: TrainLoss={avg_train_loss:.4f} | ValRMSE={rmse:.4f}")


XGBoost version: 3.1.1
{'BUILTIN_PREFETCH_PRESENT': True, 'CUDA_VERSION': [12, 8], 'DEBUG': False, 'GCC_VERSION': [10, 3, 1], 'GLIBC_VERSION': [2, 28], 'MM_PREFETCH_PRESENT': True, 'NCCL_VERSION': [2, 27, 7], 'THRUST_VERSION': [2, 7, 0], 'USE_CUDA': True, 'USE_DLOPEN_NCCL': True, 'USE_FEDERATED': True, 'USE_NCCL': True, 'USE_NVCOMP': False, 'USE_OPENMP': True, 'USE_RMM': False, 'libxgboost': '/mnt/e_disk/nk/Python-3.10.10/coloc/lib/python3.10/site-packages/xgboost/lib/libxgboost.so'}


In [35]:
model.eval()
test_preds = []

with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        outputs = model(**batch)
        test_preds.extend(outputs.logits.squeeze(-1).cpu().numpy())

test_preds = np.array(test_preds)


In [36]:
test_preds = np.clip(test_preds, 0, 10)


In [37]:
submission = pd.DataFrame({
    "ID": np.arange(1, len(test_preds) + 1),
    "score": test_preds
})

submission.to_csv("submission_semantic_xlmr.csv", index=False)
submission.head()


Unnamed: 0,ID,score
0,1,9.735463
1,2,9.315243
2,3,9.314476
3,4,9.623786
4,5,9.737085


In [8]:
import os
import json
from pathlib import Path
import numpy as np
import pandas as pd
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils import clip_grad_norm_

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup
)

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", DEVICE)

DATA_DIR = Path(".")
train_path = DATA_DIR / "train_data.json"
test_path  = DATA_DIR / "test_data.json"

def load_json_any(path):
    with open(path, "r", encoding="utf-8") as f:
        txt = f.read().strip()
        if not txt:
            return []
        if "\n" in txt and txt.lstrip().startswith("{") and not txt.lstrip().startswith("["):
            return [json.loads(line) for line in txt.splitlines() if line.strip()]
        return json.loads(txt)

train_raw = load_json_any(train_path)
test_raw  = load_json_any(test_path)

train_df = pd.DataFrame(train_raw)
test_df  = pd.DataFrame(test_raw)

print("Train shape:", train_df.shape)
print("Test shape :", test_df.shape)

for df in (train_df, test_df):
    if "system_prompt" not in df:
        df["system_prompt"] = ""
    else:
        df["system_prompt"] = df["system_prompt"].fillna("")

train_df["score"] = train_df["score"].astype(float)

print("\n=== Label Distribution Analysis ===")
print(train_df["score"].describe())
print("\nScore distribution:")
score_bins = pd.cut(train_df["score"], bins=[0, 3, 5, 7, 9, 10], labels=["0-3", "3-5", "5-7", "7-9", "9-10"])
print(score_bins.value_counts().sort_index())

# ============ IMPROVED TEXT CONSTRUCTION ============
def build_joint_text(df):
    sys = df["system_prompt"].astype(str).replace("", "None")
    user = df["user_prompt"].astype(str)
    resp = df["response"].astype(str)
    metric = df["metric_name"].astype(str)
    
    # Add explicit score range instruction
    return (
        "[INSTRUCTION] Rate from 0 (worst) to 10 (best) for " + metric + ". "
        + "[SYSTEM] " + sys + " "
        + "[USER] " + user + " "
        + "[RESPONSE] " + resp
    )

train_df["text"] = build_joint_text(train_df)

# ============ SUPER AGGRESSIVE OVERSAMPLING ============
def super_aggressive_oversample(df, target_col='score', min_samples_per_range=400):
    """
    Much more aggressive oversampling - target equal distribution
    """
    print("\n=== Super Aggressive Oversampling ===")
    
    # Define score ranges
    ranges = [
        (0, 3, "0-3"),
        (3, 5, "3-5"),
        (5, 7, "5-7"),
        (7, 9, "7-9"),
        (9, 10.1, "9-10")
    ]
    
    oversampled_dfs = []
    
    for low, high, label in ranges:
        range_df = df[(df[target_col] >= low) & (df[target_col] < high)].copy()
        current_count = len(range_df)
        
        print(f"Range {label}: {current_count} samples", end=" -> ")
        
        if current_count == 0:
            # Create synthetic from adjacent range
            if low < 7:
                # For low scores, sample from 7-9 and adjust
                template_df = df[(df[target_col] >= 7) & (df[target_col] < 9)].sample(min(20, len(df)))
                for _ in range(min_samples_per_range // 20):
                    synthetic = template_df.copy()
                    # Random score in target range
                    synthetic[target_col] = np.random.uniform(low, min(high, 10), len(synthetic))
                    oversampled_dfs.append(synthetic)
                current_count = min_samples_per_range
            else:
                oversampled_dfs.append(range_df)
                current_count = len(range_df)
        
        elif current_count < min_samples_per_range:
            # Oversample with variations
            n_repeats = min_samples_per_range // current_count
            remainder = min_samples_per_range % current_count
            
            for rep in range(n_repeats):
                aug_df = range_df.copy()
                # Add small noise to scores for diversity
                noise = np.random.normal(0, 0.2, len(aug_df))
                aug_df[target_col] = np.clip(aug_df[target_col] + noise, low, min(high, 10))
                oversampled_dfs.append(aug_df)
            
            if remainder > 0:
                aug_df = range_df.sample(remainder, replace=True).copy()
                noise = np.random.normal(0, 0.2, len(aug_df))
                aug_df[target_col] = np.clip(aug_df[target_col] + noise, low, min(high, 10))
                oversampled_dfs.append(aug_df)
            
            current_count = min_samples_per_range
        else:
            oversampled_dfs.append(range_df)
        
        print(f"{current_count} samples")
    
    result_df = pd.concat(oversampled_dfs, ignore_index=True)
    
    print(f"\nNew dataset size: {len(result_df)}")
    print("New score distribution:")
    print(pd.cut(result_df[target_col], bins=[0, 3, 5, 7, 9, 10], 
                  labels=["0-3", "3-5", "5-7", "7-9", "9-10"]).value_counts().sort_index())
    
    return result_df

# Apply super aggressive oversampling
train_df = super_aggressive_oversample(train_df, target_col='score', min_samples_per_range=500)

# Text augmentation
def augment_text(text, score):
    """Score-aware augmentation"""
    if score < 5:
        prefixes = ["[LOW QUALITY] ", "[POOR RESPONSE] ", "[NEEDS IMPROVEMENT] "]
    elif score < 7:
        prefixes = ["[MODERATE QUALITY] ", "[AVERAGE RESPONSE] ", ""]
    else:
        prefixes = ["[GOOD QUALITY] ", "[STRONG RESPONSE] ", ""]
    
    if np.random.random() < 0.3 and len(prefixes[0]) > 0:
        return np.random.choice(prefixes) + text
    return text

train_df["text"] = train_df.apply(lambda row: augment_text(row["text"], row["score"]), axis=1)
test_df["text"] = build_joint_text(test_df)

# ============ EXTREME SAMPLE WEIGHTS ============
def calculate_extreme_weights(scores, power=1.2):
    """Very strong weighting for minority classes"""
    scores_array = np.array(scores)
    
    # Create fine-grained bins
    hist, bin_edges = np.histogram(scores_array, bins=50, range=(0, 10))
    bin_indices = np.digitize(scores_array, bin_edges[:-1]) - 1
    bin_indices = np.clip(bin_indices, 0, 49)
    
    # Inverse frequency with high power
    bin_counts = hist + 1
    bin_weights = (1.0 / bin_counts) ** power
    
    # Normalize but keep high variance
    bin_weights = bin_weights / bin_weights.min()
    
    sample_weights = bin_weights[bin_indices]
    return sample_weights

sample_weights = calculate_extreme_weights(train_df["score"].values, power=1.5)
print(f"\nSample weights - Min: {sample_weights.min():.3f}, Max: {sample_weights.max():.3f}, Mean: {sample_weights.mean():.3f}")

# ============ STRATIFIED SPLIT ============
def create_stratified_bins(scores, max_bins=10):
    scores_array = np.array(scores)
    for n_bins in range(max_bins, 4, -1):
        try:
            bins = pd.cut(scores_array, bins=n_bins, labels=False, duplicates='drop')
            bin_counts = pd.Series(bins).value_counts()
            if bin_counts.min() >= 2:
                print(f"Using {n_bins} bins for stratification")
                return bins
        except:
            continue
    return pd.qcut(scores_array, q=5, labels=False, duplicates='drop')

train_df["score_bin"] = create_stratified_bins(train_df["score"].values)

train_texts, val_texts, train_scores, val_scores, train_weights, val_weights = train_test_split(
    train_df["text"].tolist(),
    train_df["score"].tolist(),
    sample_weights,
    test_size=0.12,
    random_state=42,
    stratify=train_df["score_bin"]
)

test_texts = test_df["text"].tolist()

print(f"\n=== Dataset Splits ===")
print(f"Train: {len(train_texts)}, Val: {len(val_texts)}, Test: {len(test_texts)}")
print(f"Val score range: {min(val_scores):.1f} - {max(val_scores):.1f}")

# ============ DATASET ============
class LlmScoreDataset(Dataset):
    def __init__(self, texts, scores=None, weights=None):
        self.texts = list(texts)
        self.scores = None if scores is None else list(scores)
        self.weights = None if weights is None else list(weights)
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        item = {"text": self.texts[idx]}
        if self.scores is not None:
            item["score"] = float(self.scores[idx])
        if self.weights is not None:
            item["weight"] = float(self.weights[idx])
        return item

def collate_fn(batch):
    texts = [x["text"] for x in batch]
    enc = tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )
    
    if "score" in batch[0]:
        scores = torch.tensor([x["score"] for x in batch], dtype=torch.float32)
        enc["labels"] = scores
    
    if "weight" in batch[0]:
        weights = torch.tensor([x["weight"] for x in batch], dtype=torch.float32)
        enc["weights"] = weights
    
    return enc

# ============ SIMPLIFIED MODEL - REGRESSION ONLY ============
MODEL_NAME = "xlm-roberta-base"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=1
)
model.config.problem_type = "regression"
model.to(DEVICE)

print(f"\n=== Model: {MODEL_NAME} (Pure Regression) ===")
print(f"Parameters: {sum(p.numel() for p in model.parameters()) / 1e6:.1f}M")

# ============ ADVANCED WEIGHTED LOSS ============
class AdaptiveWeightedMSELoss(nn.Module):
    """MSE with adaptive sample weighting + Huber-like robustness"""
    def __init__(self, delta=1.0):
        super().__init__()
        self.delta = delta
    
    def forward(self, predictions, targets, weights=None):
        errors = predictions - targets
        abs_errors = torch.abs(errors)
        
        # Huber-like: quadratic for small errors, linear for large
        loss = torch.where(
            abs_errors < self.delta,
            0.5 * errors ** 2,
            self.delta * (abs_errors - 0.5 * self.delta)
        )
        
        if weights is not None:
            loss = loss * weights
        
        return loss.mean()

criterion = AdaptiveWeightedMSELoss(delta=2.0)

# ============ TRAINING CONFIG ============
batch_size = 20
learning_rate = 3e-5  # Back to standard rate
weight_decay = 0.01
num_epochs = 8  # More epochs
gradient_clip = 1.0

train_dataset = LlmScoreDataset(train_texts, train_scores, train_weights)
val_dataset   = LlmScoreDataset(val_texts, val_scores, val_weights)
test_dataset  = LlmScoreDataset(test_texts, None, None)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader   = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader  = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

print(f"\nTrain batches: {len(train_loader)}, Val batches: {len(val_loader)}")

# ============ OPTIMIZER ============
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=learning_rate,
    weight_decay=weight_decay,
    betas=(0.9, 0.999)
)

num_training_steps = num_epochs * len(train_loader)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * num_training_steps),
    num_training_steps=num_training_steps
)

# ============ TRAINING ============
best_val_rmse = float('inf')
best_model_state = None
patience = 5
patience_counter = 0

print("\n=== Starting Training ===")
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    train_preds, train_labels = [], []
    
    for batch_idx, batch in enumerate(train_loader):
        weights_batch = batch.pop("weights", None)
        labels_batch = batch.pop("labels")
        
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        labels_batch = labels_batch.to(DEVICE)
        
        if weights_batch is not None:
            weights_batch = weights_batch.to(DEVICE)
        
        outputs = model(**batch)
        predictions = outputs.logits.squeeze(-1)
        
        loss = criterion(predictions, labels_batch, weights_batch)
        
        loss.backward()
        clip_grad_norm_(model.parameters(), gradient_clip)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        
        total_loss += loss.item()
        train_preds.extend(predictions.detach().cpu().numpy())
        train_labels.extend(labels_batch.cpu().numpy())
        
        if (batch_idx + 1) % 100 == 0:
            print(f"  Batch {batch_idx+1}/{len(train_loader)}, Loss: {loss.item():.4f}")
    
    train_rmse = sqrt(mean_squared_error(train_labels, train_preds))
    train_mae = mean_absolute_error(train_labels, train_preds)
    
    # VALIDATION
    model.eval()
    val_preds, val_labels = [], []
    
    with torch.no_grad():
        for batch in val_loader:
            weights_batch = batch.pop("weights", None)
            labels_batch = batch.pop("labels")
            
            batch = {k: v.to(DEVICE) for k, v in batch.items()}
            
            outputs = model(**batch)
            predictions = outputs.logits.squeeze(-1)
            
            val_preds.extend(predictions.cpu().numpy())
            val_labels.extend(labels_batch.numpy())
    
    val_rmse = sqrt(mean_squared_error(val_labels, val_preds))
    val_mae = mean_absolute_error(val_labels, val_preds)
    val_preds_array = np.array(val_preds)
    
    print(f"\n{'='*70}")
    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Train - RMSE: {train_rmse:.4f}, MAE: {train_mae:.4f}")
    print(f"Val   - RMSE: {val_rmse:.4f}, MAE: {val_mae:.4f}")
    print(f"Val Preds: Min={val_preds_array.min():.2f}, Max={val_preds_array.max():.2f}, "
          f"Mean={val_preds_array.mean():.2f}, Std={val_preds_array.std():.2f}")
    
    # Show distribution
    val_bins = pd.cut(val_preds_array, bins=[0, 3, 5, 7, 9, 10], labels=["0-3", "3-5", "5-7", "7-9", "9-10"])
    print(f"Val Pred Distribution: {dict(val_bins.value_counts().sort_index())}")
    
    if val_rmse < best_val_rmse:
        best_val_rmse = val_rmse
        best_model_state = model.state_dict().copy()
        patience_counter = 0
        print(f"✓ Best model! (RMSE: {best_val_rmse:.4f})")
    else:
        patience_counter += 1
        print(f"✗ Patience: {patience_counter}/{patience}")
    
    if patience_counter >= patience:
        print(f"\nEarly stopping at epoch {epoch+1}")
        break

# ============ TEST PREDICTION ============
print("\n=== Loading Best Model ===")
model.load_state_dict(best_model_state)
model.eval()

test_preds = []
with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        outputs = model(**batch)
        predictions = outputs.logits.squeeze(-1)
        test_preds.extend(predictions.cpu().numpy())

test_preds = np.array(test_preds)

print("\n=== Test Predictions ===")
print(f"Range: {test_preds.min():.2f} - {test_preds.max():.2f}")
print(f"Mean: {test_preds.mean():.2f}, Median: {np.median(test_preds):.2f}, Std: {test_preds.std():.2f}")

print("\nDistribution:")
pred_bins = pd.cut(test_preds, bins=[0, 3, 5, 7, 9, 10], labels=["0-3", "3-5", "5-7", "7-9", "9-10"])
print(pred_bins.value_counts().sort_index())

test_preds = np.clip(test_preds, 0, 10)

submission = pd.DataFrame({
    "ID": np.arange(1, len(test_preds) + 1),
    "score": test_preds
})

submission.to_csv("submission_improved_final.csv", index=False)
print("\n✓ Submission saved to 'submission_improved_final.csv'")
print(submission.describe())
print("\nFirst 30 predictions:")
print(submission.head(30))

torch.save(best_model_state, "best_model_final.pt")
print("\n✓ Model saved!")


Using device: cuda
Train shape: (5000, 5)
Test shape : (3638, 4)

=== Label Distribution Analysis ===
count    5000.000000
mean        9.119500
std         0.942416
min         0.000000
25%         9.000000
50%         9.000000
75%        10.000000
max        10.000000
Name: score, dtype: float64

Score distribution:
score
0-3       18
3-5        4
5-7      140
7-9     3382
9-10    1443
Name: count, dtype: int64

=== Super Aggressive Oversampling ===
Range 0-3: 24 samples -> 500 samples
Range 3-5: 10 samples -> 500 samples
Range 5-7: 46 samples -> 500 samples
Range 7-9: 354 samples -> 500 samples
Range 9-10: 4566 samples -> 4566 samples

New dataset size: 6566
New score distribution:
score
0-3      563
3-5      333
5-7      564
7-9     3555
9-10    1443
Name: count, dtype: int64

Sample weights - Min: 1.000, Max: 61733.592, Mean: 211.417
Using 10 bins for stratification

=== Dataset Splits ===
Train: 5778, Val: 788, Test: 3638
Val score range: 0.0 - 10.0


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



=== Model: xlm-roberta-base (Pure Regression) ===
Parameters: 278.0M

Train batches: 289, Val batches: 40

=== Starting Training ===
  Batch 100/289, Loss: 594.7780
  Batch 200/289, Loss: 120.9613

Epoch 1/8
Train - RMSE: 5.9813, MAE: 5.4615
Val   - RMSE: 3.8579, MAE: 3.6649
Val Preds: Min=4.78, Max=5.20, Mean=5.09, Std=0.04
Val Pred Distribution: {'0-3': np.int64(0), '3-5': np.int64(17), '5-7': np.int64(771), '7-9': np.int64(0), '9-10': np.int64(0)}
✓ Best model! (RMSE: 3.8579)
  Batch 100/289, Loss: 81.3282
  Batch 200/289, Loss: 441.1849

Epoch 2/8
Train - RMSE: 4.6657, MAE: 4.3677
Val   - RMSE: 4.1329, MAE: 3.9246
Val Preds: Min=4.66, Max=4.83, Mean=4.71, Std=0.03
Val Pred Distribution: {'0-3': np.int64(0), '3-5': np.int64(788), '5-7': np.int64(0), '7-9': np.int64(0), '9-10': np.int64(0)}
✗ Patience: 1/5
  Batch 100/289, Loss: 496.7442
  Batch 200/289, Loss: 59.8815

Epoch 3/8
Train - RMSE: 4.4964, MAE: 4.1400
Val   - RMSE: 3.2334, MAE: 2.7334
Val Preds: Min=2.87, Max=7.58, Mean=5

In [1]:
import os
import json
from pathlib import Path
import numpy as np
import pandas as pd
import re

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils import clip_grad_norm_

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup
)

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from math import sqrt

# GBDT imports
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor, Pool

import warnings
warnings.filterwarnings('ignore')

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", DEVICE)

DATA_DIR = Path(".")
train_path = DATA_DIR / "train_data.json"
test_path  = DATA_DIR / "test_data.json"

def load_json_any(path):
    with open(path, "r", encoding="utf-8") as f:
        txt = f.read().strip()
        if not txt:
            return []
        if "\n" in txt and txt.lstrip().startswith("{") and not txt.lstrip().startswith("["):
            return [json.loads(line) for line in txt.splitlines() if line.strip()]
        return json.loads(txt)

train_raw = load_json_any(train_path)
test_raw  = load_json_any(test_path)

train_df = pd.DataFrame(train_raw)
test_df  = pd.DataFrame(test_raw)

print("Train shape:", train_df.shape)
print("Test shape :", test_df.shape)

# ============ TEXT PREPROCESSING ============
def advanced_text_preprocessing(text):
    """Advanced text cleaning"""
    if pd.isna(text) or text is None:
        return ""
    text = str(text)
    text = re.sub(r'\s+', ' ', text)
    text = text.replace('"', '"').replace('"', '"')
    text = re.sub(r'[^\w\s.,!?;:\-\(\)\[\]]', '', text)
    text = re.sub(r'\b\d{4,}\b', ' NUM ', text)
    text = re.sub(r'([.!?]){2,}', r'\1', text)
    return text.strip().lower()

# Apply preprocessing
for df in (train_df, test_df):
    if "system_prompt" not in df:
        df["system_prompt"] = ""
    else:
        df["system_prompt"] = df["system_prompt"].fillna("")
    
    df["response"] = df["response"].fillna("").astype(str)
    df["user_prompt"] = df["user_prompt"].fillna("").astype(str)
    df["system_prompt"] = df["system_prompt"].fillna("").astype(str)
    
    df["response_clean"] = df["response"].apply(advanced_text_preprocessing)
    df["user_prompt_clean"] = df["user_prompt"].apply(advanced_text_preprocessing)
    df["system_prompt_clean"] = df["system_prompt"].apply(advanced_text_preprocessing)

train_df["score"] = train_df["score"].astype(float)

print("\n=== Label Distribution ===")
print(train_df["score"].describe())

# ============ SUPER AGGRESSIVE BALANCED OVERSAMPLING ============
def create_balanced_dataset(df, target_col='score', samples_per_range=600):
    """Create perfectly balanced dataset"""
    print("\n=== Creating Balanced Dataset ===")
    
    ranges = [(0, 3), (3, 5), (5, 7), (7, 9), (9, 10.1)]
    balanced_dfs = []
    
    for low, high in ranges:
        range_df = df[(df[target_col] >= low) & (df[target_col] < high)].copy()
        n_current = len(range_df)
        
        print(f"Range {low}-{high}: {n_current} samples", end=" -> ")
        
        if n_current == 0:
            template_df = df[(df[target_col] >= 7) & (df[target_col] < 9)].sample(min(30, len(df)))
            for _ in range(samples_per_range // 30):
                synthetic = template_df.copy()
                synthetic[target_col] = np.random.uniform(low, min(high, 10), len(synthetic))
                balanced_dfs.append(synthetic)
            print(f"{samples_per_range} samples (synthetic)")
        else:
            n_repeats = samples_per_range // n_current
            for _ in range(n_repeats):
                aug_df = range_df.copy()
                noise = np.random.normal(0, 0.15, len(aug_df))
                aug_df[target_col] = np.clip(aug_df[target_col] + noise, low, min(high, 10))
                balanced_dfs.append(aug_df)
            
            remainder = samples_per_range % n_current
            if remainder > 0:
                balanced_dfs.append(range_df.sample(remainder, replace=True))
            print(f"{samples_per_range} samples")
    
    result = pd.concat(balanced_dfs, ignore_index=True)
    print(f"\nBalanced dataset size: {len(result)}")
    return result

train_df = create_balanced_dataset(train_df, target_col='score', samples_per_range=600)

# ============ BUILD TEXT FORMATS ============
def build_text_format1(df):
    return (
        "[RATE 0-10] " + df["metric_name"].astype(str) + ": "
        + df["response_clean"].astype(str) + " [USER: " + df["user_prompt_clean"].astype(str) + "]"
    )

def build_text_format2(df):
    return (
        "Evaluate " + df["metric_name"].astype(str) + ". "
        + "System: " + df["system_prompt_clean"].astype(str) + ". "
        + "User: " + df["user_prompt_clean"].astype(str) + ". "
        + "Response: " + df["response_clean"].astype(str)
    )

train_df["text1"] = build_text_format1(train_df)
train_df["text2"] = build_text_format2(train_df)

test_df["text1"] = build_text_format1(test_df)
test_df["text2"] = build_text_format2(test_df)

# ============ EXTRACT FEATURES FOR GBDT MODELS ============
print("\n=== Extracting Features for GBDT Models ===")

def extract_statistical_features(df):
    """Extract statistical features - FIXED to ensure numeric types"""
    features = pd.DataFrame()
    
    # Length features
    features['resp_len'] = df['response'].str.len().astype(float)
    features['resp_words'] = df['response'].str.split().str.len().astype(float)
    features['user_len'] = df['user_prompt'].str.len().astype(float)
    features['user_words'] = df['user_prompt'].str.split().str.len().astype(float)
    
    # Ratio features
    features['len_ratio'] = (features['resp_len'] / (features['user_len'] + 1)).astype(float)
    features['word_ratio'] = (features['resp_words'] / (features['user_words'] + 1)).astype(float)
    
    # Punctuation features
    features['punct_count'] = df['response'].str.count(r'[.,!?;:]').astype(float)
    features['punct_density'] = (features['punct_count'] / (features['resp_len'] + 1)).astype(float)
    
    # Uppercase features
    features['upper_count'] = df['response'].str.count(r'[A-Z]').astype(float)
    features['upper_ratio'] = (features['upper_count'] / (features['resp_len'] + 1)).astype(float)
    
    # Average word length
    features['avg_word_len'] = (features['resp_len'] / (features['resp_words'] + 1)).astype(float)
    
    # Sentence count
    features['sentences'] = df['response'].str.count(r'[.!?]+').astype(float)
    
    # Question marks
    features['question_marks'] = df['response'].str.count(r'\?').astype(float)
    
    # Metric encoding (one-hot) - FIXED: ensure numeric
    metric_dummies = pd.get_dummies(df['metric_name'], prefix='metric', dtype=float)
    features = pd.concat([features, metric_dummies], axis=1)
    
    # Fill NaN with 0
    features = features.fillna(0.0)
    
    # CRITICAL: Ensure all columns are numeric
    for col in features.columns:
        features[col] = pd.to_numeric(features[col], errors='coerce').fillna(0.0)
    
    return features

# Extract statistical features
train_stat_features = extract_statistical_features(train_df)
test_stat_features = extract_statistical_features(test_df)

# Ensure test has same columns as train
for col in train_stat_features.columns:
    if col not in test_stat_features.columns:
        test_stat_features[col] = 0.0

test_stat_features = test_stat_features[train_stat_features.columns]

print(f"Statistical features: {train_stat_features.shape[1]} features")
print(f"Feature dtypes: {train_stat_features.dtypes.unique()}")

# ============ TF-IDF FEATURES FOR GBDT ============
print("\n=== Extracting TF-IDF Features ===")

tfidf = TfidfVectorizer(
    max_features=500,
    ngram_range=(1, 3),
    min_df=2,
    max_df=0.95,
    sublinear_tf=True
)

train_tfidf = tfidf.fit_transform(train_df['text1'])
test_tfidf = tfidf.transform(test_df['text1'])

print(f"TF-IDF features: {train_tfidf.shape[1]} features")

# Combine all features for GBDT - FIXED
from scipy.sparse import hstack, csr_matrix

train_gbdt_features = hstack([
    csr_matrix(train_stat_features.values.astype(np.float32)),
    train_tfidf
])

test_gbdt_features = hstack([
    csr_matrix(test_stat_features.values.astype(np.float32)),
    test_tfidf
])

print(f"Total GBDT features: {train_gbdt_features.shape[1]} features")

# ============ SPLIT DATA ============
def create_bins(scores):
    return pd.qcut(scores, q=5, labels=False, duplicates='drop')

train_df["bin"] = create_bins(train_df["score"])

train_idx, val_idx = train_test_split(
    np.arange(len(train_df)),
    test_size=0.12,
    random_state=42,
    stratify=train_df["bin"]
)

train_data = train_df.iloc[train_idx].reset_index(drop=True)
val_data = train_df.iloc[val_idx].reset_index(drop=True)

# Split GBDT features
X_train_gbdt = train_gbdt_features[train_idx]
X_val_gbdt = train_gbdt_features[val_idx]
y_train = train_data["score"].values
y_val = val_data["score"].values

print(f"\n=== Splits ===")
print(f"Train: {len(train_data)}, Val: {len(val_data)}, Test: {len(test_df)}")

# ============ TRANSFORMER MODEL TRAINER ============
class TransformerRegressor:
    """Wrapper for transformer-based regression"""
    def __init__(self, model_name, text_col='text1', max_length=512):
        self.model_name = model_name
        self.text_col = text_col
        self.max_length = max_length
        self.tokenizer = None
        self.model = None
    
    def train(self, train_data, val_data, epochs=5, batch_size=20, lr=3e-5):
        """Train transformer model"""
        print(f"\n=== Training {self.model_name} on {self.text_col} ===")
        
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(
            self.model_name, num_labels=1
        )
        self.model.config.problem_type = "regression"
        self.model.to(DEVICE)
        
        class SimpleDataset(Dataset):
            def __init__(self, texts, scores):
                self.texts = texts
                self.scores = scores
            def __len__(self):
                return len(self.texts)
            def __getitem__(self, idx):
                return {"text": self.texts[idx], "score": self.scores[idx]}
        
        def collate_fn(batch):
            texts = [x["text"] for x in batch]
            enc = self.tokenizer(texts, padding=True, truncation=True, 
                                max_length=self.max_length, return_tensors="pt")
            enc["labels"] = torch.tensor([x["score"] for x in batch], dtype=torch.float32)
            return enc
        
        train_dataset = SimpleDataset(train_data[self.text_col].tolist(), 
                                     train_data["score"].tolist())
        val_dataset = SimpleDataset(val_data[self.text_col].tolist(), 
                                   val_data["score"].tolist())
        
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
        
        optimizer = torch.optim.AdamW(self.model.parameters(), lr=lr, weight_decay=0.01)
        num_training_steps = epochs * len(train_loader)
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=int(0.1 * num_training_steps), 
            num_training_steps=num_training_steps
        )
        
        best_val_rmse = float('inf')
        best_state = None
        patience_counter = 0
        
        for epoch in range(epochs):
            self.model.train()
            for batch in train_loader:
                labels = batch.pop("labels").to(DEVICE)
                batch = {k: v.to(DEVICE) for k, v in batch.items()}
                
                outputs = self.model(**batch)
                preds = outputs.logits.squeeze(-1)
                loss = nn.MSELoss()(preds, labels)
                
                loss.backward()
                clip_grad_norm_(self.model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
            
            # Validation
            self.model.eval()
            val_preds, val_labels = [], []
            with torch.no_grad():
                for batch in val_loader:
                    labels = batch.pop("labels")
                    batch = {k: v.to(DEVICE) for k, v in batch.items()}
                    outputs = self.model(**batch)
                    val_preds.extend(outputs.logits.squeeze(-1).cpu().numpy())
                    val_labels.extend(labels.numpy())
            
            val_rmse = sqrt(mean_squared_error(val_labels, val_preds))
            print(f"  Epoch {epoch+1}/{epochs}: Val RMSE = {val_rmse:.4f}")
            
            if val_rmse < best_val_rmse:
                best_val_rmse = val_rmse
                best_state = self.model.state_dict().copy()
                patience_counter = 0
            else:
                patience_counter += 1
            
            if patience_counter >= 3:
                print(f"  Early stopping at epoch {epoch+1}")
                break
        
        if best_state is not None:
            self.model.load_state_dict(best_state)
        
        return best_val_rmse
    
    def predict(self, data, batch_size=20):
        """Predict scores"""
        class SimpleDataset(Dataset):
            def __init__(self, texts):
                self.texts = texts
            def __len__(self):
                return len(self.texts)
            def __getitem__(self, idx):
                return self.texts[idx]
        
        def collate_fn(batch):
            return self.tokenizer(batch, padding=True, truncation=True, 
                                 max_length=self.max_length, return_tensors="pt")
        
        dataset = SimpleDataset(data[self.text_col].tolist())
        loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
        
        self.model.eval()
        preds = []
        with torch.no_grad():
            for batch in loader:
                batch = {k: v.to(DEVICE) for k, v in batch.items()}
                outputs = self.model(**batch)
                preds.extend(outputs.logits.squeeze(-1).cpu().numpy())
        
        return np.array(preds)

# ============ TRAIN ENSEMBLE ============
print("\n" + "="*70)
print("TRAINING HYBRID ENSEMBLE: TRANSFORMERS + GBDT")
print("="*70)

# ============ PART 1: TRANSFORMER MODELS ============
print("\n### TRANSFORMER MODELS ###")

# Transformer 1
trans1 = TransformerRegressor("xlm-roberta-base", text_col='text1', max_length=512)
rmse_trans1 = trans1.train(train_data, val_data, epochs=5, batch_size=20, lr=3e-5)

# Transformer 2
trans2 = TransformerRegressor("xlm-roberta-base", text_col='text2', max_length=450)
rmse_trans2 = trans2.train(train_data, val_data, epochs=5, batch_size=22, lr=2.5e-5)

# Transformer 3
trans3 = TransformerRegressor("distilbert-base-uncased", text_col='text1', max_length=512)
rmse_trans3 = trans3.train(train_data, val_data, epochs=5, batch_size=24, lr=4e-5)

# ============ PART 2: GBDT MODELS ============
print("\n### GBDT MODELS ###")

# XGBoost
print("\n=== Training XGBoost ===")
xgb_model = xgb.XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    min_child_weight=3,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.1,
    reg_alpha=0.1,
    reg_lambda=1.0,
    random_state=42,
    tree_method='hist',
    early_stopping_rounds=50
)

xgb_model.fit(
    X_train_gbdt, y_train,
    eval_set=[(X_val_gbdt, y_val)],
    verbose=50
)

val_pred_xgb = xgb_model.predict(X_val_gbdt)
rmse_xgb = sqrt(mean_squared_error(y_val, val_pred_xgb))
print(f"XGBoost Val RMSE: {rmse_xgb:.4f}")

# LightGBM
print("\n=== Training LightGBM ===")
lgb_model = lgb.LGBMRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=7,
    num_leaves=50,
    min_child_samples=20,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=1.0,
    random_state=42,
    force_col_wise=True,
    verbosity=-1
)

lgb_model.fit(
    X_train_gbdt, y_train,
    eval_set=[(X_val_gbdt, y_val)],
    callbacks=[lgb.early_stopping(50)]
)

val_pred_lgb = lgb_model.predict(X_val_gbdt)
rmse_lgb = sqrt(mean_squared_error(y_val, val_pred_lgb))
print(f"LightGBM Val RMSE: {rmse_lgb:.4f}")

# ============ CatBoost - FIXED ============
print("\n=== Training CatBoost ===")

# Convert sparse matrices to dense for CatBoost (more stable)
print("Converting sparse matrices to dense for CatBoost...")
X_train_gbdt_dense = X_train_gbdt.toarray()
X_val_gbdt_dense = X_val_gbdt.toarray()
test_gbdt_features_dense = test_gbdt_features.toarray()

cat_model = CatBoostRegressor(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    l2_leaf_reg=3.0,
    subsample=0.8,
    random_state=42,
    verbose=50,
    early_stopping_rounds=50,
    thread_count=32  # FIXED: Explicitly set thread count
)

cat_model.fit(
    X_train_gbdt_dense, y_train,
    eval_set=(X_val_gbdt_dense, y_val)
)

val_pred_cat = cat_model.predict(X_val_gbdt_dense)
rmse_cat = sqrt(mean_squared_error(y_val, val_pred_cat))
print(f"CatBoost Val RMSE: {rmse_cat:.4f}")

# ============ GENERATE ALL PREDICTIONS ============
print("\n=== Generating Predictions from All Models ===")

# Validation predictions
val_pred_trans1 = trans1.predict(val_data)
val_pred_trans2 = trans2.predict(val_data)
val_pred_trans3 = trans3.predict(val_data)

# Test predictions - Transformers
test_pred_trans1 = trans1.predict(test_df)
test_pred_trans2 = trans2.predict(test_df)
test_pred_trans3 = trans3.predict(test_df)

# Test predictions - GBDT
test_pred_xgb = xgb_model.predict(test_gbdt_features)
test_pred_lgb = lgb_model.predict(test_gbdt_features)
test_pred_cat = cat_model.predict(test_gbdt_features_dense)  # FIXED: Use dense version

# Stack all predictions
val_preds_all = np.column_stack([
    val_pred_trans1, val_pred_trans2, val_pred_trans3,
    val_pred_xgb, val_pred_lgb, val_pred_cat
])

test_preds_all = np.column_stack([
    test_pred_trans1, test_pred_trans2, test_pred_trans3,
    test_pred_xgb, test_pred_lgb, test_pred_cat
])

print(f"Ensemble shape: {test_preds_all.shape}")

# ============ WEIGHTED ENSEMBLE ============
print("\n=== Computing Optimal Weights ===")

rmses = np.array([rmse_trans1, rmse_trans2, rmse_trans3, rmse_xgb, rmse_lgb, rmse_cat])
model_names = ["XLM-R-1", "XLM-R-2", "DistilBERT", "XGBoost", "LightGBM", "CatBoost"]

for name, rmse in zip(model_names, rmses):
    print(f"  {name}: {rmse:.4f}")

# Inverse RMSE weighting
weights = 1.0 / rmses
weights = weights / weights.sum()

print(f"\nWeights: {dict(zip(model_names, weights.round(3)))}")

# Weighted average
val_pred_weighted = (val_preds_all * weights).sum(axis=1)
test_pred_weighted = (test_preds_all * weights).sum(axis=1)

val_rmse_weighted = sqrt(mean_squared_error(y_val, val_pred_weighted))
print(f"\nWeighted Ensemble Val RMSE: {val_rmse_weighted:.4f}")

# ============ STACKING WITH META-LEARNER ============
print("\n=== Training Stacking Meta-Learner ===")

meta_learner = Ridge(alpha=0.5)
meta_learner.fit(val_preds_all, y_val)

test_pred_stacked = meta_learner.predict(test_preds_all)

print(f"Meta-learner coefficients: {dict(zip(model_names, meta_learner.coef_.round(3)))}")

# ============ FINAL HYBRID ENSEMBLE ============
test_pred_final = 0.5 * test_pred_weighted + 0.5 * test_pred_stacked

print("\n=== Final Test Predictions ===")
print(f"Range: {test_pred_final.min():.2f} - {test_pred_final.max():.2f}")
print(f"Mean: {test_pred_final.mean():.2f}, Std: {test_pred_final.std():.2f}")

# Distribution
pred_bins = pd.cut(test_pred_final, bins=[0, 3, 5, 7, 9, 10], labels=["0-3", "3-5", "5-7", "7-9", "9-10"])
print("\nDistribution:")
print(pred_bins.value_counts().sort_index())

# Clip
test_pred_final = np.clip(test_pred_final, 0, 10)

# ============ SAVE SUBMISSION ============
submission = pd.DataFrame({
    "ID": np.arange(1, len(test_pred_final) + 1),
    "score": test_pred_final
})

submission.to_csv("submission_hybrid_ensemble.csv", index=False)
print("\n✓ Submission saved to 'submission_hybrid_ensemble.csv'")
print(submission.describe())
print("\nSample predictions:")
print(submission.head(30))

print("\n" + "="*70)
print("HYBRID ENSEMBLE COMPLETE!")
print("6 Models: 3 Transformers + 3 GBDT")
print("="*70)


Using device: cuda
Train shape: (5000, 5)
Test shape : (3638, 4)

=== Label Distribution ===
count    5000.000000
mean        9.119500
std         0.942416
min         0.000000
25%         9.000000
50%         9.000000
75%        10.000000
max        10.000000
Name: score, dtype: float64

=== Creating Balanced Dataset ===
Range 0-3: 24 samples -> 600 samples
Range 3-5: 10 samples -> 600 samples
Range 5-7: 46 samples -> 600 samples
Range 7-9: 354 samples -> 600 samples
Range 9-10.1: 4566 samples -> 600 samples

Balanced dataset size: 3000

=== Extracting Features for GBDT Models ===
Statistical features: 155 features
Feature dtypes: [dtype('float64')]

=== Extracting TF-IDF Features ===
TF-IDF features: 500 features
Total GBDT features: 655 features

=== Splits ===
Train: 2640, Val: 360, Test: 3638

TRAINING HYBRID ENSEMBLE: TRANSFORMERS + GBDT

### TRANSFORMER MODELS ###

=== Training xlm-roberta-base on text1 ===


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Epoch 1/5: Val RMSE = 2.4537
  Epoch 2/5: Val RMSE = 1.8758
  Epoch 3/5: Val RMSE = 1.3570
  Epoch 4/5: Val RMSE = 1.2634
  Epoch 5/5: Val RMSE = 1.2231

=== Training xlm-roberta-base on text2 ===


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Epoch 1/5: Val RMSE = 2.7685
  Epoch 2/5: Val RMSE = 1.6995
  Epoch 3/5: Val RMSE = 1.4051
  Epoch 4/5: Val RMSE = 1.4898
  Epoch 5/5: Val RMSE = 1.2829

=== Training distilbert-base-uncased on text1 ===


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Epoch 1/5: Val RMSE = 2.5913
  Epoch 2/5: Val RMSE = 1.3536
  Epoch 3/5: Val RMSE = 1.1401
  Epoch 4/5: Val RMSE = 0.9405
  Epoch 5/5: Val RMSE = 0.7661

### GBDT MODELS ###

=== Training XGBoost ===
[0]	validation_0-rmse:2.99568
[50]	validation_0-rmse:2.23644
[100]	validation_0-rmse:2.18669
[150]	validation_0-rmse:2.16210
[200]	validation_0-rmse:2.14594
[250]	validation_0-rmse:2.13628
[300]	validation_0-rmse:2.13121
[350]	validation_0-rmse:2.12757
[400]	validation_0-rmse:2.12366
[450]	validation_0-rmse:2.12283
[499]	validation_0-rmse:2.12101
XGBoost Val RMSE: 0.6528

=== Training LightGBM ===
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[498]	valid_0's l2: 0.513403
LightGBM Val RMSE: 0.7165

=== Training CatBoost ===
Converting sparse matrices to dense for CatBoost...
0:	learn: 3.0433415	test: 2.9998457	best: 2.9998457 (0)	total: 54.4ms	remaining: 27.2s
50:	learn: 1.4710899	test: 1.5080788	best: 1.5080788 (50)	total: 30