In [None]:
# --- ADD THIS NEW BLOCK FOR AUTHENTICATION ---
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient

# Access the secret
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("HF_TOKEN")

# Log in to Hugging Face
login(token=hf_token)
# --- END OF NEW BLOCK ---

In [None]:
device='cuda'

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset, random_split
from sentence_transformers import SentenceTransformer
import warnings

# Suppress warnings
warnings.filterwarnings("ignore")

# --- Device Setup ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# --- Load Data ---
print("Loading data...")
train_df = pd.read_json('/kaggle/input/da5401-2025-data-challenge/train_data.json')
test_df = pd.read_json('/kaggle/input/da5401-2025-data-challenge/test_data.json')

with open('/kaggle/input/da5401-2025-data-challenge/metric_names.json', 'r') as f:
    import json
    metric_names = json.load(f)
metric_to_index = {name: i for i, name in enumerate(metric_names)}

metric_embeddings = np.load('/kaggle/input/da5401-2025-data-challenge/metric_name_embeddings.npy')

# --- Initialize Embedding Model ---
print("Initializing embedding model...")
embedding_model = SentenceTransformer("google/embeddinggemma-300m").to(device)

In [None]:
train_df['score'].value_counts()

In [None]:
train_df['score'] = train_df['score'].replace({
    4.0: 3.0,
    5.0: 6.0,
    9.5: 10.0
})


In [None]:
train_df['score'].value_counts()

In [None]:
train_df['score'] = train_df['score'].replace({
    1.0: 1.5,
    2.0: 1.5,
})


In [None]:
train_df['score'].value_counts()

In [None]:
from sklearn.model_selection import train_test_split
from torch.utils.data import Subset
import pandas as pd
# --- Preprocessing Function ---
def preprocess_for_attention_model(df, is_train=True):
    df['system_prompt'] = df['system_prompt'].fillna('')
    df['combined_prompt'] = "SYSTEM PROMPT: " + df['system_prompt'] + " USER PROMPT: " + df['user_prompt']

    print(f"Generating embeddings for {len(df)} rows...")
    
    # Encode texts
    prompt_embeds = embedding_model.encode(df['combined_prompt'].tolist(), show_progress_bar=True, batch_size=64)
    response_embeds = embedding_model.encode(df['response'].tolist(), show_progress_bar=True, batch_size=64)

    # Move to CPU numpy
    if isinstance(prompt_embeds, torch.Tensor): prompt_embeds = prompt_embeds.cpu().numpy()
    if isinstance(response_embeds, torch.Tensor): response_embeds = response_embeds.cpu().numpy()
    torch.cuda.empty_cache()

    # Get Metric Embeddings
    metric_embeds = np.array([metric_embeddings[metric_to_index[name]] for name in df['metric_name']])

    data_dict = {
        'metric_embedding': metric_embeds,
        'prompt_embedding': prompt_embeds,
        'response_embedding': response_embeds
    }

    if is_train:
        targets = df['score'].astype(float).values
        return data_dict, targets
    else:
        return data_dict

# --- Process Data ---
print("Processing Training Data...")
train_data_dict, y_train_all = preprocess_for_attention_model(train_df, is_train=True)
print("Processing Test Data...")
test_data_dict = preprocess_for_attention_model(test_df, is_train=False)


# --- Part 3: Convert to Tensors and Perform a Stratified Split ---

print("\n--- Creating Full Dataset Tensors ---")
# Convert all NumPy arrays to PyTorch tensors
full_metric_tensors = torch.tensor(train_data_dict['metric_embedding'], dtype=torch.float32)
full_prompt_tensors = torch.tensor(train_data_dict['prompt_embedding'], dtype=torch.float32)
full_response_tensors = torch.tensor(train_data_dict['response_embedding'], dtype=torch.float32)
full_target_tensors = torch.tensor(y_train_all, dtype=torch.float32).view(-1, 1)

# Create the full TensorDataset
full_dataset = TensorDataset(full_metric_tensors, full_prompt_tensors, full_response_tensors, full_target_tensors)

# --- NEW: Stratified Splitting ---
print("\n--- Performing Stratified Split to Create Train/Val Datasets ---")

# Step 1: Create bins from the continuous target scores to use for stratification.
# This ensures that each bin has a similar distribution in both train and val sets.
num_bins = 15 # A reasonable number of bins to capture the distribution shape
score_bins = pd.cut(y_train_all, bins=num_bins, labels=False)

# Step 2: Generate indices for the split using sklearn's train_test_split
# We split the indices (0 to len-1) instead of the data itself.
dataset_indices = list(range(len(full_dataset)))
train_indices, val_indices = train_test_split(
    dataset_indices,
    test_size=0.2,       # 80/20 split
    stratify=score_bins, # Stratify based on our new bins
    random_state=42
)

# Step 3: Create PyTorch Subsets from the full dataset using the stratified indices
train_dataset = Subset(full_dataset, train_indices)
val_dataset = Subset(full_dataset, val_indices)

print(f"Stratified split complete. Train size: {len(train_dataset)}, Validation size: {len(val_dataset)}")


# --- Verification Step (Optional but Recommended) ---
# Let's verify that the distributions are indeed similar.
print("\n--- Verifying Score Distributions ---")
original_dist = pd.Series(y_train_all).value_counts(normalize=True).sort_index()
train_targets = [full_dataset[i][3].item() for i in train_indices]
train_dist = pd.Series(train_targets).value_counts(normalize=True).sort_index()
val_targets = [full_dataset[i][3].item() for i in val_indices]
val_dist = pd.Series(val_targets).value_counts(normalize=True).sort_index()

dist_df = pd.DataFrame({
    'Original %': original_dist * 100,
    'Train Split %': train_dist * 100,
    'Val Split %': val_dist * 100
}).fillna(0).round(2)

print(dist_df)


# --- Create DataLoaders ---
# The rest of the process is the same
BATCH_SIZE = 32
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

print(f"\nDataLoaders created successfully!")
print(f"Train batches: {len(train_loader)}, Val batches: {len(val_loader)}")

In [None]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader
import optuna
from optuna.trial import TrialState

# =====================================================================================
# SECTION 1: MODEL DEFINITION
# =====================================================================================
# The HierarchicalAttentionScorer class remains the same as before.
class HierarchicalAttentionScorer(nn.Module):
    def __init__(self, embed_dim=768, num_heads=8, dropout=0.1):
        super().__init__()
        self.prompt_response_attention = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout, batch_first=True)
        self.metric_context_attention = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout, batch_first=True)
        self.ln1 = nn.LayerNorm(embed_dim)
        self.ln2 = nn.LayerNorm(embed_dim)
        self.prediction_head = nn.Sequential(
            nn.Linear(embed_dim, embed_dim // 2), nn.ReLU(), nn.Dropout(dropout), nn.Linear(embed_dim // 2, 1)
        )
    def forward(self, metric_embedding, prompt_embedding, response_embedding):
        prompt_embedding = prompt_embedding.unsqueeze(1)
        response_embedding = response_embedding.unsqueeze(1)
        metric_embedding = metric_embedding.unsqueeze(1)
        contextual_prompt, _ = self.prompt_response_attention(query=prompt_embedding, key=response_embedding, value=response_embedding)
        contextual_prompt = self.ln1(contextual_prompt + prompt_embedding)
        final_representation, _ = self.metric_context_attention(query=metric_embedding, key=contextual_prompt, value=contextual_prompt)
        final_representation = self.ln2(final_representation + metric_embedding)
        return self.prediction_head(final_representation.squeeze(1))

# =====================================================================================
# SECTION 2: OPTUNA OBJECTIVE FUNCTION (NOW TUNES WEIGHTS)
# =====================================================================================

def objective(trial):
    # --- 1. Suggest Hyperparameters for Model and Optimizer ---
    num_heads = trial.suggest_categorical('num_heads', [4, 8 ,16])
    dropout = trial.suggest_float('dropout', 0.1, 0.4)
    lr = trial.suggest_float('lr', 1e-5, 1e-3, log=True)
    weight_decay = trial.suggest_float('weight_decay', 1e-6, 1e-4, log=True)
    
    # --- 2. NEW: Suggest Hyperparameters for Loss Weights ---
    # We suggest weights for each of our custom tiers.
    # Using a log scale is good for searching over different orders of magnitude.
    weight_rare = trial.suggest_float('weight_rare', 1000.0, 3000.0, log=True)
    weight_mid_rare = trial.suggest_float('weight_mid_rare', 80.0, 500.0, log=True)
    weight_8 = trial.suggest_float('weight_8', 10.0, 100.0, log=True)
    weight_common =2.0 # Keep the common class weight fixed at 1.0 for a stable baseline
    
    # --- 3. Define the Weighted Loss Function for this specific trial ---
    def calculate_loss_weights_trial(targets, device):
        weights = torch.ones_like(targets).to(device)
        mask_common = targets > 8.0
        mask_8 = targets == 8.0
        mask_mid_rare = (targets < 8.0) & (targets >= 6.0)
        mask_rare = targets < 6.0
        weights[mask_rare] = weight_rare
        weights[mask_mid_rare] = weight_mid_rare
        weights[mask_8] = weight_8
        weights[mask_common] = weight_common
        return weights

    # --- 4. Initialize Model and Optimizer with Suggested Params ---
    model = HierarchicalAttentionScorer(num_heads=num_heads, dropout=dropout).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    criterion = nn.MSELoss(reduction='none')

    # --- 5. Training and Validation Loop ---
    EPOCHS =100
    best_val_rmse = float('inf')

    for epoch in range(EPOCHS):
        model.train()
        for m_emb, p_emb, r_emb, target in train_loader:
            m_emb, p_emb, r_emb, target = m_emb.to(device), p_emb.to(device), r_emb.to(device), target.to(device)
            optimizer.zero_grad()
            preds = model(m_emb, p_emb, r_emb)
            loss_per_sample = criterion(preds, target)
            # Use the loss function defined for this trial
            weights = calculate_loss_weights_trial(target, device)
            weighted_loss = (loss_per_sample * weights).mean()
            weighted_loss.backward()
            optimizer.step()
            
        model.eval()
        weighted_val_mse_sum = 0.0
        with torch.no_grad():
            for m_emb, p_emb, r_emb, target in val_loader:
                m_emb, p_emb, r_emb, target = m_emb.to(device), p_emb.to(device), r_emb.to(device), target.to(device)
                preds = model(m_emb, p_emb, r_emb)
                loss_per_sample = nn.MSELoss(reduction='none')(preds, target)
                weights = calculate_loss_weights_trial(target, device)
                weighted_val_mse_sum += (loss_per_sample * weights).sum().item()
        
        val_rmse = np.sqrt(weighted_val_mse_sum / len(val_dataset))

        if val_rmse < best_val_rmse:
            best_val_rmse = val_rmse
            
        trial.report(best_val_rmse, epoch)
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    return best_val_rmse

# =====================================================================================
# SECTION 3: RUN OPTUNA STUDY
# =====================================================================================
print("\n--- Starting Optuna Hyperparameter and Loss Weight Search ---")
study = optuna.create_study(direction='minimize', pruner=optuna.pruners.MedianPruner())
study.optimize(objective, n_trials=150) # Increase n_trials for a more thorough search

print("\n--- Optuna Search Complete ---")
best_trial = study.best_trial
print(f"  Value (Best Weighted Val RMSE): {best_trial.value}")
print("  Best Params: ")
for key, value in best_trial.params.items():
    print(f"    {key}: {value:.6f}")

# =====================================================================================
# SECTION 4: TRAIN FINAL MODEL WITH BEST PARAMS AND BEST WEIGHTS
# =====================================================================================
print("\n--- Training Final Model with Best Hyperparameters and Weights ---")
# --- 1. Get Best Params and Initialize Final Model ---
best_params = study.best_params
final_model = HierarchicalAttentionScorer(
    num_heads=best_params['num_heads'],
    dropout=best_params['dropout']
).to(device)

# --- 2. Create the final, optimized weight function ---
def calculate_final_loss_weights(targets, device):
    weights = torch.ones_like(targets).to(device)
    mask_common = targets > 8.0
    mask_8 = targets == 8.0
    mask_mid_rare = (targets < 8.0) & (targets >= 6.0)
    mask_rare = targets < 6.0
    weights[mask_rare] = best_params['weight_rare']
    weights[mask_mid_rare] = best_params['weight_mid_rare']
    weights[mask_8] = best_params['weight_8']
    weights[mask_common] = 1.0 # Still keep common class at 1.0
    return weights

optimizer = torch.optim.AdamW(final_model.parameters(), lr=best_params['lr'], weight_decay=best_params['weight_decay'])
criterion = nn.MSELoss(reduction='none')
EPOCHS = 100
best_val_rmse = float('inf')

# --- 3. Final Training Loop ---
for epoch in range(EPOCHS):
    final_model.train()
    train_loss = 0.0
    for m_emb, p_emb, r_emb, target in train_loader:
        m_emb, p_emb, r_emb, target = m_emb.to(device), p_emb.to(device), r_emb.to(device), target.to(device)
        optimizer.zero_grad()
        preds = final_model(m_emb, p_emb, r_emb)
        loss_per_sample = criterion(preds, target)
        # Use the FINAL optimized weight function
        weights = calculate_final_loss_weights(target, device)
        weighted_loss = (loss_per_sample * weights).mean()
        weighted_loss.backward()
        optimizer.step()
        train_loss += weighted_loss.item()
        
    final_model.eval()
    weighted_val_mse_sum = 0.0
    with torch.no_grad():
        for m_emb, p_emb, r_emb, target in val_loader:
            m_emb, p_emb, r_emb, target = m_emb.to(device), p_emb.to(device), r_emb.to(device), target.to(device)
            preds = final_model(m_emb, p_emb, r_emb)
            loss_per_sample = nn.MSELoss(reduction='none')(preds, target)
            weights = calculate_final_loss_weights(target, device)
            weighted_val_mse_sum += (loss_per_sample * weights).sum().item()
    val_rmse = np.sqrt(weighted_val_mse_sum / len(val_dataset))
    
    print(f"Epoch {epoch+1}/{EPOCHS} | Train Loss: {train_loss/len(train_loader):.4f} | Val Weighted RMSE: {val_rmse:.4f}")
    if val_rmse < best_val_rmse:
        best_val_rmse = val_rmse
        torch.save(final_model.state_dict(), "best_attn_model_fully_tuned.pth")
        print(f"   -> New best model saved with RMSE: {best_val_rmse:.4f}")

# --- 4. Inference & Submission ---
print("\nGenerating Submission with Fully Tuned Model...")
final_model.load_state_dict(torch.load("best_attn_model_fully_tuned.pth"))
final_model.eval()
# (Inference code is the same as before)
test_metric_tensors = torch.tensor(test_data_dict['metric_embedding'], dtype=torch.float32).to(device)
test_prompt_tensors = torch.tensor(test_data_dict['prompt_embedding'], dtype=torch.float32).to(device)
test_response_tensors = torch.tensor(test_data_dict['response_embedding'], dtype=torch.float32).to(device)
test_dataset_final = TensorDataset(test_metric_tensors, test_prompt_tensors, test_response_tensors)
test_loader_final = DataLoader(test_dataset_final, batch_size=64, shuffle=False)
all_preds = []
with torch.no_grad():
    for m_emb, p_emb, r_emb in test_loader_final:
        preds = final_model(m_emb, p_emb, r_emb)
        all_preds.extend(preds.cpu().numpy().flatten())
final_preds = np.clip(all_preds, 0, 10)
test_df['ID'] = test_df.index + 1
submission_df = pd.DataFrame({'ID': test_df['ID'], 'score': final_preds})
submission_df.to_csv('submission_attn_regression_fully_tuned.csv', index=False)
print("Submission file 'submission_attn_regression_fully_tuned.csv' created successfully!")

In [None]:
import torch

# --- Common Setup Block ---

# 1. Define the device for PyTorch
# This is the most critical missing piece. The VQ-VAE code relies on this variable.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device set to: {device}")

# 2. Re-confirm necessary variables from the previous step are available.
# The following variables were created in your previous code block and are needed by the VQ-VAE pipeline.
# We'll just confirm they exist.
try:
    _ = full_metric_tensors
    _ = full_prompt_tensors
    _ = full_response_tensors
    _ = full_target_tensors
    _ = test_data_dict
    _ = y_train_all
    print("Prerequisite tensors and data dictionaries are available.")
except NameError as e:
    print(f"CRITICAL ERROR: A required variable is missing: {e}")
    print("Please ensure the data preprocessing and splitting block has been run successfully before this step.")

In [None]:
!pip install scikit-learn==1.1.3 --quiet
!pip install imbalanced-learn==0.10.1 --quiet

In [None]:
from imblearn.over_sampling import SMOTENC
print("SMOTENC imported successfully!")


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import gc
from imblearn.over_sampling import SMOTENC

# =====================================================================================
# CONTROL PANEL & HYPERPARAMETERS
# =====================================================================================
# VQ-VAE Parameters
NUM_EMBEDDINGS = 256# Size of the codebook (how many discrete codes to learn)
EMBEDDING_DIM = 64    # Dimension of each code in the codebook
COMMITMENT_COST = 0.25
VQVAE_EPOCHS = 10000     # How many epochs to train each VQ-VAE
VQVAE_LR = 1e-5

# XGBoost Parameters
XGB_EPOCHS = 10000
XGB_LR = 0.0005
# =====================================================================================


# =====================================================================================
# SECTION 1: VQ-VAE MODEL DEFINITION
# =====================================================================================

class VectorQuantizer(nn.Module):
    """The Vector Quantization Layer."""
    def __init__(self, num_embeddings, embedding_dim, commitment_cost):
        super(VectorQuantizer, self).__init__()
        self.embedding_dim = embedding_dim
        self.num_embeddings = num_embeddings
        self.commitment_cost = commitment_cost
        
        # Initialize the codebook
        self.embedding = nn.Embedding(self.num_embeddings, self.embedding_dim)
        self.embedding.weight.data.uniform_(-1/self.num_embeddings, 1/self.num_embeddings)

    def forward(self, inputs):
        # Flatten input
        flat_input = inputs.view(-1, self.embedding_dim)
        
        # Calculate distances
        distances = (torch.sum(flat_input**2, dim=1, keepdim=True) 
                    + torch.sum(self.embedding.weight**2, dim=1)
                    - 2 * torch.matmul(flat_input, self.embedding.weight.t()))
            
        # Encoding
        encoding_indices = torch.argmin(distances, dim=1).unsqueeze(1)
        encodings = torch.zeros(encoding_indices.shape[0], self.num_embeddings, device=inputs.device)
        encodings.scatter_(1, encoding_indices, 1)
        
        # Quantize and unflatten
        quantized = torch.matmul(encodings, self.embedding.weight).view(inputs.shape)
        
        # Calculate loss
        e_latent_loss = F.mse_loss(quantized.detach(), inputs)
        q_latent_loss = F.mse_loss(quantized, inputs.detach())
        loss = q_latent_loss + self.commitment_cost * e_latent_loss
        
        # Straight-through estimator
        quantized = inputs + (quantized - inputs).detach()
        
        return loss, quantized, encoding_indices.squeeze()

class VQVAE(nn.Module):
    """The full VQ-VAE model."""
    def __init__(self, input_dim=768, hidden_dim=256, embedding_dim=EMBEDDING_DIM, num_embeddings=NUM_EMBEDDINGS, commitment_cost=COMMITMENT_COST):
        super(VQVAE, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, embedding_dim)
        )
        self.vq_layer = VectorQuantizer(num_embeddings, embedding_dim, commitment_cost)
        self.decoder = nn.Sequential(
            nn.Linear(embedding_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, input_dim)
        )

    def forward(self, x):
        z = self.encoder(x)
        vq_loss, quantized, indices = self.vq_layer(z)
        x_recon = self.decoder(quantized)
        return vq_loss, x_recon, indices

# =====================================================================================
# SECTION 2: VQ-VAE TRAINING AND LATENT CODE EXTRACTION
# =====================================================================================

def train_vqvae(model, loader, optimizer, epochs=VQVAE_EPOCHS):
    """Generic training loop for a VQ-VAE model."""
    model.train()
    for epoch in range(epochs):
        for data in loader:
            data = data[0].to(device)
            optimizer.zero_grad()
            vq_loss, data_recon, _ = model(data)
            recon_error = F.mse_loss(data_recon, data)
            loss = recon_error + vq_loss
            loss.backward()
            optimizer.step()
    print("Training complete.")

def extract_latent_codes(model, loader):
    """Extracts the discrete latent codes for a given dataset."""
    model.eval()
    codes = []
    with torch.no_grad():
        for data in loader:
            data = data[0].to(device)
            _, _, indices = model(data)
            codes.append(indices.cpu().numpy())
    return np.concatenate(codes)

# --- Prepare DataLoaders for each embedding type ---
# We train the VQ-VAEs on the FULL training dataset (train+val)
full_train_dataset = TensorDataset(full_metric_tensors, full_prompt_tensors, full_response_tensors, full_target_tensors)

metric_loader_train = DataLoader(TensorDataset(full_metric_tensors), batch_size=128)
prompt_loader_train = DataLoader(TensorDataset(full_prompt_tensors), batch_size=128)
response_loader_train = DataLoader(TensorDataset(full_response_tensors), batch_size=128)

# --- Train a VQ-VAE for each embedding type ---
print("\n--- Training VQ-VAE for Metric Embeddings ---")
vqvae_metric = VQVAE().to(device)
optimizer_metric = torch.optim.AdamW(vqvae_metric.parameters(), lr=VQVAE_LR)
train_vqvae(vqvae_metric, metric_loader_train, optimizer_metric)

print("\n--- Training VQ-VAE for Prompt Embeddings ---")
vqvae_prompt = VQVAE().to(device)
optimizer_prompt = torch.optim.AdamW(vqvae_prompt.parameters(), lr=VQVAE_LR)
train_vqvae(vqvae_prompt, prompt_loader_train, optimizer_prompt)

print("\n--- Training VQ-VAE for Response Embeddings ---")
vqvae_response = VQVAE().to(device)
optimizer_response = torch.optim.AdamW(vqvae_response.parameters(), lr=VQVAE_LR)
train_vqvae(vqvae_response, response_loader_train, optimizer_response)

# --- Extract Latent Codes to create the new dataset for XGBoost ---
print("\n--- Extracting Latent Codes for XGBoost ---")
# For training data
metric_codes_train = extract_latent_codes(vqvae_metric, metric_loader_train)
prompt_codes_train = extract_latent_codes(vqvae_prompt, prompt_loader_train)
response_codes_train = extract_latent_codes(vqvae_response, response_loader_train)

# For test data
test_metric_tensors = torch.tensor(test_data_dict['metric_embedding'], dtype=torch.float32)
test_prompt_tensors = torch.tensor(test_data_dict['prompt_embedding'], dtype=torch.float32)
test_response_tensors = torch.tensor(test_data_dict['response_embedding'], dtype=torch.float32)

metric_loader_test = DataLoader(TensorDataset(test_metric_tensors), batch_size=128)
prompt_loader_test = DataLoader(TensorDataset(test_prompt_tensors), batch_size=128)
response_loader_test = DataLoader(TensorDataset(test_response_tensors), batch_size=128)

metric_codes_test = extract_latent_codes(vqvae_metric, metric_loader_test)
prompt_codes_test = extract_latent_codes(vqvae_prompt, prompt_loader_test)
response_codes_test = extract_latent_codes(vqvae_response, response_loader_test)

# --- Create the final DataFrame with latent codes as features ---
X_train_xgb_df = pd.DataFrame({
    'metric_code': metric_codes_train,
    'prompt_code': prompt_codes_train,
    'response_code': response_codes_train
}).astype('category') # Treat codes as categorical features

X_test_xgb_df = pd.DataFrame({
    'metric_code': metric_codes_test,
    'prompt_code': prompt_codes_test,
    'response_code': response_codes_test
}).astype('category')

y_train_xgb = y_train_all

print(f"XGBoost training features shape: {X_train_xgb_df.shape}")
print("Sample of new features:\n", X_train_xgb_df.head())

# =====================================================================================
# SECTION 3: TIERED OVERSAMPLING WITH SMOTE-NC AND XGBOOST TRAINING
# =====================================================================================

print("\n--- Applying Tiered SMOTE-NC to the Training Data ---")

# --- 1. Prepare Data and Define the Oversampling Strategy ---
# SMOTE-NC requires discrete class labels for its 'y' input.
# We'll round the scores to the nearest integer to create these classes.
y_train_classes = np.round(y_train_xgb).astype(int)
class_counts = pd.Series(y_train_classes).value_counts()

# Define the desired number of samples for each class after oversampling.
# This gives you precise control, as requested.
sampling_strategy = {
    # Scores > 8.0 (e.g., 9, 10) are not included, so they will not be oversampled.
    # Scores == 8.0 get multiplied by 4
    8: class_counts.get(8, 0) * 4,
    # Scores == 7.0 get multiplied by 15
    7: class_counts.get(7, 0) * 15,
    # Scores == 6.0 get multiplied by 15
    6: class_counts.get(6, 0) * 15
}
# Scores < 6.0 get multiplied by 80
for score in class_counts.index:
    if score < 6:
        sampling_strategy[score] = class_counts.get(score, 0) * 80

print("Original Class Distribution:\n", class_counts.sort_index())
print("\nTarget Sampling Strategy for SMOTE-NC:")
# Sort the dictionary for printing
sorted_strategy = {k: v for k, v in sorted(sampling_strategy.items())}
for k, v in sorted_strategy.items():
    print(f"  Score {k}: Target {v} samples")

# --- 2. Initialize and Apply SMOTE-NC ---
# We must tell SMOTE-NC that all our features (columns 0, 1, 2) are categorical.
smote_nc = SMOTENC(
    categorical_features=[0, 1, 2],
    sampling_strategy=sampling_strategy,
    random_state=42,
    # Use a small k_neighbors value, as some classes have very few original samples.
    # If a class has fewer than k_neighbors+1 samples, SMOTE-NC will fail.
    # We find the smallest minority class count to set k_neighbors safely.
    k_neighbors=min(2, min(class_counts[class_counts.index < 6]) - 1) if any(class_counts.index < 6) else 2
)

print(f"\nUsing k_neighbors = {smote_nc.k_neighbors_}")
print(f"Original training data shape: {X_train_xgb_df.shape}")

# Apply the resampling. This can be memory-intensive.
X_resampled, y_resampled_classes = smote_nc.fit_resample(X_train_xgb_df, y_train_classes)

# The output 'y' is integer classes. For regression, we convert them back to float.
y_resampled = y_resampled_classes.astype(float)

print(f"Resampled training data shape: {X_resampled.shape}")
print("\nNew Score Distribution after Tiered SMOTE-NC:")
print(pd.Series(y_resampled).value_counts().sort_index())

# --- 3. Define XGBoost Parameters ---
params_xgb = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'n_estimators': XGB_EPOCHS,
    'learning_rate': XGB_LR,
    'max_depth': 6, # Can potentially use a slightly deeper tree with more data
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'seed': 42,
    'n_jobs': -1,
    'tree_method': 'gpu_hist' if 'cuda' in device.type else 'hist',
    'enable_categorical': True  # CRUCIAL
}

# --- 4. Train the Final XGBoost Model on Resampled Data ---
print("\n--- Training Final XGBoost Regressor on Resampled Latent Codes ---")
xgb_model_smote = xgb.XGBRegressor(**params_xgb)

# IMPORTANT: We use the resampled data (X_resampled, y_resampled)
# We DO NOT use sample_weight here, as the data has already been balanced by SMOTE-NC.
xgb_model_smote.fit(X_resampled, y_resampled, verbose=False)

# --- 5. Generate Predictions on the Original Test Set ---
print("\nGenerating final predictions with the XGBoost model...")
final_preds_smote = xgb_model_smote.predict(X_test_xgb_df)

# --- 6. Create Submission File ---
final_preds_smote = np.clip(final_preds_smote, 0, 10)
test_df['ID'] = test_df.index + 1
submission_df_smote = pd.DataFrame({'ID': test_df['ID'], 'score': final_preds_smote})
submission_df_smote.to_csv('submission_vqvae_xgb_tiered_smotenc.csv', index=False)
print("Submission file 'submission_vqvae_xgb_tiered_smotenc.csv' created successfully!")