In [1]:
import pandas as pd
import numpy as np
from scipy.stats import spearmanr
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)
from sentence_transformers import SentenceTransformer
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings('ignore')
import json
import os
import gc

# ============================================================================
# CONFIGURATION - MEMORY OPTIMIZED
# ============================================================================
class Config:
    """Competition-grade configuration - Memory Optimized"""
    # Paths
    TRAIN_PATH = "/kaggle/input/datasetllm/train.json"
    VAL_PATH = "/kaggle/input/datasetllm/dev.json"
    OUTPUT_DIR = "/kaggle/working/competition_models"

    # OPTIMIZED MODEL LIST - Smaller models
    BASE_MODELS = [
        'microsoft/deberta-v3-base',      # 184M params (vs 304M for large)
        'roberta-base',                    # 125M params (vs 355M for large)
    ]

    # Training settings - MEMORY OPTIMIZED
    EPOCHS = 10                          # Slightly reduced from 12
    BATCH_SIZE = 1                       # Reduced from 2
    GRADIENT_ACCUMULATION = 16           # Increased from 8 to maintain effective batch
    LEARNING_RATE = 5e-6
    WARMUP_RATIO = 0.15
    MAX_LENGTH = 384                     # Reduced from 512 but not too small
    WEIGHT_DECAY = 0.01

    # Advanced settings
    USE_KFOLD = True
    N_FOLDS = 3                          # Reduced from 5
    USE_FOCAL_LOSS = True
    USE_ORDINAL = True
   
    # Memory optimization flags
    CLEAR_CACHE_FREQUENCY = 1            # Clear cache every N folds

    DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
    SEED = 42

torch.manual_seed(Config.SEED)
np.random.seed(Config.SEED)

print(f"\n{'='*70}")
print(f"WSD Competition System - Memory Optimized for Kaggle")
print(f"Device: {Config.DEVICE}")
print(f"{'='*70}\n")

# ============================================================================
# MEMORY MANAGEMENT UTILITIES
# ============================================================================
def clear_memory():
    """Aggressively clear GPU and CPU memory"""
    gc.collect()
    torch.cuda.empty_cache()
    if torch.cuda.is_available():
        torch.cuda.synchronize()

def print_gpu_memory():
    """Print current GPU memory usage"""
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated() / 1024**3
        reserved = torch.cuda.memory_reserved() / 1024**3
        print(f"  GPU Memory: {allocated:.2f}GB allocated, {reserved:.2f}GB reserved")

# ============================================================================
# DATA LOADING
# ============================================================================
def load_json_data(filepath):
    """Load data from JSON file"""
    print(f"Loading: {filepath}")
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            data = json.load(f)
        if isinstance(data, dict):
            if all(k.isdigit() for k in data.keys()):
                data = [data[k] for k in sorted(data.keys(), key=int)]
            else:
                data = [data]
        print(f"  ✓ Loaded {len(data)} records")
        return pd.DataFrame(data)
    except json.JSONDecodeError:
        data = []
        with open(filepath, 'r', encoding='utf-8') as f:
            for line in f:
                if line.strip():
                    data.append(json.loads(line))
        print(f"  ✓ Loaded {len(data)} records (JSONL)")
        return pd.DataFrame(data)

def process_dataframe(df):
    """Process and validate dataframe"""
    column_mapping = {
        'precontext': ['precontext', 'pre_context', 'context'],
        'ambiguous_sentence': ['ambiguous_sentence', 'ambiguous', 'sentence'],
        'ending': ['ending', 'end', 'conclusion'],
        'homonym': ['homonym', 'target_word', 'word'],
        'sense_definition': ['sense_definition', 'sense', 'judged_meaning'],
    }

    final_mapping = {}
    for target_col, possible_names in column_mapping.items():
        for name in possible_names:
            if name in df.columns:
                final_mapping[name] = target_col
                break

    if final_mapping:
        df = df.rename(columns=final_mapping)

    if 'ending' not in df.columns:
        df['ending'] = ''

    if 'average' in df.columns:
        df['avg_rating'] = df['average'].astype(float)
        df['std_rating'] = df.get('stdev', pd.Series([1.0] * len(df))).astype(float).fillna(1.0)
    elif 'avg_rating' in df.columns:
        df['avg_rating'] = df['avg_rating'].astype(float)
        df['std_rating'] = df.get('std_rating', pd.Series([1.0] * len(df)))
    else:
        df['avg_rating'] = np.nan
        df['std_rating'] = 1.0

    for col in ['precontext', 'ambiguous_sentence', 'ending', 'homonym', 'sense_definition']:
        if col in df.columns:
            df[col] = df[col].fillna('').astype(str)

    return df

def load_data(train_path, val_path=None):
    """Load and preprocess data"""
    print("\n" + "="*70)
    print("DATA LOADING")
    print("="*70)

    train_df = load_json_data(train_path)
    train_df = process_dataframe(train_df)

    if val_path:
        val_df = load_json_data(val_path)
        val_df = process_dataframe(val_df)
    else:
        from sklearn.model_selection import train_test_split
        train_df, val_df = train_test_split(train_df, test_size=0.15, random_state=Config.SEED)

    train_df = train_df.reset_index(drop=True)
    val_df = val_df.reset_index(drop=True)

    print(f"\n✓ Training samples: {len(train_df)}")
    print(f"✓ Validation samples: {len(val_df)}")

    return train_df, val_df

# ============================================================================
# FEATURE EXTRACTION - KEEPING BOTH MODELS FOR ACCURACY
# ============================================================================
def cosine_sim(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-8)

def word_overlap(text1, text2):
    words1 = set(text1.lower().split())
    words2 = set(text2.lower().split())
    if not words1 or not words2:
        return 0
    return len(words1 & words2) / len(words1 | words2)

def extract_features(data):
    """Extract semantic features - KEEPING BOTH EMBEDDING MODELS"""
    print("\n" + "="*70)
    print("FEATURE EXTRACTION")
    print("="*70)

    # Use BOTH models for better feature diversity and accuracy
    print("  Loading embedding models...")
    model_mini = SentenceTransformer('all-MiniLM-L6-v2')
    model_mpnet = SentenceTransformer('all-mpnet-base-v2')
    print("  ✓ Both models loaded")

    features = []

    for idx, row in data.iterrows():
        if (idx + 1) % 500 == 0:
            print(f"  Progress: {idx + 1}/{len(data)}")

        precontext = str(row['precontext'])
        ambiguous = str(row['ambiguous_sentence'])
        sense = str(row['sense_definition'])
        homonym = str(row['homonym'])
        ending = str(row.get('ending', '')).strip()

        full_context = f"{precontext} {ambiguous}"
        if ending:
            full_context += f" {ending}"

        # Embeddings from BOTH models (critical for accuracy)
        pre_mini = model_mini.encode(precontext)
        amb_mini = model_mini.encode(ambiguous)
        sense_mini = model_mini.encode(sense)
        full_mini = model_mini.encode(full_context)

        pre_mpnet = model_mpnet.encode(precontext)
        amb_mpnet = model_mpnet.encode(ambiguous)
        sense_mpnet = model_mpnet.encode(sense)
        full_mpnet = model_mpnet.encode(full_context)

        feat = {
            # MiniLM similarities
            'sim_pre_sense_mini': cosine_sim(pre_mini, sense_mini),
            'sim_amb_sense_mini': cosine_sim(amb_mini, sense_mini),
            'sim_full_sense_mini': cosine_sim(full_mini, sense_mini),

            # MPNet similarities  
            'sim_pre_sense_mpnet': cosine_sim(pre_mpnet, sense_mpnet),
            'sim_amb_sense_mpnet': cosine_sim(amb_mpnet, sense_mpnet),
            'sim_full_sense_mpnet': cosine_sim(full_mpnet, sense_mpnet),

            # Text features
            'sense_length': len(sense.split()),
            'context_length': len(full_context.split()),
            'homonym_in_sense': int(homonym.lower() in sense.lower()),
            'has_ending': int(bool(ending)),

            # Word overlap
            'precontext_sense_overlap': word_overlap(precontext, sense),
            'full_sense_overlap': word_overlap(full_context, sense),
        }

        if ending:
            end_mini = model_mini.encode(ending)
            end_mpnet = model_mpnet.encode(ending)
            feat['sim_end_sense_mini'] = cosine_sim(end_mini, sense_mini)
            feat['sim_end_sense_mpnet'] = cosine_sim(end_mpnet, sense_mpnet)
        else:
            feat['sim_end_sense_mini'] = 0
            feat['sim_end_sense_mpnet'] = 0

        features.append(feat)

    print(f"✓ Extracted {len(features)} feature sets with BOTH embedding models\n")
   
    # Clear models from memory after feature extraction
    del model_mini
    del model_mpnet
    clear_memory()
   
    return pd.DataFrame(features)

# ============================================================================
# DATASET
# ============================================================================
class WSDDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=384):
        self.data = data.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]

        precontext = str(row['precontext']).strip()
        sentence = str(row['ambiguous_sentence']).strip()
        ending = str(row.get('ending', '')).strip()
        homonym = str(row['homonym']).strip()
        sense = str(row['sense_definition']).strip()

        # Enhanced input format
        if ending:
            text = (f"Context: {precontext} Target sentence: {sentence} "
                   f"Continuation: {ending} [SEP] Word: {homonym} [SEP] "
                   f"Meaning: {sense}")
        else:
            text = (f"Context: {precontext} Target sentence: {sentence} "
                   f"[SEP] Word: {homonym} [SEP] Meaning: {sense}")

        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        item = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
        }

        if 'avg_rating' in row and pd.notna(row['avg_rating']):
            rating = float(row['avg_rating'])
            std = float(row.get('std_rating', 1.0))

            item['labels'] = torch.tensor(rating - 1, dtype=torch.float)

            # Higher weight for lower std (higher confidence)
            weight = 1.0 / (std + 0.3)
            item['weight'] = torch.tensor(weight, dtype=torch.float)

        return item

# ============================================================================
# FOCAL LOSS
# ============================================================================
class FocalMSELoss(nn.Module):
    def __init__(self, gamma=1.5):
        super().__init__()
        self.gamma = gamma

    def forward(self, pred, target, weight=None):
        mse = (pred - target) ** 2
        focal_weight = torch.abs(pred - target) ** self.gamma
        loss = focal_weight * mse

        if weight is not None:
            loss = weight * loss

        return loss.mean()

# ============================================================================
# CUSTOM TRAINER
# ============================================================================
class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        if Config.USE_FOCAL_LOSS:
            self.focal_loss = FocalMSELoss()

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        weights = inputs.pop("weight", None)

        outputs = model(**inputs)
        logits = outputs.logits.squeeze()

        if Config.USE_FOCAL_LOSS:
            loss = self.focal_loss(logits, labels, weights)
        elif weights is not None:
            loss = (weights * (logits - labels) ** 2).mean()
        else:
            loss = ((logits - labels) ** 2).mean()

        return (loss, outputs) if return_outputs else loss

# ============================================================================
# METRICS
# ============================================================================
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.squeeze()

    if predictions.ndim == 0:
        predictions = np.array([predictions])
    if labels.ndim == 0:
        labels = np.array([labels])

    predictions = np.clip(predictions + 1, 1, 5)
    labels = labels + 1

    rounded_preds = np.round(predictions)
    accuracy_within_1 = np.mean(np.abs(rounded_preds - labels) <= 1)
    spearman_corr, _ = spearmanr(predictions, labels)

    mse = np.mean((predictions - labels) ** 2)
    mae = np.mean(np.abs(predictions - labels))
    exact_acc = np.mean(rounded_preds == labels)

    return {
        'accuracy_within_1': accuracy_within_1,
        'spearman': spearman_corr,
        'mse': mse,
        'mae': mae,
        'exact_accuracy': exact_acc,
    }

# ============================================================================
# K-FOLD TRAINING - MEMORY OPTIMIZED
# ============================================================================
def train_fold(model_name, train_df, tokenizer, fold_idx, train_indices, val_indices):
    """Train single fold - Memory optimized"""
    print(f"\n  Fold {fold_idx + 1}/{Config.N_FOLDS}")
    print_gpu_memory()

    fold_train = train_df.iloc[train_indices].reset_index(drop=True)
    fold_val = train_df.iloc[val_indices].reset_index(drop=True)

    train_dataset = WSDDataset(fold_train, tokenizer, Config.MAX_LENGTH)
    val_dataset = WSDDataset(fold_val, tokenizer, Config.MAX_LENGTH)

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=1,
        problem_type="regression"
    )

    output_dir = f"{Config.OUTPUT_DIR}/fold_{fold_idx}"

    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=Config.EPOCHS,
        per_device_train_batch_size=Config.BATCH_SIZE,
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=Config.GRADIENT_ACCUMULATION,
        learning_rate=Config.LEARNING_RATE,
        warmup_ratio=Config.WARMUP_RATIO,
        weight_decay=Config.WEIGHT_DECAY,
        logging_steps=100,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="spearman",
        greater_is_better=True,
        fp16=torch.cuda.is_available(),
        report_to="none",
        seed=Config.SEED + fold_idx,
        lr_scheduler_type="cosine_with_restarts",
        save_total_limit=1,
        gradient_checkpointing=True,
        gradient_checkpointing_kwargs={'use_reentrant': False},
        dataloader_num_workers=0,
        dataloader_pin_memory=False,
        optim="adamw_torch",
        max_grad_norm=1.0,
    )

    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    eval_results = trainer.evaluate()

    print(f"    Spearman: {eval_results['eval_spearman']:.4f}, "
          f"Acc: {eval_results['eval_accuracy_within_1']:.4f}")

    # Get model before clearing trainer
    trained_model = trainer.model
   
    # Clear trainer and datasets
    del trainer
    del train_dataset
    del val_dataset
    clear_memory()

    return trained_model

def train_model_kfold(model_name, train_df, tokenizer, model_idx):
    """Train model with k-fold cross-validation - Memory optimized"""
    print(f"\n{'='*70}")
    print(f"Model {model_idx + 1}/{len(Config.BASE_MODELS)}: {model_name}")
    print(f"Training with {Config.N_FOLDS}-fold cross-validation")
    print(f"{'='*70}")

    kf = KFold(n_splits=Config.N_FOLDS, shuffle=True, random_state=Config.SEED)
    models = []

    for fold_idx, (train_indices, val_indices) in enumerate(kf.split(train_df)):
        model = train_fold(model_name, train_df, tokenizer, fold_idx, train_indices, val_indices)
        models.append(model)
       
        # Clear cache periodically
        if (fold_idx + 1) % Config.CLEAR_CACHE_FREQUENCY == 0:
            print(f"  Clearing cache after fold {fold_idx + 1}")
            clear_memory()
            print_gpu_memory()

    print(f"\n✓ Completed {Config.N_FOLDS}-fold training")
    return models

# ============================================================================
# PREDICTIONS - MEMORY OPTIMIZED
# ============================================================================
def get_predictions(models, tokenizer, data):
    """Get averaged predictions from model ensemble - Memory optimized"""
    all_predictions = []

    for model_idx, model in enumerate(models):
        dataset = WSDDataset(data, tokenizer, Config.MAX_LENGTH)
        loader = DataLoader(dataset, batch_size=8, shuffle=False)

        model.eval()
        model.to(Config.DEVICE)

        predictions = []
        with torch.no_grad():
            for batch in loader:
                input_ids = batch['input_ids'].to(Config.DEVICE)
                attention_mask = batch['attention_mask'].to(Config.DEVICE)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                preds = outputs.logits.squeeze().cpu().numpy()

                if preds.ndim == 0:
                    preds = np.array([preds])

                predictions.extend(preds)
               
                # Clear batch from GPU
                del input_ids, attention_mask, outputs
               
        predictions = np.array(predictions) + 1
        all_predictions.append(predictions)

        if Config.USE_KFOLD:
            print(f"    Fold {model_idx + 1}/{len(models)} predictions obtained")
       
        # Move model back to CPU to free GPU memory
        model.cpu()
        clear_memory()

    # Average across folds
    final_predictions = np.mean(all_predictions, axis=0)
    final_predictions = np.clip(final_predictions, 1, 5)

    return final_predictions

# ============================================================================
# META-LEARNER - FULL ENSEMBLE KEPT
# ============================================================================
def train_meta_learner(base_predictions, features, labels):
    """Train stacked meta-learner - FULL ENSEMBLE"""
    print("\n" + "="*70)
    print("TRAINING META-LEARNER")
    print("="*70)

    X = np.column_stack([*base_predictions, features])

    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    # Level 1: Multiple models
    print("  Training Level 1 models...")
    level1_models = []
    level1_preds = []

    # GBM
    gbm = GradientBoostingRegressor(
        n_estimators=500,
        learning_rate=0.02,
        max_depth=7,
        min_samples_split=10,
        min_samples_leaf=5,
        subsample=0.8,
        max_features='sqrt',
        random_state=Config.SEED
    )
    gbm.fit(X, labels)
    level1_models.append(('gbm', gbm))
    level1_preds.append(gbm.predict(X))
    print("    ✓ GBM trained")

    # Random Forest
    rf = RandomForestRegressor(
        n_estimators=300,
        max_depth=15,
        min_samples_split=8,
        min_samples_leaf=4,
        max_features='sqrt',
        random_state=Config.SEED,
        n_jobs=-1
    )
    rf.fit(X, labels)
    level1_models.append(('rf', rf))
    level1_preds.append(rf.predict(X))
    print("    ✓ Random Forest trained")

    # Level 2: Meta-meta-learner
    print("  Training Level 2 meta-learner...")
    X_level2 = np.column_stack([*level1_preds, X])

    meta_meta = GradientBoostingRegressor(
        n_estimators=200,
        learning_rate=0.03,
        max_depth=5,
        min_samples_split=10,
        subsample=0.9,
        random_state=Config.SEED
    )
    meta_meta.fit(X_level2, labels)
    print("    ✓ Level 2 trained")

    print("\n✓ Meta-learner training complete\n")

    return {
        'level1_models': level1_models,
        'level2_model': meta_meta,
        'scaler': scaler
    }

def predict_with_meta(meta_learner, base_predictions, features):
    """Make predictions with meta-learner - FULL ENSEMBLE"""
    X = np.column_stack([*base_predictions, features])
    X = meta_learner['scaler'].transform(X)

    # Level 1
    level1_preds = []
    for name, model in meta_learner['level1_models']:
        level1_preds.append(model.predict(X))

    # Level 2
    X_level2 = np.column_stack([*level1_preds, X])
    final_preds = meta_learner['level2_model'].predict(X_level2)

    return np.clip(final_preds, 1, 5)

# ============================================================================
# MAIN PIPELINE
# ============================================================================
def main():
    """Complete training pipeline - Memory optimized"""

    # Load data
    train_df, val_df = load_data(Config.TRAIN_PATH, Config.VAL_PATH)

    # Extract features with BOTH embedding models
    train_features = extract_features(train_df)
    val_features = extract_features(val_df)

    # Train base models
    print("="*70)
    print("TRAINING BASE MODELS")
    print("="*70)

    all_models = []
    all_tokenizers = []
    train_predictions = []
    val_predictions = []

    for idx, model_name in enumerate(Config.BASE_MODELS):
        print(f"\nLoading tokenizer for {model_name}")
        tokenizer = AutoTokenizer.from_pretrained(model_name)

        # Train with k-fold
        models = train_model_kfold(model_name, train_df, tokenizer, idx)

        # Get predictions
        print(f"\n  Getting training predictions...")
        train_preds = get_predictions(models, tokenizer, train_df)
        print(f"  Getting validation predictions...")
        val_preds = get_predictions(models, tokenizer, val_df)

        all_models.append(models)
        all_tokenizers.append(tokenizer)
        train_predictions.append(train_preds)
        val_predictions.append(val_preds)

        # Evaluate base model
        labels = val_df['avg_rating'].values
        spearman, _ = spearmanr(val_preds, labels)
        rounded = np.round(val_preds)
        acc = np.mean(np.abs(rounded - labels) <= 1)
        print(f"\n  Base model {idx + 1} - Spearman: {spearman:.4f}, Accuracy: {acc:.4f}")
       
        # Clear models from GPU after predictions
        for model in models:
            model.cpu()
        clear_memory()
        print_gpu_memory()

    # Train meta-learner (FULL ENSEMBLE)
    meta_learner = train_meta_learner(
        train_predictions,
        train_features.values,
        train_df['avg_rating'].values
    )

    # Final predictions
    print("="*70)
    print("FINAL EVALUATION")
    print("="*70)

    final_preds = predict_with_meta(meta_learner, val_predictions, val_features.values)

    labels = val_df['avg_rating'].values
    stds = val_df['std_rating'].values

    rounded_preds = np.round(final_preds)

    # Calculate metrics
    accuracy_within_1 = np.mean(np.abs(rounded_preds - labels) <= 1)
    accuracy_within_sd = np.mean(np.abs(rounded_preds - labels) <= np.maximum(stds, 1.0))
    spearman_corr, _ = spearmanr(final_preds, labels)
    exact_acc = np.mean(rounded_preds == labels)
    mae = np.mean(np.abs(final_preds - labels))

    print(f"\n{'='*70}")
    print(f"FINAL RESULTS")
    print(f"{'='*70}")
    print(f"Accuracy within 1:  {accuracy_within_1:.4f}")
    print(f"Accuracy within SD: {accuracy_within_sd:.4f} {'✓ TARGET MET!' if accuracy_within_sd >= 0.95 else '(Target: >0.95)'}")
    print(f"Spearman:           {spearman_corr:.4f} {'✓ TARGET MET!' if spearman_corr >= 0.77 else '(Target: >0.77)'}")
    print(f"Exact accuracy:     {exact_acc:.4f}")
    print(f"MAE:                {mae:.4f}")
    print(f"{'='*70}\n")

    # Save predictions
    results_df = val_df.copy()
    results_df['predicted_rating'] = final_preds
    results_df['rounded_prediction'] = rounded_preds
    results_df.to_csv('/kaggle/working/predictions.csv', index=False)
    print("✓ Predictions saved to '/kaggle/working/predictions.csv'")

    return {
        'accuracy_within_sd': accuracy_within_sd,
        'spearman': spearman_corr,
        'models': all_models,
        'meta_learner': meta_learner
    }

# ============================================================================
# RUN
# ============================================================================
if __name__ == "__main__":
    # Create output directory
    os.makedirs(Config.OUTPUT_DIR, exist_ok=True)
   
    # Clear initial memory
    clear_memory()
    print_gpu_memory()

    # Run pipeline
    results = main()

    print("\n" + "="*70)
    print("PIPELINE COMPLETE!")
    print("="*70)


2025-12-18 11:19:29.514516: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766056769.925082      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766056770.030581      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'


WSD Competition System - Memory Optimized for Kaggle
Device: cuda

  GPU Memory: 0.00GB allocated, 0.00GB reserved

DATA LOADING
Loading: /kaggle/input/datasetllm/train.json
  ✓ Loaded 2280 records
Loading: /kaggle/input/datasetllm/dev.json
  ✓ Loaded 588 records

✓ Training samples: 2280
✓ Validation samples: 588

FEATURE EXTRACTION
  Loading embedding models...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  ✓ Both models loaded
  Progress: 500/2280
  Progress: 1000/2280
  Progress: 1500/2280
  Progress: 2000/2280
✓ Extracted 2280 feature sets with BOTH embedding models


FEATURE EXTRACTION
  Loading embedding models...
  ✓ Both models loaded
  Progress: 500/588
✓ Extracted 588 feature sets with BOTH embedding models

TRAINING BASE MODELS

Loading tokenizer for microsoft/deberta-v3-base


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]


Model 1/2: microsoft/deberta-v3-base
Training with 3-fold cross-validation

  Fold 1/3
  GPU Memory: 0.01GB allocated, 0.02GB reserved


pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy Within 1,Spearman,Mse,Mae,Exact Accuracy
1,No log,26.718834,0.278947,-0.033314,5.04498,1.902653,0.036842
2,No log,3.468052,0.522368,0.048459,1.571248,1.082804,0.040789
3,21.980000,3.795277,0.522368,0.075583,1.630675,1.097558,0.043421
4,21.980000,3.113776,0.522368,0.118551,1.503041,1.067659,0.040789
5,2.737100,3.00295,0.530263,0.134332,1.474539,1.057253,0.044737
6,2.737100,4.057825,0.55,0.155641,1.662034,1.098437,0.05
7,2.512900,4.896299,0.555263,0.152945,1.816458,1.13357,0.051316
8,2.512900,4.401138,0.556579,0.170099,1.718248,1.109293,0.051316
9,2.336000,4.347707,0.559211,0.174946,1.705474,1.105585,0.051316
10,2.336000,4.418045,0.556579,0.174038,1.719037,1.108864,0.05


    Spearman: 0.1749, Acc: 0.5592
  Clearing cache after fold 1
  GPU Memory: 0.71GB allocated, 0.76GB reserved

  Fold 2/3
  GPU Memory: 0.71GB allocated, 0.76GB reserved


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy Within 1,Spearman,Mse,Mae,Exact Accuracy
1,No log,26.08252,0.203947,-0.058602,5.132494,1.975696,0.023684
2,No log,2.509433,0.573684,-0.014338,1.288978,0.973132,0.044737
3,21.358300,2.519687,0.573684,0.053329,1.288387,0.971385,0.044737
4,21.358300,2.489761,0.573684,0.079256,1.279251,0.967649,0.044737
5,2.964100,2.897015,0.586842,0.078029,1.350702,0.982376,0.039474
6,2.964100,3.175083,0.590789,0.084848,1.403141,0.995346,0.040789
7,2.686200,4.061028,0.596053,0.084378,1.567586,1.033641,0.047368
8,2.686200,3.690825,0.607895,0.102877,1.497866,1.019006,0.046053
9,2.388300,3.750475,0.601316,0.105394,1.508441,1.021814,0.048684
10,2.388300,3.789344,0.6,0.105685,1.515436,1.023314,0.047368


    Spearman: 0.1057, Acc: 0.6000
  Clearing cache after fold 2
  GPU Memory: 1.40GB allocated, 1.48GB reserved

  Fold 3/3
  GPU Memory: 1.40GB allocated, 1.48GB reserved


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy Within 1,Spearman,Mse,Mae,Exact Accuracy
1,No log,28.563734,0.255263,-0.156044,5.30746,1.970274,0.027632
2,No log,3.416252,0.514474,0.058673,1.550895,1.078266,0.044737
3,22.014600,3.112591,0.510526,0.100598,1.497905,1.06636,0.043421
4,22.014600,3.165622,0.510526,0.130935,1.504527,1.066741,0.043421
5,2.749300,2.965734,0.510526,0.177557,1.464134,1.055795,0.043421
6,2.749300,3.314839,0.564474,0.222053,1.518705,1.065238,0.056579
7,2.687100,3.147905,0.565789,0.257385,1.480083,1.05352,0.056579
8,2.687100,3.091736,0.572368,0.273455,1.458711,1.043733,0.051316
9,2.413400,2.980242,0.573684,0.282911,1.430604,1.033832,0.053947
10,2.413400,2.965259,0.571053,0.284456,1.426706,1.032416,0.053947


    Spearman: 0.2845, Acc: 0.5711
  Clearing cache after fold 3
  GPU Memory: 2.09GB allocated, 2.20GB reserved

✓ Completed 3-fold training

  Getting training predictions...
    Fold 1/3 predictions obtained
    Fold 2/3 predictions obtained
    Fold 3/3 predictions obtained
  Getting validation predictions...
    Fold 1/3 predictions obtained
    Fold 2/3 predictions obtained
    Fold 3/3 predictions obtained

  Base model 1 - Spearman: 0.1686, Accuracy: 0.5986
  GPU Memory: 0.03GB allocated, 0.06GB reserved

Loading tokenizer for roberta-base


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]


Model 2/2: roberta-base
Training with 3-fold cross-validation

  Fold 1/3
  GPU Memory: 0.03GB allocated, 0.06GB reserved


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy Within 1,Spearman,Mse,Mae,Exact Accuracy
1,No log,26.548082,0.278947,-0.013936,5.023792,1.897667,0.036842
2,No log,3.540797,0.523684,0.149192,1.581255,1.084474,0.042105
3,21.345900,4.246769,0.538158,0.205533,1.709633,1.113418,0.048684
4,21.345900,3.014607,0.522368,0.249059,1.480788,1.061213,0.040789
5,2.773100,2.789521,0.525,0.287073,1.423122,1.042017,0.040789
6,2.773100,3.2115,0.577632,0.351282,1.47748,1.044685,0.046053
7,2.455300,3.810195,0.585526,0.387312,1.586107,1.065367,0.051316
8,2.455300,2.837948,0.610526,0.414359,1.343575,0.987198,0.052632
9,1.987000,2.798332,0.625,0.421123,1.318487,0.973033,0.056579
10,1.987000,2.70848,0.627632,0.422818,1.295169,0.964307,0.056579


    Spearman: 0.4228, Acc: 0.6276
  Clearing cache after fold 1
  GPU Memory: 0.49GB allocated, 0.54GB reserved

  Fold 2/3
  GPU Memory: 0.49GB allocated, 0.54GB reserved


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy Within 1,Spearman,Mse,Mae,Exact Accuracy
1,No log,14.793439,0.456579,-0.0156,3.546767,1.584773,0.05
2,No log,2.416595,0.573684,-0.051569,1.283903,0.976701,0.044737
3,15.641800,2.576595,0.573684,0.046093,1.298684,0.972577,0.044737
4,15.641800,2.382909,0.573684,0.059905,1.271128,0.970323,0.044737
5,2.946000,2.390645,0.582895,0.101907,1.258435,0.962559,0.043421
6,2.946000,2.470782,0.617105,0.149138,1.256966,0.954671,0.048684
7,2.679500,2.687977,0.617105,0.178933,1.287936,0.956213,0.048684
8,2.679500,2.562568,0.617105,0.203916,1.251932,0.939555,0.047368
9,2.330800,2.673577,0.631579,0.212558,1.270151,0.942382,0.05
10,2.330800,2.638292,0.627632,0.21453,1.262475,0.939758,0.051316


    Spearman: 0.2145, Acc: 0.6276
  Clearing cache after fold 2
  GPU Memory: 0.96GB allocated, 1.03GB reserved

  Fold 3/3
  GPU Memory: 0.96GB allocated, 1.03GB reserved


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy Within 1,Spearman,Mse,Mae,Exact Accuracy
1,No log,32.312962,0.255263,-0.087422,5.756065,2.075182,0.027632
2,No log,3.460458,0.543421,0.134574,1.555807,1.079429,0.044737
3,23.620300,2.882974,0.510526,0.185976,1.452028,1.053274,0.043421
4,23.620300,3.505198,0.606579,0.313402,1.547209,1.070489,0.053947
5,2.777200,2.536194,0.586842,0.405405,1.315057,0.992904,0.051316
6,2.777200,2.515676,0.648684,0.474906,1.245954,0.949733,0.059211
7,2.282500,2.069786,0.698684,0.506433,1.091222,0.877604,0.068421
8,2.282500,2.32788,0.684211,0.515579,1.134919,0.888488,0.068421
9,1.692300,2.11961,0.710526,0.521455,1.070669,0.858552,0.073684
10,1.692300,2.125389,0.713158,0.521889,1.071361,0.858569,0.073684


    Spearman: 0.5219, Acc: 0.7132
  Clearing cache after fold 3
  GPU Memory: 1.42GB allocated, 1.53GB reserved

✓ Completed 3-fold training

  Getting training predictions...
    Fold 1/3 predictions obtained
    Fold 2/3 predictions obtained
    Fold 3/3 predictions obtained
  Getting validation predictions...
    Fold 1/3 predictions obtained
    Fold 2/3 predictions obtained
    Fold 3/3 predictions obtained

  Base model 2 - Spearman: 0.3076, Accuracy: 0.6276
  GPU Memory: 0.03GB allocated, 0.06GB reserved

TRAINING META-LEARNER
  Training Level 1 models...
    ✓ GBM trained
    ✓ Random Forest trained
  Training Level 2 meta-learner...
    ✓ Level 2 trained

✓ Meta-learner training complete

FINAL EVALUATION

FINAL RESULTS
Accuracy within 1:  0.6497
Accuracy within SD: 0.6922 (Target: >0.95)
Spearman:           0.3764 (Target: >0.77)
Exact accuracy:     0.0765
MAE:                0.9216

✓ Predictions saved to '/kaggle/working/predictions.csv'

PIPELINE COMPLETE!
