In [1]:
!pip install peft



In [2]:
import pandas as pd
import numpy as np
from scipy.stats import spearmanr
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)
from peft import LoraConfig, get_peft_model, TaskType
from sentence_transformers import SentenceTransformer
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings('ignore')
import json
import os
import gc

# ============================================================================
# CONFIGURATION - LoRA OPTIMIZED
# ============================================================================
class Config:
    """Competition-grade configuration with LoRA"""
    # Paths
    TRAIN_PATH = "/kaggle/input/datasetllm/train.json"
    VAL_PATH = "/kaggle/input/datasetllm/dev.json"
    OUTPUT_DIR = "/kaggle/working/competition_models"

    # NOW WE CAN USE LARGE MODELS with LoRA!
    BASE_MODELS = [
        'microsoft/deberta-v3-large',      # 304M params but only train ~1%
        'roberta-large',                    # 355M params but only train ~1%
    ]

    # Training settings - BETTER with LoRA
    EPOCHS = 10                          # Back to original
    BATCH_SIZE = 12                       # Can increase with LoRA!
    GRADIENT_ACCUMULATION = 2            # Less needed with bigger batch
    LEARNING_RATE = 3e-4                 # Higher LR works better for LoRA
    WARMUP_RATIO = 0.1
    MAX_LENGTH = 512                     # Back to full length!
    WEIGHT_DECAY = 0.01

    # LoRA Configuration
    LORA_R = 16                          # Rank - higher = more capacity
    LORA_ALPHA = 32                      # Scaling factor
    LORA_DROPOUT = 0.1
    
    # Advanced settings
    USE_KFOLD = True
    N_FOLDS = 3                          # Back to 5 folds!
    USE_FOCAL_LOSS = True
    USE_ORDINAL = True

    DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
    SEED = 42

torch.manual_seed(Config.SEED)
np.random.seed(Config.SEED)

print(f"\n{'='*70}")
print(f"WSD Competition System - LoRA/PEFT Optimized")
print(f"Device: {Config.DEVICE}")
print(f"Using LoRA: 3x memory reduction, full model performance!")
print(f"{'='*70}\n")

# ============================================================================
# MEMORY MANAGEMENT
# ============================================================================
def clear_memory():
    """Clear GPU and CPU memory"""
    gc.collect()
    torch.cuda.empty_cache()
    if torch.cuda.is_available():
        torch.cuda.synchronize()

def print_gpu_memory():
    """Print current GPU memory usage"""
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated() / 1024**3
        reserved = torch.cuda.memory_reserved() / 1024**3
        print(f"  GPU Memory: {allocated:.2f}GB allocated, {reserved:.2f}GB reserved")

# ============================================================================
# DATA LOADING
# ============================================================================
def load_json_data(filepath):
    """Load data from JSON file"""
    print(f"Loading: {filepath}")
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            data = json.load(f)
        if isinstance(data, dict):
            if all(k.isdigit() for k in data.keys()):
                data = [data[k] for k in sorted(data.keys(), key=int)]
            else:
                data = [data]
        print(f"  ✓ Loaded {len(data)} records")
        return pd.DataFrame(data)
    except json.JSONDecodeError:
        data = []
        with open(filepath, 'r', encoding='utf-8') as f:
            for line in f:
                if line.strip():
                    data.append(json.loads(line))
        print(f"  ✓ Loaded {len(data)} records (JSONL)")
        return pd.DataFrame(data)

def process_dataframe(df):
    """Process and validate dataframe"""
    column_mapping = {
        'precontext': ['precontext', 'pre_context', 'context'],
        'ambiguous_sentence': ['ambiguous_sentence', 'ambiguous', 'sentence'],
        'ending': ['ending', 'end', 'conclusion'],
        'homonym': ['homonym', 'target_word', 'word'],
        'sense_definition': ['sense_definition', 'sense', 'judged_meaning'],
    }

    final_mapping = {}
    for target_col, possible_names in column_mapping.items():
        for name in possible_names:
            if name in df.columns:
                final_mapping[name] = target_col
                break

    if final_mapping:
        df = df.rename(columns=final_mapping)

    if 'ending' not in df.columns:
        df['ending'] = ''

    if 'average' in df.columns:
        df['avg_rating'] = df['average'].astype(float)
        df['std_rating'] = df.get('stdev', pd.Series([1.0] * len(df))).astype(float).fillna(1.0)
    elif 'avg_rating' in df.columns:
        df['avg_rating'] = df['avg_rating'].astype(float)
        df['std_rating'] = df.get('std_rating', pd.Series([1.0] * len(df)))
    else:
        df['avg_rating'] = np.nan
        df['std_rating'] = 1.0

    for col in ['precontext', 'ambiguous_sentence', 'ending', 'homonym', 'sense_definition']:
        if col in df.columns:
            df[col] = df[col].fillna('').astype(str)

    return df

def load_data(train_path, val_path=None):
    """Load and preprocess data"""
    print("\n" + "="*70)
    print("DATA LOADING")
    print("="*70)

    train_df = load_json_data(train_path)
    train_df = process_dataframe(train_df)

    if val_path:
        val_df = load_json_data(val_path)
        val_df = process_dataframe(val_df)
    else:
        from sklearn.model_selection import train_test_split
        train_df, val_df = train_test_split(train_df, test_size=0.15, random_state=Config.SEED)

    train_df = train_df.reset_index(drop=True)
    val_df = val_df.reset_index(drop=True)

    print(f"\n✓ Training samples: {len(train_df)}")
    print(f"✓ Validation samples: {len(val_df)}")

    return train_df, val_df

# ============================================================================
# FEATURE EXTRACTION - BOTH MODELS
# ============================================================================
def cosine_sim(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-8)

def word_overlap(text1, text2):
    words1 = set(text1.lower().split())
    words2 = set(text2.lower().split())
    if not words1 or not words2:
        return 0
    return len(words1 & words2) / len(words1 | words2)

def extract_features(data):
    """Extract semantic features with BOTH models"""
    print("\n" + "="*70)
    print("FEATURE EXTRACTION")
    print("="*70)

    model_mini = SentenceTransformer('all-MiniLM-L6-v2')
    model_mpnet = SentenceTransformer('all-mpnet-base-v2')

    features = []

    for idx, row in data.iterrows():
        if (idx + 1) % 500 == 0:
            print(f"  Progress: {idx + 1}/{len(data)}")

        precontext = str(row['precontext'])
        ambiguous = str(row['ambiguous_sentence'])
        sense = str(row['sense_definition'])
        homonym = str(row['homonym'])
        ending = str(row.get('ending', '')).strip()

        full_context = f"{precontext} {ambiguous}"
        if ending:
            full_context += f" {ending}"

        # Both models for maximum accuracy
        pre_mini = model_mini.encode(precontext)
        amb_mini = model_mini.encode(ambiguous)
        sense_mini = model_mini.encode(sense)
        full_mini = model_mini.encode(full_context)

        pre_mpnet = model_mpnet.encode(precontext)
        amb_mpnet = model_mpnet.encode(ambiguous)
        sense_mpnet = model_mpnet.encode(sense)
        full_mpnet = model_mpnet.encode(full_context)

        feat = {
            'sim_pre_sense_mini': cosine_sim(pre_mini, sense_mini),
            'sim_amb_sense_mini': cosine_sim(amb_mini, sense_mini),
            'sim_full_sense_mini': cosine_sim(full_mini, sense_mini),
            'sim_pre_sense_mpnet': cosine_sim(pre_mpnet, sense_mpnet),
            'sim_amb_sense_mpnet': cosine_sim(amb_mpnet, sense_mpnet),
            'sim_full_sense_mpnet': cosine_sim(full_mpnet, sense_mpnet),
            'sense_length': len(sense.split()),
            'context_length': len(full_context.split()),
            'homonym_in_sense': int(homonym.lower() in sense.lower()),
            'has_ending': int(bool(ending)),
            'precontext_sense_overlap': word_overlap(precontext, sense),
            'full_sense_overlap': word_overlap(full_context, sense),
        }

        if ending:
            end_mini = model_mini.encode(ending)
            end_mpnet = model_mpnet.encode(ending)
            feat['sim_end_sense_mini'] = cosine_sim(end_mini, sense_mini)
            feat['sim_end_sense_mpnet'] = cosine_sim(end_mpnet, sense_mpnet)
        else:
            feat['sim_end_sense_mini'] = 0
            feat['sim_end_sense_mpnet'] = 0

        features.append(feat)

    print(f"✓ Extracted {len(features)} feature sets\n")
    
    del model_mini, model_mpnet
    clear_memory()
    
    return pd.DataFrame(features)

# ============================================================================
# DATASET
# ============================================================================
class WSDDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]

        precontext = str(row['precontext']).strip()
        sentence = str(row['ambiguous_sentence']).strip()
        ending = str(row.get('ending', '')).strip()
        homonym = str(row['homonym']).strip()
        sense = str(row['sense_definition']).strip()

        if ending:
            text = (f"Context: {precontext} Target sentence: {sentence} "
                   f"Continuation: {ending} [SEP] Word: {homonym} [SEP] "
                   f"Meaning: {sense}")
        else:
            text = (f"Context: {precontext} Target sentence: {sentence} "
                   f"[SEP] Word: {homonym} [SEP] Meaning: {sense}")

        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        item = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
        }

        if 'avg_rating' in row and pd.notna(row['avg_rating']):
            rating = float(row['avg_rating'])
            std = float(row.get('std_rating', 1.0))

            item['labels'] = torch.tensor(rating - 1, dtype=torch.float)
            weight = 1.0 / (std + 0.3)
            item['weight'] = torch.tensor(weight, dtype=torch.float)

        return item

# ============================================================================
# FOCAL LOSS
# ============================================================================
class FocalMSELoss(nn.Module):
    def __init__(self, gamma=1.5):
        super().__init__()
        self.gamma = gamma

    def forward(self, pred, target, weight=None):
        mse = (pred - target) ** 2
        focal_weight = torch.abs(pred - target) ** self.gamma
        loss = focal_weight * mse

        if weight is not None:
            loss = weight * loss

        return loss.mean()

# ============================================================================
# CUSTOM TRAINER
# ============================================================================
class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        if Config.USE_FOCAL_LOSS:
            self.focal_loss = FocalMSELoss()

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        weights = inputs.pop("weight", None)

        outputs = model(**inputs)
        logits = outputs.logits.squeeze()

        if Config.USE_FOCAL_LOSS:
            loss = self.focal_loss(logits, labels, weights)
        elif weights is not None:
            loss = (weights * (logits - labels) ** 2).mean()
        else:
            loss = ((logits - labels) ** 2).mean()

        return (loss, outputs) if return_outputs else loss

# ============================================================================
# METRICS
# ============================================================================
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.squeeze()

    if predictions.ndim == 0:
        predictions = np.array([predictions])
    if labels.ndim == 0:
        labels = np.array([labels])

    predictions = np.clip(predictions + 1, 1, 5)
    labels = labels + 1

    rounded_preds = np.round(predictions)
    accuracy_within_1 = np.mean(np.abs(rounded_preds - labels) <= 1)
    spearman_corr, _ = spearmanr(predictions, labels)

    mse = np.mean((predictions - labels) ** 2)
    mae = np.mean(np.abs(predictions - labels))
    exact_acc = np.mean(rounded_preds == labels)

    return {
        'accuracy_within_1': accuracy_within_1,
        'spearman': spearman_corr,
        'mse': mse,
        'mae': mae,
        'exact_accuracy': exact_acc,
    }

# ============================================================================
# LoRA MODEL SETUP
# ============================================================================
def create_lora_model(model_name):
    """Create model with LoRA adapters"""
    print(f"  Loading base model: {model_name}")
    
    # Load base model
    base_model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=1,
        problem_type="regression"
    )
    
    # LoRA configuration - targets all key layers in DeBERTa/RoBERTa
    lora_config = LoraConfig(
        r=Config.LORA_R,
        lora_alpha=Config.LORA_ALPHA,
        target_modules=["query_proj", "key_proj", "value_proj", "dense"],  # DeBERTa/RoBERTa layers
        lora_dropout=Config.LORA_DROPOUT,
        bias="none",
        task_type=TaskType.SEQ_CLS
    )
    
    # Apply LoRA
    model = get_peft_model(base_model, lora_config)
    model.print_trainable_parameters()
    
    return model

# ============================================================================
# K-FOLD TRAINING with LoRA
# ============================================================================
def train_fold(model_name, train_df, tokenizer, fold_idx, train_indices, val_indices):
    """Train single fold with LoRA"""
    print(f"\n  Fold {fold_idx + 1}/{Config.N_FOLDS}")
    print_gpu_memory()

    fold_train = train_df.iloc[train_indices].reset_index(drop=True)
    fold_val = train_df.iloc[val_indices].reset_index(drop=True)

    train_dataset = WSDDataset(fold_train, tokenizer, Config.MAX_LENGTH)
    val_dataset = WSDDataset(fold_val, tokenizer, Config.MAX_LENGTH)

    # Create LoRA model
    model = create_lora_model(model_name)

    output_dir = f"{Config.OUTPUT_DIR}/fold_{fold_idx}"

    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=Config.EPOCHS,
        per_device_train_batch_size=Config.BATCH_SIZE,
        per_device_eval_batch_size=8,
        gradient_accumulation_steps=Config.GRADIENT_ACCUMULATION,
        learning_rate=Config.LEARNING_RATE,
        warmup_ratio=Config.WARMUP_RATIO,
        weight_decay=Config.WEIGHT_DECAY,
        logging_steps=50,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="spearman",
        greater_is_better=True,
        fp16=torch.cuda.is_available(),
        report_to="none",
        seed=Config.SEED + fold_idx,
        lr_scheduler_type="cosine_with_restarts",
        save_total_limit=1,
        gradient_checkpointing=True,
        gradient_checkpointing_kwargs={'use_reentrant': False},
        optim="adamw_torch",
        max_grad_norm=1.0,
    )

    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    eval_results = trainer.evaluate()

    print(f"    Spearman: {eval_results['eval_spearman']:.4f}, "
          f"Acc: {eval_results['eval_accuracy_within_1']:.4f}")

    trained_model = trainer.model
    
    del trainer, train_dataset, val_dataset
    clear_memory()

    return trained_model

def train_model_kfold(model_name, train_df, tokenizer, model_idx):
    """Train model with k-fold cross-validation using LoRA"""
    print(f"\n{'='*70}")
    print(f"Model {model_idx + 1}/{len(Config.BASE_MODELS)}: {model_name}")
    print(f"Training with {Config.N_FOLDS}-fold cross-validation + LoRA")
    print(f"{'='*70}")

    kf = KFold(n_splits=Config.N_FOLDS, shuffle=True, random_state=Config.SEED)
    models = []

    for fold_idx, (train_indices, val_indices) in enumerate(kf.split(train_df)):
        model = train_fold(model_name, train_df, tokenizer, fold_idx, train_indices, val_indices)
        models.append(model)
        
        clear_memory()
        print_gpu_memory()

    print(f"\n✓ Completed {Config.N_FOLDS}-fold training with LoRA")
    return models

# ============================================================================
# PREDICTIONS
# ============================================================================
def get_predictions(models, tokenizer, data):
    """Get averaged predictions from model ensemble"""
    all_predictions = []

    for model_idx, model in enumerate(models):
        dataset = WSDDataset(data, tokenizer, Config.MAX_LENGTH)
        loader = DataLoader(dataset, batch_size=16, shuffle=False)

        model.eval()
        model.to(Config.DEVICE)

        predictions = []
        with torch.no_grad():
            for batch in loader:
                input_ids = batch['input_ids'].to(Config.DEVICE)
                attention_mask = batch['attention_mask'].to(Config.DEVICE)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                preds = outputs.logits.squeeze().cpu().numpy()

                if preds.ndim == 0:
                    preds = np.array([preds])

                predictions.extend(preds)
                
                del input_ids, attention_mask, outputs
                
        predictions = np.array(predictions) + 1
        all_predictions.append(predictions)

        if Config.USE_KFOLD:
            print(f"    Fold {model_idx + 1}/{len(models)} predictions obtained")
        
        model.cpu()
        clear_memory()

    final_predictions = np.mean(all_predictions, axis=0)
    final_predictions = np.clip(final_predictions, 1, 5)

    return final_predictions

# ============================================================================
# META-LEARNER
# ============================================================================
def train_meta_learner(base_predictions, features, labels):
    """Train stacked meta-learner"""
    print("\n" + "="*70)
    print("TRAINING META-LEARNER")
    print("="*70)

    X = np.column_stack([*base_predictions, features])

    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    print("  Training Level 1 models...")
    level1_models = []
    level1_preds = []

    gbm = GradientBoostingRegressor(
        n_estimators=500,
        learning_rate=0.02,
        max_depth=7,
        min_samples_split=10,
        min_samples_leaf=5,
        subsample=0.8,
        max_features='sqrt',
        random_state=Config.SEED
    )
    gbm.fit(X, labels)
    level1_models.append(('gbm', gbm))
    level1_preds.append(gbm.predict(X))
    print("    ✓ GBM trained")

    rf = RandomForestRegressor(
        n_estimators=300,
        max_depth=15,
        min_samples_split=8,
        min_samples_leaf=4,
        max_features='sqrt',
        random_state=Config.SEED,
        n_jobs=-1
    )
    rf.fit(X, labels)
    level1_models.append(('rf', rf))
    level1_preds.append(rf.predict(X))
    print("    ✓ Random Forest trained")

    print("  Training Level 2 meta-learner...")
    X_level2 = np.column_stack([*level1_preds, X])

    meta_meta = GradientBoostingRegressor(
        n_estimators=200,
        learning_rate=0.03,
        max_depth=5,
        min_samples_split=10,
        subsample=0.9,
        random_state=Config.SEED
    )
    meta_meta.fit(X_level2, labels)
    print("    ✓ Level 2 trained")

    print("\n✓ Meta-learner training complete\n")

    return {
        'level1_models': level1_models,
        'level2_model': meta_meta,
        'scaler': scaler
    }

def predict_with_meta(meta_learner, base_predictions, features):
    """Make predictions with meta-learner"""
    X = np.column_stack([*base_predictions, features])
    X = meta_learner['scaler'].transform(X)

    level1_preds = []
    for name, model in meta_learner['level1_models']:
        level1_preds.append(model.predict(X))

    X_level2 = np.column_stack([*level1_preds, X])
    final_preds = meta_learner['level2_model'].predict(X_level2)

    return np.clip(final_preds, 1, 5)

# ============================================================================
# MAIN PIPELINE
# ============================================================================
def main():
    """Complete training pipeline with LoRA"""

    # Load data
    train_df, val_df = load_data(Config.TRAIN_PATH, Config.VAL_PATH)

    # Extract features
    train_features = extract_features(train_df)
    val_features = extract_features(val_df)

    # Train base models with LoRA
    print("="*70)
    print("TRAINING BASE MODELS WITH LoRA")
    print("="*70)

    all_models = []
    all_tokenizers = []
    train_predictions = []
    val_predictions = []

    for idx, model_name in enumerate(Config.BASE_MODELS):
        print(f"\nLoading tokenizer for {model_name}")
        tokenizer = AutoTokenizer.from_pretrained(model_name)

        # Train with k-fold + LoRA
        models = train_model_kfold(model_name, train_df, tokenizer, idx)

        # Get predictions
        print(f"\n  Getting training predictions...")
        train_preds = get_predictions(models, tokenizer, train_df)
        print(f"  Getting validation predictions...")
        val_preds = get_predictions(models, tokenizer, val_df)

        all_models.append(models)
        all_tokenizers.append(tokenizer)
        train_predictions.append(train_preds)
        val_predictions.append(val_preds)

        # Evaluate base model
        labels = val_df['avg_rating'].values
        spearman, _ = spearmanr(val_preds, labels)
        rounded = np.round(val_preds)
        acc = np.mean(np.abs(rounded - labels) <= 1)
        print(f"\n  Base model {idx + 1} - Spearman: {spearman:.4f}, Accuracy: {acc:.4f}")
        
        for model in models:
            model.cpu()
        clear_memory()
        print_gpu_memory()

    # Train meta-learner
    meta_learner = train_meta_learner(
        train_predictions,
        train_features.values,
        train_df['avg_rating'].values
    )

    # Final predictions
    print("="*70)
    print("FINAL EVALUATION")
    print("="*70)

    final_preds = predict_with_meta(meta_learner, val_predictions, val_features.values)

    labels = val_df['avg_rating'].values
    stds = val_df['std_rating'].values

    rounded_preds = np.round(final_preds)

    # Calculate metrics
    accuracy_within_1 = np.mean(np.abs(rounded_preds - labels) <= 1)
    accuracy_within_sd = np.mean(np.abs(rounded_preds - labels) <= np.maximum(stds, 1.0))
    spearman_corr, _ = spearmanr(final_preds, labels)
    exact_acc = np.mean(rounded_preds == labels)
    mae = np.mean(np.abs(final_preds - labels))

    print(f"\n{'='*70}")
    print(f"FINAL RESULTS - LoRA OPTIMIZED")
    print(f"{'='*70}")
    print(f"Accuracy within 1:  {accuracy_within_1:.4f}")
    print(f"Accuracy within SD: {accuracy_within_sd:.4f} {'✓ TARGET MET!' if accuracy_within_sd >= 0.95 else '(Target: >0.95)'}")
    print(f"Spearman:           {spearman_corr:.4f} {'✓ TARGET MET!' if spearman_corr >= 0.77 else '(Target: >0.77)'}")
    print(f"Exact accuracy:     {exact_acc:.4f}")
    print(f"MAE:                {mae:.4f}")
    print(f"{'='*70}\n")

    # Save predictions
    results_df = val_df.copy()
    results_df['predicted_rating'] = final_preds
    results_df['rounded_prediction'] = rounded_preds
    results_df.to_csv('/kaggle/working/predictions.csv', index=False)
    print("✓ Predictions saved to '/kaggle/working/predictions.csv'")

    return {
        'accuracy_within_sd': accuracy_within_sd,
        'spearman': spearman_corr,
        'models': all_models,
        'meta_learner': meta_learner
    }

# ============================================================================
# RUN
# ============================================================================
if __name__ == "__main__":
    os.makedirs(Config.OUTPUT_DIR, exist_ok=True)
    
    clear_memory()
    print_gpu_memory()

    # Run pipeline
    results = main()

    print("\n" + "="*70)
    print("PIPELINE COMPLETE!")
    print("="*70)


2025-12-20 10:11:32.155167: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766225492.353365      23 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766225492.413018      23 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1766225492.914157      23 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766225492.914192      23 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766225492.914198      23 computation_placer.cc:177] computation placer alr


WSD Competition System - LoRA/PEFT Optimized
Device: cuda
Using LoRA: 3x memory reduction, full model performance!

  GPU Memory: 0.00GB allocated, 0.00GB reserved

DATA LOADING
Loading: /kaggle/input/datasetllm/train.json
  ✓ Loaded 2280 records
Loading: /kaggle/input/datasetllm/dev.json
  ✓ Loaded 588 records

✓ Training samples: 2280
✓ Validation samples: 588

FEATURE EXTRACTION


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  Progress: 500/2280
  Progress: 1000/2280
  Progress: 1500/2280
  Progress: 2000/2280
✓ Extracted 2280 feature sets


FEATURE EXTRACTION
  Progress: 500/588
✓ Extracted 588 feature sets

TRAINING BASE MODELS WITH LoRA

Loading tokenizer for microsoft/deberta-v3-large


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]


Model 1/2: microsoft/deberta-v3-large
Training with 3-fold cross-validation + LoRA

  Fold 1/3
  GPU Memory: 0.01GB allocated, 0.02GB reserved
  Loading base model: microsoft/deberta-v3-large


pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/874M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 7,111,681 || all params: 442,174,466 || trainable%: 1.6083


Epoch,Training Loss,Validation Loss,Accuracy Within 1,Spearman,Mse,Mae,Exact Accuracy
1,No log,3.231931,0.522368,0.000332,1.530262,1.076602,0.040789
2,9.043000,3.557775,0.522368,0.016316,1.588595,1.087015,0.040789
3,9.043000,3.68329,0.522368,0.003162,1.611214,1.09305,0.040789
4,2.863900,2.935202,0.522368,-0.018053,1.477102,1.064446,0.040789
5,2.738100,2.935191,0.522368,0.035666,1.477364,1.064312,0.040789
6,2.738100,3.166312,0.522368,0.05493,1.517928,1.073028,0.040789
7,2.812400,3.370461,0.522368,0.000378,1.554784,1.080489,0.040789
8,2.838900,3.09266,0.522368,0.02198,1.504672,1.069525,0.040789
9,2.838900,3.043317,0.522368,0.011396,1.49583,1.066686,0.040789
10,2.692300,3.059703,0.522368,0.017894,1.498761,1.067689,0.040789


    Spearman: 0.0549, Acc: 0.5224
  GPU Memory: 1.67GB allocated, 1.69GB reserved

  Fold 2/3
  GPU Memory: 1.67GB allocated, 1.69GB reserved
  Loading base model: microsoft/deberta-v3-large


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 7,111,681 || all params: 442,174,466 || trainable%: 1.6083


Epoch,Training Loss,Validation Loss,Accuracy Within 1,Spearman,Mse,Mae,Exact Accuracy
1,No log,2.40649,0.573684,0.018484,1.278646,0.973951,0.044737
2,13.367700,2.92083,0.573684,0.017009,1.361852,0.988765,0.043421
3,13.367700,2.76143,0.573684,-0.007116,1.332653,0.980124,0.044737
4,3.186600,2.405601,0.573684,0.00992,1.276393,0.971457,0.044737
5,3.088100,2.496438,0.573684,0.003884,1.286883,0.972394,0.044737
6,3.088100,2.403667,0.573684,0.00602,1.277731,0.973507,0.044737
7,3.008100,2.468553,0.573684,-0.001563,1.282677,0.971734,0.044737
8,2.997300,2.437646,0.573684,0.012139,1.278511,0.970801,0.044737
9,2.997300,2.404509,0.573684,0.020592,1.278629,0.974379,0.044737
10,3.019400,2.404224,0.573684,-0.004439,1.276754,0.972195,0.044737


    Spearman: 0.0206, Acc: 0.5737
  GPU Memory: 3.32GB allocated, 3.34GB reserved

  Fold 3/3
  GPU Memory: 3.32GB allocated, 3.34GB reserved
  Loading base model: microsoft/deberta-v3-large


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 7,111,681 || all params: 442,174,466 || trainable%: 1.6083


Epoch,Training Loss,Validation Loss,Accuracy Within 1,Spearman,Mse,Mae,Exact Accuracy
1,No log,3.053401,0.510526,0.009985,1.489492,1.065674,0.043421
2,17.076000,2.895275,0.510526,-0.009858,1.465427,1.058961,0.043421
3,17.076000,2.963378,0.510526,0.125059,1.474856,1.061556,0.043421
4,2.846300,3.104791,0.510526,0.101955,1.498154,1.067354,0.043421
5,2.835700,2.907996,0.510526,0.107509,1.46659,1.058074,0.043421
6,2.835700,2.958249,0.510526,0.12881,1.473792,1.061225,0.043421
7,2.697200,2.963061,0.510526,0.168587,1.474424,1.061463,0.043421
8,2.708500,2.906576,0.510526,0.167777,1.465759,1.057678,0.043421
9,2.708500,2.91367,0.510526,0.169175,1.466674,1.058115,0.043421
10,2.786100,2.903554,0.510526,0.170014,1.465242,1.057579,0.043421


    Spearman: 0.1700, Acc: 0.5105
  GPU Memory: 4.97GB allocated, 4.98GB reserved

✓ Completed 3-fold training with LoRA

  Getting training predictions...
    Fold 1/3 predictions obtained
    Fold 2/3 predictions obtained
    Fold 3/3 predictions obtained
  Getting validation predictions...
    Fold 1/3 predictions obtained
    Fold 2/3 predictions obtained
    Fold 3/3 predictions obtained

  Base model 1 - Spearman: 0.0312, Accuracy: 0.5544
  GPU Memory: 0.02GB allocated, 0.05GB reserved

Loading tokenizer for roberta-large


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]


Model 2/2: roberta-large
Training with 3-fold cross-validation + LoRA

  Fold 1/3
  GPU Memory: 0.02GB allocated, 0.05GB reserved
  Loading base model: roberta-large


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 5,769,217 || all params: 361,129,986 || trainable%: 1.5975


Epoch,Training Loss,Validation Loss,Accuracy Within 1,Spearman,Mse,Mae,Exact Accuracy
1,No log,2.925222,0.522368,0.078297,1.471028,1.060337,0.040789
2,6.015300,2.859376,0.522368,0.164892,1.456213,1.056787,0.040789
3,6.015300,3.688716,0.557895,0.238029,1.598016,1.084707,0.046053
4,2.934600,2.549423,0.551316,0.335561,1.330143,1.00019,0.040789
5,2.792600,2.55713,0.539474,0.354456,1.331651,1.000612,0.042105
6,2.792600,2.605881,0.634211,0.492788,1.262494,0.94682,0.055263
7,2.121400,1.815272,0.675,0.570616,1.042851,0.867769,0.065789
8,1.267700,2.048014,0.719737,0.609085,1.023364,0.82564,0.071053
9,1.267700,1.933418,0.736842,0.62675,0.975507,0.800453,0.076316
10,0.898200,1.733788,0.753947,0.632764,0.917157,0.778519,0.080263


    Spearman: 0.6328, Acc: 0.7539
  GPU Memory: 1.37GB allocated, 1.39GB reserved

  Fold 2/3
  GPU Memory: 1.37GB allocated, 1.39GB reserved
  Loading base model: roberta-large


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 5,769,217 || all params: 361,129,986 || trainable%: 1.5975


Epoch,Training Loss,Validation Loss,Accuracy Within 1,Spearman,Mse,Mae,Exact Accuracy
1,No log,2.487892,0.573684,0.036438,1.304043,0.984744,0.044737
2,6.280500,2.522106,0.573684,0.135066,1.28544,0.969041,0.044737
3,6.280500,4.998045,0.575,0.194587,1.767346,1.089082,0.05
4,3.281300,1.884252,0.653947,0.401721,1.065944,0.875355,0.052632
5,2.430000,1.731856,0.672368,0.490765,1.008556,0.848832,0.060526
6,2.430000,1.909873,0.718421,0.571829,0.987607,0.810191,0.072368
7,1.747300,1.914073,0.760526,0.601309,0.936969,0.767811,0.071053
8,0.954400,1.866119,0.765789,0.613208,0.908153,0.747271,0.077632
9,0.954400,1.736359,0.790789,0.620369,0.858776,0.724449,0.076316
10,0.685400,2.015613,0.764474,0.621317,0.934942,0.751399,0.081579


    Spearman: 0.6213, Acc: 0.7645
  GPU Memory: 2.72GB allocated, 2.74GB reserved

  Fold 3/3
  GPU Memory: 2.72GB allocated, 2.74GB reserved
  Loading base model: roberta-large


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 5,769,217 || all params: 361,129,986 || trainable%: 1.5975


Epoch,Training Loss,Validation Loss,Accuracy Within 1,Spearman,Mse,Mae,Exact Accuracy
1,No log,3.085313,0.510526,0.011417,1.509319,1.069868,0.043421
2,10.797700,3.175775,0.510526,0.135811,1.508029,1.068015,0.043421
3,10.797700,2.777354,0.510526,0.220122,1.429477,1.044431,0.043421
4,2.992700,2.463428,0.582895,0.411651,1.288326,0.980493,0.052632
5,2.591300,2.068004,0.630263,0.530466,1.123611,0.900845,0.067105
6,2.591300,2.097243,0.678947,0.582412,1.08656,0.869042,0.068421
7,1.758700,1.716128,0.753947,0.638947,0.905829,0.769479,0.081579
8,1.156900,1.610401,0.788158,0.651025,0.841234,0.730974,0.092105
9,1.156900,1.628864,0.775,0.658813,0.853315,0.738053,0.096053
10,0.805700,1.647124,0.769737,0.658903,0.859072,0.74081,0.096053


    Spearman: 0.6589, Acc: 0.7697
  GPU Memory: 4.06GB allocated, 4.08GB reserved

✓ Completed 3-fold training with LoRA

  Getting training predictions...
    Fold 1/3 predictions obtained
    Fold 2/3 predictions obtained
    Fold 3/3 predictions obtained
  Getting validation predictions...
    Fold 1/3 predictions obtained
    Fold 2/3 predictions obtained
    Fold 3/3 predictions obtained

  Base model 2 - Spearman: 0.4597, Accuracy: 0.6990
  GPU Memory: 0.02GB allocated, 0.05GB reserved

TRAINING META-LEARNER
  Training Level 1 models...
    ✓ GBM trained
    ✓ Random Forest trained
  Training Level 2 meta-learner...
    ✓ Level 2 trained

✓ Meta-learner training complete

FINAL EVALUATION

FINAL RESULTS - LoRA OPTIMIZED
Accuracy within 1:  0.6837
Accuracy within SD: 0.7279 (Target: >0.95)
Spearman:           0.4644 (Target: >0.77)
Exact accuracy:     0.0918
MAE:                0.8816

✓ Predictions saved to '/kaggle/working/predictions.csv'

PIPELINE COMPLETE!
