In [None]:
import pandas as pd
import numpy as np
from scipy.stats import spearmanr
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)
from sentence_transformers import SentenceTransformer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')
import json
# ============================================================================
# SECTION 1: ADVANCED DATA LOADING WITH AUGMENTATION
# ============================================================================

from google.colab import drive
drive.mount('/content/drive')
# ============================================================================
# CONFIGURATION
# ============================================================================
class Config:
    """Configuration settings"""
    # Paths
    TRAIN_PATH = "/content/drive/MyDrive/datasetLLMProject/train.json"
    VAL_PATH = "/content/drive/MyDrive/datasetLLMProject/dev.json"
    TEST_PATH = None
    OUTPUT_PATH = "final_predictions.csv"

    # Model settings - Using better models
    BASE_MODELS = [
        'microsoft/deberta-v3-base',
        'microsoft/deberta-v3-large',  # Added larger model
        'roberta-large',  # Changed to large
    ]

    # Training settings - Improved hyperparameters
    EPOCHS = 6  # Increased epochs
    BATCH_SIZE = 4  # Reduced for larger models
    GRADIENT_ACCUMULATION = 4  # Effective batch size = 16
    LEARNING_RATE = 1e-5  # Lower learning rate
    WARMUP_RATIO = 0.1
    MAX_LENGTH = 512
    WEIGHT_DECAY = 0.01

    # Advanced settings
    USE_WEIGHTED_LOSS = True  # Weight by std deviation
    USE_LAYER_WISE_LR = True  # Different LR for different layers
    ENSEMBLE_WEIGHTS = [0.35, 0.35, 0.3]  # Weighted ensemble

    # Other settings
    DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
    SEED = 42

torch.manual_seed(Config.SEED)
np.random.seed(Config.SEED)

# ============================================================================
# DATA LOADING (keeping your existing functions)
# ============================================================================
def load_json_data(filepath):
    """Load data from JSON file"""
    print(f"  Attempting to load as JSON...")
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            data = json.load(f)
        if isinstance(data, dict):
            if all(k.isdigit() for k in data.keys()):
                data = [data[k] for k in sorted(data.keys(), key=int)]
            else:
                data = [data]
        print(f"  ✓ Loaded {len(data)} records from JSON")
        return pd.DataFrame(data)
    except json.JSONDecodeError:
        print("  Trying JSON Lines format...")
        try:
            data = []
            with open(filepath, 'r', encoding='utf-8') as f:
                for line in f:
                    line = line.strip()
                    if line:
                        data.append(json.loads(line))
            print(f"  ✓ Loaded {len(data)} records from JSON Lines")
            return pd.DataFrame(data)
        except:
            pass
    return None

def process_dataframe(df):
    """Process and validate dataframe"""
    print(f"\nProcessing dataframe...")
    print(f"  Shape: {df.shape}")

    column_mapping = {
        'precontext': ['precontext', 'pre_context', 'context', 'premise'],
        'ambiguous_sentence': ['ambiguous_sentence', 'ambiguous', 'sentence', 'target_sentence'],
        'ending': ['ending', 'end', 'conclusion'],
        'homonym': ['homonym', 'target_word', 'word', 'ambiguous_word'],
        'sense_definition': ['sense_definition', 'sense', 'definition', 'gloss', 'judged_meaning'],
    }

    final_mapping = {}
    for target_col, possible_names in column_mapping.items():
        for name in possible_names:
            if name in df.columns:
                final_mapping[name] = target_col
                break

    if final_mapping:
        df = df.rename(columns=final_mapping)
        print(f"  ✓ Mapped columns: {final_mapping}")

    required = ['precontext', 'ambiguous_sentence', 'homonym', 'sense_definition']
    missing = [col for col in required if col not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns: {missing}")

    if 'ending' not in df.columns:
        df['ending'] = ''

    # Handle ratings
    if 'average' in df.columns and 'stdev' in df.columns:
        df['avg_rating'] = df['average'].astype(float)
        df['std_rating'] = df['stdev'].astype(float).fillna(1.0)
    elif 'avg_rating' in df.columns:
        df['avg_rating'] = df['avg_rating'].astype(float)
        df['std_rating'] = df.get('std_rating', pd.Series([1.0] * len(df)))
    else:
        df['avg_rating'] = np.nan
        df['std_rating'] = 1.0

    # Fill NaN
    for col in ['precontext', 'ambiguous_sentence', 'ending', 'homonym', 'sense_definition']:
        if col in df.columns:
            df[col] = df[col].fillna('').astype(str)

    return df

def load_data(train_path, val_path=None):
    """Load and preprocess data"""
    print("\n" + "="*70)
    print("DATA LOADING")
    print("="*70)

    train_df = load_json_data(train_path)
    train_df = process_dataframe(train_df)

    if val_path:
        val_df = load_json_data(val_path)
        val_df = process_dataframe(val_df)
    else:
        from sklearn.model_selection import train_test_split
        train_df, val_df = train_test_split(train_df, test_size=0.15, random_state=Config.SEED)

    train_df = train_df.reset_index(drop=True)
    val_df = val_df.reset_index(drop=True)

    print(f"\n✓ Data loaded successfully!")
    print(f"  Training samples: {len(train_df)}")
    print(f"  Validation samples: {len(val_df)}")

    return train_df, val_df

# ============================================================================
# ENHANCED DATASET WITH BETTER INPUT FORMATTING
# ============================================================================
class WSDDataset(Dataset):
    """Enhanced dataset with improved text formatting"""

    def __init__(self, data, tokenizer, max_length=512):
        self.data = data.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]

        # Enhanced input format with better structure
        precontext = str(row['precontext']).strip()
        sentence = str(row['ambiguous_sentence']).strip()
        ending = str(row.get('ending', '')).strip()
        homonym = str(row['homonym']).strip()
        sense = str(row['sense_definition']).strip()

        # Create structured input
        if ending:
            context = f"Context: {precontext} {sentence} {ending}"
        else:
            context = f"Context: {precontext} {sentence}"

        text = f"{context} [SEP] Target word: {homonym} [SEP] Sense: {sense}"

        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        item = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
        }

        # Add weighted label if available
        if 'avg_rating' in row and pd.notna(row['avg_rating']):
            rating = float(row['avg_rating'])
            std = float(row.get('std_rating', 1.0))

            item['labels'] = torch.tensor(rating - 1, dtype=torch.float)

            # Add weight based on std (higher confidence = higher weight)
            if Config.USE_WEIGHTED_LOSS:
                weight = 1.0 / (std + 0.5)  # Inverse of uncertainty
                item['weight'] = torch.tensor(weight, dtype=torch.float)

        return item

# ============================================================================
# ENHANCED FEATURE EXTRACTION
# ============================================================================
def extract_semantic_features(data):
    """Extract comprehensive semantic features"""
    print("Extracting semantic features...")

    model = SentenceTransformer('all-MiniLM-L6-v2')
    features = []

    for idx, row in data.iterrows():
        if idx % 500 == 0 and idx > 0:
            print(f"  Processed {idx}/{len(data)} samples...")

        precontext = str(row['precontext'])
        ambiguous = str(row['ambiguous_sentence'])
        sense = str(row['sense_definition'])
        homonym = str(row['homonym'])
        ending = str(row.get('ending', '')).strip()

        # Get embeddings
        precontext_emb = model.encode(precontext)
        ambiguous_emb = model.encode(ambiguous)
        sense_emb = model.encode(sense)
        homonym_emb = model.encode(homonym)

        # Full context embedding
        full_context = f"{precontext} {ambiguous}"
        if ending:
            full_context += f" {ending}"
        full_emb = model.encode(full_context)

        # Calculate similarities
        feat = {
            'sim_pre_sense': cosine_sim(precontext_emb, sense_emb),
            'sim_amb_sense': cosine_sim(ambiguous_emb, sense_emb),
            'sim_full_sense': cosine_sim(full_emb, sense_emb),
            'sim_homonym_sense': cosine_sim(homonym_emb, sense_emb),
            'sim_pre_amb': cosine_sim(precontext_emb, ambiguous_emb),
        }

        # Ending features
        if ending:
            ending_emb = model.encode(ending)
            feat['sim_end_sense'] = cosine_sim(ending_emb, sense_emb)
            feat['sim_end_amb'] = cosine_sim(ending_emb, ambiguous_emb)
            feat['has_ending'] = 1
        else:
            feat['sim_end_sense'] = 0
            feat['sim_end_amb'] = 0
            feat['has_ending'] = 0

        # Text-based features
        feat['sense_length'] = len(sense.split())
        feat['context_length'] = len(full_context.split())
        feat['homonym_in_sense'] = int(homonym.lower() in sense.lower())

        features.append(feat)

    print(f"  ✓ Extracted {len(features)} feature sets")
    return pd.DataFrame(features)

def cosine_sim(a, b):
    """Calculate cosine similarity"""
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-8)

# ============================================================================
# CUSTOM TRAINER WITH WEIGHTED LOSS
# ============================================================================
class WeightedTrainer(Trainer):
    """Trainer with weighted MSE loss"""

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        weights = inputs.pop("weight", None)

        outputs = model(**inputs)
        logits = outputs.logits.squeeze()

        if weights is not None and Config.USE_WEIGHTED_LOSS:
            loss = (weights * (logits - labels) ** 2).mean()
        else:
            loss = ((logits - labels) ** 2).mean()

        return (loss, outputs) if return_outputs else loss

# ============================================================================
# ENHANCED TRAINING
# ============================================================================
def compute_metrics(eval_pred):
    """Compute evaluation metrics"""
    predictions, labels = eval_pred
    predictions = predictions.squeeze()

    if predictions.ndim == 0:
        predictions = np.array([predictions])
    if labels.ndim == 0:
        labels = np.array([labels])

    predictions = np.clip(predictions + 1, 1, 5)
    labels = labels + 1

    rounded_preds = np.round(predictions)
    accuracy_within_1 = np.mean(np.abs(rounded_preds - labels) <= 1)
    spearman_corr, _ = spearmanr(predictions, labels)

    # Additional metrics
    mse = np.mean((predictions - labels) ** 2)
    mae = np.mean(np.abs(predictions - labels))

    return {
        'accuracy_within_1': accuracy_within_1,
        'spearman': spearman_corr,
        'mse': mse,
        'mae': mae,
    }

def train_single_model(model_name, train_dataset, val_dataset, model_idx):
    """Train a single transformer model with enhanced settings"""
    print(f"\n{'='*60}")
    print(f"Training Model {model_idx + 1}: {model_name}")
    print(f"{'='*60}\n")

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=1,
        problem_type="regression"
    )

    training_args = TrainingArguments(
        output_dir=f'./model_{model_idx}',
        num_train_epochs=Config.EPOCHS,
        per_device_train_batch_size=Config.BATCH_SIZE,
        per_device_eval_batch_size=8,
        gradient_accumulation_steps=Config.GRADIENT_ACCUMULATION,
        learning_rate=Config.LEARNING_RATE,
        warmup_ratio=Config.WARMUP_RATIO,
        weight_decay=Config.WEIGHT_DECAY,
        logging_steps=50,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="spearman",
        greater_is_better=True,
        fp16=torch.cuda.is_available(),
        report_to="none",
        seed=Config.SEED,
        # Additional improvements
        lr_scheduler_type="cosine",
        save_total_limit=2,
        gradient_checkpointing=True,
        gradient_checkpointing_kwargs={'use_reentrant': False}
    )

    trainer = WeightedTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    results = trainer.evaluate()

    print(f"\nResults:")
    print(f"  Accuracy within 1: {results['eval_accuracy_within_1']:.4f}")
    print(f"  Spearman: {results['eval_spearman']:.4f}")
    print(f"  MSE: {results['eval_mse']:.4f}")
    print(f"  MAE: {results['eval_mae']:.4f}")

    return trainer.model

def get_model_predictions(model, tokenizer, data):
    """Get predictions from a model"""
    dataset = WSDDataset(data, tokenizer)
    loader = DataLoader(dataset, batch_size=16, shuffle=False)

    model.eval()
    model.to(Config.DEVICE)

    predictions = []
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(Config.DEVICE)
            attention_mask = batch['attention_mask'].to(Config.DEVICE)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = outputs.logits.squeeze().cpu().numpy()

            if preds.ndim == 0:
                preds = np.array([preds])

            predictions.extend(preds)

    predictions = np.array(predictions) + 1
    predictions = np.clip(predictions, 1, 5)

    return predictions

def train_meta_learner(base_predictions, features, labels):
    """Train enhanced meta-learner"""
    print("\n" + "="*60)
    print("Training Meta-Learner")
    print("="*60)

    X = np.column_stack([*base_predictions, features])

    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    # Enhanced meta-learner
    meta_model = GradientBoostingRegressor(
        n_estimators=300,  # Increased
        learning_rate=0.03,  # Lower
        max_depth=6,  # Deeper
        min_samples_split=8,
        min_samples_leaf=4,
        subsample=0.8,
        max_features='sqrt',
        random_state=Config.SEED
    )

    meta_model.fit(X, labels)

    print("✓ Meta-learner trained successfully!")
    return meta_model, scaler

# ============================================================================
# MAIN PIPELINE
# ============================================================================
def main():
    """Complete training and prediction pipeline"""
    print("\n" + "="*70)
    print("SemEval 2026 Task 5 - Enhanced WSD Plausibility Prediction")
    print("="*70)
    print(f"Device: {Config.DEVICE}")

    try:
        train_df, val_df = load_data(Config.TRAIN_PATH, Config.VAL_PATH)
    except Exception as e:
        print(f"\n❌ Error: {e}")
        return

    # Extract features
    print("\n" + "="*60)
    print("Feature Extraction")
    print("="*60)
    train_features = extract_semantic_features(train_df)
    val_features = extract_semantic_features(val_df)

    # Train base models
    print("\n" + "="*60)
    print("Training Base Models")
    print("="*60)

    trained_models = []
    tokenizers = []
    train_predictions = []
    val_predictions = []

    for idx, model_name in enumerate(Config.BASE_MODELS):
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        train_dataset = WSDDataset(train_df, tokenizer)
        val_dataset = WSDDataset(val_df, tokenizer)

        model = train_single_model(model_name, train_dataset, val_dataset, idx)

        print(f"  Getting predictions from model {idx+1}...")
        train_preds = get_model_predictions(model, tokenizer, train_df)
        val_preds = get_model_predictions(model, tokenizer, val_df)

        trained_models.append(model)
        tokenizers.append(tokenizer)
        train_predictions.append(train_preds)
        val_predictions.append(val_preds)

    # Train meta-learner
    meta_model, scaler = train_meta_learner(
        train_predictions,
        train_features.values,
        train_df['avg_rating'].values
    )

    # Final ensemble predictions
    X_val = np.column_stack([*val_predictions, val_features.values])
    X_val = scaler.transform(X_val)
    final_val_preds = meta_model.predict(X_val)
    final_val_preds = np.clip(final_val_preds, 1, 5)

    # Evaluate
    labels = val_df['avg_rating'].values
    stds = val_df['std_rating'].values

    rounded_preds = np.round(final_val_preds)
    accuracy_within_sd = np.mean(
        np.abs(rounded_preds - labels) <= np.maximum(stds, 1.0)
    )
    spearman_corr, _ = spearmanr(final_val_preds, labels)

    print("\n" + "="*70)
    print("FINAL ENSEMBLE RESULTS")
    print("="*70)
    print(f"Accuracy within SD: {accuracy_within_sd:.4f} (Target: >0.95)")
    print(f"Spearman Correlation: {spearman_corr:.4f} (Target: >0.88)")
    print("="*70)

    print("\n" + "="*70)
    print("Pipeline Complete!")
    print("="*70)

if __name__ == "__main__":
    main()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

SemEval 2026 Task 5 - Enhanced WSD Plausibility Prediction
Device: cuda

DATA LOADING
  Attempting to load as JSON...
  ✓ Loaded 2280 records from JSON

Processing dataframe...
  Shape: (2280, 11)
  ✓ Mapped columns: {'precontext': 'precontext', 'sentence': 'ambiguous_sentence', 'ending': 'ending', 'homonym': 'homonym', 'judged_meaning': 'sense_definition'}
  Attempting to load as JSON...
  ✓ Loaded 588 records from JSON

Processing dataframe...
  Shape: (588, 11)
  ✓ Mapped columns: {'precontext': 'precontext', 'sentence': 'ambiguous_sentence', 'ending': 'ending', 'homonym': 'homonym', 'judged_meaning': 'sense_definition'}

✓ Data loaded successfully!
  Training samples: 2280
  Validation samples: 588

Feature Extraction
Extracting semantic features...
  Processed 500/2280 samples...
  Processed 1000/2280 samples...
  Processed 1500/2280 samples...
  Proces

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy Within 1,Spearman,Mse,Mae
1,1.8247,1.437064,0.554422,0.037418,1.437064,1.033427
2,1.3578,1.418723,0.554422,0.069797,1.418723,1.026958
3,1.3445,1.448017,0.578231,0.105966,1.448017,1.03017
4,1.1759,1.471805,0.581633,0.13086,1.471805,1.029933
5,1.0934,1.77547,0.586735,0.154722,1.77547,1.100171
6,0.9287,1.775277,0.586735,0.156165,1.775277,1.100124



Results:
  Accuracy within 1: 0.5867
  Spearman: 0.1562
  MSE: 1.7753
  MAE: 1.1001
  Getting predictions from model 1...


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]


Training Model 2: microsoft/deberta-v3-large



pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/874M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy Within 1,Spearman,Mse,Mae
1,1.4719,1.138917,0.612245,0.455891,1.138917,0.903269
2,0.885,1.040146,0.715986,0.59536,1.040146,0.815753
3,0.5603,0.895504,0.77551,0.629195,0.895504,0.766523


Epoch,Training Loss,Validation Loss,Accuracy Within 1,Spearman,Mse,Mae
1,1.4719,1.138917,0.612245,0.455891,1.138917,0.903269
2,0.885,1.040146,0.715986,0.59536,1.040146,0.815753
3,0.5603,0.895504,0.77551,0.629195,0.895504,0.766523
4,0.3832,0.970883,0.727891,0.648395,0.970883,0.787651
5,0.273,1.018079,0.72449,0.656558,1.018079,0.798871
6,0.2263,1.060844,0.719388,0.653623,1.060844,0.811612



Results:
  Accuracy within 1: 0.7245
  Spearman: 0.6566
  MSE: 1.0181
  MAE: 0.7989
  Getting predictions from model 2...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]


Training Model 3: roberta-large



model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy Within 1,Spearman,Mse,Mae
1,1.5645,1.400755,0.554422,0.178852,1.400755,1.019659
2,1.2493,1.415012,0.64966,0.36622,1.415012,0.959751
3,0.8164,1.196297,0.683673,0.508159,1.196297,0.871852
4,0.4779,1.067317,0.72619,0.530419,1.067317,0.827472
5,0.3196,1.140024,0.721088,0.529461,1.140024,0.844153
6,0.239,1.200845,0.697279,0.528646,1.200598,0.859872



Results:
  Accuracy within 1: 0.7262
  Spearman: 0.5304
  MSE: 1.0673
  MAE: 0.8275
  Getting predictions from model 3...

Training Meta-Learner
✓ Meta-learner trained successfully!

FINAL ENSEMBLE RESULTS
Accuracy within SD: 0.8044 (Target: >0.95)
Spearman Correlation: 0.6525 (Target: >0.88)

Pipeline Complete!
