In [1]:
import json
import pandas as pd
import numpy as np
from scipy.stats import spearmanr
from sklearn.metrics import mean_squared_error, mean_absolute_error
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType
import warnings
warnings.filterwarnings('ignore')

# ============================================================================
# INSTALL PEFT (Run this first in Colab)
# ============================================================================
# !pip install peft -q

print("="*80)
print("  LoRA FINE-TUNING FOR SEMEVAL 2026")
print("="*80)

# Check GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\n Device: {device}")
if not torch.cuda.is_available():
    print("  WARNING: No GPU detected!")
    print("   Enable GPU: Runtime ‚Üí Change runtime type ‚Üí T4 GPU")


  LoRA FINE-TUNING FOR SEMEVAL 2026

 Device: cuda



# SECTION 1: DATA LOADING


In [2]:
from google.colab import drive
drive.mount('/content/drive')
def load_data(data_path='/content/drive/MyDrive/datasetLLMProject/'):
    with open(f'{data_path}train.json', 'r') as f:
        train_data = json.load(f)
    with open(f'{data_path}dev.json', 'r') as f:
        dev_data = json.load(f)
    return train_data, dev_data

def prepare_dataframe(data):
    stories_list = []
    for story_id, content in data.items():
        story_content = content.copy()
        story_content['story_id'] = story_id
        stories_list.append(story_content)
    return pd.DataFrame(stories_list)

def create_full_story(row):
    precontext = row.get('precontext', '').strip()
    sentence = row.get('sentence', '').strip()
    ending = row.get('ending', '').strip()
    return f"{precontext} {sentence} {ending}" if ending else f"{precontext} {sentence}"

train_data, dev_data = load_data()
train_df = prepare_dataframe(train_data)
dev_df = prepare_dataframe(dev_data)
train_df['full_story'] = train_df.apply(create_full_story, axis=1)
dev_df['full_story'] = dev_df.apply(create_full_story, axis=1)

print(f"\nüìä Data: {len(train_df)} train, {len(dev_df)} dev samples")

Mounted at /content/drive

üìä Data: 2280 train, 588 dev samples


# SECTION 2: ENHANCED PROMPT

In [3]:
def create_structured_prompt(row):
    """Structured prompt for better understanding"""
    homonym = row['homonym']
    meaning = row['judged_meaning']
    story = row['full_story']

    # Highlight homonym
    story_highlighted = story.replace(f" {homonym} ", f" [{homonym}] ")
    story_highlighted = story_highlighted.replace(f" {homonym.capitalize()} ",
                                                  f" [{homonym.capitalize()}] ")

    prompt = (
        f"Story: {story_highlighted}\n\n"
        f"Question: How plausible is the meaning '{meaning}' for the word '{homonym}' "
        f"in this context?\n\n"
        f"Rating (1=not plausible, 5=very plausible):"
    )

    return prompt

# SECTION 3: LoRA MODEL WITH CUSTOM HEAD

In [4]:
class LoRAModelWithHead(nn.Module):
    """
    Model with LoRA + Custom Regression Head

    LoRA (Low-Rank Adaptation):
    - Only trains small matrices (rank r << hidden_dim)
    - Injects trainable rank decomposition into frozen weights
    - W = W_frozen + B*A where B,A are trainable low-rank matrices
    """

    def __init__(self, model_name, n_homonyms, lora_config):
        super().__init__()

        print(f"\nüîß Loading base model: {model_name}")
        base_model = AutoModel.from_pretrained(model_name)

        # Apply LoRA - this makes 99.9% of params non-trainable!
        print(f" Applying LoRA adaptation...")
        self.encoder = get_peft_model(base_model, lora_config)

        # Print trainable parameters
        trainable_params = sum(p.numel() for p in self.encoder.parameters() if p.requires_grad)
        total_params = sum(p.numel() for p in self.encoder.parameters())
        print(f"   Trainable params: {trainable_params:,} ({100*trainable_params/total_params:.2f}%)")
        print(f"   Total params: {total_params:,}")
        print(f"   Memory reduction: {100*(1-trainable_params/total_params):.1f}%")

        hidden_size = base_model.config.hidden_size

        # Custom task-specific head (always trainable)
        self.homonym_embedding = nn.Embedding(n_homonyms, 64)

        self.regressor = nn.Sequential(
            nn.Linear(hidden_size + 64, 256),
            nn.LayerNorm(256),
            nn.GELU(),
            nn.Dropout(0.2),
            nn.Linear(256, 128),
            nn.LayerNorm(128),
            nn.GELU(),
            nn.Dropout(0.1),
            nn.Linear(128, 1)
        )

        # Uncertainty head for better calibration
        self.uncertainty_head = nn.Sequential(
            nn.Linear(hidden_size + 64, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Softplus()
        )

    def forward(self, input_ids, attention_mask, homonym_ids, labels=None, stdevs=None):
        # Get LoRA-adapted embeddings
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)

        # Mean pooling
        mask_expanded = attention_mask.unsqueeze(-1).expand(outputs.last_hidden_state.size()).float()
        sum_embeddings = torch.sum(outputs.last_hidden_state * mask_expanded, 1)
        sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
        pooled = sum_embeddings / sum_mask

        # Add homonym context
        homonym_emb = self.homonym_embedding(homonym_ids)
        combined = torch.cat([pooled, homonym_emb], dim=1)

        # Predictions
        rating_logits = self.regressor(combined)
        uncertainty_logits = self.uncertainty_head(combined)

        loss = None
        if labels is not None:
            # Huber loss (robust to outliers)
            rating_loss = nn.SmoothL1Loss()(rating_logits.squeeze(), labels)

            if stdevs is not None:
                # Uncertainty-aware loss
                uncertainty_loss = nn.MSELoss()(uncertainty_logits.squeeze(), stdevs)
                # Weight by uncertainty (harder examples get more weight)
                weights = 1.0 + stdevs
                weighted_loss = (weights * (rating_logits.squeeze() - labels) ** 2).mean()
                loss = weighted_loss + 0.2 * uncertainty_loss
            else:
                loss = rating_loss

        return {'loss': loss, 'logits': rating_logits, 'uncertainty': uncertainty_logits}

# SECTION 4: DATASET

In [5]:

class LoRADataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=320):
        self.data = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length

        all_homonyms = sorted(set(dataframe['homonym'].unique()))
        self.homonym_to_id = {h: i for i, h in enumerate(all_homonyms)}

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        text = create_structured_prompt(row)

        encoding = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt'
        )

        homonym_id = self.homonym_to_id.get(row['homonym'], 0)
        rating = (row['average'] - 1) / 4.0  # Normalize [1,5] ‚Üí [0,1]
        stdev = row['stdev'] / 2.0  # Normalize

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'homonym_ids': torch.tensor(homonym_id, dtype=torch.long),
            'labels': torch.tensor(rating, dtype=torch.float),
            'stdevs': torch.tensor(stdev, dtype=torch.float)
        }

# SECTION 5: CUSTOM TRAINER

In [6]:
class LoRATrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        stdevs = inputs.pop("stdevs")
        outputs = model(**inputs, labels=labels, stdevs=stdevs)
        loss = outputs['loss']
        return (loss, outputs) if return_outputs else loss

# SECTION 6: TRAINING FUNCTION

In [7]:
def train_lora_model(
    model_name='microsoft/deberta-v3-base',  # Can use LARGER models now!
    lora_r=16,           # LoRA rank (8, 16, 32) - higher = more capacity
    lora_alpha=32,       # LoRA scaling factor (usually 2*r)
    lora_dropout=0.1,
    epochs=6,
    batch_size=16,
    lr=3e-4,             # LoRA allows higher learning rates!
    max_length=320
):
    """
    Train with LoRA

    Args:
        lora_r: Rank of LoRA matrices (higher = more parameters, better performance)
        lora_alpha: Scaling factor (controls adaptation strength)
        lora_dropout: Dropout in LoRA layers
    """

    print("\n" + "="*80)
    print("   LORA CONFIGURATION")
    print("="*80)
    print(f"\n Base Model: {model_name}")
    print(f" LoRA Rank (r): {lora_r}")
    print(f" LoRA Alpha: {lora_alpha}")
    print(f" LoRA Dropout: {lora_dropout}")
    print(f"\n Training:")
    print(f"   Epochs: {epochs}")
    print(f"   Batch Size: {batch_size}")
    print(f"   Learning Rate: {lr} (higher than usual - LoRA allows this!)")
    print(f"   Max Length: {max_length}")

    # LoRA Configuration
    lora_config = LoraConfig(
        task_type=TaskType.FEATURE_EXTRACTION,
        r=lora_r,                    # Rank
        lora_alpha=lora_alpha,       # Scaling
        lora_dropout=lora_dropout,
        target_modules=[              # Which layers to apply LoRA to
            "query_proj",             # Attention queries
            "value_proj",             # Attention values
            # "key_proj",             # Can add more for better performance
            # "dense",                # Feed-forward layers
        ],
        bias="none",
        inference_mode=False
    )

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Create datasets
    print(f"\n Creating datasets...")
    train_dataset = LoRADataset(train_df, tokenizer, max_length=max_length)
    eval_dataset = LoRADataset(dev_df, tokenizer, max_length=max_length)
    print(f"   Train: {len(train_dataset)} samples")
    print(f"   Dev: {len(eval_dataset)} samples")

    # Create model with LoRA
    n_homonyms = len(set(train_df['homonym'].unique()) | set(dev_df['homonym'].unique()))
    model = LoRAModelWithHead(model_name, n_homonyms, lora_config)
    model.to(device)

    # Training arguments
    training_args = TrainingArguments(
        output_dir='./results_lora',
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        learning_rate=lr,
        warmup_ratio=0.1,
        weight_decay=0.01,
        logging_steps=50,
        eval_strategy='epoch',
        save_strategy='epoch',
        load_best_model_at_end=True,
        metric_for_best_model='eval_loss',
        report_to='none',
        fp16=torch.cuda.is_available(),
        gradient_accumulation_steps=2,  # Effective batch size = 32
        save_total_limit=2,             # Save space
    )

    # Train
    trainer = LoRATrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset
    )

    print("\n" + "="*80)
    print("  TRAINING STARTED")
    print("="*80)

    trainer.train()

    return trainer, tokenizer, model, train_dataset.homonym_to_id


# SECTION 7: PREDICTION & EVALUATION

In [8]:
def predict(model, tokenizer, test_df, homonym_to_id, batch_size=16, max_length=320):
    print(f"\n Generating predictions...")

    model.eval()
    test_dataset = LoRADataset(test_df, tokenizer, max_length=max_length)
    test_dataset.homonym_to_id = homonym_to_id

    dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    predictions = []
    uncertainties = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            homonym_ids = batch['homonym_ids'].to(device)

            outputs = model(input_ids, attention_mask, homonym_ids)

            # Denormalize [0,1] ‚Üí [1,5]
            preds = (outputs['logits'].squeeze().cpu().numpy() * 4) + 1
            uncerts = outputs['uncertainty'].squeeze().cpu().numpy() * 2

            predictions.extend(preds if isinstance(preds, np.ndarray) else [preds])
            uncertainties.extend(uncerts if isinstance(uncerts, np.ndarray) else [uncerts])

    return np.array(predictions), np.array(uncertainties)

def evaluate(predictions, targets, stdevs):
    predictions = np.clip(predictions, 1, 5)

    spearman_corr, _ = spearmanr(predictions, targets)
    acc = sum(abs(p - t) <= max(s, 1.0)
              for p, t, s in zip(predictions, targets, stdevs)) / len(predictions)
    rmse = np.sqrt(mean_squared_error(targets, predictions))
    mae = mean_absolute_error(targets, predictions)

    print("\n" + "="*80)
    print("  EVALUATION RESULTS")
    print("="*80)
    print(f"\n {'Metric':<30} {'Score':<15} {'Target':<15} {'Status'}")
    print("-" * 80)
    print(f" {'Spearman Correlation':<30} {spearman_corr:<15.4f} {'>0.45':<15} {'‚úÖ' if spearman_corr > 0.45 else '‚ö†Ô∏è'}")
    print(f" {'Accuracy Within StdDev':<30} {acc:<15.4f} {'>0.70':<15} {'‚úÖ' if acc > 0.70 else '‚ö†Ô∏è'}")
    print(f" {'RMSE':<30} {rmse:<15.4f} {'<1.10':<15} {'‚úÖ' if rmse < 1.10 else '‚ö†Ô∏è'}")
    print(f" {'MAE':<30} {mae:<15.4f} {'<0.90':<15} {'‚úÖ' if mae < 0.90 else '‚ö†Ô∏è'}")

    return {'spearman': spearman_corr, 'accuracy': acc, 'rmse': rmse, 'mae': mae}

# SECTION 8: MAIN EXECUTION

In [9]:
print("\n" + "="*80)
print("  LORA FINE-TUNING - BEST APPROACH!")
print("="*80)

# Train with LoRA
trainer, tokenizer, model, homonym_to_id = train_lora_model(
    model_name='microsoft/deberta-v3-base',  # 184M params but only train 0.5M!
    lora_r=16,              # Try 8, 16, or 32
    lora_alpha=32,
    lora_dropout=0.1,
    epochs=6,
    batch_size=16,          # Can use larger batch due to memory savings
    lr=3e-4,                # Higher LR works with LoRA!
    max_length=320
)

# Predict
predictions, uncertainties = predict(
    model, tokenizer, dev_df, homonym_to_id,
    batch_size=16, max_length=320
)
# Evaluate
results = evaluate(predictions, dev_df['average'].values, dev_df['stdev'].values)

# Save
dev_df['prediction'] = predictions
dev_df[['story_id', 'prediction']].to_csv('predictions_lora.csv', index=False)

print("\n" + "="*80)
print("  DONE!")
print("="*80)



  LORA FINE-TUNING - BEST APPROACH!

   LORA CONFIGURATION

 Base Model: microsoft/deberta-v3-base
 LoRA Rank (r): 16
 LoRA Alpha: 32
 LoRA Dropout: 0.1

 Training:
   Epochs: 6
   Batch Size: 16
   Learning Rate: 0.0003 (higher than usual - LoRA allows this!)
   Max Length: 320


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]


 Creating datasets...
   Train: 2280 samples
   Dev: 588 samples

üîß Loading base model: microsoft/deberta-v3-base


pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

 Applying LoRA adaptation...
   Trainable params: 589,824 (0.32%)
   Total params: 184,421,376
   Memory reduction: 99.7%

  TRAINING STARTED


Epoch,Training Loss,Validation Loss
1,0.1854,0.143828
2,0.1386,0.126951
3,0.1222,0.12081
4,0.1171,0.117199
5,0.0998,0.131244
6,0.0925,0.136164



 Generating predictions...

  EVALUATION RESULTS

 Metric                         Score           Target          Status
--------------------------------------------------------------------------------
 Spearman Correlation           0.3726          >0.45           ‚ö†Ô∏è
 Accuracy Within StdDev         0.6224          >0.70           ‚ö†Ô∏è
 RMSE                           1.1226          <1.10           ‚ö†Ô∏è
 MAE                            0.9472          <0.90           ‚ö†Ô∏è

  DONE!
