<a href="https://colab.research.google.com/github/GuillaumeDesa/XLM_RoBERTa_Cool_4/blob/main/XLM_Roberta_improved.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Enhanced XLM-RoBERTa Training for Text Classification

This notebook provides an enhanced training pipeline with:
- Automatic GPU/TPU/CPU detection and fallback
- Progress bars for training and data processing
- Adaptive training until target performance is reached
- Comprehensive metrics and analysis


## 1. Install Required Packages

In [1]:
!pip install transformers datasets scikit-learn pandas openpyxl evaluate tqdm accelerate scipy

Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Collecting aiohttp!=4.0.0a0,!=4.0.0a1 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading aiohttp-3.12.15-cp311-

## 2. Import Libraries and Setup

In [2]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
from datasets import Dataset
import evaluate
from scipy.special import softmax
from tqdm import tqdm
import os

## 3. Device Setup with GPU Fallback

In [3]:
def setup_device():
    """Set up the best available device (TPU -> GPU -> CPU)"""
    try:
        # Try TPU first (for Google Colab)
        import torch_xla
        import torch_xla.core.xla_model as xm
        device = xm.xla_device()
        print(f"✅ Using TPU: {device}")
        return "tpu", device
    except ImportError:
        pass

    # Fallback to GPU if available
    if torch.cuda.is_available():
        device = torch.device("cuda")
        print(f"✅ Using GPU: {torch.cuda.get_device_name()}")
        print(f"   GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
        return "gpu", device
    else:
        device = torch.device("cpu")
        print("⚠️  Using CPU (training will be slower)")
        return "cpu", device

# Set up device
device_type, device = setup_device()

✅ Using TPU: xla:0


## 4. Upload Data Files

In [4]:
from google.colab import files
print("Please upload your Excel files:")
uploaded = files.upload()

Please upload your Excel files:


Saving cool.annotated.filtered.cleaned.xlsx to cool.annotated.filtered.cleaned.xlsx
Saving cool.unannotated.filtered.xlsx to cool.unannotated.filtered.xlsx


## 5. Load and Preprocess Data

In [5]:
# Load the uploaded Excel files
annotated_df = pd.read_excel("cool.annotated.filtered.cleaned.xlsx")
unannotated_df = pd.read_excel("cool.unannotated.filtered.xlsx")

print(f"📊 Annotated samples: {len(annotated_df)}")
print(f"📊 Unannotated samples: {len(unannotated_df)}")

# Data preprocessing
print("\n🔄 Preprocessing data...")
annotated_df["interpretation"] = annotated_df["interpretation"].str.strip().str.capitalize()

# Encode labels
label_encoder = LabelEncoder()
annotated_df["label"] = label_encoder.fit_transform(annotated_df["interpretation"])

# Show label mapping
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("🏷️  Label Mapping:", label_mapping)
num_labels = len(label_mapping)

📊 Annotated samples: 1008
📊 Unannotated samples: 19149

🔄 Preprocessing data...
🏷️  Label Mapping: {'Basic': np.int64(0), 'Emotion': np.int64(1), 'Nonliteral': np.int64(2)}


## 6. Split Data and Create Datasets

In [6]:
# Split data
train_df, test_df = train_test_split(
    annotated_df,
    test_size=0.2,
    random_state=42,
    stratify=annotated_df["label"]
)

print(f"📈 Training samples: {len(train_df)}")
print(f"🧪 Test samples: {len(test_df)}")

# Create datasets
train_dataset = Dataset.from_pandas(
    train_df[["occurrences", "label"]].rename(columns={"occurrences": "text"})
)
test_dataset = Dataset.from_pandas(
    test_df[["occurrences", "label"]].rename(columns={"occurrences": "text"})
)

📈 Training samples: 806
🧪 Test samples: 202


## 7. Load Model and Tokenizer

In [7]:
model_name = "xlm-roberta-base"
print(f"\n🤖 Loading {model_name}...")

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

# Move model to device
if device_type != "tpu":  # TPU handling is different
    model = model.to(device)


🤖 Loading xlm-roberta-base...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 8. Tokenize Datasets

In [8]:
# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

print("🔤 Tokenizing datasets...")
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Format for PyTorch
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

🔤 Tokenizing datasets...


Map:   0%|          | 0/806 [00:00<?, ? examples/s]

Map:   0%|          | 0/202 [00:00<?, ? examples/s]

## 9. Define Metrics and Custom Trainer with Progress Bars

In [9]:
# Enhanced metrics computation
def compute_metrics(eval_pred):
    """Compute comprehensive metrics"""
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    accuracy = evaluate.load("accuracy").compute(predictions=predictions, references=labels)
    f1 = evaluate.load("f1").compute(predictions=predictions, references=labels, average="weighted")
    precision = evaluate.load("precision").compute(predictions=predictions, references=labels, average="weighted")
    recall = evaluate.load("recall").compute(predictions=predictions, references=labels, average="weighted")

    return {
        "accuracy": accuracy["accuracy"],
        "f1": f1["f1"],
        "precision": precision["precision"],
        "recall": recall["recall"]
    }

# Custom Trainer with progress bar
class ProgressTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.progress_bar = None

    def on_train_begin(self, args, state, control, **kwargs):
        if state.max_steps > 0:
            self.progress_bar = tqdm(total=state.max_steps, desc="Training Progress")

    def on_step_end(self, args, state, control, **kwargs):
        if self.progress_bar:
            self.progress_bar.update(1)
            # Update description with current metrics if available
            if state.log_history:
                last_log = state.log_history[-1]
                desc = f"Step {state.global_step}"
                if 'loss' in last_log:
                    desc += f" | Loss: {last_log['loss']:.4f}"
                if 'eval_f1' in last_log:
                    desc += f" | F1: {last_log['eval_f1']:.4f}"
                self.progress_bar.set_description(desc)

    def on_train_end(self, args, state, control, **kwargs):
        if self.progress_bar:
            self.progress_bar.close()

## 10. Training Configuration with Device Optimization

In [10]:
def get_training_args(max_epochs=10, patience=3):
    """Get training arguments optimized for the available device"""

    # Adjust batch size based on device
    if device_type == "tpu":
        train_batch_size = 32
        eval_batch_size = 32
    elif device_type == "gpu":
        # Adjust based on GPU memory
        gpu_memory_gb = torch.cuda.get_device_properties(0).total_memory / 1024**3
        if gpu_memory_gb >= 16:
            train_batch_size = 16
            eval_batch_size = 32
        elif gpu_memory_gb >= 8:
            train_batch_size = 8
            eval_batch_size = 16
        else:
            train_batch_size = 4
            eval_batch_size = 8
    else:  # CPU
        train_batch_size = 4
        eval_batch_size = 8

    return TrainingArguments(
        output_dir="./cool_model_xlm",
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="eval_f1",
        greater_is_better=True,
        num_train_epochs=max_epochs,
        per_device_train_batch_size=train_batch_size,
        per_device_eval_batch_size=eval_batch_size,
        warmup_steps=100,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_strategy="steps",
        logging_steps=50,
        save_total_limit=2,
        seed=42,
        fp16=device_type == "gpu",  # Use fp16 only on GPU
        dataloader_num_workers=0 if device_type == "tpu" else 2,
        report_to=None,  # Disable wandb/tensorboard
    )

## 11. Adaptive Training Function

In [11]:
def train_with_target_performance(target_f1=0.85, max_epochs=15, patience=3):
    """Train until target F1 score is reached or max epochs"""

    print(f"\n🎯 Target F1 Score: {target_f1}")
    print(f"📊 Max Epochs: {max_epochs}")
    print(f"⏱️  Early Stopping Patience: {patience}")

    training_args = get_training_args(max_epochs, patience)

    trainer = ProgressTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=patience)],
    )

    print(f"\n🚀 Starting training on {device_type.upper()}...")
    print(f"   Batch size: {training_args.per_device_train_batch_size}")

    # Train the model
    trainer.train()

    # Get final metrics
    final_metrics = trainer.evaluate()
    final_f1 = final_metrics.get('eval_f1', 0)

    print(f"\n📈 Training completed!")
    print(f"   Final F1 Score: {final_f1:.4f}")
    print(f"   Final Accuracy: {final_metrics.get('eval_accuracy', 0):.4f}")

    if final_f1 >= target_f1:
        print(f"🎉 Target F1 score of {target_f1} achieved!")
    else:
        print(f"⚠️  Target F1 score not reached. Consider:")
        print(f"   - Increasing max_epochs")
        print(f"   - Adjusting learning rate")
        print(f"   - Adding more training data")

    return trainer, final_metrics

## 12. Train the Model

In [None]:
# Train the model with target performance
trainer, metrics = train_with_target_performance(target_f1=0.85, max_epochs=15, patience=3)


🎯 Target F1 Score: 0.85
📊 Max Epochs: 15
⏱️  Early Stopping Patience: 3

🚀 Starting training on TPU...
   Batch size: 32


## 13. Prepare Unannotated Data for Prediction

In [None]:
print("\n🔮 Making predictions on unannotated data...")

# Prepare unannotated dataset
unannotated_dataset = Dataset.from_pandas(
    unannotated_df[["match_context"]].rename(columns={"match_context": "text"})
)

# Tokenize
tokenized_unannotated = unannotated_dataset.map(
    lambda examples: tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512),
    batched=True
)
tokenized_unannotated.set_format("torch", columns=["input_ids", "attention_mask"])

## 14. Run Predictions and Save Results

In [None]:
print("🔍 Running inference...")
predictions = trainer.predict(tokenized_unannotated)

# Process predictions
predicted_class_ids = predictions.predictions.argmax(axis=1)
predicted_labels = label_encoder.inverse_transform(predicted_class_ids)

# Calculate confidence scores
probabilities = softmax(predictions.predictions, axis=1)
prediction_scores = np.max(probabilities, axis=1)

# Add predictions to DataFrame
unannotated_df["predicted_interpretation"] = predicted_labels
unannotated_df["prediction_score"] = prediction_scores

# Save results
output_file = "cool.unannotated.with_predictions.xlsx"
unannotated_df.to_excel(output_file, index=False)

print(f"\n✅ Predictions saved to: {output_file}")
print(f"📊 Prediction confidence stats:")
print(f"   Mean confidence: {prediction_scores.mean():.3f}")
print(f"   Min confidence: {prediction_scores.min():.3f}")
print(f"   Max confidence: {prediction_scores.max():.3f}")

# Show prediction distribution
prediction_counts = pd.Series(predicted_labels).value_counts()
print(f"\n📈 Prediction distribution:")
for label, count in prediction_counts.items():
    percentage = (count / len(predicted_labels)) * 100
    print(f"   {label}: {count} ({percentage:.1f}%)")

## 15. Download Results and Save Model

In [None]:
# Download file (for Colab)
try:
    files.download(output_file)
    print(f"⬇️  File downloaded: {output_file}")
except:
    print(f"ℹ️  File saved locally: {output_file}")

# Save model for future use
model_save_path = "./final_cool_model"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"\n💾 Model saved to: {model_save_path}")

print("\n🎉 Training and prediction pipeline completed successfully!")