# XLM-RoBERTa Training for Multilingual Sentiment Analysis

This notebook trains XLM-RoBERTa on the English Financial PhraseBank dataset
and tests cross-lingual transfer to Spanish.

**Model:** `xlm-roberta-base` (270M parameters, 100+ languages)

## 1. Setup

In [None]:
import sys
import warnings
from pathlib import Path

# Add project root to path
sys.path.insert(0, str(Path.cwd().parent))
warnings.filterwarnings('ignore')

import torch
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from config import XLM_ROBERTA_CONFIG, print_config, MODELS_DIR
from config.params import ID2LABEL
from src.data import load_financial_phrasebank, create_data_splits, create_dataloaders
from src.models import create_model, Trainer, ModelEvaluator, SentimentPredictor
from src.models.classifier import print_model_info

# Check device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")
if device == 'cuda':
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

In [None]:
# Show configuration
print_config(XLM_ROBERTA_CONFIG)

## 2. Data Preparation

In [None]:
# Load dataset
print("Loading Financial PhraseBank...")
df = load_financial_phrasebank(agreement_level="sentences_75agree")
print(f"Total samples: {len(df)}")
print(f"\nLabel distribution:")
print(df['label'].value_counts().sort_index())

In [None]:
# Create splits
print("Creating train/val/test splits...")
train_df, val_df, test_df = create_data_splits(df, seed=42)
print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

In [None]:
# Create DataLoaders with XLM-RoBERTa tokenizer
print("Creating DataLoaders...")
train_loader, val_loader, test_loader = create_dataloaders(
    train_df, val_df, test_df,
    tokenizer_name=XLM_ROBERTA_CONFIG.model_checkpoint,
    batch_size=XLM_ROBERTA_CONFIG.batch_size,
    max_length=XLM_ROBERTA_CONFIG.max_seq_length,
    num_workers=0  # Windows compatibility
)
print(f"Train batches: {len(train_loader)}, Val batches: {len(val_loader)}, Test batches: {len(test_loader)}")

## 3. Model Training

In [None]:
# Create model
print("Creating XLM-RoBERTa model...")
model = create_model(
    model_checkpoint=XLM_ROBERTA_CONFIG.model_checkpoint,
    num_labels=3,
    device=device
)
print_model_info(model, "XLM-RoBERTa-base")

In [None]:
# Create trainer
trainer = Trainer.from_config(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    config=XLM_ROBERTA_CONFIG,
    save_dir=str(MODELS_DIR / "xlm-roberta")
)

In [None]:
# Train!
print("Starting training...")
print("="*60)
history = trainer.train()

In [None]:
# Plot training curves
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Loss
axes[0].plot(history['train_loss'], label='Train')
axes[0].plot(history['val_loss'], label='Validation')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('Training & Validation Loss')
axes[0].legend()

# Accuracy
axes[1].plot(history['val_accuracy'], label='Validation', color='green')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy')
axes[1].set_title('Validation Accuracy')
axes[1].legend()

plt.tight_layout()
plt.show()

print(f"\nBest validation accuracy: {max(history['val_accuracy']):.4f}")

## 4. Evaluation on English Test Set

In [None]:
# Load best model
from src.models import load_model

best_model_path = MODELS_DIR / "xlm-roberta" / "best_model.pt"
model = load_model(
    checkpoint_path=str(best_model_path),
    model_checkpoint=XLM_ROBERTA_CONFIG.model_checkpoint,
    num_labels=3,
    device=device
)
print(f"Loaded best model from {best_model_path}")

In [None]:
# Evaluate on test set
evaluator = ModelEvaluator(model, device=device)
metrics, predictions, labels = evaluator.evaluate(test_loader)

print("\n" + "="*60)
print("TEST SET RESULTS (English)")
print("="*60)
print(f"Accuracy:  {metrics['accuracy']:.4f}")
print(f"F1 Macro:  {metrics['f1_macro']:.4f}")
print(f"F1 Weighted: {metrics['f1_weighted']:.4f}")

In [None]:
# Confusion matrix
from sklearn.metrics import confusion_matrix, classification_report

cm = confusion_matrix(labels, predictions)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Negative', 'Neutral', 'Positive'],
            yticklabels=['Negative', 'Neutral', 'Positive'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('XLM-RoBERTa - Confusion Matrix (English Test Set)')
plt.show()

print("\nClassification Report:")
print(classification_report(labels, predictions, target_names=['Negative', 'Neutral', 'Positive']))

## 5. Cross-lingual Testing (Spanish)

Test zero-shot transfer: model trained on English, tested on Spanish financial texts.

In [None]:
# Create predictor
predictor = SentimentPredictor(
    model_path=str(best_model_path),
    tokenizer_name=XLM_ROBERTA_CONFIG.model_checkpoint,
    device=device
)

In [None]:
# Spanish financial test sentences
spanish_texts = [
    # Positive
    ("Los ingresos de la empresa aumentaron un 25% en el tercer trimestre.", "positive"),
    ("La compañía reportó ganancias récord este año.", "positive"),
    ("Las acciones subieron tras el anuncio de dividendos.", "positive"),
    ("El nuevo producto superó las expectativas del mercado.", "positive"),
    ("La fusión generará sinergias significativas.", "positive"),
    
    # Negative
    ("La empresa anunció pérdidas significativas en el último trimestre.", "negative"),
    ("Las ventas cayeron un 15% debido a la competencia.", "negative"),
    ("El CEO renunció tras el escándalo financiero.", "negative"),
    ("La compañía recortará 500 empleos para reducir costos.", "negative"),
    ("Los inversores perdieron confianza en la gestión.", "negative"),
    
    # Neutral
    ("La empresa publicará sus resultados el próximo lunes.", "neutral"),
    ("El consejo de administración se reunirá mañana.", "neutral"),
    ("La compañía tiene sede en Madrid.", "neutral"),
    ("El informe anual estará disponible en línea.", "neutral"),
    ("La empresa opera en el sector tecnológico.", "neutral"),
]

print(f"Testing on {len(spanish_texts)} Spanish sentences...\n")

In [None]:
# Run predictions
results = []
for text, true_label in spanish_texts:
    pred = predictor.predict(text)
    results.append({
        'text': text[:60] + '...' if len(text) > 60 else text,
        'true': true_label,
        'predicted': pred['label'],
        'confidence': pred['confidence'],
        'correct': pred['label'] == true_label
    })

results_df = pd.DataFrame(results)
print(results_df.to_string(index=False))

In [None]:
# Calculate Spanish accuracy
spanish_accuracy = results_df['correct'].mean()
print(f"\n{'='*60}")
print(f"SPANISH ZERO-SHOT RESULTS")
print(f"{'='*60}")
print(f"Accuracy: {spanish_accuracy:.1%} ({results_df['correct'].sum()}/{len(results_df)})")
print(f"\nPer-class accuracy:")
for label in ['positive', 'neutral', 'negative']:
    subset = results_df[results_df['true'] == label]
    acc = subset['correct'].mean()
    print(f"  {label}: {acc:.1%} ({subset['correct'].sum()}/{len(subset)})")

In [None]:
# Compare English vs Spanish
print(f"\n{'='*60}")
print("COMPARISON: English vs Spanish")
print(f"{'='*60}")
print(f"English test accuracy: {metrics['accuracy']:.1%}")
print(f"Spanish zero-shot:     {spanish_accuracy:.1%}")
print(f"Transfer efficiency:   {spanish_accuracy/metrics['accuracy']:.1%}")

## 6. Interactive Demo

In [None]:
def analyze_sentiment(text: str):
    """Analyze sentiment of a text in any language."""
    result = predictor.predict(text)
    print(f"Text: {text}")
    print(f"Sentiment: {result['label'].upper()}")
    print(f"Confidence: {result['confidence']:.1%}")
    print(f"Probabilities: neg={result['probabilities']['negative']:.2f}, "
          f"neu={result['probabilities']['neutral']:.2f}, "
          f"pos={result['probabilities']['positive']:.2f}")
    print()

# Test with different languages
print("=" * 60)
print("MULTILINGUAL DEMO")
print("=" * 60 + "\n")

# English
analyze_sentiment("Revenue increased by 25% this quarter.")

# Spanish
analyze_sentiment("Los beneficios cayeron un 10% este año.")

# Russian (bonus test)
analyze_sentiment("Компания объявила о рекордной прибыли.")

In [None]:
# Try your own text!
# analyze_sentiment("Your text here in any language")