# Imports and installations 

In [None]:
# Install Required Packages
"""
Install necessary packages for XLM-RoBERTa fine-tuning
Run this cell first to ensure all dependencies are available
"""

# !pip install transformers datasets torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
# !pip install accelerate evaluate scikit-learn pandas numpy matplotlib seaborn

In [None]:


# Import Libraries and Setup
"""
Import all necessary libraries and set up the environment
"""

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import (
    XLMRobertaTokenizer, 
    XLMRobertaForSequenceClassification,
    TrainingArguments, 
    Trainer,
    DataCollatorWithPadding,
    AutoTokenizer,
    AutoModelForSequenceClassification
)
from datasets import Dataset as HFDataset, DatasetDict
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Dict, List, Tuple, Optional
import os
import json
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
### TODO: set the seed
SEED = 42

torch.manual_seed(SEED)
np.random.seed(SEED)

### TODO: set the device
# DEVICE = "0"
# DEVICE = "1"
# DEVICE = "2"
# DEVICE = "3"
DEVICE = "-1"
os.environ["CUDA_VISIBLE_DEVICES"] = DEVICE

# Check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

















# Dataset Configuration and Loading

In [None]:
"""
Functions to load and preprocess your dataset
Modify the load_data function according to your data format
"""

def load_data(data_path: str, text_column: str = 'text', label_column: str = 'label') -> pd.DataFrame:
    """
    Load dataset from various formats (CSV, JSON, TSV)
    
    Args:
        data_path (str): Path to your dataset file
        text_column (str): Name of the column containing text data
        label_column (str): Name of the column containing labels
    
    Returns:
        pd.DataFrame: Loaded dataset
    """
    
    # Detect file format and load accordingly
    if data_path.endswith('.csv'):
        df = pd.read_csv(data_path)
    elif data_path.endswith('.json') or data_path.endswith('.jsonl'):
        df = pd.read_json(data_path, lines=data_path.endswith('.jsonl'))
    elif data_path.endswith('.tsv'):
        df = pd.read_csv(data_path, sep='\t')
    else:
        raise ValueError("Unsupported file format. Use CSV, JSON, JSONL, or TSV")
    
    # Ensure required columns exist
    if text_column not in df.columns:
        raise ValueError(f"Text column '{text_column}' not found in dataset")
    if label_column not in df.columns:
        raise ValueError(f"Label column '{label_column}' not found in dataset")
    
    # Clean data
    df = df.dropna(subset=[text_column, label_column])
    df[text_column] = df[text_column].astype(str)
    
    print(f"Loaded {len(df)} samples")
    print(f"Unique labels: {df[label_column].unique()}")
    print(f"Label distribution:\n{df[label_column].value_counts()}")
    
    return df

def create_label_mapping(labels: List[str]) -> Tuple[Dict[str, int], Dict[int, str]]:
    """
    Create bidirectional mapping between string labels and integers
    
    Args:
        labels (List[str]): List of unique string labels
    
    Returns:
        Tuple[Dict[str, int], Dict[int, str]]: (label_to_id, id_to_label) mappings
    """
    unique_labels = sorted(list(set(labels)))
    label_to_id = {label: idx for idx, label in enumerate(unique_labels)}
    id_to_label = {idx: label for label, idx in label_to_id.items()}
    
    print(f"Label mapping: {label_to_id}")
    return label_to_id, id_to_label

In [None]:
"""
Configure your dataset paths and parameters
Modify these variables according to your setup
"""

# Dataset Configuration
#### TODO : Update these paths and column names based on your dataset 
DATA_PATH = "/path/to/your/dataset.csv"
TEXT_COLUMN = "text"  # Name of column containing text
LABEL_COLUMN = "label"  # Name of column containing labels


print("Loading your dataset...")
df = load_data(DATA_PATH, TEXT_COLUMN, LABEL_COLUMN)

# Create label mappings
labels = df[LABEL_COLUMN].tolist()
label_to_id, id_to_label = create_label_mapping(labels)
num_labels = len(label_to_id)

# Convert string labels to integer IDs
df['label_id'] = df[LABEL_COLUMN].map(label_to_id)

print(f"\nDataset shape: {df.shape}")
print(f"Number of classes: {num_labels}")

In [None]:
"""
Split the dataset into train, validation, and test sets
"""

# Split configuration
TRAIN_SIZE = 0.8
VAL_SIZE = 0.1
TEST_SIZE = 0.1

# Ensure splits add up to 1.0
assert abs(TRAIN_SIZE + VAL_SIZE + TEST_SIZE - 1.0) < 1e-6, "Split sizes must sum to 1.0"

# First split: separate test set
train_val_df, test_df = train_test_split(
    df, 
    test_size=TEST_SIZE, 
    random_state=SEED, 
    stratify=df['label_id']
)

# Second split: separate train and validation
train_df, val_df = train_test_split(
    train_val_df, 
    test_size=VAL_SIZE/(TRAIN_SIZE + VAL_SIZE), 
    random_state=SEED, 
    stratify=train_val_df['label_id']
)

print("Data splits:")
print(f"Train: {len(train_df)} samples")
print(f"Validation: {len(val_df)} samples") 
print(f"Test: {len(test_df)} samples")

# Verify label distribution in each split
print("\nLabel distribution across splits:")
for split_name, split_df in [("Train", train_df), ("Val", val_df), ("Test", test_df)]:
    dist = split_df[LABEL_COLUMN].value_counts().sort_index()
    print(f"{split_name}: {dict(dist)}")

# Model and Tokenizer Setup

In [None]:

"""
Initialize XLM-RoBERTa model and tokenizer
"""

# Model configuration
MODEL_NAME = "xlm-roberta-base"  # XLM-RoBERTa Base (~125M parameters)
MAX_LENGTH = 512  # Maximum sequence length for tokenization

print(f"Loading model and tokenizer: {MODEL_NAME}")

# Load tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_NAME)

# Load model for sequence classification
model = XLMRobertaForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
    id2label=id_to_label,
    label2id=label_to_id,
    ignore_mismatched_sizes=True  # In case of size mismatches
)

# Move model to GPU if available
model = model.to(device)

print(f"Model loaded successfully!")
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

##  Dataset Tokenization

In [None]:

"""
Tokenize the text data for training
"""

def tokenize_function(examples):
    """
    Tokenize text examples for the model
    
    Args:
        examples: Batch of examples from the dataset
    
    Returns:
        Dict: Tokenized inputs
    """
    return tokenizer(
        examples[TEXT_COLUMN],
        truncation=True,
        padding=False,  # Will be handled by data collator
        max_length=MAX_LENGTH,
        return_tensors=None
    )

# Convert pandas DataFrames to Hugging Face Datasets
train_dataset = HFDataset.from_pandas(train_df[[TEXT_COLUMN, 'label_id']])
val_dataset = HFDataset.from_pandas(val_df[[TEXT_COLUMN, 'label_id']])
test_dataset = HFDataset.from_pandas(test_df[[TEXT_COLUMN, 'label_id']])

# Rename label_id column to labels (required by transformers)
train_dataset = train_dataset.rename_column('label_id', 'labels')
val_dataset = val_dataset.rename_column('label_id', 'labels')
test_dataset = test_dataset.rename_column('label_id', 'labels')

# Tokenize datasets
print("Tokenizing datasets...")
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Remove text column (no longer needed)
train_dataset = train_dataset.remove_columns([TEXT_COLUMN])
val_dataset = val_dataset.remove_columns([TEXT_COLUMN])
test_dataset = test_dataset.remove_columns([TEXT_COLUMN])

print("Tokenization completed!")
print(f"Training set features: {train_dataset.features}")

# Training Arguments Configuration

In [None]:
"""
Configure training parameters
Full documentation of all available arguments
"""

# Output directory for model checkpoints and logs
OUTPUT_DIR = "./xlm-roberta-finetuned"

# Training Arguments with full documentation
training_args = TrainingArguments(
    # === OUTPUT AND LOGGING ===
    output_dir=OUTPUT_DIR,                    # Directory to save model checkpoints and logs
    logging_dir=f"{OUTPUT_DIR}/logs",         # Directory for TensorBoard logs
    logging_steps=10,                         # Log training metrics every N steps
    logging_strategy="steps",                 # When to log ("steps" or "epoch")
    
    # === TRAINING CONFIGURATION ===
    num_train_epochs=3,                       # Number of training epochs
    per_device_train_batch_size=8,            # Batch size per GPU for training
    per_device_eval_batch_size=16,            # Batch size per GPU for evaluation
    gradient_accumulation_steps=2,            # Accumulate gradients over N steps (effective batch size = batch_size * gradient_accumulation_steps * num_gpus)
    
    # === LEARNING RATE AND OPTIMIZATION ===
    learning_rate=2e-5,                       # Learning rate for AdamW optimizer
    weight_decay=0.01,                        # Weight decay for regularization
    adam_beta1=0.9,                          # Beta1 parameter for AdamW
    adam_beta2=0.999,                        # Beta2 parameter for AdamW
    adam_epsilon=1e-8,                       # Epsilon parameter for AdamW
    max_grad_norm=1.0,                       # Maximum gradient norm for clipping
    
    # === LEARNING RATE SCHEDULING ===
    lr_scheduler_type="linear",               # Type of learning rate scheduler ("linear", "cosine", "polynomial", etc.)
    warmup_steps=100,                        # Number of warmup steps for learning rate scheduler
    # warmup_ratio=0.1,                      # Alternative: warmup as ratio of total steps
    
    # === EVALUATION AND SAVING ===
    evaluation_strategy="steps",              # When to evaluate ("steps", "epoch", or "no")
    eval_steps=50,                           # Evaluate every N steps
    save_strategy="steps",                    # When to save checkpoints ("steps", "epoch", or "no")
    save_steps=50,                           # Save checkpoint every N steps
    save_total_limit=3,                      # Maximum number of checkpoints to keep
    load_best_model_at_end=True,             # Load best model at end of training
    metric_for_best_model="eval_accuracy",    # Metric to use for selecting best model
    greater_is_better=True,                   # Whether higher metric values are better
    
    # === EARLY STOPPING ===
    # early_stopping_patience=3,             # Stop training if metric doesn't improve for N evaluations
    # early_stopping_threshold=0.001,        # Minimum improvement to reset patience counter
    
    # === HARDWARE AND PERFORMANCE ===
    fp16=torch.cuda.is_available(),          # Use mixed precision training (faster on modern GPUs)
    # bf16=False,                            # Use bfloat16 (alternative to fp16, better on A100/H100)
    dataloader_num_workers=4,                # Number of subprocesses for data loading
    dataloader_pin_memory=True,              # Pin memory for faster GPU transfer
    
    # === REPRODUCIBILITY ===
    seed=SEED,                                 # Random seed for reproducibility
    data_seed=SEED,                           # Seed for data shuffling
    
    # === REPORTING AND INTEGRATION ===
    report_to=["tensorboard"],               # Reporting tools ("tensorboard", "wandb", "comet_ml", etc.)
    run_name="xlm-roberta-classification",   # Name for this training run
    
    # === ADVANCED TRAINING OPTIONS ===
    remove_unused_columns=True,              # Remove dataset columns not used by the model
    label_names=["labels"],                  # Names of the label columns
    # group_by_length=True,                  # Group samples by length to minimize padding
    # length_column_name="length",           # Column name for sequence lengths
    
    # === CHECKPOINTING AND RESUMING ===
    # resume_from_checkpoint=None,           # Path to checkpoint to resume from
    ignore_data_skip=False,                  # Skip data loading optimizations when resuming
    
    # === DISTRIBUTED TRAINING ===
    # local_rank=-1,                         # Local rank for distributed training
    # ddp_find_unused_parameters=False,      # Find unused parameters in DDP
    # ddp_bucket_cap_mb=25,                  # DDP bucket size in MB
    
    # === PREDICTION AND GENERATION ===
    predict_with_generate=False,             # Use generation for predictions (not applicable for classification)
    
    # === PUSH TO HUB ===
    # push_to_hub=False,                     # Push model to Hugging Face Hub
    # hub_model_id="your-username/model-name", # Model name on the Hub
    # hub_token=None,                        # Hugging Face token for authentication
)

print("Training arguments configured!")
print(f"Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"Total training steps: {len(train_dataset) // (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps) * training_args.num_train_epochs}")




# Evaluation Metrics

In [None]:

"""
Define evaluation metrics for model performance
"""

def compute_metrics(eval_pred):
    """
    Compute evaluation metrics for the model
    
    Args:
        eval_pred: Predictions and labels from the model
    
    Returns:
        Dict: Dictionary of computed metrics
    """
    predictions, labels = eval_pred
    
    # Get predicted class (highest probability)
    predictions = np.argmax(predictions, axis=1)
    
    # Compute metrics
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    
    # Per-class metrics
    precision_per_class, recall_per_class, f1_per_class, support = precision_recall_fscore_support(
        labels, predictions, average=None
    )
    
    metrics = {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall,
    }
    
    # Add per-class metrics
    for i, label_name in id_to_label.items():
        if i < len(precision_per_class):  # Ensure index is valid
            metrics[f'f1_{label_name}'] = f1_per_class[i]
            metrics[f'precision_{label_name}'] = precision_per_class[i]
            metrics[f'recall_{label_name}'] = recall_per_class[i]
    
    return metrics

# Data collator for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

print("Evaluation metrics and data collator configured!")




# Initialize Trainer and Start Training

In [None]:

"""
Initialize the Trainer and begin fine-tuning
"""

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("Trainer initialized!")
print("Starting training...")
print("=" * 50)

# Start training
train_result = trainer.train()

print("=" * 50)
print("Training completed!")
print(f"Training time: {train_result.metrics['train_runtime']:.2f} seconds")
print(f"Training samples per second: {train_result.metrics['train_samples_per_second']:.2f}")

# Save the final model
trainer.save_model()
tokenizer.save_pretrained(OUTPUT_DIR)

# Save label mappings
with open(f"{OUTPUT_DIR}/label_mappings.json", "w") as f:
    json.dump({
        "label_to_id": label_to_id,
        "id_to_label": {str(k): v for k, v in id_to_label.items()}  # JSON keys must be strings
    }, f, indent=2)

print(f"Model and tokenizer saved to {OUTPUT_DIR}")

# Evaluation on Test Set

In [None]:
"""
Evaluate the fine-tuned model on the test set
"""

print("Evaluating on test set...")

# Evaluate on test set
test_results = trainer.evaluate(eval_dataset=test_dataset, metric_key_prefix="test")

print("Test Results:")
print("=" * 30)
for key, value in test_results.items():
    if isinstance(value, float):
        print(f"{key}: {value:.4f}")
    else:
        print(f"{key}: {value}")

# Get predictions for detailed analysis
predictions = trainer.predict(test_dataset)
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids

# Create confusion matrix
cm = confusion_matrix(y_true, y_pred)
class_names = [id_to_label[i] for i in range(len(id_to_label))]

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=class_names, yticklabels=class_names)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# Per-class performance
print("\nPer-class Performance:")
print("=" * 30)
precision_per_class, recall_per_class, f1_per_class, support = precision_recall_fscore_support(
    y_true, y_pred, average=None
)

for i, class_name in enumerate(class_names):
    if i < len(precision_per_class):
        print(f"{class_name}:")
        print(f"  Precision: {precision_per_class[i]:.4f}")
        print(f"  Recall: {recall_per_class[i]:.4f}")
        print(f"  F1-Score: {f1_per_class[i]:.4f}")
        print(f"  Support: {support[i]}")
        print()

# Inference Function and Testing

In [None]:
"""
Create inference function and test with new examples
"""

def predict_text(text: str, model, tokenizer, device, label_mapping):
    """
    Predict the class of a single text input
    
    Args:
        text (str): Input text to classify
        model: Fine-tuned model
        tokenizer: Model tokenizer
        device: Device to run inference on
        label_mapping: Dictionary mapping class IDs to labels
    
    Returns:
        Dict: Prediction results with probabilities
    """
    # Tokenize input
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=MAX_LENGTH
    ).to(device)
    
    # Get predictions
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    
    # Convert to numpy and get results
    predictions = predictions.cpu().numpy()[0]
    predicted_class_id = np.argmax(predictions)
    predicted_class = label_mapping[predicted_class_id]
    confidence = predictions[predicted_class_id]
    
    # Get all class probabilities
    class_probabilities = {
        label_mapping[i]: prob for i, prob in enumerate(predictions)
    }
    
    return {
        "text": text,
        "predicted_class": predicted_class,
        "confidence": confidence,
        "all_probabilities": class_probabilities
    }

def predict_texts(texts: List[str], model, tokenizer, device, label_mapping):
    """
    Predict classes for multiple texts
    
    Args:
        texts (List[str]): List of input texts
        model: Fine-tuned model
        tokenizer: Model tokenizer
        device: Device to run inference on
        label_mapping: Dictionary mapping class IDs to labels
    
    Returns:
        List[Dict]: List of prediction results
    """
    results = []
    for text in texts:
        result = predict_text(text, model, tokenizer, device, label_mapping)
        results.append(result)
    return results

# Test with example texts
test_texts = [
    "This product is absolutely amazing! I love it!",
    "Worst purchase ever. Complete waste of money.",
    "It's okay, nothing special but not bad either.",
    "¡Este producto es increíble! Lo recomiendo totalmente.",
    "Ce produit est vraiment décevant."
]

print("Testing inference with example texts:")
print("=" * 40)

predictions = predict_texts(test_texts, model, tokenizer, device, id_to_label)

for pred in predictions:
    print(f"Text: {pred['text']}")
    print(f"Predicted: {pred['predicted_class']} (confidence: {pred['confidence']:.4f})")
    print("All probabilities:")
    for class_name, prob in pred['all_probabilities'].items():
        print(f"  {class_name}: {prob:.4f}")
    print("-" * 40)

# Model Loading and Deployment Function

In [None]:


"""
Functions to load the saved model for deployment
"""

def load_trained_model(model_path: str, device: str = "cuda"):
    """
    Load the fine-tuned model from saved checkpoint
    
    Args:
        model_path (str): Path to the saved model directory
        device (str): Device to load the model on
    
    Returns:
        Tuple: (model, tokenizer, label_mappings)
    """
    # Load tokenizer
    tokenizer = XLMRobertaTokenizer.from_pretrained(model_path)
    
    # Load model
    model = XLMRobertaForSequenceClassification.from_pretrained(model_path)
    model = model.to(device)
    model.eval()
    
    # Load label mappings
    with open(f"{model_path}/label_mappings.json", "r") as f:
        mappings = json.load(f)
        label_to_id = mappings["label_to_id"]
        id_to_label = {int(k): v for k, v in mappings["id_to_label"].items()}
    
    print(f"Model loaded from {model_path}")
    print(f"Available classes: {list(label_to_id.keys())}")
    
    return model, tokenizer, id_to_label, label_to_id

# Example of loading the model (uncomment to test)
# loaded_model, loaded_tokenizer, loaded_id_to_label, loaded_label_to_id = load_trained_model(OUTPUT_DIR, device)

print("Model loading function created!")
print(f"To load your trained model later, use: load_trained_model('{OUTPUT_DIR}')")

# Training Summary and Next Steps

In [None]:
"""
Display training summary and provide next steps
"""

print("🎉 TRAINING COMPLETED SUCCESSFULLY! 🎉")
print("=" * 50)

print("\n📊 TRAINING SUMMARY:")
print(f"• Model: XLM-RoBERTa Base (~125M parameters)")
print(f"• Training samples: {len(train_dataset)}")
print(f"• Validation samples: {len(val_dataset)}")
print(f"• Test samples: {len(test_dataset)}")
print(f"• Number of classes: {num_labels}")
print(f"• Classes: {list(label_to_id.keys())}")
print(f"• Training epochs: {training_args.num_train_epochs}")
print(f"• Final test accuracy: {test_results['test_accuracy']:.4f}")

print(f"\n💾 SAVED FILES:")
print(f"• Model directory: {OUTPUT_DIR}")
print(f"• Model files: pytorch_model.bin, config.json")
print(f"• Tokenizer files: tokenizer.json, vocab.json")
print(f"• Label mappings: label_mappings.json")
print(f"• Training logs: {OUTPUT_DIR}/logs/")

