In [None]:
# Import required libraries
import os
import json
import gc
import numpy as np
from collections import Counter
from datasets import load_dataset
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    TrainingArguments, 
    Trainer, 
    EarlyStoppingCallback
)
import nlpaug.augmenter.word as naw

# Free up CUDA memory at the start
torch.cuda.empty_cache()
gc.collect()

# Check GPU availability and set memory optimization flags
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory allocated: {torch.cuda.memory_allocated(0) / 1e6:.2f} MB")
    print(f"Memory reserved: {torch.cuda.memory_reserved(0) / 1e6:.2f} MB")

# T4-specific optimizations for PyTorch
torch.backends.cudnn.benchmark = True  # Speed up training if input sizes don't change
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"  # More deterministic behavior

# Disable Weights & Biases logging
os.environ["WANDB_DISABLED"] = "true"

# Function to free up GPU memory
def free_gpu_memory():
    torch.cuda.empty_cache()
    gc.collect()
    if torch.cuda.is_available():
        print(f"Memory allocated after cleanup: {torch.cuda.memory_allocated(0) / 1e6:.2f} MB")

# Load the dataset with caching to avoid reloading
print("Loading MMLU-Pro dataset...")
cache_dir = "/content/cache"
os.makedirs(cache_dir, exist_ok=True)
dataset = load_dataset("TIGER-Lab/MMLU-Pro", cache_dir=cache_dir)

# Extract questions and categories
print("Extracting questions and categories...")
preguntas = [item["question"] for item in dataset["test"]]
categorias = [item["category"] for item in dataset["test"]]

# Check category distribution
category_counts = Counter(categorias)
print("Category distribution:")
for category, count in category_counts.most_common():
    print(f"{category}: {count}")

# Get unique categories
categorias_unicas = list(set(categorias))
categorias_a_id = {categoria: i for i, categoria in enumerate(categorias_unicas)}

# Save category mapping
with open('categorias_a_id.json', 'w') as f:
    json.dump(categorias_a_id, f)

# Format prompts with instructional prefix to improve model understanding
def format_prompt(question):
    return f"Classify the academic category: {question}"

# Apply prompt formatting
formatted_questions = [format_prompt(q) for q in preguntas]
category_ids = [categorias_a_id[cat] for cat in categorias]

# Choose the most efficient model for T4 GPU
# DistilBERT is smaller and faster than BERT, while maintaining good performance
model_name = "distilbert-base-uncased"  # A lightweight alternative
# Alternatively, you could use SciBERT if you need domain-specific understanding
# model_name = "allenai/scibert_scivocab_uncased"

print(f"Using model: {model_name}")
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to compute evaluation metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    return {
        'accuracy': acc,
        'f1': f1
    }

# Dataset class for the model with memory optimization
class MMLUProDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# Use a single fold for faster results, or multiple folds for better evaluation
use_cross_validation = False  # Set to True if you want cross-validation

if use_cross_validation:
    n_splits = 3  # Reduced from 5 to save time on T4
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    fold_results = []
    
    print(f"Starting {n_splits}-fold cross-validation...")
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(formatted_questions)):
        print(f"\nTraining fold {fold+1}/{n_splits}")
        
        # Free memory before starting a new fold
        free_gpu_memory()
        
        # Split data for this fold
        fold_train_texts = [formatted_questions[i] for i in train_idx]
        fold_train_labels = [category_ids[i] for i in train_idx]
        fold_val_texts = [formatted_questions[i] for i in val_idx]
        fold_val_labels = [category_ids[i] for i in val_idx]
        
        # Skip data augmentation for speed on T4
        # Tokenize data - use a smaller max_length to conserve memory
        print("Tokenizing data...")
        max_length = 256  # Reduced from 512 to save memory on T4
        train_encodings = tokenizer(fold_train_texts, truncation=True, padding=True, max_length=max_length)
        val_encodings = tokenizer(fold_val_texts, truncation=True, padding=True, max_length=max_length)
        
        # Create datasets
        train_dataset = MMLUProDataset(train_encodings, fold_train_labels)
        val_dataset = MMLUProDataset(val_encodings, fold_val_labels)
        
        # Compute class weights to handle imbalance
        class_weights = compute_class_weight('balanced', classes=np.unique(fold_train_labels), y=fold_train_labels)
        class_weights_dict = {i: float(weight) for i, weight in enumerate(class_weights)}
        
        # Create model with memory optimization
        model = AutoModelForSequenceClassification.from_pretrained(
            model_name, 
            num_labels=len(categorias_unicas)
        )
        
        # T4-optimized training arguments
        training_args = TrainingArguments(
            output_dir=f'./results/fold_{fold+1}',
            evaluation_strategy="steps",  # Changed from epoch to steps for more frequent evaluation
            eval_steps=100,  # Evaluate every 100 steps
            save_strategy="steps",
            save_steps=100,
            save_total_limit=2,  # Keep only the 2 best checkpoints to save space
            learning_rate=5e-5,  # Slightly higher learning rate
            per_device_train_batch_size=16,  # Adjusted for T4 memory
            per_device_eval_batch_size=32,
            gradient_accumulation_steps=2,  # Effective batch size = 16*2 = 32
            num_train_epochs=3,  # Reduced epochs for T4
            warmup_ratio=0.1,  # Use ratio instead of steps
            weight_decay=0.01,
            logging_dir=f'./logs/fold_{fold+1}',
            logging_steps=50,
            load_best_model_at_end=True,
            metric_for_best_model="eval_f1",
            greater_is_better=True,
            fp16=True,  # Mixed precision training for T4
            dataloader_num_workers=2,  # Parallel data loading
            dataloader_pin_memory=True,  # Faster data transfer to GPU
            report_to="none"  # Disable wandb explicitly
        )
        
        # Create trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
        )
        
        # Train the model
        print(f"Training model for fold {fold+1}...")
        trainer.train()
        
        # Evaluate on validation set
        print(f"Evaluating model for fold {fold+1}...")
        eval_result = trainer.evaluate()
        fold_results.append(eval_result)
        
        print(f"Fold {fold+1} results: {eval_result}")
        
        # Save the model for this fold
        model.save_pretrained(f"modelo_mmlu_fold_{fold+1}")
        tokenizer.save_pretrained(f"modelo_mmlu_fold_{fold+1}")
        
        # Free up memory after each fold
        del model, trainer, train_dataset, val_dataset
        free_gpu_memory()
    
    # Print cross-validation results
    print("\nCross-validation results:")
    avg_accuracy = sum(result['eval_accuracy'] for result in fold_results) / len(fold_results)
    avg_f1 = sum(result['eval_f1'] for result in fold_results) / len(fold_results)
    print(f"Average Accuracy: {avg_accuracy:.4f}")
    print(f"Average F1 Score: {avg_f1:.4f}")

# Train final model on entire dataset
print("\nTraining final model...")
free_gpu_memory()

# Format all questions
all_formatted_questions = [format_prompt(q) for q in preguntas]
all_labels = [categorias_a_id[cat] for cat in categorias]

# Use 90% for training, 10% for final validation
train_texts, val_texts, train_labels, val_labels = train_test_split(
    all_formatted_questions, all_labels, test_size=0.1, random_state=42, stratify=all_labels
)

# Simple data augmentation (limited for T4 efficiency)
print("Applying targeted data augmentation...")
aug = naw.SynonymAug(aug_src='wordnet')

# Only augment underrepresented classes
label_counts = Counter(train_labels)
median_count = np.median(list(label_counts.values()))
underrepresented_labels = [label for label, count in label_counts.items() if count < median_count]

augmented_texts = []
augmented_labels = []
for text, label in zip(train_texts, train_labels):
    if label in underrepresented_labels:
        try:
            augmented_text = aug.augment(text)[0]
            augmented_texts.append(augmented_text)
            augmented_labels.append(label)
        except:
            continue

print(f"Added {len(augmented_texts)} augmented examples for underrepresented classes")

# Combine original and augmented data
train_texts = train_texts + augmented_texts
train_labels = train_labels + augmented_labels

# Tokenize with optimized parameters for T4
print("Tokenizing data for final model...")
max_length = 256  # Reduced from 512 to save memory
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=max_length)

# Create datasets
train_dataset = MMLUProDataset(train_encodings, train_labels)
val_dataset = MMLUProDataset(val_encodings, val_labels)

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)
class_weights_dict = {i: float(weight) for i, weight in enumerate(class_weights)}

# Create final model
final_model = AutoModelForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=len(categorias_unicas)
)

# T4-optimized training arguments for final model
final_training_args = TrainingArguments(
    output_dir='./results/final_model',
    evaluation_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,  # Keep only the 2 best checkpoints
    learning_rate=5e-5,
    lr_scheduler_type="cosine",  # Cosine scheduler works well for final training
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    warmup_ratio=0.1,
    weight_decay=0.01,
    logging_dir='./logs/final_model',
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    fp16=True,
    dataloader_num_workers=2,
    dataloader_pin_memory=True,
    report_to="none",
    # Enable gradient checkpointing for memory efficiency
    gradient_checkpointing=True
)

# Create trainer for final model
final_trainer = Trainer(
    model=final_model,
    args=final_training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# Train final model
print("Training final model...")
final_trainer.train()

# Evaluate final model
print("Evaluating final model...")
final_eval_result = final_trainer.evaluate()
print(f"Final model evaluation: {final_eval_result}")

# Save the final model
print("Saving final model...")
final_model.save_pretrained("modelo_mmlu_final")
tokenizer.save_pretrained("modelo_mmlu_final")

# Create a pipeline for easy inference
from transformers import pipeline
print("Creating classification pipeline...")
clasificador = pipeline("text-classification", model="modelo_mmlu_final", tokenizer="modelo_mmlu_final")

# Get reverse mapping from ID to category name
id_to_category = {id_val: cat for cat, id_val in categorias_a_id.items()}

# Test on a few examples
print("\nTesting the model with examples:")
test_questions = [
    "What is the capital of France?",
    "Who discovered America?",
    "What is an index fund?",
    "What is 2+2?",
    "What is the chemical symbol for water?",
    "Who wrote Hamlet?"
]

for question in test_questions:
    formatted_question = format_prompt(question)
    result = clasificador(formatted_question)
    label_id = int(result[0]['label'].split('_')[1])
    category_name = id_to_category.get(label_id, "Unknown")
    print(f"Question: {question}")
    print(f"Predicted category: {category_name} (confidence: {result[0]['score']:.4f})\n")

# Function to evaluate model on a larger test set
def evaluate_model_performance(model_path, test_data, test_labels, category_mapping):
    """Evaluate model performance on a test set"""
    # Create a classification pipeline
    test_classifier = pipeline("text-classification", model=model_path, tokenizer=model_path)
    
    # Get predictions
    predictions = []
    for question in test_data:
        formatted_question = format_prompt(question)
        result = test_classifier(formatted_question, truncation=True, max_length=512)
        label_id = int(result[0]['label'].split('_')[1])
        predictions.append(label_id)
    
    # Calculate accuracy
    accuracy = accuracy_score(test_labels, predictions)
    
    # Get classification report
    id_to_category = {v: k for k, v in category_mapping.items()}
    target_names = [id_to_category[i] for i in range(len(id_to_category))]
    
    report = classification_report(test_labels, predictions, target_names=target_names, output_dict=True)
    
    return accuracy, report

# Evaluate final model on a separate test set
print("\nEvaluating final model on test set...")
_, test_texts, _, test_labels = train_test_split(
    all_formatted_questions, all_labels, test_size=0.2, random_state=24, stratify=all_labels
)

accuracy, report = evaluate_model_performance("modelo_mmlu_final", test_texts, test_labels, categorias_a_id)
print(f"Test set accuracy: {accuracy:.4f}")
print("Performance by category:")
for category, metrics in report.items():
    if category not in ['accuracy', 'macro avg', 'weighted avg']:
        print(f"{category}: F1-score = {metrics['f1-score']:.4f}, Precision = {metrics['precision']:.4f}, Recall = {metrics['recall']:.4f}")

print("\nTraining and evaluation complete!")

In [None]:
# Guardar el modelo
torch.save(model.state_dict(), 'modelo_entrenado.pth')

# Código para descargar el archivo
from google.colab import files
files.download('modelo_entrenado.pth')