# Cargar y preparar los datos

In [None]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split

# Load the dataset de MMLU-Pro
dataset = load_dataset("TIGER-Lab/MMLU-Pro")
print(dataset["test"][0])

# Extract the questions and categories from the dataset
preguntas = [item["question"] for item in dataset["test"]]
categorias = [item["category"] for item in dataset["test"]]

# Split the dataset into training and validation sets (80% para entrenar, 20% para validarlo)
train_texts, val_texts, train_labels, val_labels = train_test_split(preguntas, categorias, test_size=0.2)
# train_texts y val_texts preguntas para entrenar y validar (respectivamente)
# train_labels y val_labels categorías para entrenar y validar (respectivamente) 

  from .autonotebook import tqdm as notebook_tqdm


{'question_id': 70, 'question': 'Typical advertising regulatory bodies suggest, for example that adverts must not: encourage _________, cause unnecessary ________ or _____, and must not cause _______ offence.', 'options': ['Safe practices, Fear, Jealousy, Trivial', 'Unsafe practices, Distress, Joy, Trivial', 'Safe practices, Wants, Jealousy, Trivial', 'Safe practices, Distress, Fear, Trivial', 'Unsafe practices, Wants, Jealousy, Serious', 'Safe practices, Distress, Jealousy, Serious', 'Safe practices, Wants, Fear, Serious', 'Unsafe practices, Wants, Fear, Trivial', 'Unsafe practices, Distress, Fear, Serious'], 'answer': 'I', 'answer_index': 8, 'cot_content': '', 'category': 'business', 'src': 'ori_mmlu-business_ethics'}


# Tokenizar los datos

In [2]:
from transformers import BertTokenizer
 
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512) 
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

# Crear modelo

In [None]:
from transformers import BertForSequenceClassification

num_categorias = len(set(categorias))
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_categorias)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Convertir categorias a números

In [6]:
# Obtener todas las categorías únicas
categorias_unicas = list(set(categorias))

# Crear un diccionario {categoria: id}
categorias_a_id = {categoria: i for i, categoria in enumerate(categorias_unicas)}

# Convertir las categorías a ids
train_labels = [categorias_a_id[label] for label in train_labels]
val_labels = [categorias_a_id[label] for label in val_labels]

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

# Entrenar el modelo

In [None]:
from transformers import TrainingArguments, Trainer
import torch

# Convert the labels to tensor
class MMLUProDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long) # labels need to be of type long
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MMLUProDataset(train_encodings, train_labels)
val_dataset = MMLUProDataset(val_encodings, val_labels)

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',         # output directory
    evaluation_strategy="epoch",    # evaluate at the end of each epoch
    save_strategy="epoch",          # save at the end of each epoch
    per_device_train_batch_size=8,  # batch size for training
    per_device_eval_batch_size=8,   # batch size for evaluation
    num_train_epochs=3,             # number of training epochs
    logging_dir='./logs'            # directory for storing logs
)

# Create a Trainer instance
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

# Train the model
trainer.train()




Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

# Guardar y usar el modelo

In [None]:
model.save_pretrained("modelo_mmlu")
tokenizer.save_pretrained("modelo_mmlu")  

from transformers import pipeline
clasificador = pipeline("text-classification", model= "modelo_mmlu")

# Pruebas

In [None]:
print(clasificador("What is the capital of France?"))
print(clasificador("Who discovered America?"))
print(clasificador("What is an index fund?"))
print(clasificador("What is 2+2?"))
print(clasificador("What is the chemical symbol for water?"))
print(clasificador("Who wrote Hamlet?"))

In [None]:
print(list(categorias_a_id.items())) 

# Evaluar aciertos

In [None]:
from sklearn.metrics import accuracy_score
from random import sample
from datasets import load_dataset

dataset = load_dataset("TIGER-Lab/MMLU-Pro")
val_dataset = dataset["test"]
random_val_dataset = sample(val_dataset, 1000)

categorias_prueba = list(set(random_val_dataset["category"]))
categorias_a_id_prueba = {categoria: i for i, categoria in enumerate(categorias_prueba)}
id_a_categorias_prueba = {i: categoria for categoria, i in categorias_a_id_prueba.items()}

# Cargar el modelo y el tokenizador entrenados
clasificador = pipeline("text-classification", model= "modelo_mmlu", tokenizer= "modelo_mmlu", device= 0)

# Obtener las predicciones del modelo
predicciones = []
categorias_reales = []

for item in val_dataset:
    pregunta = item["question"]
    categoria_real = categorias_a_id_prueba[item["category"]]
    resultado = clasificador(pregunta)
    etiqueta_predicha = int(resultado[0]['label'].split('_')[1]) # Convertir LABEL_X a número
    categorias_reales.append(categoria_real)
    predicciones.append(etiqueta_predicha)

# Calcular la precisión
tasa_aciertos = accuracy_score(categorias_reales, predicciones)
print(f"Tasa de aciertos: {tasa_aciertos * 100:.2f}%")