In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [None]:
!rm -rf /content/cache/*


# **Cargar y preparar los datos**

In [None]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split

# Load the dataset de MMLU-Pro
dataset = load_dataset("TIGER-Lab/MMLU-Pro")
print(dataset["test"][0])

# Extract the questions and categories from the dataset
preguntas = [item["question"] for item in dataset["test"]]
categorias = [item["category"] for item in dataset["test"]]

# Split the dataset into training and validation sets (80% para entrenar, 20% para validarlo)
train_texts, val_texts, train_labels, val_labels = train_test_split(preguntas, categorias, test_size=0.2)
# train_texts y val_texts preguntas para entrenar y validar (respectivamente)
# train_labels y val_labels categorías para entrenar y validar (respectivamente)

{'question_id': 70, 'question': 'Typical advertising regulatory bodies suggest, for example that adverts must not: encourage _________, cause unnecessary ________ or _____, and must not cause _______ offence.', 'options': ['Safe practices, Fear, Jealousy, Trivial', 'Unsafe practices, Distress, Joy, Trivial', 'Safe practices, Wants, Jealousy, Trivial', 'Safe practices, Distress, Fear, Trivial', 'Unsafe practices, Wants, Jealousy, Serious', 'Safe practices, Distress, Jealousy, Serious', 'Safe practices, Wants, Fear, Serious', 'Unsafe practices, Wants, Fear, Trivial', 'Unsafe practices, Distress, Fear, Serious'], 'answer': 'I', 'answer_index': 8, 'cot_content': '', 'category': 'business', 'src': 'ori_mmlu-business_ethics'}


# **Tokenizar los datos**
Convertir el texto en números que el modelo pueda entender

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

# **Crear el modelo**

In [None]:
from transformers import BertForSequenceClassification

num_categorias = len(set(categorias))
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_categorias)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


**Convertir categorias a números**

In [None]:
import json

# Obtener todas las categorías únicas
categorias_unicas = list(set(categorias))

# Crear un diccionario {categoria: id}
categorias_a_id = {categoria: i for i, categoria in enumerate(categorias_unicas)}

# Guardar el mapeo
with open('categorias_a_id.json', 'w') as f:
    json.dump(categorias_a_id, f)

# Convertir las categorías a ids
train_labels = [categorias_a_id[label] for label in train_labels]
val_labels = [categorias_a_id[label] for label in val_labels]

Desabilitar Wandb (Weights & Biases)

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

# **Entrenar el modelo**

In [None]:
from transformers import TrainingArguments, Trainer
import torch
from sklearn.metrics import accuracy_score, f1_score


# Convert the labels to tensor
class MMLUProDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long) # labels need to be of type long
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MMLUProDataset(train_encodings, train_labels)
val_dataset = MMLUProDataset(val_encodings, val_labels)

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',         # output directory
    evaluation_strategy="epoch",    # evaluate at the end of each epoch
    save_strategy="epoch",          # save at the end of each epoch
    per_device_train_batch_size=8,  # batch size for training
    per_device_eval_batch_size=8,   # batch size for evaluation
    num_train_epochs=5, 
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    logging_dir='./logs',
    learning_rate=2e-5,
    warmup_steps=500,
    weight_decay=0.01,
    lr_scheduler_type="linear",
    compute_metrics=compute_metrics, 
    fp16=True
)


def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    return {
        'accuracy': acc,
        'f1': f1
    }

# Create a Trainer instance
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset, 
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]# evaluation dataset
)

# Train the model
trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss
1,0.7332,0.629607
2,0.3462,0.630594
3,0.1316,0.686813


TrainOutput(global_step=3612, training_loss=0.46433605096928965, metrics={'train_runtime': 2944.6812, 'train_samples_per_second': 9.806, 'train_steps_per_second': 1.227, 'total_flos': 7598150284032000.0, 'train_loss': 0.46433605096928965, 'epoch': 3.0})

# **Guardar y usar el modelo entrenado**

In [None]:
model.save_pretrained("modelo_mmlu")
tokenizer.save_pretrained("modelo_mmlu")

from transformers import pipeline
clasificador = pipeline("text-classification", model= "modelo_mmlu")


Device set to use cuda:0


# **Pruebas**

In [None]:
print(clasificador("What is the capital of France?"))
print(clasificador("Who discovered America?"))
print(clasificador("What is an index fund?"))
print(clasificador("What is 2+2?"))
print(clasificador("What is the chemical symbol for water?"))
print(clasificador("Who wrote Hamlet?"))

[{'label': 'LABEL_9', 'score': 0.9982511401176453}]
[{'label': 'LABEL_10', 'score': 0.7519559860229492}]
[{'label': 'LABEL_11', 'score': 0.7832615971565247}]
[{'label': 'LABEL_4', 'score': 0.9984478950500488}]
[{'label': 'LABEL_13', 'score': 0.9808743596076965}]
[{'label': 'LABEL_9', 'score': 0.9969940185546875}]


In [None]:
print(list(categorias_a_id.items()))


[('psychology', 0), ('philosophy', 1), ('computer science', 2), ('biology', 3), ('math', 4), ('health', 5), ('business', 6), ('physics', 7), ('engineering', 8), ('other', 9), ('history', 10), ('economics', 11), ('law', 12), ('chemistry', 13)]


# **Evaluar tasa de aciertos**

In [None]:
# from transformers import pipeline
# from sklearn.metrics import accuracy_score
# from datasets import load_dataset
# from random import sample
# import json

# #cargar mapeo de categorias
# with open('categorias_a_id.json', 'r') as f:
#     categorias_a_id = json.load(f)

# # Cargar el dataset
# dataset = load_dataset("TIGER-Lab/MMLU-Pro")
# val_dataset = dataset["test"]
# random_val_dataset = sample(list(val_dataset), 1000)

# clasificador = pipeline("text-classification", model="modelo_mmlu", tokenizer="modelo_mmlu", device=0)

# def filtrar_preguntas(dataset, max_tokens=512):
#     """ Filtra preguntas que tengan menos de max_tokens al tokenizarlas. """
#     preguntas_filtradas = []
#     for item in dataset:
#         num_tokens = len(clasificador.tokenizer.encode(item["question"], truncation=False))
#         if num_tokens <= max_tokens:
#             preguntas_filtradas.append(item)
#     return preguntas_filtradas

# # Filtrar preguntas largas
# max_tokens = 512
# dataset_filtrado = filtrar_preguntas(random_val_dataset, max_tokens)
# print(f"Preguntas tras el filtrado: {len(dataset_filtrado)}")

# # Obtener predicciones del modelo para las preguntas filtradas
# predicciones = []
# categorias_reales = []

# for item in dataset_filtrado:
#     pregunta = item["question"]
#     categoria_real = item["category"]

#     resultado = clasificador(pregunta, truncation=True, max_length=max_tokens)  # Truncar la entrada al modelo
#     etiqueta_predicha = int(resultado[0]['label'].split('_')[1])  # Convertir LABEL_X a número
#     categorias_reales.append(categoria_real)
#     predicciones.append(etiqueta_predicha)

# # Calcular la precisión
# tasa_aciertos = accuracy_score(categorias_reales, predicciones)
# print(f"Tasa de aciertos: {tasa_aciertos * 100:.2f}%")



Device set to use cuda:0
Token indices sequence length is longer than the specified maximum sequence length for this model (553 > 512). Running this sequence through the model will result in indexing errors


Preguntas tras el filtrado: 999
Tasa de aciertos: 0.00%


In [None]:
from transformers import pipeline
from sklearn.metrics import accuracy_score
from datasets import load_dataset
from random import sample
import json

# Cargar el mapeo de categorías
with open("categorias_a_id.json", "r") as f:
    categorias_a_id = json.load(f)

# Cargar el dataset
dataset = load_dataset("TIGER-Lab/MMLU-Pro")
val_dataset = dataset["test"]
random_val_dataset = sample(list(val_dataset), 1000)

clasificador = pipeline("text-classification", model="modelo_mmlu", tokenizer="modelo_mmlu", device=0)

def filtrar_preguntas(dataset, max_tokens=512):
    preguntas_filtradas = []
    for item in dataset:
        num_tokens = len(clasificador.tokenizer.encode(item["question"], truncation=False))
        if num_tokens <= max_tokens:
            preguntas_filtradas.append(item)
    return preguntas_filtradas

max_tokens = 512
dataset_filtrado = filtrar_preguntas(random_val_dataset, max_tokens)
print(f"Preguntas tras el filtrado: {len(dataset_filtrado)}")

predicciones = []
categorias_reales = []

for item in dataset_filtrado:
    pregunta = item["question"]
    categoria_real = item["category"]

    resultado = clasificador(pregunta, truncation=True, max_length=max_tokens)
    etiqueta_predicha = int(resultado[0]['label'].split('_')[1])  # Ej: LABEL_9 → 9
    categorias_reales.append(categoria_real)
    predicciones.append(etiqueta_predicha)

# Convertir categorías reales a IDs usando el diccionario
categorias_reales_ids = [categorias_a_id[categoria] for categoria in categorias_reales]

tasa_aciertos = accuracy_score(categorias_reales_ids, predicciones)
print(f"Tasa de aciertos: {tasa_aciertos * 100:.2f}%")

Device set to use cuda:0
Token indices sequence length is longer than the specified maximum sequence length for this model (536 > 512). Running this sequence through the model will result in indexing errors


Preguntas tras el filtrado: 999
