Realizaremos un modelo de clasificacion de la emocion segun el texto, para ellos es necesario tokenizar el texto y usaremos la arquitectura transformer
1. Hacemos las importaciones necesarias

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch

  from .autonotebook import tqdm as notebook_tqdm


2. Importamos el data set

In [2]:
from datasets import load_dataset

dataset = load_dataset("dair-ai/emotion", "split")

3. importamos el tokenizer

In [3]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [4]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

train_dataset = tokenized_datasets['train']
test_dataset = tokenized_datasets['test']

Map: 100%|██████████| 2000/2000 [00:00<00:00, 5186.30 examples/s]


In [5]:
def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted')
    acc = accuracy_score(p.label_ids, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [6]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

In [7]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=6)  # 6 etiquetas de emoción

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [9]:
trainer.train()

  1%|          | 16/3000 [04:26<14:27:42, 17.45s/it]

KeyboardInterrupt: 

In [None]:
trainer.evaluate()