In [1]:
import os

import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib_inline.backend_inline import set_matplotlib_formats
import seaborn as sns
import tensorflow as tf

from tensorflow.keras import activations, optimizers, losses
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
# from tftrainer import Trainer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    confusion_matrix,
    classification_report
)




  from .autonotebook import tqdm as notebook_tqdm


## Cargar los datos y dividirlos en training y validation

In [2]:
df = pd.read_csv('./data/goemotions_clean.csv', sep=",")
df.head()

Unnamed: 0,text,emotion
0,Shhh dont give idea,anger
1,Thank much kind stranger I really need,gratitude
2,Ion know would better buy trim make hard dose,neutral
3,Im honestly surprised We fallen much farther,excitement
4,Jurisprudence fetishist get technicality,neutral


In [3]:
# Dividir el dataset en train y validation
X_train, X_val, y_train, y_val = train_test_split(df['text'], df['emotion'], test_size=0.2, random_state=0)

## Preprocesamiento de los datos

In [4]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

In [5]:
import torch
from torch.utils.data import Dataset
from sklearn.preprocessing import OneHotEncoder

class PyTorchDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels.values.reshape(-1, 1)
        
        # One-hot encode the labels
        self.encoder = OneHotEncoder()
        self.labels_encoded = self.encoder.fit_transform(self.labels).toarray() 
        
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, idx):
        sample = {
            'input_ids': torch.tensor(self.inputs[idx]),
            'labels': torch.tensor(self.labels_encoded[idx], dtype=torch.float32)  # Use float32 for binary labels
        }
        return sample

In [6]:
# Tokenize train and test sets
X_train_tokenized = tokenizer(X_train.tolist(), truncation=True, padding=True)
X_val_tokenized = tokenizer(X_val.tolist(), truncation=True, padding=True)

# Create PyTorch datasets
train_dataset = PyTorchDataset(X_train_tokenized["input_ids"], y_train)
test_dataset = PyTorchDataset(X_val_tokenized["input_ids"], y_val)

In [7]:
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", 
    num_labels=23, 
    problem_type="multi_label_classification")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# !pip install -r requirements.txt

In [9]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    eval_steps = 10,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="logs",
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

In [10]:
trainer.train()

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Step,Training Loss
10,0.6867
20,0.6775
30,0.6588
40,0.6279
50,0.58
60,0.5281
70,0.4867
80,0.4502
90,0.4177
100,0.3878


TrainOutput(global_step=4038, training_loss=0.15431371462587204, metrics={'train_runtime': 1430.347, 'train_samples_per_second': 180.552, 'train_steps_per_second': 2.823, 'total_flos': 5146785641771928.0, 'train_loss': 0.15431371462587204, 'epoch': 3.0})

In [11]:
model_path = os.path.join("./models", "distilbert_model")
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

# Load model:

In [12]:
# See loss
trainer.evaluate(test_dataset)

{'eval_loss': 0.13384085893630981,
 'eval_runtime': 26.072,
 'eval_samples_per_second': 825.483,
 'eval_steps_per_second': 12.926,
 'epoch': 3.0}

In [13]:
# Predict validation set
output = tf.argmax(trainer.predict(test_dataset)[0], axis=1)

2024-05-11 09:24:29.333447: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-05-11 09:24:29.338490: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-05-11 09:24:29.338710: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

In [14]:
cm = confusion_matrix(y_val, output)
cm

NameError: name 'y_test' is not defined

In [15]:
print(classification_report(y_val, output))

NameError: name 'y_test' is not defined

In [16]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification

model_path = os.path.join("./models", "distilbert_model")

# Load the tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained(model_path)

# Load the model
model = DistilBertForSequenceClassification.from_pretrained(model_path)

In [19]:
# Asegúrate de que el modelo esté en modo de evaluación
model.eval()

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 