# Haciendo la sintonización fina de BERT para el dataset

El notebook esta diseñado para correr en colab, por lo que se incluyeron los comandos de instalación para librerías no presentes por defecto

In [1]:
!pip install transformers[torch]
!pip install datasets evaluate transformers[sentencepiece]

Collecting transformers[torch]
  Downloading transformers-4.32.1-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers[torch])
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m35.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers[torch])
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m57.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers[torch])
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m8

In [3]:
from datasets import load_dataset, Dataset, DatasetDict
import pandas as pd
import datasets
from transformers import (AutoTokenizer, DataCollatorWithPadding, AutoConfig, AutoModel,
                          AutoModelForSequenceClassification, BertConfig, Trainer, BertForSequenceClassification,)
from transformers import TrainingArguments
import evaluate
import torch.nn as nn
import torch
import numpy as np

Los tokens especiales son:

|PAD| = 13

Los normales son:

{"other":0, "1200":1, "1300":2, "1400":3, "1600":4, "1700":5, "1800":6, "1900":7, "2100":8, "2400":9,
            "2405":9, "2600":10, "2800":11, "2805":11, "AR-10":12, "AR-20":12, "AR-30":12, "AR-40":12,
            "AR-50":12, "AR-70":12, "AR-90":12}

En el dataset tenemos este número de cada clase:

6     377

11    253

5     167

0     117

10     69

12     34

3      30

4      29

9      21

8      18

1      15

7       4

In [4]:
checkpoint = "bert-base-uncased"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [10]:
VOCAB = 14
#Limitamos el vocab_size = 22 (tipos de productos) y una secuencia máxima de 1024 productos, además de definir 13 etiquetas de salida
config = BertConfig(vocab_size=VOCAB, max_position_embeddings=1024, pad_token_id=13, num_labels=VOCAB-1) 
config

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABEL_11": 11,
    "LABEL_12": 12,
    "LABEL_13": 13,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7,
    "LABEL_8": 8,
    "LABEL_9": 9
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 1024,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 13,
  "position_embedding_type": "ab

In [11]:
model = AutoModelForSequenceClassification.from_config(config)
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(14, 768, padding_idx=13)
      (position_embeddings): Embedding(1024, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementw

In [12]:
#Prueba para ver como es que el modelo saca las predicciones
input = torch.tensor([[0, 0, 1, 2, 2, 3, 13]]).to(device)
softmax = torch.nn.Softmax(dim=1)
softmax(model(input)[0]), model(input)[0][0]

(tensor([[0.1181, 0.0685, 0.0822, 0.0676, 0.0714, 0.0719, 0.0681, 0.0504, 0.0430,
          0.0546, 0.0606, 0.0572, 0.0913, 0.0951]], grad_fn=<SoftmaxBackward0>),
 tensor([ 0.3425,  0.0263,  0.2152, -0.0845,  0.2268,  0.0005,  0.1771, -0.3026,
         -0.3570, -0.3475,  0.4641,  0.0442,  0.0008,  0.3607],
        grad_fn=<SelectBackward0>))

In [None]:
#Cargamos el dataset
df = pd.read_hdf('./datasets/data.h5', key='df')
df['X_tuple'] = df.X.apply(tuple)
df['y_tuple'] = df.y.apply(tuple)
df = df.drop_duplicates(subset=['X_tuple', 'y_tuple']) #Eliminamos los datos replicados X->y
df = df.drop(labels=['FE_EMIT', 'FE_ENTREGA', 'PEDIDO', 'X_tuple', 'y_tuple'], axis=1) #Eliminamos columnas que no son útiles
df['y'] = df['y'].apply(lambda x: x[0])
df

Unnamed: 0,X,y
3503,[0],0
3502,[6],6
3499,"[11, 0]",11
3500,"[11, 0]",0
3497,"[6, 11]",6
...,...,...
31,"[6, 6, 6, 6, 11, 11, 11, 0, 11, 11, 11, 11, 11...",0
22,"[11, 11, 11, 11, 11, 10, 10, 10, 10, 11, 11, 1...",11
17,"[11, 11, 11, 11, 11, 10, 10, 10, 10, 11, 11, 1...",10
10,"[11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 1...",11


In [None]:
#Vemos la distribución de clases del dataframe
df.y.value_counts()

6     377
11    253
5     167
0     117
10     69
12     34
3      30
4      29
9      21
8      18
1      15
7       4
Name: y, dtype: int64

In [None]:
#Definimos funciones que son necesarias para el input de BERT, consultese: https://huggingface.co/learn/nlp-course/chapter3/2?fw=pt para más información
from typing import Union
#Función para padear las secuencias
def pad(max_len:int, sequence:Union[list, np.array], pad_token:int=21):
  padding = np.full(max_len-len(sequence), pad_token)
  padded_seq = np.concatenate((sequence, padding))
  return padded_seq.tolist()

#Función para crear las máscaras de atención
def attention_mask(pad_sequence:Union[list, np.array], pad_token:int=21):
  seq = np.asarray(pad_sequence, dtype=int)
  attention_mask = np.asarray(seq != pad_token, dtype=int)
  return attention_mask.tolist()

#Función para crear los arreglos que especifican el tipo de token
def token_types(pad_sequence:Union[list, np.array]):
  return np.zeros(len(pad_sequence), dtype=int).tolist()

In [None]:
#Preparamos el df para BERT con las columnas que requiere('idx', 'label', 'input_ids', 'attention_mask', 'token_type_ids')
max_len = df.X.apply(lambda x: len(x))
max_len = max(max_len)
dt = df.copy(True)
PAD = 13

dt.insert(0, 'idx', np.full(len(df), np.nan))
dt.insert(1, 'label', dt.y)
dt.insert(2, 'input_ids', np.full(len(df), np.nan))
dt.insert(3, 'attention_mask', np.full(len(df), np.nan))
dt.insert(4, 'token_type_ids', np.full(len(df), np.nan))
data = dt.to_numpy()

for i in range(len(data)):
  data[i][2] = pad(max_len, data[i][5], pad_token=PAD)
  data[i][3] = attention_mask(data[i][2], pad_token=PAD)
  data[i][4] = token_types(data[i][2])
dt['input_ids'] = data[:, 2]
dt['attention_mask'] = data[:, 3]
dt['token_type_ids'] = data[:, 4]
df = dt.copy(True)
df

Unnamed: 0,idx,label,input_ids,attention_mask,token_type_ids,X,y
3503,,0,"[0, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[0],0
3502,,6,"[6, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[6],6
3499,,11,"[11, 0, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[11, 0]",11
3500,,0,"[11, 0, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[11, 0]",0
3497,,6,"[6, 11, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[6, 11]",6
...,...,...,...,...,...,...,...
31,,0,"[6, 6, 6, 6, 11, 11, 11, 0, 11, 11, 11, 11, 11...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[6, 6, 6, 6, 11, 11, 11, 0, 11, 11, 11, 11, 11...",0
22,,11,"[11, 11, 11, 11, 11, 10, 10, 10, 10, 11, 11, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[11, 11, 11, 11, 11, 10, 10, 10, 10, 11, 11, 1...",11
17,,10,"[11, 11, 11, 11, 11, 10, 10, 10, 10, 11, 11, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[11, 11, 11, 11, 11, 10, 10, 10, 10, 11, 11, 1...",10
10,,11,"[11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 1...",11


In [None]:
dt = df.sample(frac = 1).reset_index(drop=True) #Mezclamos el df
# El split es 80/20, sin dataset de testeo
train = int(len(dt)*0.8)
tdf = dt[:train]
vdf = dt[train:]
#Agregamos finalmente el idx a cada elemento
tdf['idx'] = tdf.reset_index(drop=True).index.values
vdf['idx'] = vdf.reset_index(drop=True).index.values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tdf['idx'] = tdf.reset_index(drop=True).index.values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vdf['idx'] = vdf.reset_index(drop=True).index.values


In [None]:
#Se pasa todo a un datasetdict porque así trabajan los modelos de HuggingFace
tds = Dataset.from_pandas(tdf)
vds = Dataset.from_pandas(vdf)

ds = DatasetDict()

ds['train'] = tds
ds['validation'] = vds

ds

DatasetDict({
    train: Dataset({
        features: ['idx', 'label', 'input_ids', 'attention_mask', 'token_type_ids', 'X', 'y'],
        num_rows: 907
    })
    validation: Dataset({
        features: ['idx', 'label', 'input_ids', 'attention_mask', 'token_type_ids', 'X', 'y'],
        num_rows: 227
    })
})

In [None]:
#Especificamos los argumentos de entrenamiento
training_args = TrainingArguments("test-trainer",
                                  per_device_train_batch_size=1,
                                  per_device_eval_batch_size=24,
                                  num_train_epochs=10,
                                  learning_rate = 3e-05,
                                  evaluation_strategy="epoch",
                                  save_steps = 1500,
                                  )

In [None]:
def compute_metrics(eval_preds):
    metric = evaluate.load("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    ord = np.argsort(logits, axis=-1) #Ordenamos los índices de los logits de menor a mayor
    sec_pred = ord[:, -2]
    third_pred = ord[:, -3]
    accuracy = metric.compute(predictions=predictions, references=labels)['accuracy']
    accuracy += metric.compute(predictions=sec_pred, references=labels)['accuracy']
    accuracy += metric.compute(predictions=third_pred, references=labels)['accuracy']
    acc = {'accuracy':accuracy}
    return acc

In [None]:
#Testeamos la función de precisión definida
eval = np.array([[-1, 2, 4],[2, 3, -2],[-5, 9, -10]]), [0, 1, 2]
print(compute_metrics(eval))

{'accuracy': 1.0}


In [None]:
#Creamos un CustomTrainer que deriva del Trainer, ya que el costo del Trainer base lanza errores
class CustomTrainer(Trainer):
        def compute_loss(self, model, inputs, return_outputs=False):
            outputs = model(
                input_ids=inputs['input_ids'],
                attention_mask=inputs['attention_mask'],
                token_type_ids=inputs['token_type_ids']
            )
            labels = torch.zeros(outputs['logits'].shape)
            labels[0, inputs['labels']] = 1
            loss = nn.BCEWithLogitsLoss()(outputs['logits'].to(device),
                                             labels.to(device))
            return (loss, outputs) if return_outputs else loss

In [None]:
#Inicializamos el CustomTrainer
trainer = CustomTrainer(
    model,
    training_args,
    train_dataset=ds['train'],
    eval_dataset=ds['validation'],
    compute_metrics=compute_metrics
)

In [None]:
#Entrenamos el modelo
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2118,0.177752,0.660793
2,0.2148,0.169941,0.665198
3,0.2122,0.161153,0.656388
4,0.2067,0.152575,0.709251
5,0.1964,0.184217,0.709251
6,0.2059,0.182496,0.709251
7,0.201,0.190468,0.709251
8,0.203,0.190053,0.709251
9,0.2065,0.189073,0.709251
10,0.1994,0.190766,0.709251


TrainOutput(global_step=9070, training_loss=0.20477372309338823, metrics={'train_runtime': 1021.3849, 'train_samples_per_second': 8.88, 'train_steps_per_second': 8.88, 'total_flos': 1599181726129200.0, 'train_loss': 0.20477372309338823, 'epoch': 10.0})

# Etapa de evaluación de modelo

In [None]:
y_train = trainer.predict(ds["train"])
print(y_train.predictions.shape, y_train.label_ids.shape)

(907, 13) (907,)


In [None]:
y_test = trainer.predict(ds["validation"])
print(y_test.predictions.shape, y_test.label_ids.shape)

(227, 13) (227,)


In [None]:
import numpy as np

train_pred = np.argmax(y_train.predictions, axis=-1)
test_pred = np.argmax(y_test.predictions, axis=-1)

In [None]:
import evaluate

metric = evaluate.load('accuracy')
metric.compute(predictions=train_pred, references=y_train.label_ids), metric.compute(predictions=test_pred, references=y_test.label_ids)

({'accuracy': 0.4211686879823594}, {'accuracy': 0.41409691629955947})