# Haciendo la sintonización fina de BERT para el dataset

El notebook esta diseñado para correr en colab, por lo que se incluyeron los comandos de instalación para librerías no presentes por defecto

In [1]:
!pip install transformers[torch]
!pip install datasets evaluate transformers[sentencepiece]

Collecting transformers[torch]
  Downloading transformers-4.32.0-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m56.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers[torch])
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers[torch])
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m86.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers[torch])
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m6

In [2]:
from datasets import load_dataset, Dataset, DatasetDict
import pandas as pd
import datasets
from transformers import (AutoTokenizer, DataCollatorWithPadding, AutoConfig, AutoModel,
                          AutoModelForSequenceClassification, BertConfig, Trainer, BertForSequenceClassification)
from transformers import TrainingArguments
import torch.nn as nn
import torch
import numpy as np

Los tokens especiales son:

|PAD| = 21

Los normales son:

{"other":0, "1200":1, "1300":2, "1400":3, "1600":4, "1700":5, "1800":6, "1900":7, "2100":8, "2400":9,
            "2405":10, "2600":11, "2800":12, "2805":13, "AR-10":14, "AR-20":15, "AR-30":16, "AR-40":17,
            "AR-50":18, "AR-70":19, "AR-90":20}

In [8]:
checkpoint = "bert-base-uncased"
device = torch.device('cuda:0')

In [4]:
config = BertConfig(vocab_size=22, max_position_embeddings=1024) #Limitamos el vocab_size = 22 (tipos de productos)
                                                                 #y una secuencia máxima de 1024 productos
config

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 1024,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.32.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 22
}

In [9]:
model = AutoModelForSequenceClassification.from_config(config)
model.classifier = nn.Linear(768, 21, bias=True) #Se modifica la última capa para que tenga 21 clases
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(22, 768, padding_idx=0)
      (position_embeddings): Embedding(1024, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, e

In [12]:
#Prueba para ver como es que el modelo saca las predicciones
input = torch.tensor([[0, 0, 1, 2, 2, 3, 21]]).to(device)
softmax = torch.nn.Softmax(dim=1)
softmax(model(input)[0]), model(input)[0][0]

(tensor([[0.0560, 0.0411, 0.0318, 0.0515, 0.0577, 0.0450, 0.0459, 0.0536, 0.0432,
          0.0490, 0.0427, 0.0521, 0.0673, 0.0526, 0.0513, 0.0557, 0.0348, 0.0452,
          0.0445, 0.0357, 0.0431]], device='cuda:0', grad_fn=<SoftmaxBackward0>),
 tensor([ 0.3599,  0.1381,  0.0541,  0.2688,  0.0283, -0.1138, -0.1881,  0.1004,
          0.3485,  0.1625,  0.2715,  0.0830,  0.4782,  0.5236,  0.2931,  0.6373,
         -0.0347,  0.1574, -0.1091,  0.0129,  0.1783], device='cuda:0',
        grad_fn=<SelectBackward0>))

In [13]:
#Cargamos el dataset
df = pd.read_hdf('data.h5', key='df')
df['X_tuple'] = df.X.apply(tuple)
df['y_tuple'] = df.y.apply(tuple)
df = df.drop_duplicates(subset=['X_tuple', 'y_tuple']) #Eliminamos los datos replicados X->y
df = df.drop(labels=['FE_EMIT', 'FE_ENTREGA', 'PEDIDO', 'X_tuple', 'y_tuple'], axis=1) #Eliminamos columnas que no son útiles
df['y'] = df['y'].apply(lambda x: x[0])
df

Unnamed: 0,X,y
3503,[0],0
3502,[6],6
3499,"[12, 0]",12
3500,"[12, 0]",0
3497,"[6, 12]",6
...,...,...
31,"[6, 6, 6, 6, 12, 12, 12, 0, 12, 12, 12, 12, 12...",0
22,"[12, 12, 12, 12, 12, 11, 11, 11, 11, 12, 12, 1...",12
17,"[12, 12, 12, 12, 12, 11, 11, 11, 11, 12, 12, 1...",11
10,"[12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 1...",12


In [14]:
#Definimos funciones que son necesarias para el input de BERT, consultese: https://huggingface.co/learn/nlp-course/chapter3/2?fw=pt para más información
from typing import Union
#Función para padear las secuencias
def pad(max_len:int, sequence:Union[list, np.array], pad_token:int=21):
  padding = np.full(max_len-len(sequence), pad_token)
  padded_seq = np.concatenate((sequence, padding))
  return padded_seq.tolist()

#Función para crear las máscaras de atención
def attention_mask(pad_sequence:Union[list, np.array], pad_token:int=21):
  seq = np.asarray(pad_sequence, dtype=int)
  attention_mask = np.asarray(seq != pad_token, dtype=int)
  return attention_mask.tolist()

#Función para crear los arreglos que especifican el tipo de token
def token_types(pad_sequence:Union[list, np.array]):
  return np.zeros(len(pad_sequence), dtype=int).tolist()

In [15]:
#Preparamos el df para BERT con las columnas que requiere('idx', 'label', 'input_ids', 'attention_mask', 'token_type_ids')
max_len = df.X.apply(lambda x: len(x))
max_len = max(max_len)
dt = df.copy(True)

dt.insert(0, 'idx', np.full(len(df), np.nan))
dt.insert(1, 'label', dt.y)
dt.insert(2, 'input_ids', np.full(len(df), np.nan))
dt.insert(3, 'attention_mask', np.full(len(df), np.nan))
dt.insert(4, 'token_type_ids', np.full(len(df), np.nan))
data = dt.to_numpy()

for i in range(len(data)):
  data[i][2] = pad(max_len, data[i][5], pad_token=21)
  data[i][3] = attention_mask(data[i][2], pad_token=21)
  data[i][4] = token_types(data[i][2])
dt['input_ids'] = data[:, 2]
dt['attention_mask'] = data[:, 3]
dt['token_type_ids'] = data[:, 4]
df = dt.copy(True)
df

Unnamed: 0,idx,label,input_ids,attention_mask,token_type_ids,X,y
3503,,0,"[0, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[0],0
3502,,6,"[6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[6],6
3499,,12,"[12, 0, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[12, 0]",12
3500,,0,"[12, 0, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[12, 0]",0
3497,,6,"[6, 12, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[6, 12]",6
...,...,...,...,...,...,...,...
31,,0,"[6, 6, 6, 6, 12, 12, 12, 0, 12, 12, 12, 12, 12...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[6, 6, 6, 6, 12, 12, 12, 0, 12, 12, 12, 12, 12...",0
22,,12,"[12, 12, 12, 12, 12, 11, 11, 11, 11, 12, 12, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[12, 12, 12, 12, 12, 11, 11, 11, 11, 12, 12, 1...",12
17,,11,"[12, 12, 12, 12, 12, 11, 11, 11, 11, 12, 12, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[12, 12, 12, 12, 12, 11, 11, 11, 11, 12, 12, 1...",11
10,,12,"[12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 1...",12


In [16]:
dt = df.sample(frac = 1).reset_index(drop=True) #Mezclamos el df
# El split es 80/20, sin dataset de testeo
train = int(len(dt)*0.8)
tdf = dt[:train]
vdf = dt[train:]
#Agregamos finalmente el idx a cada elemento
tdf['idx'] = tdf.reset_index(drop=True).index.values
vdf['idx'] = vdf.reset_index(drop=True).index.values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tdf['idx'] = tdf.reset_index(drop=True).index.values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vdf['idx'] = vdf.reset_index(drop=True).index.values


In [17]:
#Se pasa todo a un datasetdict porque así trabajan los modelos de HuggingFace
tds = Dataset.from_pandas(tdf)
vds = Dataset.from_pandas(vdf)

ds = DatasetDict()

ds['train'] = tds
ds['validation'] = vds

ds

DatasetDict({
    train: Dataset({
        features: ['idx', 'label', 'input_ids', 'attention_mask', 'token_type_ids', 'X', 'y'],
        num_rows: 911
    })
    validation: Dataset({
        features: ['idx', 'label', 'input_ids', 'attention_mask', 'token_type_ids', 'X', 'y'],
        num_rows: 228
    })
})

In [18]:
#Especificamos los argumentos de entrenamiento
training_args = TrainingArguments("test-trainer",
                                  per_device_train_batch_size=1,
                                  per_device_eval_batch_size=1)

In [22]:
#Creamos un CustomTrainer que deriva del Trainer, ya que el costo del Trainer base lanza errores
class CustomTrainer(Trainer):
        def compute_loss(self, model, inputs, return_outputs=False):
            outputs = model(
                input_ids=inputs['input_ids'],
                attention_mask=inputs['attention_mask'],
                token_type_ids=inputs['token_type_ids']
            )
            labels = torch.zeros(outputs['logits'].shape)
            labels[0, inputs['labels']] = 1
            loss = nn.BCEWithLogitsLoss()(outputs['logits'].to(device),
                                             labels.to(device))
            return (loss, outputs) if return_outputs else loss

In [23]:
#Inicializamos el CustomTrainer
trainer = CustomTrainer(
    model,
    training_args,
    train_dataset=ds['train'],
    eval_dataset=ds['validation']
)

In [24]:
#Entrenamos el modelo
trainer.train()

Step,Training Loss
500,0.1491
1000,0.1308
1500,0.1239


KeyboardInterrupt: ignored

# A partir de acá no lo ejecuté todavía

In [None]:
predictions = trainer.predict(ds["validation"])
print(predictions.predictions.shape, predictions.label_ids.shape)

(408, 2) (408,)

In [None]:
import numpy as np

preds = np.argmax(predictions.predictions, axis=-1)

In [None]:
import evaluate

metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=preds, references=predictions.label_ids)

{'accuracy': 0.8578431372549019, 'f1': 0.8996539792387542}

In [None]:
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()