# Haciendo la sintonización fina de BERT para el dataset

El notebook esta diseñado para correr en colab, por lo que se incluyeron los comandos de instalación para librerías no presentes por defecto

In [None]:
!pip install transformers[torch]
!pip install datasets evaluate transformers[sentencepiece]

In [None]:
from datasets import load_dataset, Dataset, DatasetDict
import pandas as pd
import datasets
from transformers import (AutoTokenizer, DataCollatorWithPadding, AutoConfig, AutoModel,
                          AutoModelForSequenceClassification, BertConfig, Trainer, BertForSequenceClassification,)
from transformers import TrainingArguments
import evaluate
import torch.nn as nn
import torch
import numpy as np

In [None]:
checkpoint = "bert-base-uncased"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
VOCAB = 21
#Limitamos el vocab_size = 22 (tipos de productos) y una secuencia máxima de 1024 productos, además de definir 13 etiquetas de salida
config = BertConfig(vocab_size=VOCAB, max_position_embeddings=1024, pad_token_id=13, num_labels=VOCAB-1)
config

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABEL_11": 11,
    "LABEL_12": 12,
    "LABEL_13": 13,
    "LABEL_14": 14,
    "LABEL_15": 15,
    "LABEL_16": 16,
    "LABEL_17": 17,
    "LABEL_18": 18,
    "LABEL_19": 19,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
  

In [None]:
model = AutoModelForSequenceClassification.from_config(config)
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21, 768, padding_idx=13)
      (position_embeddings): Embedding(1024, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, 

In [None]:
#Prueba para ver como es que el modelo saca las predicciones
input = torch.tensor([[0, 0, 1, 2, 2, 3, 13]]).to(device)
softmax = torch.nn.Softmax(dim=1)
softmax(model(input)[0]), model(input)[0][0]

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


(tensor([[0.0468, 0.0457, 0.0433, 0.0658, 0.0488, 0.0554, 0.0662, 0.0546, 0.0553,
          0.0570, 0.0417, 0.0516, 0.0521, 0.0529, 0.0494, 0.0392, 0.0316, 0.0425,
          0.0474, 0.0528]], device='cuda:0', grad_fn=<SoftmaxBackward0>),
 tensor([-0.0858, -0.3687, -0.0094,  0.1434,  0.2813, -0.0050,  0.0144, -0.2040,
          0.1097,  0.4017,  0.0663,  0.0339, -0.0941,  0.0220, -0.2616, -0.0973,
         -0.3470,  0.0187, -0.3394,  0.0864], device='cuda:0',
        grad_fn=<SelectBackward0>))

In [None]:
#Cargamos el dataset
df = pd.read_hdf('data_piezas.h5', key='df')
df['X_tuple'] = df.X.apply(tuple)
df['y_tuple'] = df.y.apply(tuple)

df = df.drop(labels=['FE_EMIT', 'FECHA', 'CG_ORDF','DES','SIMUL', 'X_tuple', 'y_tuple'], axis=1) #Eliminamos columnas que no son útiles
df['y'] = df['y'].apply(lambda x: x[0])
df

Unnamed: 0,X,y
6019,"[5, 5, 7, 5, 1, 1, 7, 2, 0, 3, 8, 8, 3, 5, 8, ...",5
6009,"[5, 7, 5, 1, 1, 7, 2, 0, 3, 8, 8, 3, 5, 8, 0, ...",5
6012,"[5, 7, 5, 1, 1, 7, 2, 0, 3, 8, 8, 3, 5, 8, 0, ...",7
6016,"[5, 7, 5, 1, 1, 7, 2, 0, 3, 8, 8, 3, 5, 8, 0, ...",5
6014,"[5, 7, 5, 1, 1, 7, 2, 0, 3, 8, 8, 3, 5, 8, 0, ...",1
...,...,...
5,"[1, 0, 2, 3, 18, 18, 0, 18, 1]",18
6,"[1, 0, 2, 3, 18, 18, 0, 18, 1]",18
7,"[1, 0, 2, 3, 18, 18, 0, 18, 1]",0
4,"[1, 0, 2, 3, 18, 18, 0, 18, 1]",18


In [None]:
#Vemos la distribución de clases del dataframe
df.y.value_counts()

2     994
3     721
5     597
12    471
8     447
0     399
10    306
4     244
1     229
13    220
17     93
7      90
18     42
11     30
15     27
19     27
9       6
Name: y, dtype: int64

In [None]:
#Definimos funciones que son necesarias para el input de BERT, consultese: https://huggingface.co/learn/nlp-course/chapter3/2?fw=pt para más información
from typing import Union
#Función para padear las secuencias
def pad(max_len:int, sequence:Union[list, np.array], pad_token:int=20):
  padding = np.full(max_len-len(sequence), pad_token)
  padded_seq = np.concatenate((sequence, padding))
  return padded_seq.tolist()

#Función para crear las máscaras de atención
def attention_mask(pad_sequence:Union[list, np.array], pad_token:int=20):
  seq = np.asarray(pad_sequence, dtype=int)
  attention_mask = np.asarray(seq != pad_token, dtype=int)
  return attention_mask.tolist()

#Función para crear los arreglos que especifican el tipo de token
def token_types(pad_sequence:Union[list, np.array]):
  return np.zeros(len(pad_sequence), dtype=int).tolist()

In [None]:
#Preparamos el df para BERT con las columnas que requiere('idx', 'label', 'input_ids', 'attention_mask', 'token_type_ids')
max_len = df.X.apply(lambda x: len(x))
max_len = max(max_len)
dt = df.copy(True)
PAD = 20

dt.insert(0, 'idx', np.full(len(df), np.nan))
dt.insert(1, 'label', dt.y)
dt.insert(2, 'input_ids', np.full(len(df), np.nan))
dt.insert(3, 'attention_mask', np.full(len(df), np.nan))
dt.insert(4, 'token_type_ids', np.full(len(df), np.nan))
data = dt.to_numpy()

for i in range(len(data)):
  data[i][2] = pad(max_len, data[i][5], pad_token=PAD)
  data[i][3] = attention_mask(data[i][2], pad_token=PAD)
  data[i][4] = token_types(data[i][2])
dt['input_ids'] = data[:, 2]
dt['attention_mask'] = data[:, 3]
dt['token_type_ids'] = data[:, 4]
df = dt.copy(True)
df

Unnamed: 0,idx,label,input_ids,attention_mask,token_type_ids,X,y
6019,,5,"[5, 5, 7, 5, 1, 1, 7, 2, 0, 3, 8, 8, 3, 5, 8, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[5, 5, 7, 5, 1, 1, 7, 2, 0, 3, 8, 8, 3, 5, 8, ...",5
6009,,5,"[5, 7, 5, 1, 1, 7, 2, 0, 3, 8, 8, 3, 5, 8, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[5, 7, 5, 1, 1, 7, 2, 0, 3, 8, 8, 3, 5, 8, 0, ...",5
6012,,7,"[5, 7, 5, 1, 1, 7, 2, 0, 3, 8, 8, 3, 5, 8, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[5, 7, 5, 1, 1, 7, 2, 0, 3, 8, 8, 3, 5, 8, 0, ...",7
6016,,5,"[5, 7, 5, 1, 1, 7, 2, 0, 3, 8, 8, 3, 5, 8, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[5, 7, 5, 1, 1, 7, 2, 0, 3, 8, 8, 3, 5, 8, 0, ...",5
6014,,1,"[5, 7, 5, 1, 1, 7, 2, 0, 3, 8, 8, 3, 5, 8, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[5, 7, 5, 1, 1, 7, 2, 0, 3, 8, 8, 3, 5, 8, 0, ...",1
...,...,...,...,...,...,...,...
5,,18,"[1, 0, 2, 3, 18, 18, 0, 18, 1, 20, 20, 20, 20,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 2, 3, 18, 18, 0, 18, 1]",18
6,,18,"[1, 0, 2, 3, 18, 18, 0, 18, 1, 20, 20, 20, 20,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 2, 3, 18, 18, 0, 18, 1]",18
7,,0,"[1, 0, 2, 3, 18, 18, 0, 18, 1, 20, 20, 20, 20,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 2, 3, 18, 18, 0, 18, 1]",0
4,,18,"[1, 0, 2, 3, 18, 18, 0, 18, 1, 20, 20, 20, 20,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 2, 3, 18, 18, 0, 18, 1]",18


In [None]:
dt = df.sample(frac = 1).reset_index(drop=True) #Mezclamos el df
# El split es 80/20, sin dataset de testeo
train = int(len(dt)*0.8)
tdf = dt[:train]
vdf = dt[train:]
#Agregamos finalmente el idx a cada elemento
tdf['idx'] = tdf.reset_index(drop=True).index.values
vdf['idx'] = vdf.reset_index(drop=True).index.values
tdf

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tdf['idx'] = tdf.reset_index(drop=True).index.values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vdf['idx'] = vdf.reset_index(drop=True).index.values


Unnamed: 0,idx,label,input_ids,attention_mask,token_type_ids,X,y
0,0,5,"[12, 5, 8, 10, 10, 17, 3, 10, 1, 0, 0, 8, 8, 8...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[12, 5, 8, 10, 10, 17, 3, 10, 1, 0, 0, 8, 8, 8...",5
1,1,1,"[12, 10, 10, 1, 1, 1, 12, 7, 1, 0, 3, 2, 3, 2,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[12, 10, 10, 1, 1, 1, 12, 7, 1, 0, 3, 2, 3, 2,...",1
2,2,12,"[12, 5, 8, 13, 0, 5, 8, 8, 2, 2, 2, 2, 2, 2, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[12, 5, 8, 13, 0, 5, 8, 8, 2, 2, 2, 2, 2, 2, 1...",12
3,3,7,"[0, 1, 0, 5, 3, 1, 2, 13, 2, 2, 3, 3, 3, 0, 12...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 5, 3, 1, 2, 13, 2, 2, 3, 3, 3, 0, 12...",7
4,4,8,"[10, 5, 8, 13, 2, 8, 2, 0, 7, 2, 10, 4, 10, 5,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[10, 5, 8, 13, 2, 8, 2, 0, 7, 2, 10, 4, 10, 5,...",8
...,...,...,...,...,...,...,...
3949,3949,4,"[1, 2, 2, 10, 12, 2, 11, 10, 12, 4, 5, 1, 11, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 2, 2, 10, 12, 2, 11, 10, 12, 4, 5, 1, 11, ...",4
3950,3950,3,"[12, 5, 2, 2, 2, 12, 18, 11, 3, 3, 3, 0, 2, 2,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[12, 5, 2, 2, 2, 12, 18, 11, 3, 3, 3, 0, 2, 2,...",3
3951,3951,0,"[0, 12, 3, 7, 0, 2, 3, 15, 2, 5, 8, 5, 4, 5, 5...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 12, 3, 7, 0, 2, 3, 15, 2, 5, 8, 5, 4, 5, 5...",0
3952,3952,17,"[12, 3, 3, 10, 10, 3, 10, 17, 3, 12, 3, 12, 12...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[12, 3, 3, 10, 10, 3, 10, 17, 3, 12, 3, 12, 12...",17


In [None]:
#Se pasa todo a un datasetdict porque así trabajan los modelos de HuggingFace
tds = Dataset.from_pandas(tdf)
vds = Dataset.from_pandas(vdf)

ds = DatasetDict()

ds['train'] = tds
ds['validation'] = vds

ds

DatasetDict({
    train: Dataset({
        features: ['idx', 'label', 'input_ids', 'attention_mask', 'token_type_ids', 'X', 'y'],
        num_rows: 3954
    })
    validation: Dataset({
        features: ['idx', 'label', 'input_ids', 'attention_mask', 'token_type_ids', 'X', 'y'],
        num_rows: 989
    })
})

In [None]:
#Especificamos los argumentos de entrenamiento
training_args = TrainingArguments("test-trainer",
                                  per_device_train_batch_size=1,
                                  per_device_eval_batch_size=24,
                                  num_train_epochs=17,
                                  learning_rate = 3e-05,
                                  evaluation_strategy="epoch",
                                  save_steps = 1500,
                                  )

In [None]:
def compute_metrics(eval_preds):
    metric = evaluate.load("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    ord = np.argsort(logits, axis=-1) #Ordenamos los índices de los logits de menor a mayor
    sec_pred = ord[:, -2]
    third_pred = ord[:, -3]
    accuracy = metric.compute(predictions=predictions, references=labels)['accuracy']
    accuracy += metric.compute(predictions=sec_pred, references=labels)['accuracy']
    accuracy += metric.compute(predictions=third_pred, references=labels)['accuracy']
    acc = {'accuracy':accuracy}
    return acc

In [None]:
#Testeamos la función de precisión definida
eval = np.array([[-1, 2, 4],[2, 3, -2],[-5, 9, -10]]), [0, 1, 2]
print(compute_metrics(eval))

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

{'accuracy': 1.0}


In [None]:
#Creamos un CustomTrainer que deriva del Trainer, ya que el costo del Trainer base lanza errores
class CustomTrainer(Trainer):
        def compute_loss(self, model, inputs, return_outputs=False):
            outputs = model(
                input_ids=inputs['input_ids'],
                attention_mask=inputs['attention_mask'],
                token_type_ids=inputs['token_type_ids']
            )
            labels = torch.zeros(outputs['logits'].shape)
            labels[0, inputs['labels']] = 1
            loss = nn.BCEWithLogitsLoss()(outputs['logits'].to(device),
                                             labels.to(device))
            return (loss, outputs) if return_outputs else loss

In [None]:
#Inicializamos el CustomTrainer
trainer = CustomTrainer(
    model,
    training_args,
    train_dataset=ds['train'],
    eval_dataset=ds['validation'],
    compute_metrics=compute_metrics
)

In [None]:
#Entrenamos el modelo
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.1722,0.106976,0.471183
2,0.1675,0.105906,0.471183
3,0.1698,0.108019,0.471183
4,0.1711,0.106889,0.471183
5,0.1676,0.108353,0.444894


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1722,0.106976,0.471183
2,0.1675,0.105906,0.471183
3,0.1698,0.108019,0.471183
4,0.1711,0.106889,0.471183
5,0.1676,0.108353,0.444894


# Etapa de evaluación de modelo

In [None]:
y_train = trainer.predict(ds["train"])
print(y_train.predictions.shape, y_train.label_ids.shape)

(3954, 20) (3954,)


In [None]:
y_test = trainer.predict(ds["validation"])
print(y_test.predictions.shape, y_test.label_ids.shape)

(989, 20) (989,)


In [None]:
# Get the top 3 predicted class indices for each instance in the training set
train_top3_preds = np.argsort(y_train.predictions, axis=-1)[:, -3:]

# Get the top 3 predicted class indices for each instance in the test set
test_top3_preds = np.argsort(y_test.predictions, axis=-1)[:, -3:]

In [None]:
import evaluate
# For the training set
train_accuracy = np.mean(np.any(train_top3_preds == y_train.label_ids.reshape(-1, 1), axis=-1))

# For the test set
test_accuracy = np.mean(np.any(test_top3_preds == y_test.label_ids.reshape(-1, 1), axis=-1))

# Print or use train_accuracy and test_accuracy as needed
print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

Training Accuracy: 0.46307536671724836
Test Accuracy: 0.4863498483316481


In [None]:
import torch
# Save your model to Google Drive
torch.save(model.state_dict(), '/content/drive/MyDrive/BERT PIEZAS/BERT_PIEZAS.pth')

In [None]:
from google.colab import files

# Compress the model and tokenizer files into a zip archive (optional)
!zip -r model.zip /content/drive/MyDrive/BERT PIEZAS/

# Download the zip file to your local machine (optional)
files.download('model.zip')


zip error: Nothing to do! (try: zip -r model.zip . -i /content/drive/MyDrive/BERT PIEZAS/)


FileNotFoundError: ignored

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
