In [None]:
!pip install transformers

import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.legacy import datasets
from torchtext.legacy.data import Field, LabelField, BucketIterator
from torch.utils.data import TensorDataset

import transformers
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, AdamW, get_linear_schedule_with_warmup
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, BertConfig
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

from sklearn.metrics import confusion_matrix, classification_report

import numpy as np
import random

RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed_all(RANDOM_SEED)



In [None]:
TEXT = Field(tokenize = 'spacy', lower = True) # Indicamos que queremos el texto tokenizado
LABEL = LabelField(dtype = torch.int64) # Indicamos que la etiqueta la queremos como un entero

train_data, test_data = datasets.IMDB.splits(TEXT, LABEL) # Descargar el dataset IMDB

print(vars(train_data.examples[0]))

{'text': ['for', 'those', 'who', 'like', 'their', 'murder', 'mysteries', 'busy', ',', 'this', 'is', 'definitely', 'the', 'one', 'to', 'see', ',', 'as', 'it', 'is', 'chock', 'full', 'of', 'interesting', 'and', 'suspicious', 'characters', ',', 'most', 'of', 'them', 'wealthy', 'long', 'island', 'socialite', 'types', '.', 'as', 'the', 'star', 'detective', ',', 'william', 'powell', 'is', 'alternately', 'starchy', 'and', 'inspired', ',', 'behaving', 'at', 'times', 'as', 'if', 'he', 'and', 'his', 'suit', 'went', 'to', 'the', 'cleaners', 'and', 'got', 'pressed', 'together', '.', 'mary', 'astor', 'is', 'very', 'lovely', 'here', '.', '<', 'br', '/><br', '/>powell', 'had', 'made', 'a', 'career', 'out', 'of', 'playing', 'the', 'lead', 'character', ',', 'philo', 'vance', ',', 'in', 'a', 'series', 'of', 'movies', 'made', 'at', 'a', 'couple', 'of', 'studios', 'over', 'several', 'years', '.', 'in', '-', 'between', 'these', 'films', 'he', 'developed', 'into', 'a', 'somewhat', 'offbeat', 'romantic', 'le

In [None]:
# Convertimos las listas tokenizadas a listas de strings y de etiquetas
def token_to_sentence(data):
  list_sentence = []
  list_labels = []
  for sentence in data:
    str = ""
    label = 0
    for token in sentence.text:
      str += (token + " ")
    if sentence.label == "pos":
      label = 1
    list_labels.append(label)
    list_sentence.append(str)
  return list_sentence, list_labels

train_sentences, train_labels = token_to_sentence(train_data)
test_sentences, test_labels = token_to_sentence(test_data)

In [None]:
# Definimos la versión de BERT a utilizar
# Referencia (API): https://huggingface.co/distilbert-base-uncased, paper: https://arxiv.org/abs/1910.01108
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased' # Para utilizar la versión normal de BERT usar: 'bert-base-uncased', 'distilbert-base-uncased'

# Tokenizamos las oraciones y les asignamos su respectivo ID de palabra.
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME, do_lower_case=True) # BertTokenizer, DistilBertTokenizer

MAX_LENGTH = 200

def BERT_Tokenize(sentences, max_length):
  input_ids = []
  attention_masks = []
  for sent in sentences:
    encoded_dict = tokenizer.encode_plus(
                        sent,                       # Instancia a codificar
                        add_special_tokens = True,  # Añadir '[CLS]' y '[SEP]'
                        max_length = MAX_LENGTH,    # Truncar todas las instancias
                        truncation=True,
                        padding = 'max_length',   # Añadir padding
                        return_attention_mask = True,   # Construir las máscaras de atención.
                        return_tensors = 'pt')
    
    # Añadir la instancia codificada a la lista
    input_ids.append(encoded_dict['input_ids'])
    
    # Añadimos la máscara de atención
    attention_masks.append(encoded_dict['attention_mask'])
  return input_ids, attention_masks

# Convertir las listas a tensores (conjunto de entrenamiento)
input_ids_train, attention_masks_train = BERT_Tokenize(train_sentences, MAX_LENGTH)
input_ids_train = torch.cat(input_ids_train, dim=0)
attention_masks_train = torch.cat(attention_masks_train, dim=0)
train_labels = torch.tensor(train_labels)

# Convertir las listas a tensores (conjunto de prueba)
input_ids_test, attention_masks_test = BERT_Tokenize(test_sentences, MAX_LENGTH)
input_ids_test = torch.cat(input_ids_test, dim=0)
attention_masks_test = torch.cat(attention_masks_test, dim=0)
test_labels = torch.tensor(test_labels)

# Print sentence 0, now as a list of IDs.
print('Original: ', train_sentences[0])
print('Token IDs:', input_ids_train[0])

Original:  for those who like their murder mysteries busy , this is definitely the one to see , as it is chock full of interesting and suspicious characters , most of them wealthy long island socialite types . as the star detective , william powell is alternately starchy and inspired , behaving at times as if he and his suit went to the cleaners and got pressed together . mary astor is very lovely here . < br /><br />powell had made a career out of playing the lead character , philo vance , in a series of movies made at a couple of studios over several years . in - between these films he developed into a somewhat offbeat romantic lead , at times even essaying gentleman gangster roles . already middle - aged , he was stuck in somewhat of a career rut by the time this one came along . as with so many early talkie stars , it seemed that his time had come and gone , that he was fine for early depression prohibition - era films , but that with changing times he was perhaps too mature and da

In [None]:
# Combina las entradas de entrenamiento y prueba a un único tensor
train_dataset = TensorDataset(input_ids_train, attention_masks_train, train_labels)
test_dataset = TensorDataset(input_ids_test, attention_masks_test, test_labels)

BATCH_SIZE = 64

# Creamos los DataLoaders para el entrenamiento y prueba
train_dataloader = DataLoader(
            train_dataset,  # Instancias de entrenamiento
            sampler = RandomSampler(train_dataset), # Selecciona los lotes de forma aleatoria
            batch_size = BATCH_SIZE # Entrena con el tamaño de lote
        )

test_dataloader = DataLoader(
            test_dataset, # Instancias de prueba
            sampler = SequentialSampler(test_dataset), # Pull out batches sequentially.
            batch_size = BATCH_SIZE # Evaluate with this batch size.
        )

In [None]:
# Cargamos el modelo pre-entrenado BERT-base para la clasificación de una secuencia
model = BertForSequenceClassification.from_pretrained( # DistilBertForSequenceClassification, BertForSequenceClassification
    PRE_TRAINED_MODEL_NAME, # Cargamos el modelo BASE de BERT
    num_labels = 2, # Número de clases
    output_attentions = False, # Regresar valores de atención
    output_hidden_states = False, # Regresar los valores de codificación
)

# Asignamos el dispositivo en el que se entrenará el modelo propuesto
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Movemos el modelo a la GPU
model.to(device)

# Definimos número de épocas
epochs = 4

# Creamos el optimizador
LEARNING_RATE = 5e-5

# Definimos el algorítmo de optimización
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

# Definimos el número total de pasos de entrenamiento (epocas * número de lotes)
total_steps = len(train_dataloader) * epochs

# Crear un calendario para actualizar la tasa de aprendizaje (opcional)
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
# Calcula el total de etiquetas correctamente clasificadas
def sum_correct(preds, labels):
  pred_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()
  return np.sum(pred_flat == labels_flat), pred_flat, labels_flat

# Función para el entrenamiento de nuestro modelo
def train(model, iterator, optimizer=optimizer, clip=1):
    model.train()
    epoch_loss = 0
    total_correct = 0
    total_count = 0
    
    for step, batch in enumerate(iterator):

      b_input_ids = batch[0].to(device)
      b_input_mask = batch[1].to(device)
      b_labels = batch[2].to(device)

      # Resetea el valor de los gradientes
      model.zero_grad()

      outputs  = model(     input_ids = b_input_ids,
                            attention_mask = b_input_mask, 
                            labels = b_labels)
      
      loss, logits = outputs[:2]
      
      epoch_loss += loss.item()

      # Mover las etiquetas y los registros a la cpu
      logits = logits.detach().cpu().numpy()
      label_ids = b_labels.to('cpu').numpy()

      # Sumar las predicciones correctas
      correct, predictions, labels = sum_correct(logits, label_ids)
      total_correct += correct
      total_count += len(b_labels)

      # Calcula los gradientes
      loss.backward()
      # Esto es para ayudar a prevenir el problema de "explosión de gradientes".
      torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
      # Actualiza los pesos del modelo
      optimizer.step()
      # Actualiza la tasa de aprendizaje
      scheduler.step()
    
    print(f'Train accuracy: {(total_correct/total_count):.6f}')
    mean_loss = epoch_loss / len(train_dataloader)
    return mean_loss # Pérdida promedio

In [None]:
# Ciclo de entrenamiento
for epoch in range(epochs):
  result = train(model=model, iterator=train_dataloader)
  print(f'Epoch {epoch + 1} / {epochs}, Mean loss: {result:.6f}')

Train accuracy: 0.873440
Epoch 1 / 4, Mean loss: 0.296311
Train accuracy: 0.943520
Epoch 2 / 4, Mean loss: 0.152798
Train accuracy: 0.978520
Epoch 3 / 4, Mean loss: 0.069406
Train accuracy: 0.991360
Epoch 4 / 4, Mean loss: 0.034112


In [None]:
total_correct = 0
total_count = 0
model_prediction = []
ground_truth = []
model.eval()

for batch in test_dataloader:
  b_input_ids = batch[0].to(device)
  b_input_mask = batch[1].to(device)
  b_labels = batch[2].to(device)

  with torch.no_grad():        
    outputs = model(      input_ids = b_input_ids, 
                          attention_mask = b_input_mask,
                          labels = b_labels)
    loss, logits = outputs[:2]

  # Mover las etiquetas y los registros a la cpu
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()

  # Suma las predicciones correctas
  correct, predictions, labels = sum_correct(logits, label_ids)
  total_correct += correct
  total_count += len(b_labels)
  model_prediction.append(predictions.tolist())
  ground_truth.append(labels.tolist())

print(f'Test accuracy: {(total_correct/total_count):.6f}')

Test accuracy: 0.909400


In [None]:
# Reporte de clasificación
model_prediction = [item for sublist in model_prediction for item in sublist]
ground_truth = [item for sublist in ground_truth for item in sublist]
print(classification_report(ground_truth, model_prediction, labels=[0 ,1], digits=4))

              precision    recall  f1-score   support

           0     0.9242    0.8920    0.9078     12500
           1     0.8956    0.9268    0.9109     12500

    accuracy                         0.9094     25000
   macro avg     0.9099    0.9094    0.9094     25000
weighted avg     0.9099    0.9094    0.9094     25000

