In [255]:
! pip install portalocker>=2.0
! pip install torchtext

zsh:1: 2.0 not found


In [256]:
import torch
import torchtext
torchtext.disable_torchtext_deprecation_warning()
print(torch.__version__, torchtext.__version__)
from torchtext.datasets import DBpedia

2.3.0 0.18.0


In [257]:
from dbpedia import DBpedia

train_iter = iter(DBpedia(split="train"))
print(next(train_iter))

(1, 'E. D. Abbott Ltd', ' Abbott of Farnham E D Abbott Limited was a British coachbuilding business based in Farnham Surrey trading under that name from 1929. A major part of their output was under sub-contract to motor vehicle manufacturers. Their business closed in 1972.')


In [258]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizador = get_tokenizer("basic_english")
train_iter = DBpedia(split="train")
def yield_tokens(data_iter):
    for _, text, description in data_iter:
        yield tokenizador(text)

vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [259]:
vocab(tokenizador("hello how are you? i am a platzi student"))

[2673, 629, 480, 97, 0, 40, 812, 17, 0, 2421]

In [260]:
texto_pipeline = lambda x: vocab(tokenizador(x))
label_pipeline = lambda x: int(x) - 1

In [261]:
texto_pipeline("hello i am joaquin")

[2673, 40, 812, 7907]

In [262]:
label_pipeline("10")

9

In [263]:
#Configuración del dispositivo: Determina si se usará CPU o GPU (CUDA) según la disponibilidad.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 
#Definición de collateBatch: Desarrollar una función collateBatch para procesar cada lote.
def collateBatch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_label, _text) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(texto_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0) + offsets[-1])
    
    label_list = torch.tensor(label_list, dtype=torch.int64).to(device)
    #La función cumsum en PyTorch proporciona la suma acumulativa de los elementos a lo largo de una dimensión especificada. En este contexto, se utiliza para determinar los puntos de inicio de cada nuevo texto dentro del tensor de datos concatenados. Este método es vital para gestionar adecuadamente el flujo de datos en estructuras de texto:
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0).to(device)
    text_list = torch.cat(text_list).to(device)
    return label_list, text_list, offsets

In [264]:
from torch.utils.data import DataLoader
trainIter = DBpedia(split='train') #Definición del dataset: Se utiliza un dataset de entrenamiento, en este caso un iterador llamado trainIter del dataset DBpedia, especificando su uso para el set de entrenamiento:
data_loader = DataLoader(dataset=trainIter, batch_size=8, shuffle=False, collate_fn=collateBatch) #Creación del Data Loader: Aquí se establece el tamaño del lote, se elige si se quiere aleatorizar la secuencia de los datos (shuffling) y se selecciona una función collate:


In [265]:
data_loader

<torch.utils.data.dataloader.DataLoader at 0x38cd44a60>

In [266]:
from torch import nn
import torch.nn.functional as F

In [267]:
class ModeloClasificacionTexto(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super(ModeloClasificacionTexto, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.bn1 = nn.BatchNorm1d(embed_dim)
        self.fc = nn.Linear(embed_dim, num_classes)

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        normEmbedded = self.bn1(embedded)
        embeddedActivated = F.relu(normEmbedded)
        return self.fc(embeddedActivated)

In [268]:
train_iter = DBpedia(split="train")
num_class = len(set([item[1] for item in train_iter]))
vocab_size = len(vocab)
embedding_size = 100
modelo = ModeloClasificacionTexto(vocab_size=vocab_size, embed_dim=embedding_size, num_classes=num_class).to(device)

In [269]:
vocab_size

313710

In [270]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

modelo = ModeloClasificacionTexto(vocab_size=vocab_size, embed_dim=embedding_size, num_classes=num_class)
print(f"El modelo tiene {count_parameters(modelo):,} parámetros entrenables")

El modelo tiene 87,931,200 parámetros entrenables


In [271]:
def entrena(dataloader):
    # Colocar el modelo en formato de entrenamiento
    modelo.train()

    # Inicializa accuracy, count y loss para cada epoch
    epoch_acc = 0
    epoch_loss = 0
    total_count = 0 

    for idx, (label, text, offsets) in enumerate(dataloader):
        # reestablece los gradientes después de cada batch
        optimizer.zero_grad()
        # Obten predicciones del modelo
        prediccion = modelo(text, offsets)

        # Obten la pérdida
        loss = criterio(prediccion, label)
        
        # backpropage la pérdida y calcular los gradientes
        loss.backward()
        
        # Obten la accuracy
        acc = (prediccion.argmax(1) == label).sum()
        
        # Evita que los gradientes sean demasiado grandes 
        torch.nn.utils.clip_grad_norm_(modelo.parameters(), 0.1, foreach=False)

        # Actualiza los pesos
        optimizer.step()

        # Llevamos el conteo de la pérdida y el accuracy para esta epoch
        epoch_acc += acc.item()
        epoch_loss += loss.item()
        total_count += label.size(0)

        if idx % 500 == 0 and idx > 0:
          print(f" epoca {epoch} | {idx}/{len(dataloader)} batches | perdida {epoch_loss/total_count} | accuracy {epoch_acc/total_count}")

    return epoch_acc/total_count, epoch_loss/total_count


In [272]:
def evalua(dataloader):
  modelo.eval()
  epoch_acc = 0
  total_count = 0
  epoch_loss = 0

  with torch.no_grad():
    for idx, (label, text, offsets) in enumerate(dataloader):
            # Obtenemos la la etiqueta predecida
      prediccion = modelo(text, offsets)

            # Obtenemos pérdida y accuracy
      loss = criterio(prediccion, label)
      acc = (prediccion.argmax(1) == label).sum()
            
            # Llevamos el conteo de la pérdida y el accuracy para esta epoch
      epoch_loss += loss.item()
      epoch_acc += acc.item()
      total_count += label.size(0)

  return epoch_acc/total_count, epoch_loss/total_count


In [273]:
# Hiperparámetros

EPOCHS = 4 # epochs
TASA_APRENDIZAJE = 0.2  # tasa de aprendizaje
BATCH_TAMANO = 64 # tamaño de los batches

In [274]:
# Pérdida, optimizador
criterio = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(modelo.parameters(), lr= TASA_APRENDIZAJE)

In [275]:
def collate_batch(batch):
    labels = []
    texts = []

    for label, title, text in batch:
        full_text = (title + " " + text).lower()
        token_ids = vocab(tokenizador(full_text))
        labels.append(label - 1)
        texts.append(torch.tensor(token_ids, dtype=torch.int64))

    labels = torch.tensor(labels, dtype=torch.int64)
    offsets = torch.tensor([0] + [len(t) for t in texts[:-1]]).cumsum(0)
    texts = torch.cat(texts)

    return labels.to(device), texts.to(device), offsets.to(device)




In [276]:
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset

train_iter = list(DBpedia("train"))
test_iter = list(DBpedia("test"))

train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)

num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = random_split(train_dataset, [num_train, len(train_dataset) - num_train])

train_dataloader = DataLoader(split_train_, batch_size=BATCH_TAMANO, shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_TAMANO, shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_TAMANO, shuffle=True, collate_fn=collate_batch)



In [277]:
ejemplo = next(iter(DBpedia(split="train")))
print(ejemplo)

(1, 'E. D. Abbott Ltd', ' Abbott of Farnham E D Abbott Limited was a British coachbuilding business based in Farnham Surrey trading under that name from 1929. A major part of their output was under sub-contract to motor vehicle manufacturers. Their business closed in 1972.')


In [None]:
# Obten la mejor pérdida 
major_loss_validation = float('inf')

# Entrenamos
for epoch in range(1, EPOCHS + 1):
    # Entrenamiento
    entrenamiento_acc, entrenamiento_loss = entrena(train_dataloader)
    
    # Validación
    validacion_acc, validacion_loss = evalua(valid_dataloader)

    # Guarda el mejor modelo
    if validacion_loss < major_loss_validation:
      best_valid_loss = validacion_loss
      torch.save(modelo.state_dict(), "mejores_guardados.pt")
    

In [None]:
test_acc, test_loss = evalua(test_dataloader)

print(f'Accuracy del test dataset -> {test_acc}')
print(f'Pérdida del test dataset -> {test_loss}')

In [None]:
DBpedia_label = {1: 'Company',
                2: 'EducationalInstitution',
                3: 'Artist',
                4: 'Athlete',
                5: 'OfficeHolder',
                6: 'MeanOfTransportation',
                7: 'Building',
                8: 'NaturalPlace',
                9: 'Village',
                10: 'Animal',
                11: 'Plant',
                12: 'Album',
                13: 'Film',
                14: 'WrittenWork'}

def predict(text, texto_pipeline):
  with torch.no_grad():
    text = torch.tensor(texto_pipeline(text))
    opt_mod = torch.compile(model, mode="reduce-overhead")
    output = opt_mod(text, torch.tensor([0]))
    return output.argmax(1).item() + 1


ejemplo_1 = "Nithari is a village in the western part of the state of Uttar Pradesh India bordering on New Delhi. Nithari forms part of the New Okhla Industrial Development Authority's planned industrial city Noida falling in Sector 31. Nithari made international news headlines in December 2006 when the skeletons of a number of apparently murdered women and children were unearthed in the village."


model = modelo.to("cpu")


print(f"El ejemplo 1 es de categoría {DBpedia_label[predict(ejemplo_1, texto_pipeline)]}")