<a href="https://colab.research.google.com/github/LennyRBriones/pytorch/blob/main/torchtext_data_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Library & dataset


In [1]:
%%capture
!pip install portalocker>=2.0.0
!pip install torchtext --updgrade

In [2]:
import torch
import torchtext
from torchtext.datasets import DBpedia

# version
torchtext.__version__

'0.15.2+cpu'

## Processing the dataset and starting the vocabulary

In [3]:
train_iter = iter(DBpedia(split="train"))

In [4]:
next(train_iter)

(1,
 'E. D. Abbott Ltd  Abbott of Farnham E D Abbott Limited was a British coachbuilding business based in Farnham Surrey trading under that name from 1929. A major part of their output was under sub-contract to motor vehicle manufacturers. Their business closed in 1972.')

In [5]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer("basic_english")
train_iter = DBpedia(split="train")

def yield_tokens(data_iter):
  for _, text in data_iter:
    yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])


Our vocabulary transforms the list of tokens in int numbers

In [6]:
tokenizer("Hi there!, here Lenny making tests")

['hi', 'there', '!', ',', 'here', 'lenny', 'making', 'tests']

In [7]:
vocab(tokenizer("Hi there!, here Lenny making tests, nihao!"))

[10371, 313, 403, 90515, 1538, 13823, 1031, 5247, 90515, 0, 403]

The tokenizer classify every word that is register, in this case tnakns to `<unk> ` the words that are not register as "nihao" is store as 0

In [8]:
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x) -1
#to save starting in 0

In [9]:
text_pipeline("Hi, I'am Lenny")

[10371, 90515, 187, 17, 2409, 13823]

In [10]:
label_pipeline("10")

9

The `Dataloader` allows to load big data in a small batchers

In [11]:
#Using cuda to big data process
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch):
  label_list = []
  text_list = []
  offsets = [0]

  for (_label, _text) in batch:
    label_list.append(label_pipeline(_label))
    processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
    text_list.append(processed_text)
    offsets.append(processed_text.size(0))

  label_list = torch.tensor(label_list, dtype=torch.int64)
  offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
  text_list = torch.cat(text_list)
  return label_list.to(device), text_list.to(device), offsets.to(device)

In [12]:
from torch.utils.data import DataLoader

train_iter = DBpedia(split="train")
dataloader = DataLoader(train_iter, batch_size=8, shuffle=False, collate_fn=collate_batch )
                          # 8 in this case using a colab CPU

In [13]:
dataloader

<torch.utils.data.dataloader.DataLoader at 0x79c8c8932500>

In [14]:
from torch import nn
import torch.nn.functional as F


class Classificationtextmodel(nn.Module):
  def __init__(self, vocab_size, embed_dim, num_class):
   super(Classificationtextmodel, self).__init__()

      # Embedding layer
   self.embedding = nn.EmbeddingBag(vocab_size, embed_dim)

      # Batch Normalization
   self.bn1 = nn.BatchNorm1d(embed_dim)

      # Fully conenected layer

   self.fc = nn.Linear(embed_dim, num_class)

  def forward(self, text, offsets):
      # Embed the text
    embedded = self.embedding(text, offsets)

      # Apply Batch normalization
    embedded_norm = self.bn1(embedded)

      # Apply the ReLU activation function
    embedded_activated = F.relu(embedded_norm)

      # Output the class probabilities
    return self.fc(embedded_activated)


### Generating the model with embeddin of 100

In [15]:
train_iter = DBpedia(split="train")
num_class = len(set([label for (label, text) in train_iter]))
vocab_size = len(vocab)
embedding_size = 100

model = Classificationtextmodel(vocab_size=vocab_size, embed_dim=embedding_size, num_class=num_class).to(device)

In [16]:
vocab_size

802998

In [17]:
# model architecture
print(model)



Classificationtextmodel(
  (embedding): EmbeddingBag(802998, 100, mode='mean')
  (bn1): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc): Linear(in_features=100, out_features=14, bias=True)
)


In [18]:
#Number of trainable parameters in our model
def count_parameters(model):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)
          #pnumel to get parameters                          #gradient = trainable

print(f"The model has {count_parameters(model):,} trainable parameters")

The model has 80,301,414 trainable parameters


## Training and evaluate in our Model

In [19]:
def training(dataloader):
    # Turns the model to training mode
    model.train()

    # initialize the accuracy, count & loss for every epoch
    epoch_acc = 0
    epoch_loss = 0
    total_count = 0

    for idx, (label, text, offsets) in enumerate(dataloader):
      # Reestar gradient every batch
      optimizer.zero_grad()

      # Get model predictions
      prediction = model(text, offsets)

      #Get the loss
      loss = criteria(prediction, label)

      #backpropage the loss and get the gradients
      loss.backward()

      #Get the accuracy

      acc = (prediction.argmax(1) == label).sum()

      #Avoid the gradients elevated it´s values
      torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)

      # Update the wheights
      optimizer.step()

      # Get the sum of loss & the accuracy for every epoch
      epoch_acc += acc.item()
      epoch_loss += loss.item()
      total_count += label.size(0)

      if idx % 500 == 0 and idx > 0:
        print(f" epoch {epoch} | {idx}/{len(dataloader)} batches | loss {epoch_loss/total_count} | accuracy {epoch_acc/total_count}")


    return epoch_acc/total_count, epoch_loss/total_count




In [27]:
def evaluate(dataloader):
  model.eval()
  epoch_acc = 0
  total_count = 0
  epoch_loss = 0

  with torch.no_grad():
    for idx, (label, text, offsets) in enumerate(dataloader):
              # Get the predicted label
      prediction = model(text, offsets)

              # Get loss & accuracy
      loss = criteria(prediction, label)
      acc = (prediction.argmax(1) == label).sum()

              # Get the new sum of loss & accuracy
      epoch_loss += loss.item()
      epoch_acc += acc.item()
      total_count += label.size(0)

  return epoch_acc/total_count, epoch_loss/total_count



## Training: Data Split, loss & Optimization

In [21]:
#Hyperpatameters

epochs = 3
learning_rate = 0.2
batch_size = 64

In [22]:
# loss, optimizer

criteria = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr= learning_rate)

### Split of Data

In [23]:
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset

# Get the trainset & testset
train_iter, test_iter = DBpedia()
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)

# Train the model using the 95% of data from trainset
num_train = int(len(train_dataset) * 0.95)

# Get a validation dataset with 5% of trainset
split_train_, split_valid_ = random_split(train_dataset, [num_train, len(train_dataset)-num_train])

# Get dataloader ready to upload to our model
train_dataloader = DataLoader(split_train_, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(split_valid_, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)

## Training & Evaluation

In [28]:
# Get the best loss value
major_loss_validation = float("inf")

# Training
for epoch in range(1, epochs + 1):
    # Train
    training_acc, training_loss = training(train_dataloader)

    #Validation
    validation_acc, validation_loss = evaluate(valid_dataloader)

    #Save the best model
    if validation_loss < major_loss_validation:
      best_valid_loss = validation_loss
      torch.save(model.state_dict(), "best_models.pt")


 epoch 1 | 500/8313 batches | loss 0.012067549609153452 | accuracy 0.7594186626746507
 epoch 1 | 1000/8313 batches | loss 0.011983260302548553 | accuracy 0.7614572927072927
 epoch 1 | 1500/8313 batches | loss 0.011996481639595527 | accuracy 0.7619711858760826
 epoch 1 | 2000/8313 batches | loss 0.011977427110340314 | accuracy 0.7617285107446277
 epoch 1 | 2500/8313 batches | loss 0.011935190605621536 | accuracy 0.7622263594562175
 epoch 1 | 3000/8313 batches | loss 0.011899726866610703 | accuracy 0.7627665778073975
 epoch 1 | 3500/8313 batches | loss 0.011863212522797094 | accuracy 0.7633354755784062
 epoch 1 | 4000/8313 batches | loss 0.011827568239603407 | accuracy 0.7642620594851287
 epoch 1 | 4500/8313 batches | loss 0.011790084726885012 | accuracy 0.7647363085980893
 epoch 1 | 5000/8313 batches | loss 0.011744137418543344 | accuracy 0.7655718856228754
 epoch 1 | 5500/8313 batches | loss 0.011713607635784476 | accuracy 0.7662669287402291
 epoch 1 | 6000/8313 batches | loss 0.011678

In [30]:
test_acc, test_loss = evaluate(test_dataloader)

print(f"The Accuracy in the test dataset is: {test_acc}")
print(f"The Loss in the test dataset is: {test_loss}")


The Accuracy in the test dataset is: 0.8185428571428571
The Loss in the test dataset is: 0.009112035138692175
