<a href="https://colab.research.google.com/github/LennyRBriones/pytorch/blob/main/torchtext_data_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text Classification with Pytorch

## Library & dataset


In [25]:
%%capture
!pip install portalocker>=2.0.0
!pip install torchtext --updgrade

In [26]:
import torch
import torchtext
from torchtext.datasets import DBpedia

# version
torchtext.__version__

'0.15.2+cpu'

## Processing the dataset and starting the vocabulary

In [27]:
train_iter = iter(DBpedia(split="train"))

In [28]:
next(train_iter)

(1,
 'E. D. Abbott Ltd  Abbott of Farnham E D Abbott Limited was a British coachbuilding business based in Farnham Surrey trading under that name from 1929. A major part of their output was under sub-contract to motor vehicle manufacturers. Their business closed in 1972.')

In [29]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer("basic_english")
train_iter = DBpedia(split="train")

def yield_tokens(data_iter):
  for _, text in data_iter:
    yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])


Our vocabulary transforms the list of tokens in int numbers

In [30]:
tokenizer("Hi there!, here Lenny making tests")

['hi', 'there', '!', ',', 'here', 'lenny', 'making', 'tests']

In [31]:
vocab(tokenizer("Hi there!, here Lenny making tests, nihao!"))

[10371, 313, 403, 90515, 1538, 13823, 1031, 5247, 90515, 0, 403]

The tokenizer classify every word that is register, in this case tnakns to `<unk> ` the words that are not register as "nihao" is store as 0

In [32]:
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x) -1
#to save starting in 0

In [33]:
text_pipeline("Hi, I'am Lenny")

[10371, 90515, 187, 17, 2409, 13823]

In [34]:
label_pipeline("10")

9

The `Dataloader` allows to load big data in a small batchers

In [35]:
#Using cuda to big data process
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch):
  label_list = []
  text_list = []
  offsets = [0]

  for (_label, _text) in batch:
    label_list.append(label_pipeline(_label))
    processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
    text_list.append(processed_text)
    offsets.append(processed_text.size(0))

  label_list = torch.tensor(label_list, dtype=torch.int64)
  offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
  text_list = torch.cat(text_list)
  return label_list.to(device), text_list.to(device), offsets.to(device)

In [36]:
from torch.utils.data import DataLoader

train_iter = DBpedia(split="train")
dataloader = DataLoader(train_iter, batch_size=8, shuffle=False, collate_fn=collate_batch )
                          # 8 in this case using a colab CPU

In [37]:
dataloader

<torch.utils.data.dataloader.DataLoader at 0x7fe471538490>

In [38]:
from torch import nn
import torch.nn.functional as F


class Classificationtextmodel(nn.Module):
  def __init__(self, vocab_size, embed_dim, num_class):
   super(Classificationtextmodel, self).__init__()

      # Embedding layer
   self.embedding = nn.EmbeddingBag(vocab_size, embed_dim)

      # Batch Normalization
   self.bn1 = nn.BatchNorm1d(embed_dim)

      # Fully conenected layer

   self.fc = nn.Linear(embed_dim, num_class)

  def forward(self, text, offsets):
      # Embed the text
    embedded = self.embedding(text, offsets)

      # Apply Batch normalization
    embedded_norm = self.bn1(embedded)

      # Apply the ReLU activation function
    embedded_activated = F.relu(embedded_norm)

      # Output the class probabilities
    return self.fc(embedded_activated)


### Generating the model with embeddin of 100

In [39]:
train_iter = DBpedia(split="train")
num_class = len(set([label for (label, text) in train_iter]))
vocab_size = len(vocab)
embedding_size = 100

model = Classificationtextmodel(vocab_size=vocab_size, embed_dim=embedding_size, num_class=num_class).to(device)

In [40]:
vocab_size

802998

In [41]:
# model architecture
print(model)



Classificationtextmodel(
  (embedding): EmbeddingBag(802998, 100, mode='mean')
  (bn1): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc): Linear(in_features=100, out_features=14, bias=True)
)


In [42]:
#Number of trainable parameters in our model
def count_parameters(model):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)
          #pnumel to get parameters                          #gradient = trainable

print(f"The model has {count_parameters(model):,} trainable parameters")

The model has 80,301,414 trainable parameters


## Training and evaluate in our Model

In [43]:
def training(dataloader):
    # Turns the model to training mode
    model.train()

    # initialize the accuracy, count & loss for every epoch
    epoch_acc = 0
    epoch_loss = 0
    total_count = 0

    for idx, (label, text, offsets) in enumerate(dataloader):
      # Reestar gradient every batch
      optimizer.zero_grad()

      # Get model predictions
      prediction = model(text, offsets)

      #Get the loss
      loss = criteria(prediction, label)

      #backpropage the loss and get the gradients
      loss.backward()

      #Get the accuracy

      acc = (prediction.argmax(1) == label).sum()

      #Avoid the gradients elevated it´s values
      torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)

      # Update the wheights
      optimizer.step()

      # Get the sum of loss & the accuracy for every epoch
      epoch_acc += acc.item()
      epoch_loss += loss.item()
      total_count += label.size(0)

      if idx % 500 == 0 and idx > 0:
        print(f" epoch {epoch} | {idx}/{len(dataloader)} batches | loss {epoch_loss/total_count} | accuracy {epoch_acc/total_count}")


    return epoch_acc/total_count, epoch_loss/total_count




In [44]:
def evaluate(dataloader):
  model.eval()
  epoch_acc = 0
  total_count = 0
  epoch_loss = 0

  with torch.no_grad():
    for idx, (label, text, offsets) in enumerate(dataloader):
              # Get the predicted label
      prediction = model(text, offsets)

              # Get loss & accuracy
      loss = criteria(prediction, label)
      acc = (prediction.argmax(1) == label).sum()

              # Get the new sum of loss & accuracy
      epoch_loss += loss.item()
      epoch_acc += acc.item()
      total_count += label.size(0)

  return epoch_acc/total_count, epoch_loss/total_count



## Training: Data Split, loss & Optimization

In [45]:
#Hyperpatameters

epochs = 6
learning_rate = 0.15
batch_size = 86

In [46]:
# loss, optimizer

criteria = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr= learning_rate)

### Split of Data

In [47]:
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset

# Get the trainset & testset
train_iter, test_iter = DBpedia()
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)

# Train the model using the 95% of data from trainset
num_train = int(len(train_dataset) * 0.95)

# Get a validation dataset with 5% of trainset
split_train_, split_valid_ = random_split(train_dataset, [num_train, len(train_dataset)-num_train])

# Get dataloader ready to upload to our model
train_dataloader = DataLoader(split_train_, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(split_valid_, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)

## Training & Evaluation

In [48]:
# Get the best loss value
major_loss_validation = float("inf")

# Training
for epoch in range(1, epochs + 1):
    # Train
    training_acc, training_loss = training(train_dataloader)

    #Validation
    validation_acc, validation_loss = evaluate(valid_dataloader)

    #Save the best model
    if validation_loss < major_loss_validation:
      best_valid_loss = validation_loss
      torch.save(model.state_dict(), "best_models.pt")


 epoch 1 | 500/6187 batches | loss 0.025818540210738927 | accuracy 0.3509260548670102
 epoch 1 | 1000/6187 batches | loss 0.022533882248095053 | accuracy 0.4567757823571777
 epoch 1 | 1500/6187 batches | loss 0.020437451173250628 | accuracy 0.5064065816587391
 epoch 1 | 2000/6187 batches | loss 0.019008530099198832 | accuracy 0.5378996548237509
 epoch 1 | 2500/6187 batches | loss 0.017986463289190898 | accuracy 0.5588927219809751
 epoch 1 | 3000/6187 batches | loss 0.017211621653409823 | accuracy 0.5747192796199716
 epoch 1 | 3500/6187 batches | loss 0.016578671121242852 | accuracy 0.5874401333838173
 epoch 1 | 4000/6187 batches | loss 0.016071531376417925 | accuracy 0.5974087873380493
 epoch 1 | 4500/6187 batches | loss 0.015655409086510825 | accuracy 0.6056121895392755
 epoch 1 | 5000/6187 batches | loss 0.01528614392589596 | accuracy 0.6129704291699799
 epoch 1 | 5500/6187 batches | loss 0.014975361293023303 | accuracy 0.6193461653906478
 epoch 1 | 6000/6187 batches | loss 0.0147011

In [49]:
test_acc, test_loss = evaluate(test_dataloader)

print(f"The Accuracy in the test dataset is: {test_acc}")
print(f"The Loss in the test dataset is: {test_loss}")


The Accuracy in the test dataset is: 0.8034714285714286
The Loss in the test dataset is: 0.007318142723185675


## Optimization using Inference

In [53]:
# In case of heavy models, this case of optimization allow us to reduce production costs

DBpedia_label = {1: 'Company',
                2: 'EducationalInstitution',
                3: 'Artist',
                4: 'Athlete',
                5: 'OfficeHolder',
                6: 'MeanOfTransportation',
                7: 'Building',
                8: 'NaturalPlace',
                9: 'Village',
                10: 'Animal',
                11: 'Plant',
                12: 'Album',
                13: 'Film',
                14: 'WrittenWork'}

def predict(text, text_pipeline):
  with torch.no_grad():
    text = torch.tensor(text_pipeline(text))
    opt_mod = torch.compile(model, mode="reduce-overhead")
    #optimized model
    output = opt_mod(text, torch.tensor([0]))
    return output.argmax(1).item() + 1
                        #get only the format an 1 to 14

example_1 = "Zaanse Schans is a village  kwon for their windmills & harvest"

model = model.to("cpu")

print(f"The example number 1 belong to the category {DBpedia_label[predict(example_1, text_pipeline)]}")

The example number 1 belong to the category Village


## Saving the model

In [54]:
model_state_dict = model.state_dict()
optimizer_state_dict = optimizer.state_dict()

checkpoint = {
    "model_state_dict" : model_state_dict,
    "optimizer_state_dict": optimizer_state_dict,
    "epoch" : epoch,
    "loss" : training_loss,
}

torch.save(checkpoint, "model_checkpoint.pth")

## Uploading to Hugging Face

In [55]:
%%capture
!pip install huggingface_hub

In [57]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [59]:
from huggingface_hub import HfApi
api = HfApi()

api.create_repo(repo_id="LennyRBriones/textclassification-DBpedia-lenny-rbriones")

RepoUrl('https://huggingface.co/LennyRBriones/textclassification-DBpedia-lenny-rbriones', endpoint='https://huggingface.co', repo_type='model', repo_id='LennyRBriones/textclassification-DBpedia-lenny-rbriones')

### Upload our checkpoin

In [60]:
!ls

'=2.0.0'   best_models.pt   model_checkpoint.pth   sample_data


In [64]:
api.upload_file(
    path_or_fileobj="./model_checkpoint.pth",
    path_in_repo="model_checkpoint.pth",
    repo_id="LennyRBriones/textclassification-DBpedia-lenny-rbriones"
)

model_checkpoint.pth:   0%|          | 0.00/321M [00:00<?, ?B/s]

'https://huggingface.co/LennyRBriones/textclassification-DBpedia-lenny-rbriones/blob/main/model_checkpoint.pth'

### Downloading the model from hugging face to local directory

In [65]:
!mkdir weights

In [66]:
from huggingface_hub import hf_hub_download
hf_hub_download(repo_id="LennyRBriones/textclassification-DBpedia-lenny-rbriones", filename="model_checkpoint.pth", local_dir="weights/")

Downloading model_checkpoint.pth:   0%|          | 0.00/321M [00:00<?, ?B/s]

'weights/model_checkpoint.pth'

### Loading the downloaded model

In [67]:
checkpoint = torch.load("weights/model_checkpoint.pth")

In [68]:
train_iter =DBpedia(split="train")
num_class = len(set([label for (label, text ) in train_iter]))
vocab_size = len(vocab)
embedding_size = 100

model_2 = Classificationtextmodel(vocab_size = vocab_size, embed_dim = embedding_size, num_class=num_class)

In [69]:
optimizer_2 = torch.optim.SGD(model_2.parameters(), lr=0.2)

In [72]:
checkpoint["model_state_dict"]

OrderedDict([('embedding.weight',
              tensor([[-0.2452, -1.2445, -0.5474,  ...,  1.9547,  0.6863, -1.0317],
                      [-1.4177,  0.3655,  0.8530,  ..., -0.3490, -0.1362,  1.5133],
                      [-0.0567,  0.3204, -2.0167,  ...,  0.9700, -0.2322,  0.1036],
                      ...,
                      [ 1.3814,  0.2670,  0.5674,  ...,  0.5943, -0.4915, -1.2665],
                      [ 0.3377,  1.0727, -1.2215,  ...,  0.3235,  0.9733,  0.3194],
                      [ 0.3777,  0.0786,  0.9783,  ..., -0.4042, -0.5603, -0.0184]])),
             ('bn1.weight',
              tensor([0.9272, 0.9416, 0.9772, 1.1068, 0.8722, 1.0011, 1.1578, 1.0804, 1.0119,
                      1.1129, 0.9800, 0.9941, 1.0818, 1.0081, 0.8985, 1.1512, 1.2241, 0.9705,
                      1.1213, 0.9675, 0.8094, 1.0561, 0.9022, 0.9606, 1.0165, 0.9735, 1.1263,
                      0.9097, 1.2037, 0.9690, 1.1338, 1.0121, 1.1558, 1.0870, 1.0120, 1.0896,
                      0.8426

In [73]:
model_2.load_state_dict(checkpoint["model_state_dict"])

<All keys matched successfully>

In [74]:
checkpoint["optimizer_state_dict"]

{'state': {0: {'momentum_buffer': None},
  1: {'momentum_buffer': None},
  2: {'momentum_buffer': None},
  3: {'momentum_buffer': None},
  4: {'momentum_buffer': None}},
 'param_groups': [{'lr': 0.15,
   'momentum': 0,
   'dampening': 0,
   'weight_decay': 0,
   'nesterov': False,
   'maximize': False,
   'foreach': None,
   'differentiable': False,
   'params': [0, 1, 2, 3, 4]}]}

In [76]:
optimizer_2.load_state_dict(checkpoint["optimizer_state_dict"])

In [77]:
epoch_2 = checkpoint["epoch"]
loss_2 = checkpoint["loss"]

In [78]:
example_2 = "The Axolotl  It is unusual among amphibians in that it reaches adulthood without undergoing metamorphosis. Instead of taking to the land, adults remain aquatic and gilled. The species was originally found in several lakes underlying what is now Mexico City "

model_cpu = model_2.to("cpu")

DBpedia_label[predict(example_2, text_pipeline)]

'Animal'