<a href="https://colab.research.google.com/github/Horcruxno13/Multi-Class-Text-Classification-using-BERT-and-PyTorch/blob/main/ECommerce_3_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
ecom_df = pd.read_csv('/content/drive/My Drive/Datasets/ecommerceCleaned.csv')

In [None]:
ecom_df.head(10)

Unnamed: 0.1,Unnamed: 0,Category,Description,Length,Labels
0,0,Household,paper plane design frame wall hang motivationa...,228,3
1,1,Household,saf frame paint wood inch x inch special effec...,59,3
2,2,Household,saf texture modern art print paint synthetic c...,224,3
3,3,Household,saf flower print frame paint synthetic inch x ...,184,3
4,4,Household,incredible gift india wooden happy birthday un...,184,3
5,5,Household,pitaara box romantic venice canvas painting th...,230,3
6,6,Household,paper plane design starry night vangoh wall ar...,275,3
7,7,Household,saf modern art paint synthetic cm x cm x cm se...,213,3
8,8,Household,painting villa uv texture modern art print fra...,111,3
9,9,Household,paint mantra art street jardin bird frame art ...,72,3


In [None]:
X = ecom_df["Description"].values
y = ecom_df["Labels"].values

In [None]:
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader, Dataset
from tqdm import tqdm

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install -U catalyst transformers > /dev/null

In [None]:
import os
import warnings
import logging
from typing import Mapping, List
from pprint import pprint

# Transformers 
from transformers import AutoConfig, AutoModel, AutoTokenizer

# Catalyst
#from catalyst.dl import SupervisedRunner
#from catalyst.dl.callbacks import AccuracyCallback, F1ScoreCallback, OptimizerCallback
#from catalyst.dl.callbacks import CheckpointCallback, InferCallback
#from catalyst.utils import set_global_seed, prepare_cudnn

In [None]:
MODEL_NAME = 'distilbert-base-uncased' 
#LOG_DIR = "./logdir_amazon_reviews"    
NUM_EPOCHS = 4                         # smth around 2-6 epochs is typically fine when finetuning transformers
BATCH_SIZE = 16                        # depends on your available GPU memory (in combination with max seq length)
MAX_SEQ_LENGTH = 128                    # depends on your available GPU memory (in combination with batch size)
NUM_CLASSES = 4               
LEARN_RATE = 5e-5                      # learning rate is typically ~1e-5 for transformers
ACCUM_STEPS = 4                        # one optimization step for that many backward passes
SEED = 42                 
FP16_PARAMS = None             # random seed for reproducibility

In [None]:
%%capture
# if Your machine doesn't support FP16, comment these 4 lines below
!git clone https://github.com/NVIDIA/apex 
!pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex 
!rm -rf ./apex
FP16_PARAMS = dict(opt_level="O1") 

In [None]:
from transformers import DistilBertTokenizer

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
input_ids = []
for line in X:
    line = line[:MAX_SEQ_LENGTH]
    encoded = tokenizer.encode(line, add_special_tokens = True)
    input_ids.append(encoded)

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
input_ids = pad_sequences(input_ids, maxlen=MAX_SEQ_LENGTH, dtype="long", value=0, truncating="post", padding="post")

In [None]:
attention_masks = []
for line in input_ids:
    mask = [int(token_id > 0) for token_id in line]
    attention_masks.append(mask)

In [None]:
from sklearn.model_selection import train_test_split 

In [None]:
input_train, input_remain, label_train, label_remain, mask_train, mask_remain = train_test_split(input_ids, y, attention_masks, test_size=0.3, random_state=42)
input_valid, input_test, label_valid, label_test, mask_valid, mask_test = train_test_split(input_remain, label_remain, mask_remain, test_size=0.5, random_state=42)

In [None]:
import torch
from torch import nn
#from torch.optim import AdamW
from torch.utils.data import TensorDataset, DataLoader
from tqdm import tqdm

In [None]:
batch_size = 16

trainset = TensorDataset(torch.tensor(input_train), torch.tensor(label_train), torch.tensor(mask_train))
validset = TensorDataset(torch.tensor(input_valid), torch.tensor(label_valid), torch.tensor(mask_valid))
testset = TensorDataset(torch.tensor(input_test), torch.tensor(label_test), torch.tensor(mask_test))

trainloader = DataLoader(trainset, shuffle=True, batch_size=batch_size)
validloader = DataLoader(validset, shuffle=True, batch_size=batch_size)
testloader = DataLoader(testset, shuffle=True, batch_size=batch_size)

In [None]:
train_val_loaders = {
    "train": DataLoader(dataset=trainset,
                        batch_size=BATCH_SIZE, 
                        shuffle=True),
    "valid": DataLoader(dataset=validset,
                        batch_size=BATCH_SIZE, 
                        shuffle=False)    
}

In [None]:
from transformers import DistilBertForSequenceClassification, AdamW

In [None]:
class DistilBertForSequenceClassification(nn.Module):
    def __init__(self, pretrained_model_name: str, num_classes: int = None):
        super().__init__()
        config = AutoConfig.from_pretrained(pretrained_model_name, num_labels=num_classes)
        self.distilbert = AutoModel.from_pretrained(pretrained_model_name, config=config)
        self.pre_classifier = nn.Linear(config.dim, config.dim)
        self.classifier = nn.Linear(config.dim, num_classes)
        self.dropout = nn.Dropout(config.seq_classif_dropout)

    def forward(self, features, attention_mask=None, head_mask=None):
        assert attention_mask is not None, "attention mask is none"
        distilbert_output = self.distilbert(input_ids=features, attention_mask=attention_mask, head_mask=head_mask)
        hidden_state = distilbert_output[0]
        pooled_output = hidden_state[:, 0] 
        pooled_output = self.pre_classifier(pooled_output)  
        pooled_output = nn.ReLU()(pooled_output)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output) 

        return logits

In [None]:
model = DistilBertForSequenceClassification(pretrained_model_name=MODEL_NAME, num_classes=NUM_CLASSES)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARN_RATE)
#scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer)

In [None]:
model.cuda()

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [None]:
!pip install -U catalyst

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from catalyst import dl
from catalyst.dl import SupervisedRunner
from catalyst.callbacks import AccuracyCallback, OptimizerCallback, SchedulerCallback
from catalyst.callbacks import CheckpointCallback
from catalyst.utils import set_global_seed, prepare_cudnn

In [None]:
optimizer = AdamW(model.parameters(), lr = 1e-5, eps = 1e-8)
loss_function = torch.nn.CrossEntropyLoss()

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [None]:
def flat_accuracy(preds, labels):
  pred_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()
  return np.sum(pred_flat == labels_flat) / len(labels_flat)
def calcuate_accu(big_idx, targets):
  n_correct = (big_idx==targets).sum().item()
  return n_correct

In [None]:
epochs = 4
epochloop = tqdm(range(epochs), position=0, desc='Training', leave=True)

for epoch in epochloop:
  model.train()
  train_loss = 0
  train_acc = 0
  n_correct = 0
  nb_tr_steps = 0
  nb_tr_examples = 0
  for id, batch in enumerate(trainloader):
    if id % 50 == 0 and id != 0:
      print("ID Number: ", id)
    inputids = batch[0].to(device)
    inputlabels = batch[1].to(device)
    inputmasks = batch[2].to(device)

    optimizer.zero_grad()
    outputs = model(inputids, attention_mask = inputmasks)
    loss = loss_function(outputs, inputlabels)
    train_loss = train_loss + loss.item()
    big_val, big_idx = torch.max(outputs, dim=1)
    n_correct = n_correct + calcuate_accu(big_idx, inputlabels)
    nb_tr_steps += 1
    nb_tr_examples += inputlabels.size(0)
    loss.backward()
    optimizer.step()
  
  print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
  epoch_loss = train_loss/nb_tr_steps
  print(f'The Total Loss for Epoch {epoch}: {epoch_loss}')


Training:   0%|          | 0/4 [00:00<?, ?it/s]

ID Number:  50
ID Number:  100
ID Number:  150
ID Number:  200
ID Number:  250
ID Number:  300
ID Number:  350
ID Number:  400
ID Number:  450
ID Number:  500
ID Number:  550
ID Number:  600
ID Number:  650
ID Number:  700
ID Number:  750
ID Number:  800
ID Number:  850
ID Number:  900
ID Number:  950
ID Number:  1000
ID Number:  1050
ID Number:  1100
ID Number:  1150
ID Number:  1200


Training:  25%|██▌       | 1/4 [02:54<08:43, 174.51s/it]

The Total Accuracy for Epoch 0: 24.885668773444326
The Total Loss for Epoch 0: 1.3896299995353405
ID Number:  50
ID Number:  100
ID Number:  150
ID Number:  200
ID Number:  250
ID Number:  300
ID Number:  350
ID Number:  400
ID Number:  450
ID Number:  500
ID Number:  550
ID Number:  600
ID Number:  650
ID Number:  700
ID Number:  750
ID Number:  800
ID Number:  850
ID Number:  900
ID Number:  950
ID Number:  1000
ID Number:  1050
ID Number:  1100
ID Number:  1150
ID Number:  1200


Training:  50%|█████     | 2/4 [05:55<05:56, 178.05s/it]

The Total Accuracy for Epoch 1: 25.394378500590925
The Total Loss for Epoch 1: 1.3896855151153726
ID Number:  50
ID Number:  100
ID Number:  150
ID Number:  200
ID Number:  250
ID Number:  300
ID Number:  350
ID Number:  400
ID Number:  450
ID Number:  500
ID Number:  550
ID Number:  600
ID Number:  650
ID Number:  700
ID Number:  750
ID Number:  800
ID Number:  850
ID Number:  900
ID Number:  950
ID Number:  1000
ID Number:  1050
ID Number:  1100
ID Number:  1150
ID Number:  1200


Training:  75%|███████▌  | 3/4 [08:55<02:59, 179.07s/it]

The Total Accuracy for Epoch 2: 25.188839216895328
The Total Loss for Epoch 2: 1.3897494748663295
ID Number:  50
ID Number:  100
ID Number:  150
ID Number:  200
ID Number:  250
ID Number:  300
ID Number:  350
ID Number:  400
ID Number:  450
ID Number:  500
ID Number:  550
ID Number:  600
ID Number:  650
ID Number:  700
ID Number:  750
ID Number:  800
ID Number:  850
ID Number:  900
ID Number:  950
ID Number:  1000
ID Number:  1050
ID Number:  1100
ID Number:  1150
ID Number:  1200


Training: 100%|██████████| 4/4 [11:55<00:00, 178.98s/it]

The Total Accuracy for Epoch 3: 25.075792610862752
The Total Loss for Epoch 3: 1.3899801058561565





In [None]:
#runner = SupervisedRunner(input_key=("features", "attention_mask"), output_key = "logits", target_key="labels")

#runner.train(model=model, criterion=criterion, optimizer=optimizer, loaders=train_val_loaders, cpu = True,
    #callbacks=[AccuracyCallback(num_classes=NUM_CLASSES, input_key="logits", target_key="labels"), OptimizerCallback(accumulation_steps=ACCUM_STEPS, metric_key = "loss")],
    #num_epochs=NUM_EPOCHS,
    #verbose=True)

1/4 * Epoch (train):   0%|          | 0/1217 [00:00<?, ?it/s]

AssertionError: ignored