<a href="https://colab.research.google.com/github/GastonWoollands/NLP_classification/blob/main/NLP_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://towardsdatascience.com/text-classification-with-bert-in-pytorch-887965e5820f

In [None]:
%%capture
pip install transformers

In [None]:
import pandas as pd
import torch
import numpy as np
from transformers import BertTokenizer, BertModel
from torch import nn
from torch.optim import Adam
# from tqdm import tqdm

In [None]:
df = pd.read_csv('/content/Training_Dataset_Jul19_Nov20.csv', encoding='latin1')
df['texts'] = df['texts'].astype(str)
df = df[1:]

labels = df.label.unique().tolist()

id2label = {idx:label for idx, label in enumerate(labels)}
labels   = {label:idx for idx, label in enumerate(labels)}

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = [labels[label] for label in df['label']]
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['texts']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [None]:
class BertClassifier(nn.Module):

    def __init__(self, dropout=0.05):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 13)
        # self.relu = nn.ReLU()
        self.final_layer = nn.Identity()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False) # _last state
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        # final_layer = self.relu(linear_output)
        final_layer = self.final_layer(linear_output)

        return final_layer

In [None]:
# class BertClassifier(nn.Module):
#     def __init__(self, num_labels, freeze_bert=True):
#         super(BertClassifier, self).__init__()
#         self.num_labels = num_labels
#         self.bert = BertModel.from_pretrained('bert-base-cased')
#         self.dropout = nn.Dropout(0.1)
#         self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
#         if freeze_bert:
#             for param in self.bert.parameters():
#                 param.requires_grad = False
#         else:
#             for name, param in self.bert.named_parameters():
#                 if 'encoder.layer.10' in name or 'encoder.layer.11' in name or 'encoder.layer.12' in name or 'pooler' in name:
#                     param.requires_grad = True

#     def forward(self, input_ids, attention_mask):
#         outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
#         pooled_output = outputs[1]
#         pooled_output = self.dropout(pooled_output)
#         logits = self.classifier(pooled_output)

#         return logits


In [None]:
def train(model, train_data, val_data, learning_rate, epochs, batch_size):

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:
            #use_cuda = False
            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in train_dataloader:

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                
                batch_loss = criterion(output, train_label.long())
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label.long())
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} | Train Accuracy: {total_acc_train / len(train_data): .3f} | Val Loss: {total_loss_val / len(val_data): .3f} | Val Accuracy: {total_acc_val / len(val_data): .3f}')
                  

In [None]:
def evaluate(model, test_data):

    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model = model.cuda()

    total_acc_test = 0
    with torch.no_grad():

        for test_input, test_label in test_dataloader:

              test_label = test_label.to(device)
              mask = test_input['attention_mask'].to(device)
              input_id = test_input['input_ids'].squeeze(1).to(device)

              output = model(input_id, mask)

              acc = (output.argmax(dim=1) == test_label).sum().item()
              total_acc_test += acc
    
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')

In [None]:
np.random.seed(83)
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=83), 
                                     [int(.8*len(df)), int(.9*len(df))])

print(len(df_train),len(df_val), len(df_test))

1388 173 174


In [None]:
model = BertClassifier()
LR = 1e-6
              
train(model, df_train, df_val, LR, epochs = 50, batch_size = 15)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epochs: 1 | Train Loss:  0.182 | Train Accuracy:  0.061 | Val Loss:  1.324 | Val Accuracy:  0.040
Epochs: 2 | Train Loss:  0.171 | Train Accuracy:  0.100 | Val Loss:  1.277 | Val Accuracy:  0.145
Epochs: 3 | Train Loss:  0.163 | Train Accuracy:  0.233 | Val Loss:  1.221 | Val Accuracy:  0.225
Epochs: 4 | Train Loss:  0.154 | Train Accuracy:  0.272 | Val Loss:  1.164 | Val Accuracy:  0.220
Epochs: 5 | Train Loss:  0.146 | Train Accuracy:  0.297 | Val Loss:  1.128 | Val Accuracy:  0.272
Epochs: 6 | Train Loss:  0.138 | Train Accuracy:  0.344 | Val Loss:  1.093 | Val Accuracy:  0.306
Epochs: 7 | Train Loss:  0.132 | Train Accuracy:  0.397 | Val Loss:  1.068 | Val Accuracy:  0.341
Epochs: 8 | Train Loss:  0.124 | Train Accuracy:  0.442 | Val Loss:  1.047 | Val Accuracy:  0.335
Epochs: 9 | Train Loss:  0.118 | Train Accuracy:  0.485 | Val Loss:  1.022 | Val Accuracy:  0.376
Epochs: 10 | Train Loss:  0.111 | Train Accuracy:  0.531 | Val Loss:  1.010 | Val Accuracy:  0.376
Epochs: 11 | Train 

In [None]:
evaluate(model, df_test)

Test Accuracy:  0.586


In [None]:
# Save the updated weights
torch.save(model, 'BERT_pretrained_v1.h5')

#### Keep Training the model with more epochs

In [None]:
model_v2 = torch.load('/content/BERT_pretrained_v1.h5')
LR = 1e-6
              
train(model_v2, df_train, df_val, LR, epochs = 50, batch_size = 15)

Epochs: 1 | Train Loss:  0.056 | Train Accuracy:  0.831 | Val Loss:  0.825 | Val Accuracy:  0.514
Epochs: 2 | Train Loss:  0.053 | Train Accuracy:  0.839 | Val Loss:  0.829 | Val Accuracy:  0.509
Epochs: 3 | Train Loss:  0.050 | Train Accuracy:  0.843 | Val Loss:  0.817 | Val Accuracy:  0.514
Epochs: 4 | Train Loss:  0.048 | Train Accuracy:  0.857 | Val Loss:  0.822 | Val Accuracy:  0.514
Epochs: 5 | Train Loss:  0.045 | Train Accuracy:  0.862 | Val Loss:  0.814 | Val Accuracy:  0.520
Epochs: 6 | Train Loss:  0.043 | Train Accuracy:  0.865 | Val Loss:  0.819 | Val Accuracy:  0.532
Epochs: 7 | Train Loss:  0.041 | Train Accuracy:  0.874 | Val Loss:  0.822 | Val Accuracy:  0.520
Epochs: 8 | Train Loss:  0.039 | Train Accuracy:  0.880 | Val Loss:  0.812 | Val Accuracy:  0.538
Epochs: 9 | Train Loss:  0.037 | Train Accuracy:  0.886 | Val Loss:  0.821 | Val Accuracy:  0.549
Epochs: 10 | Train Loss:  0.036 | Train Accuracy:  0.895 | Val Loss:  0.823 | Val Accuracy:  0.555
Epochs: 11 | Train 

In [None]:
evaluate(model_v2, df_test)

Test Accuracy:  0.575


In [None]:
# Save the updated weights
torch.save(model, 'BERT_pretrained_v0.h5')

In [None]:
  # test = Dataset(test_data)

  # test_dataloader = torch.utils.data.DataLoader(test, batch_size=2)

  # use_cuda = torch.cuda.is_available()
  # device = torch.device("cuda" if use_cuda else "cpu")

  # if use_cuda:

  #     model = model.cuda()

  # total_acc_test = 0
  # with torch.no_grad():

  #     for test_input, test_label in test_dataloader:

  #           test_label = test_label.to(device)
  #           mask = test_input['attention_mask'].to(device)
  #           input_id = test_input['input_ids'].squeeze(1).to(device)

  #           output = model(input_id, mask)

  #           acc = (output.argmax(dim=1) == test_label).sum().item()
  #           total_acc_test += acc