# 4 BERT

BERT (Bidirectional Encoder Representations from Transformers) is a Machine Learning model based on transformers, i.e. attention components able to learn contextual relations between words.

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

from tabulate import tabulate
from tqdm import trange
import random

In [None]:
data = pd.read_csv('/content/drive/MyDrive/DataSolveUS_BIA667_Project/data/train_data.csv')

data['labels'] = data['cat_id_all']
data = data[["text","labels"]]

data

Unnamed: 0,text,labels
0,consent matter solium services llc solium serv...,203764182442
1,alberta warns investors top investment risks y...,23259
2,exempt dealer agrees settlement alberta asc co...,3412029437824
3,canadian regulators announces consultation acc...,1740243029
4,csa consultation paper consideration access eq...,1721304029
...,...,...
9854,difficulty repaying loans good track record co...,921
9855,russia restricted foreign blocked internationa...,132114834291121
9856,application pecuniary administrative sanction ...,30174423
9857,investor consultation investor service hotline...,1346


In [None]:
labels_num = 50
labels = torch.zeros((len(data), labels_num))
for index in range(len(data)):
  temp = data['labels'][index].split(',')
  temp = [int(i) for i in temp]
  labels[index][temp] =1

print('Shape of label tensor:', labels.shape)
print(labels[0:5])

Shape of label tensor: torch.Size([9859, 50])
tensor([[0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0.,
         0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
         0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0

In [None]:
text = data['text'].values

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(text,labels, test_size = 0.10, random_state = random.seed(2022))
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(8873,) torch.Size([8873, 50])
(986,) torch.Size([986, 50])


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
from torch.utils.data import Dataset, DataLoader

class BERT_Dataset(Dataset):

    def __init__(self, text, labels, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.text = text
        self.targets = labels
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': self.targets[index]
        }

In [None]:
MAX_LEN = 200

train_dataset = BERT_Dataset(X_train, Y_train, tokenizer, MAX_LEN)
test_dataset = BERT_Dataset(X_test, Y_test, tokenizer, MAX_LEN)

In [None]:
import transformers
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class BERT(torch.nn.Module):
    def __init__(self):
        super(BERT, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.2)
        self.l3 = torch.nn.Linear(768, 50)
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict= False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

model = BERT()
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERT(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
         

In [None]:
! pip install torchmetrics

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from torchmetrics.classification import MultilabelF1Score
metric = MultilabelF1Score(num_labels=50).to(device)

def train_model(model, train_dataset, test_dataset, device, lr=0.0001, epochs=20, batch_size= 256):
    # construct dataloader
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

    # move model to device
    model = model.to(device)

    # history
    history = {'train_loss': [],
               'train_acc': [],
               'test_loss': [],
               'test_acc': []}

    # setup loss function and optimizer
    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    # training loop
    print('Training Start')
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        train_acc = 0
        test_loss = 0
        test_acc = 0
        for data in train_loader:
            # move data to device
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device)
            # print(data['targets'])
            # forward
            # outputs = model(x).view(-1)
            outputs = model(ids, mask, token_type_ids)
            pred = torch.round(outputs)
            # print(targets.shape)
            cur_train_loss = criterion(outputs, targets)
            cur_train_acc = metric(pred, targets)
            # backward
            cur_train_loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            # loss and acc
            train_loss += cur_train_loss
            train_acc += cur_train_acc

        # test start
        model.eval()
        with torch.no_grad():
            for data in test_loader:
                # print(x.shape,y.shape)
                # move
                ids = data['ids'].to(device, dtype = torch.long)
                mask = data['mask'].to(device, dtype = torch.long)
                token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
                targets = data['targets'].to(device)
                outputs = model(ids, mask, token_type_ids)
                # predict
                # outputs = model(x).view(-1)
                pred = torch.round(outputs)
                
                cur_test_loss = criterion(outputs, targets)
                cur_test_acc = metric(pred, targets)
                # loss and acc
                test_loss += cur_test_loss
                test_acc += cur_test_acc

        # epoch output
        train_loss = (train_loss / len(train_loader)).item()
        train_acc = train_acc / len(train_loader)
        val_loss = (test_loss / len(test_loader)).item()
        val_acc = test_acc / len(test_loader)
        history['train_loss'].append(train_loss)
        history['train_acc'].append(train_acc.cpu().numpy())
        history['test_loss'].append(val_loss)
        history['test_acc'].append(val_acc.cpu().numpy())
        print(
            f"Epoch:{epoch + 1} / {epochs}, train loss:{train_loss:.4f} train_acc:{train_acc:.4f}, valid loss:{val_loss:.4f} valid acc:{val_acc:.5f}")

    return history

history = train_model(model=model,
                      train_dataset=train_dataset,
                      test_dataset=test_dataset,
                      device=device,
                      lr=0.0001,
                      epochs= 100,
                      batch_size= 128)

Training Start
Epoch:1 / 100, train loss:0.3613 train_acc:0.0013, valid loss:0.2955 valid acc:0.00000
Epoch:2 / 100, train loss:0.2782 train_acc:0.0050, valid loss:0.2552 valid acc:0.10678
Epoch:3 / 100, train loss:0.2325 train_acc:0.1106, valid loss:0.2158 valid acc:0.15342
Epoch:4 / 100, train loss:0.2011 train_acc:0.2580, valid loss:0.1928 valid acc:0.29363
Epoch:5 / 100, train loss:0.1770 train_acc:0.3670, valid loss:0.1766 valid acc:0.36472
Epoch:6 / 100, train loss:0.1566 train_acc:0.4569, valid loss:0.1657 valid acc:0.41472
Epoch:7 / 100, train loss:0.1391 train_acc:0.5292, valid loss:0.1547 valid acc:0.49863
Epoch:8 / 100, train loss:0.1232 train_acc:0.5949, valid loss:0.1483 valid acc:0.56109
Epoch:9 / 100, train loss:0.1090 train_acc:0.6534, valid loss:0.1413 valid acc:0.60078
Epoch:10 / 100, train loss:0.0966 train_acc:0.7002, valid loss:0.1358 valid acc:0.63793
Epoch:11 / 100, train loss:0.0854 train_acc:0.7436, valid loss:0.1312 valid acc:0.66291
Epoch:12 / 100, train loss

KeyboardInterrupt: ignored