<a href="https://colab.research.google.com/github/Mostofa-Najmus-Sakib/NLP-works/blob/main/Assisgnment_8/A8_BERT_fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### BERT fine-tuning for document classification

In [None]:
pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ed/d5/f4157a376b8a79489a76ce6cfe147f4f3be1e029b7144fa7b8432e8acb26/transformers-4.4.2-py3-none-any.whl (2.0MB)
[K     |▏                               | 10kB 22.8MB/s eta 0:00:01[K     |▎                               | 20kB 29.7MB/s eta 0:00:01[K     |▌                               | 30kB 33.3MB/s eta 0:00:01[K     |▋                               | 40kB 36.4MB/s eta 0:00:01[K     |▉                               | 51kB 34.4MB/s eta 0:00:01[K     |█                               | 61kB 36.6MB/s eta 0:00:01[K     |█▏                              | 71kB 22.8MB/s eta 0:00:01[K     |█▎                              | 81kB 21.4MB/s eta 0:00:01[K     |█▌                              | 92kB 23.0MB/s eta 0:00:01[K     |█▋                              | 102kB 21.6MB/s eta 0:00:01[K     |█▉                              | 112kB 21.6MB/s eta 0:00:01[K     |██                              | 

In [None]:
import os
import re
import numpy as np 
from sklearn.metrics import accuracy_score

import transformers
from transformers import BertTokenizer, BertModel

import torch
from torch import cuda
from tqdm import tqdm_notebook as tqdm
device = 'cuda' if cuda.is_available() else 'cpu'

device

'cuda'

In [None]:
# import tensorflow as tf
# # sess = tf.InteractiveSession()
# sess=tf.compat.v1.InteractiveSession()

- use X.txt and YL1.txt 

In [None]:
X = [line.strip() for line in open('X.txt').readlines()]
y = train_data = [int(line.strip()) for line in open('YL1.txt').readlines()]

len(X), len(y), max(y)

(46985, 46985, 6)

In [None]:
type(y)
y[1]

5

### An easy train/test split

In [None]:
train_X = X[:46000]
train_y = np.array(y[:46000])
test_X = X[46000:]
test_y = np.array(y[46000:])

len(train_X), len(train_y), len(test_X), len(test_y)

(46000, 46000, 985, 985)

In [None]:
# not needed for training or evaluation, but useful for mapping examples
labels = {
    0:'Computer Science',
    1:'Electrical Engineering',
    2:'Psychology',
    3:'Mechanical Engineering',
    4:'Civil Engineering',
    5:'Medical Science',
    6:'Biochemistry'
}

len(labels)

7

### Fine-tune BERT on the dataset

## Torch Datasets
- takes in inputs and outputs/labels
- interfaces with tokenizer
- handles batching

In [None]:
class MultiLabelDataset(torch.utils.data.Dataset):

    def __init__(self, text, labels, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.text = text
        self.targets = labels
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            truncation=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

## Bert Class
- first "layer" is a pre-trained BERT model
- you can add whatever layers you want after that

In [None]:
class BERTClass(torch.nn.Module):
    def __init__(self, NUM_OUT):
        super(BERTClass, self).__init__()
                   
        self.l1 = BertModel.from_pretrained("bert-base-uncased")
#         self.pre_classifier = torch.nn.Linear(768, 256)
        self.classifier = torch.nn.Linear(768, NUM_OUT)
#         self.dropout = torch.nn.Dropout(0.5)
        self.softmax = torch.nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
#         pooler = self.pre_classifier(pooler)
#         pooler = torch.nn.Tanh()(pooler)
#         pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        output = self.softmax(output)
        return output

### Helpful Functions

Loss

- This task is binary, so it uses binary crossentropy loss
- Tasks with more labels will use categorical crossentropy
- Tasks that don't have labels, but rather have distributions should use KL divergence
- Tasks that don't have distributions should use something like RMSE loss

Train

- Steps through the data batch by batch
- grabs ids, masks, and token_type_ids which are required inputs for BERT
- inputs are passed through the model, compared to targets, computes loss function, backprops

Validation

- Takes a model, passes inputs
- Need to use the targets from here because they are potentially shuffled!

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.CrossEntropyLoss()(outputs, targets)

def train(model, training_loader, optimizer):
    model.train()
    for data in tqdm(training_loader):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)  ### changed 

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    return loss
    
def validation(model, testing_loader):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for data in tqdm(testing_loader):
            targets = data['targets']
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids)
            outputs = torch.sigmoid(outputs).cpu().detach()
            fin_outputs.extend(outputs)
            fin_targets.extend(targets)
    return torch.stack(fin_outputs), torch.stack(fin_targets)

### The Tokenizer

- Converts a raw string to the ids, masks, and token_type_ids

In [None]:
# train_data
# train_X
# jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# what does the tokenizer do?
# print(train_X)

# tokenizer.encode_plus(
#             train_X,
#             None,
#             add_special_tokens=True,
#             max_length=128,
#             pad_to_max_length=True,
#             truncation=True,
#             return_token_type_ids=True
#         )

### Training setup

- hyperparameters
- setup dataset
- setup parameters
- setup dataloader

In [None]:
# test_X

In [None]:
MAX_LEN = 128
BATCH_SIZE = 64
EPOCHS = 3
NUM_OUT = 7 # binary task
LEARNING_RATE = 2e-05

training_data = MultiLabelDataset(train_X, torch.from_numpy(train_y), tokenizer, MAX_LEN)
test_data = MultiLabelDataset(test_X, torch.from_numpy(test_y), tokenizer, MAX_LEN)

train_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }    

training_loader = torch.utils.data.DataLoader(training_data, **train_params)
testing_loader = torch.utils.data.DataLoader(test_data, **test_params)

### Train,  Evaluate

- model.to -> send to GPU, if available (anything computed should be put onto the GPU)
- setup optimizer - could use Stochastic Gradient Descent, but ADAM tends to work better
- for each epoch, train, show the loss, evaluate on the test data

In [None]:
model = BERTClass(NUM_OUT)
model.to(device)    

optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)



for epoch in range(EPOCHS):
    loss = train(model, training_loader, optimizer)
    print(f'Epoch: {epoch}, Loss:  {loss.item()}')  
    guess, targs = validation(model, testing_loader)
    guess_list = guess.tolist()
    targs_list = targs.tolist()

    final_guess = []

    for a_list in guess_list:
      maxpos = a_list.index(max(a_list))
      final_guess.append(maxpos)
    print('accurracy on test set {}'.format(accuracy_score(final_guess, targs_list)))


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=719.0), HTML(value='')))




Epoch: 0, Loss:  1.3117804527282715


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))


accurracy on test set 0.798984771573604


HBox(children=(FloatProgress(value=0.0, max=719.0), HTML(value='')))


Epoch: 1, Loss:  1.3079255819320679


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))


accurracy on test set 0.8182741116751269


HBox(children=(FloatProgress(value=0.0, max=719.0), HTML(value='')))


Epoch: 2, Loss:  1.3097169399261475


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))


accurracy on test set 0.8263959390862944
