|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 2:</h2>|<h1>Large language models<h1>|
|<h2>Section:</h2>|<h1>Fine-tune pretrained models<h1>|
|<h2>Lecture:</h2>|<h1><b>Fine-tuning BERT for classification<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">udemy.com/course/dulm_x/?couponCode=202509</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

# Importing the IMDB dataset

In [None]:
# run this code, then restart the python session (and then comment it out)
# !pip install -U datasets huggingface_hub fsspec
from datasets import load_dataset, DatasetDict
dataset = load_dataset('imdb')

# And back to our regularly scheduled program

In [None]:
# typical python libraries
import numpy as np
import matplotlib.pyplot as plt

# pytorch libraries
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

# huggingface libraries
from transformers import BertModel, BertTokenizer

In [None]:
# import BERT pretrained model
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert = BertModel.from_pretrained('bert-base-uncased').to(device)

In [None]:
bert # note the shape of the final layer

In [None]:
bert.embeddings.dropout.p

In [None]:
tokenizer.vocab_size

# More on model inputs and outputs

In [None]:
text = 'Replace me with any text you like.'
tokens = tokenizer(text, return_tensors='pt').to(device)
tokens

In [None]:
# get model output using specific inputs
output = bert(
    input_ids      = tokens['input_ids'],
    attention_mask = tokens['attention_mask']
    )


# but there's a better way ;)
output = bert(**tokens)

In [None]:
dir(output)

In [None]:
output['last_hidden_state'].shape

In [None]:
output['pooler_output'].shape

In [None]:
# can we get text output?
bert.generate(tokens, max_length=100, do_sample=True).cpu()

# Create an LLM model using pretrained BERT with a new head

In [None]:
class BertForBinaryClassification(nn.Module):
  def __init__(self, num_labels=2):
    super(BertForBinaryClassification, self).__init__()

    # Load the pre-trained BERT model.
    self.bert = BertModel.from_pretrained('bert-base-uncased')

    # classification head that converts the 768-d pooled output into 2 final outputs.
    self.classifier = nn.Linear(768,2)
    self.dropout = nn.Dropout(self.bert.embeddings.dropout.p) # 10%

    # initialize the weights and biases
    nn.init.xavier_uniform_(self.classifier.weight)
    nn.init.zeros_(self.classifier.bias)


  def forward(self, input_ids, attention_mask=None, token_type_ids=None):

    # forward pass through the downloaded (pretrained) BERT
    outputs = self.bert(
      input_ids      = input_ids,
      attention_mask = attention_mask,
      token_type_ids = token_type_ids)

    # extract the pooled output and apply dropout
    pooled_output = self.dropout( outputs.pooler_output )

    # final push through the classification layer.
    logits = self.classifier(pooled_output)
    return logits


In [None]:
# create an instance of the model and test it
model = BertForBinaryClassification().to(device)

# test the output
tokens = tokenizer(text, return_tensors='pt').to(device)
out = model(**tokens)
out

# Import the dataset

In [None]:
# check out the dataset
dataset

In [None]:
dataset['train'][2000]

In [None]:
plt.figure(figsize=(10,3.5))
plt.plot(dataset['train']['label'] + np.random.randn(len(dataset['train']))/20,'m.',markersize=1,alpha=.2)

plt.gca().set(xlabel='Review index',ylabel='Label',yticks=[0,1],yticklabels=['Negative','Positive'],
              xlim=[0,len(dataset['train'])],ylim=[-.5,1.5])
plt.show()

# It's a lot of data; let's take a small sample

In [None]:
dataset['train'].select(range(100))

In [None]:
# reduce the dataset size while:
# 1) including both categories (see plot above and range() below)
# 2) preserving only 'train' and 'test'
# 2) using a DatasetDict (not a Python dict) to preserve methods
small_data = DatasetDict({split:dataset[split].select(range(10000,15000)) for split in ['train','test']})
small_data

In [None]:
# confirm we still have both categories
plt.figure(figsize=(10,3.5))
plt.plot(small_data['train']['label'] + np.random.randn(len(small_data['train']))/20,'m.',markersize=1)

plt.gca().set(xlabel='Review index',ylabel='Label',yticks=[0,1],yticklabels=['Negative','Positive'],
              xlim=[0,len(small_data['train'])],ylim=[-.5,1.5])
plt.show()

# Tokenizing the text with padding

In [None]:
# this works...
first_try = tokenizer(dataset['train'][0]['text'])

# but this is better b/c reviews have differing lengths
better = tokenizer(
    dataset['train'][0]['text'], # the text to tokenize
    max_length = 512,            #
    padding    = 'max_length',   # using pad_token to reach max_len
    truncation = True)           # cut out tokens >max_len


print(f"'Naive' tokenization (N={len(first_try['input_ids'])}):")
print(f"{first_try['input_ids']}")

print(f"\nBetter tokenization (N={len(better['input_ids'])}):")
print(f"{better['input_ids']}")

In [None]:
# define a tokenization function that processes each data sample
def tokenize_function(one_sample):
  return tokenizer(
    one_sample['text'],
    max_length = 512,         # Maximum sequence length
    padding    = 'max_length',# Pad sequences to the maximum length
    truncation = True)        # Truncate sequences longer than max_length


# apply the tokenization function to the dataset (batched for efficiency)
tokenized_dataset = small_data.map(tokenize_function, batched=True)
tokenized_dataset

In [None]:
# remove text pair
tokenized_dataset = tokenized_dataset.remove_columns(['text'])

# change format to pytorch tensors
tokenized_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

# create DataLoaders for training and testing
train_dataloader = DataLoader(tokenized_dataset['train'], shuffle=True, batch_size=32)
test_dataloader  = DataLoader(tokenized_dataset['test'], batch_size=32)

In [None]:
# check dataset structure again
print(tokenized_dataset,'\n\n')

# tokenized_dataset['train'][1000]

In [None]:
# and check out a dataloader iteration
X = next(iter(train_dataloader))
X

# Now to fine-tune the model

In [None]:
# optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(),lr=1e-5)
loss_fun = nn.CrossEntropyLoss() # (cross-entropy loss for multi-class classification)

In [None]:
# get a batch of data
batch = next(iter(train_dataloader))

# and move it to the GPU
tokenz  = batch['input_ids'].to(device)
att_msk = batch['attention_mask'].to(device)
labels  = batch['label'].to(device)

# clear the previous gradients
optimizer.zero_grad()

# forward pass and get model predictions
logits = model(tokenz, attention_mask=att_msk)
predLabels = torch.argmax(logits, dim=1)

# calculate and store loss + average accuracy
loss = loss_fun(logits, labels)
train_accuracy = (predLabels == labels).sum().item()/train_dataloader.batch_size

# backward pass
loss.backward()

# update the weights and the learning rate
optimizer.step()

In [None]:
print(f'Logits are of size {logits.shape} and are:\n',logits)

In [None]:
print('Model predictions:',predLabels)
print('True labels:',labels)

In [None]:
accuracy = (predLabels==labels).sum()/train_dataloader.batch_size

print(f'Accuracy is {100*accuracy:.1f}%')