<a href="https://colab.research.google.com/github/IfeanyiEmeagi/fine-tune-llm-for-classification-task/blob/main/Finetune_Bert_model_for__sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import pandas as pd
import numpy as np
from pathlib import Path
import os
from urllib.request import urlretrieve
from os.path import basename, exists
from typing import Tuple
import time
from torch.utils.data import DataLoader

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
def download(url: str)-> None:
  """The function downloads the dataset from the provided url
  Parameters:
  Input: str (url)

  Return: None
  """
  filename = basename(url)
  if not exists(filename):
    local, _ = urlretrieve(url, filename)
    print("Downloaded " + str(local))
  else:
    print("File already exists...")

  return
download('https://raw.githubusercontent.com/IfeanyiEmeagi/Sentiment-Analysis-CETM47/main/CETM47-22_23-AS2-Data.json')

Downloaded CETM47-22_23-AS2-Data.json


In [3]:
#Load the downloaded dataset
def load(file_path: str) -> pd.DataFrame:
  """The function loads the dataset into memory and print out a few information.
  Parameters:
  Input: str (file path)

  Return: pd.DataFrame
  """
  data = pd.read_json(file_path)

  print(f'The dataset contains {data.shape[0]} rows and {data.shape[1]} columns.\n')
  print(f'Its data types are as follows:')
  print(f'{data.dtypes}\n')
  print(f'The data loaded successfully!')

  return data

file_path = 'CETM47-22_23-AS2-Data.json'
data = load(file_path)

The dataset contains 6443 rows and 5 columns.

Its data types are as follows:
text                  object
date          datetime64[ns]
label                  int64
id                     int64
label_name            object
dtype: object

The data loaded successfully!


In [4]:
def clean(data: pd.DataFrame) -> Tuple[pd.DataFrame, dict]:
  """The function cleans the dataset.
  Parameters:
  Input: pd.DataFrame

  Return: Tuple(pd.DataFrame, dict)
  """
  import re

  label_map = {data[data["label_name"] == label]['label'].iloc[0]: label for label in data['label_name'].unique()}

  #drop the date, label_name, id columns
  df = data.drop(columns=['date', 'id', 'label_name'], axis='columns')

  df['text'] = df['text'].replace(r'\s+', ' ', regex=True) #remove extra whitespaces
  df['text'] = df['text'].replace(r'\W', ' ', regex=True)  #remove special characters
  df['text'] = df['text'].replace(r'\[^a-zA-Z\s]', '', regex=True)  #remove all non-alphabetical characters
  df['text'] = df['text'].apply(lambda text: ' '.join(word for word in text.split() if len(word)> 2)) #remove all the double letter word

  df['text'] = df['text'].str.lower()

  return df, label_map


df, label_map = clean(data)

In [5]:
#Split the data into train, validation and test set. Ensure they are stratified using the label


def stratified_split(data:pd.DataFrame, stratify_label:str, train_size:float, validation_size:float)->pd.DataFrame:
  """The function split the dataset and stratify it based on the defined label.
  Parameters:
  Input: data:pd.DataFrame, stratify_label_str, train_size:float, validation_size:float

  Return: stratified_train: pd.DataFrame, stratified_validation: pd.DataFrame, stratified_test: pd.DataFrame

  """
  stratified_train = pd.DataFrame()
  stratified_validation = pd.DataFrame()
  stratified_test = pd.DataFrame()

  #stratify split based on the unique labels in the stratify label column
  for value in data[stratify_label].unique():
    data_class = data[data[stratify_label] == value]

    train_end = int(len(data_class) * train_size)
    validation_end = train_end + int(len(data_class) * validation_size)

    #slice the dataframe
    stratified_train = pd.concat([stratified_train, data_class[:train_end]], axis='rows')
    stratified_validation = pd.concat([stratified_validation, data_class[train_end:validation_end]], axis='rows')
    stratified_test = pd.concat([stratified_test, data_class[validation_end:]], axis = 'rows')

  #Shuffle the data
  stratified_train = stratified_train.sample(frac=1, random_state = 42).reset_index(drop=True)
  stratified_validation = stratified_validation.sample(frac=1, random_state=42).reset_index(drop=True)
  stratified_test = stratified_test.sample(frac=1, random_state = 42).reset_index(drop=True)

  #Save to disk storage
  stratified_train.to_csv('stratified_train.csv', index=False)
  stratified_validation.to_csv('stratified_validation.csv', index=False)
  stratified_test.to_csv('stratified_test.csv', index=False)
  print('Data stratified and saved to the disk.')

  return stratified_train, stratified_validation, stratified_test


In [6]:
#Prepare the cleaned dataset
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification

class SentimentDataset2(Dataset):
  def __init__(self, csv_file, tokenizer, max_length=None):
    self.data = pd.read_csv(csv_file)
    self.tokenizer = tokenizer
    if max_length is None:
      self.max_length = self._longest_encoded_length(tokenizer)
    else:
      self.max_length = max_length

  def __getitem__(self, index):
    words = self.data['text'][index]
    inputs = self.tokenizer.encode_plus(
        words,
        None,
        add_special_tokens=True,
        max_length = self.max_length,
        pad_to_max_length = True,
        return_token_type_ids = True,
        truncation = True
    )
    ids = inputs['input_ids']
    mask = inputs['attention_mask']
    label = int(self.data['label'][index])
    return {
        'ids': torch.tensor(ids, dtype=torch.long),
        'mask': torch.tensor(mask, dtype=torch.long),
        'label': torch.tensor(label, dtype=torch.long)
        }

  def __len__(self):
    return len(self.data)

  def _longest_encoded_length(self, tokenizer):
    max_length = 0
    for text in self.data['text']:
      encoded_length = len(tokenizer.encode(text))
      if encoded_length > max_length:
        max_length = encoded_length
    return max_length

In [7]:
#Build the model

from transformers import DistilBertModel, RobertaModel

class BERTClass(torch.nn.Module):
  def __init__(self, bert_model):
    super(BERTClass, self).__init__()
    if bert_model == 'distilbert':
      self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
      self.hidden_size = 768
    else:
        self.l1 = RobertaModel.from_pretrained("FacebookAI/roberta-large")
        self.hidden_size = 1024
    self.pre_classifier = torch.nn.Linear(self.hidden_size, self.hidden_size)
    self.dropout = torch.nn.Dropout(0.3)
    self.classifier = torch.nn.Linear(self.hidden_size, 6)

  def forward(self, input_ids, attention_mask):
    output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
    hidden_state = output_1[0]
    pooler = hidden_state[:, 0]
    pooler = self.pre_classifier(pooler)
    pooler = torch.nn.ReLU()(pooler)
    pooler = self.dropout(pooler)
    output = self.classifier(pooler)
    return output

In [8]:
#The train function

def batch_loss(ids, mask, targets, loss_function, model):
  outputs = model(ids, mask)
  loss = loss_function(outputs, targets)
  return loss, outputs


def evaluate(dataloader, model):
  data_acc = 0
  data_loss = 0
  correct_predictions = 0
  num_total_predictions = 0
  num_step = 0
  model.eval()
  with torch.no_grad():
    for _, data in enumerate(dataloader):
      ids = data['ids'].to(device, dtype=torch.long)
      mask = data['mask'].to(device, dtype=torch.long)
      targets = data['label'].to(device, dtype=torch.long)

      loss, outputs = batch_loss(ids=ids, mask=mask, targets=targets, loss_function=loss_fn, model=model)
      data_loss += loss.item()
      predicted_label = torch.argmax(outputs, dim=1)
      correct_predictions += (predicted_label == targets).sum().item()
      num_step += 1
      num_total_predictions += targets.size(0)

      if _ % 500 == 0:
        #print(f"Look at {num_step * len(ids)}/{num_total_predictions} samples")
        pass

  loss = data_loss / num_step
  acc = (correct_predictions / num_total_predictions) *100
  #print(f"The {type} loss: {loss}")
  #print(f"The {type} accu: {acc}%.")

  return loss, acc



def train(epoch, model, train_loader, val_loader, loss_fn, optimizer):
  print(f"Epoch: {epoch}\n-----")

  tr_loss = 0
  tr_acc  = 0
  n_correct = 0
  n_samples = 0
  train_step = 0

  model.train()

  for _, data in enumerate(train_loader, 0):
    ids = data['ids'].to(device, dtype=torch.long)
    mask = data['mask'].to(device, dtype=torch.long)
    targets = data['label'].to(device, dtype=torch.long)

    loss, outputs = batch_loss(ids, mask, targets, loss_fn, model)

    tr_loss +=loss.item()

    optimizer.zero_grad() #reset the weight

    loss.backward() #backward propagate the loss

    optimizer.step() #update the weights

    predicted_label = torch.argmax(outputs, dim=1)
    n_correct += (predicted_label == targets).sum().item()
    n_samples += targets.size(0)
    train_step += 1

    num = 400
    if _ % num == 0:
      loss_step = tr_loss / train_step
      acc_step = (n_correct * 100) / n_samples
      print(f"Epoch: {epoch}. Training Loss {loss_step} per {num} steps")
      print(f"Epoch: {epoch}. Training accuracy {acc_step} per {num} steps")

  epoch_loss = tr_loss / train_step
  #print(f"train loss: {epoch_loss}")
  train_acc = (n_correct * 100) / n_samples

  #evaluation
  #train_acc, train_loss = eval(train_loader, model, type="train")
  val_loss, val_acc = evaluate(val_loader, model)

  print(f"Epoch {epoch} Train loss: {epoch_loss} | Train accuracy {train_acc} | Val loss: {val_loss} | Val accuracy: {val_acc}\n")

  return


In [13]:
#Program workflow
#import argparse
from ipywidgets import widgets

if __name__ == "__main__":


  # Create widgets for input
  trainable_layers_widget = widgets.Dropdown(options=['all', 'last_block', 'last_layer'], value='last_block', description='Trainable Layers:')
  bert_model_widget = widgets.Dropdown(options=['distilbert', 'roberta'], value='distilbert', description='BERT Model:')

  # Display widgets
  display(trainable_layers_widget)
  display(bert_model_widget)

  # Get the input values
  trainable_layers = trainable_layers_widget.value
  bert_model = bert_model_widget.value

  #load model and tokenizer
  if bert_model in ['distilbert', 'roberta']:
    if bert_model == 'distilbert':
      model = BERTClass('distilbert')
      tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
    else:
      model = BERTClass('roberta')
      tokenizer = AutoTokenizer.from_pretrained('FacebookAI/roberta-large')

  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model.to(device)
  model.eval()

  #Download the dataset
  url = "https://raw.githubusercontent.com/IfeanyiEmeagi/Sentiment-Analysis-CETM47/main/CETM47-22_23-AS2-Data.json"
  download(url) #download the dataset
  file_path = 'CETM47-22_23-AS2-Data.json'
  data = load(file_path) #load the dataset
  cleaned_df, label_map = clean(data) #clean the dataset

  train_data, val_data, test_data = stratified_split(cleaned_df, "label", 0.7, 0.1) #split the dataset

  #Create Pytorch dataset
  max_length = 512
  train_dataset = SentimentDataset2('stratified_train.csv', tokenizer, max_length=max_length)
  validation_dataset = SentimentDataset2('stratified_validation.csv', tokenizer, max_length=max_length)
  test_dataset = SentimentDataset2('stratified_test.csv', tokenizer, max_length=max_length)

  #create the dataloaders
  num_workers = os.cpu_count()
  batch_size = 8

  train_loader = DataLoader (dataset=train_dataset,
                           batch_size=batch_size,
                           shuffle = True,
                           num_workers = num_workers,
                           drop_last=False)

  validation_loader = DataLoader (dataset= validation_dataset,
                           batch_size=batch_size,
                           shuffle = True,
                           num_workers = num_workers,
                           drop_last=False)

  test_loader = DataLoader (dataset=test_dataset,
                           batch_size=batch_size,
                           shuffle = True,
                           num_workers = num_workers,
                           drop_last=False)

  #Train model
  start_time = time.time()
  torch.manual_seed(42)
  torch.cuda.manual_seed(42)

  optimizer = torch.optim.Adam(model.parameters(), lr=1e-5, weight_decay=0.1)
  loss_fn = torch.nn.CrossEntropyLoss()

  num_epoches = 4
  from tqdm.auto import tqdm
  for epoch in tqdm(range(num_epoches)):
    train(epoch=epoch, model=model, train_loader=train_loader, val_loader=validation_loader, loss_fn=loss_fn, optimizer=optimizer)

  end_time = time.time()
  execution_time_minutes = (end_time - start_time) / 60
  print(f"Training completed in {execution_time_minutes:.3f} minutes.")

  print(f"\nEvaluating on the test dataset")
  _, test_accuracy = evaluate(test_loader, model)

  print(f"Test Accuracy: {test_accuracy:.3f}")





Dropdown(description='Trainable Layers:', index=1, options=('all', 'last_block', 'last_layer'), value='last_bl…

Dropdown(description='BERT Model:', options=('distilbert', 'roberta'), value='distilbert')

File already exists...
The dataset contains 6443 rows and 5 columns.

Its data types are as follows:
text                  object
date          datetime64[ns]
label                  int64
id                     int64
label_name            object
dtype: object

The data loaded successfully!
Data stratified and saved to the disk.


  0%|          | 0/4 [00:00<?, ?it/s]

Epoch: 0
-----
Epoch: 0. Training Loss 1.7731575965881348 per 400 steps
Epoch: 0. Training accuracy 12.5 per 400 steps
Epoch: 0. Training Loss 1.2711356184131783 per 400 steps
Epoch: 0. Training accuracy 57.231920199501246 per 400 steps
Epoch 0 Train loss: 1.1085853487226769 | Train accuracy 62.990903039715995 | Val loss: 0.7323015777417171 | Val accuracy: 76.4797507788162
Epoch: 1
-----
Epoch: 1. Training Loss 0.2774958312511444 per 400 steps
Epoch: 1. Training accuracy 100.0 per 400 steps
Epoch: 1. Training Loss 0.6254098279369145 per 400 steps
Epoch: 1. Training accuracy 80.143391521197 per 400 steps
Epoch 1 Train loss: 0.6286649706885747 | Train accuracy 79.83137341912581 | Val loss: 0.6190884804100166 | Val accuracy: 80.21806853582555
Epoch: 2
-----
Epoch: 2. Training Loss 0.3756439983844757 per 400 steps
Epoch: 2. Training accuracy 87.5 per 400 steps
Epoch: 2. Training Loss 0.5496281956942599 per 400 steps
Epoch: 2. Training accuracy 82.57481296758105 per 400 steps
Epoch 2 Train 

In [22]:
if not os.path.exists('./models'):
  os.makedirs('./models')

if not os.path.exists('./vocab'):
  os.makedirs('./vocab')

output_model_file = os.path.join('./models', 'pytorch_distilbert_classification.bin')
output_vocab_file = os.path.join('./vocab', 'vocab_distilbert_classification')

# Save the model
model_to_save = model
torch.save(model_to_save, output_model_file)

# Save the vocabulary
tokenizer.save_pretrained(output_vocab_file)

print('All files saved')
print('Thank you!')


All files saved
Thank you!
