# DistilBERT

In [None]:
import pandas as pd
import numpy as np
import warnings
import os
warnings.filterwarnings('ignore')

In [None]:
! pip install --quiet lightning
! pip install --quiet transformers

In [None]:
import pandas as pd
from os import listdir
from os.path import join
from sklearn.model_selection import train_test_split
import string
from torch.utils.data import Dataset, DataLoader
import torch
import torchmetrics
import pytorch_lightning as pl
from transformers import AutoModelForSequenceClassification
from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger
from pytorch_lightning.callbacks import ModelCheckpoint
from transformers import DistilBertTokenizer

#### Dataset Preparation

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#show train dataframe
train_data_preprocessed = pd.read_csv('/content/drive/MyDrive/NUS_MSBA/BT5153_Final Group Project_Shared Folder/Data/kaggle_preprocessed.csv')
train_data_preprocessed.head()

Unnamed: 0,text,label,%unique_word_total,%stop_word_total,mean_word_length,mean_char_count_per_word
0,carfree cities become subject increasing inter...,1,0.466087,0.34087,5.926957,6.925217
1,car free cities carfree cities concept gaining...,1,0.518519,0.309942,6.111111,7.109162
2,sustainable urban future carfree cities emergi...,1,0.492188,0.302734,6.265625,7.265625
3,pioneering sustainable urban living era marked...,1,0.495183,0.333333,6.015414,7.015414
4,path sustainable urban living age rapid urbani...,1,0.481409,0.315068,6.039139,7.039139


In [None]:
train_data_preprocessed.shape

(27340, 6)

In [None]:
#show test dataframe
test_data_preprocessed = pd.read_csv('/content/drive/MyDrive/NUS_MSBA/BT5153_Final Group Project_Shared Folder/Data/new_essay_val_preprocessed.csv')
test_data_preprocessed.head()

Unnamed: 0,text,label,%unique_word_total,%stop_word_total,mean_word_length,mean_char_count_per_word
0,marshall plan test progress test progress whet...,0,0.444112,0.392216,5.384232,6.384232
1,promoting global regional security postcold wa...,0,0.414807,0.424949,5.150101,6.150101
2,womanhood peacemaking taking advantage unity c...,0,0.475947,0.37871,5.583419,6.583419
3,human rights universal western construct cruci...,0,0.370482,0.503012,4.875502,5.875502
4,challenges american foreign service rebuilding...,0,0.467425,0.379524,5.668046,6.668046


In [None]:
#split into train validation and test sets
from sklearn.model_selection import train_test_split

df_train, df_val = train_test_split(train_data_preprocessed, test_size=0.3, random_state=42)
df_test = test_data_preprocessed # rename

print(df_train.shape)
print(df_val.shape)
print(df_test.shape)
df_train.dropna(inplace = True)
df_val.dropna(inplace = True)
df_test.dropna(inplace = True)
print(df_train.shape)
print(df_val.shape)
print(df_test.shape)

(19138, 6)
(8202, 6)
(40, 6)
(19137, 6)
(8202, 6)
(40, 6)


In [None]:
# Prepare for DataLoader and Dataset without converting labels to ordinal
X_train_token_list = []
X_val_token_list = []
X_test_token_list = []
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Process the data
for i in df_train["text"]:
    X_train_token_list.append(tokenizer(i, truncation=True, padding='max_length', max_length=512, return_tensors='pt'))
for i in df_val["text"]:
    X_val_token_list.append(tokenizer(i, truncation=True, padding='max_length', max_length=512, return_tensors='pt'))
for i in df_test["text"]:
    X_test_token_list.append(tokenizer(i, truncation=True, padding='max_length', max_length=512, return_tensors='pt'))


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:
# add labels to token_list for use later as dataset
train_labels = []
for i in df_train["label"]:
  if int(i) == 0:
    train_labels.append(0)
  else:
    train_labels.append(1)
val_labels = []
for i in df_val["label"]:
  if int(i) == 0:
    val_labels.append(0)
  else:
    val_labels.append(1)
test_labels = []
for i in df_test["label"]:
  if int(i) == 0:
    test_labels.append(0)
  else:
    test_labels.append(1)
for i in range(len(X_train_token_list)):
  X_train_token_list[i]["label"] = torch.tensor(train_labels[i])
for i in range(len(X_val_token_list)):
  X_val_token_list[i]["label"] = torch.tensor(val_labels[i])
for i in range(len(X_test_token_list)):
  X_test_token_list[i]["label"] = torch.tensor(test_labels[i])


In [None]:
# add additoinal features to token_list for use later as dataset
for i in range(len(X_train_token_list)):
    row = df_train.iloc[i]
    X_train_token_list[i]['%unique_word_total'] = row['%unique_word_total'].astype(np.float32)
    X_train_token_list[i]['%stop_word_total'] = row['%stop_word_total'].astype(np.float32)
    X_train_token_list[i]['mean_word_length'] = row['mean_word_length'].astype(np.float32)
    X_train_token_list[i]['mean_char_count_per_word'] = row['mean_char_count_per_word'].astype(np.float32)

for i in range(len(X_val_token_list)):
    row = df_val.iloc[i]
    X_val_token_list[i]['%unique_word_total'] = row['%unique_word_total'].astype(np.float32)
    X_val_token_list[i]['%stop_word_total'] = row['%stop_word_total'].astype(np.float32)
    X_val_token_list[i]['mean_word_length'] = row['mean_word_length'].astype(np.float32)
    X_val_token_list[i]['mean_char_count_per_word'] = row['mean_char_count_per_word'].astype(np.float32)

for i in range(len(X_test_token_list)):
    row = df_test.iloc[i]
    X_test_token_list[i]['%unique_word_total'] = row['%unique_word_total'].astype(np.float32)
    X_test_token_list[i]['%stop_word_total'] = row['%stop_word_total'].astype(np.float32)
    X_test_token_list[i]['mean_word_length'] = row['mean_word_length'].astype(np.float32)
    X_test_token_list[i]['mean_char_count_per_word'] = row['mean_char_count_per_word'].astype(np.float32)


#### Model Finetuning

In [None]:
!pip install --quiet torch torchmetrics pytorch_lightning transformers

In [None]:
# model finetuning
import torch
import torch.nn as nn
import pytorch_lightning as pl
import torchmetrics
from transformers import AutoModelForSequenceClassification

# train our model and unfreeze our parameters
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2)

for param in model.parameters():
    param.requires_grad = False

for param in model.pre_classifier.parameters():
    param.requires_grad = True

for param in model.classifier.parameters():
    param.requires_grad = True

additional_features_size = 4

# model structure
class LightningModel(pl.LightningModule):
    def __init__(self, model, learning_rate=5e-5):
        super().__init__()

        self.learning_rate = learning_rate

        if model is None:
            raise ValueError("A model must be provided for initialization.")

        self.model = model

        # Ensure the config is set
        self.config = model.config  # Copy the model's configuration

        # New dense layer to process additional features
        self.additional_layer = nn.Linear(additional_features_size, 2)  # Adjust size based on additional features
        self.additional_dropout = nn.Dropout(0.4)

        # Adjusting the classifier to consider additional features
        self.combined_classifier = nn.Linear(2 + 2, 2)  # Assuming binary classification     model.config.hidden_size=768 but bert_logits is 2, so we use 2

        self.train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2)
        self.train_auroc = torchmetrics.AUROC(task="multiclass", num_classes=2)  # AUROC for training

        self.val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2)
        self.val_auroc = torchmetrics.AUROC(task="multiclass", num_classes=2)  # AUROC for evaluation
        self.test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2)
        self.test_auroc = torchmetrics.AUROC(task="multiclass", num_classes=2)  # AUROC for testing


    def forward(self, input_ids, attention_mask, unique_word_total, stop_word_total, mean_word_length, mean_char_count_per_word, labels=None):

        bert_output = self.model(input_ids, attention_mask=attention_mask, labels=labels)
        bert_logits = bert_output["logits"]
        # print('bert_logits')
        # print(bert_logits.shape)
        # print(bert_logits)

        additional_features = torch.stack([unique_word_total, stop_word_total, mean_word_length, mean_char_count_per_word], dim=1)  # Ensure correct shape
        # print(additional_features)
        additional_processed = self.additional_layer(additional_features)
        additional_processed = self.additional_dropout(additional_processed)

        # Combine BERT outputs with additional features
        combined_input = torch.cat([bert_logits, additional_processed], dim=1)

        # Pass combined data through the classifier
        # print(combined_input.shape)
        # print(combined_input)
        combined_logits = self.combined_classifier(combined_input)

        loss = None
        if labels is not None:
            loss = nn.CrossEntropyLoss()(combined_logits, labels)

        return {"logits": combined_logits, "loss": loss}

        # return self.model(input_ids, attention_mask=attention_mask, labels=labels)

    def training_step(self, batch, batch_idx):
        outputs = self(
            input_ids=batch["input_ids"].squeeze(1),
            attention_mask=batch["attention_mask"],
            unique_word_total=batch["%unique_word_total"],
            stop_word_total=batch["%stop_word_total"],
            mean_word_length=batch["mean_word_length"],
            mean_char_count_per_word=batch["mean_char_count_per_word"],
            labels=batch["label"],
        )
        self.log("train_loss", outputs["loss"])
        logits = outputs["logits"]
        probabilities = torch.softmax(logits, dim=1)  # Get probabilities for the positive class
        predicted_labels = torch.argmax(logits, 1)

        # Log accuracy
        self.train_acc(predicted_labels, batch["label"])
        self.log("train_acc", self.train_acc)

        # Update and log AUROC for training
        self.train_auroc(probabilities, batch["label"])
        # self.train_auroc.update(probabilities[:, 1], batch["label"])  # Use probabilities for the positive class
        self.log("train_auroc", self.train_auroc)

        return outputs["loss"]



    def validation_step(self, batch, batch_idx):
        outputs = self(
            input_ids=batch["input_ids"].squeeze(1),
            attention_mask=batch["attention_mask"],
            unique_word_total=batch["%unique_word_total"],
            stop_word_total=batch["%stop_word_total"],
            mean_word_length=batch["mean_word_length"],
            mean_char_count_per_word=batch["mean_char_count_per_word"],
            labels=batch["label"],
        )
        self.log("val_loss", outputs["loss"], prog_bar=True)

        logits = outputs["logits"]
        probabilities = torch.softmax(logits, dim=1)  # Get probabilities
        predicted_labels = torch.argmax(logits, 1)

        # Log accuracy
        self.val_acc(predicted_labels, batch["label"])
        self.log("val_acc", self.val_acc, prog_bar=True)

        # Update and log AUROC
        self.val_auroc(probabilities, batch["label"])
        # self.val_auroc.update(probabilities[:, 1], batch["label"])  # Use probabilities for the positive class
        self.log("val_auroc", self.val_auroc, prog_bar=True)

        # output for classification report
        return {"logits": logits, "loss": outputs["loss"], "labels": batch["label"]}


    def test_step(self, batch, batch_idx):
        outputs = self(
            input_ids=batch["input_ids"].squeeze(1),
            attention_mask=batch["attention_mask"],
            unique_word_total=batch["%unique_word_total"],
            stop_word_total=batch["%stop_word_total"],
            mean_word_length=batch["mean_word_length"],
            mean_char_count_per_word=batch["mean_char_count_per_word"],
            labels=batch["label"],
        )

        logits = outputs["logits"]
        probabilities = torch.softmax(logits, dim=1)  # Get probabilities
        predicted_labels = torch.argmax(logits, 1)

        # Log accuracy
        self.test_acc(predicted_labels, batch["label"])
        self.log("test_acc", self.test_acc, prog_bar=True)

        # Update and log AUROC
        self.test_auroc(probabilities, batch["label"])
        self.log("test_auroc", self.test_auroc, prog_bar=True)

        # output for classification report
        return {"logits": logits, "loss": outputs["loss"], "labels": batch["label"]}


    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
        return optimizer


lightning_model = LightningModel(model)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Create dataset and dataloader
class MyDataset(Dataset):
    def __init__(self, array_of_dicts):
        self.data = array_of_dicts

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        return item

trainset = MyDataset(X_train_token_list)
valset = MyDataset(X_val_token_list)
testset = MyDataset(X_test_token_list)

train_loader = DataLoader(
    dataset=trainset,
    batch_size=4,
    shuffle=True,
    num_workers=0
)

val_loader = DataLoader(
    dataset=valset,
    batch_size=4,
    num_workers=0
)

test_loader = DataLoader(
    dataset=testset,
    batch_size=4,
    num_workers=0
)

#### Model Training

##### Monitor val_auroc

In [None]:
# Change the monitor to "val_auroc"
logger = TensorBoardLogger("distilbert-add-fea/", name="finetuning", version="original")
callbacks = [
    ModelCheckpoint(
        save_top_k=1, mode="max", monitor="val_auroc"  # Monitor the AUROC metric
    )
]

In [None]:
# train model
trainer = pl.Trainer(
    max_epochs=3,
    callbacks=callbacks,
    # accelerator="gpu",  uncomment if gpu available
    devices=1,
    logger=logger,
    log_every_n_steps=1,
)

# Start training
trainer.fit(
    model=lightning_model,
    train_dataloaders=train_loader,
    val_dataloaders=val_loader
)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.callbacks.model_summary:
  | Name                | Type                                | Params
----------------------------------------------------------------------------
0 | model               | DistilBertForSequenceClassification | 67.0 M
1 | additional_layer    | Linear                              | 10    
2 | additional_dropout  | Dropout                             | 0     
3 | combined_classifier | Linear                              | 10    
4 | train_acc           | MulticlassAccuracy                  | 0     
5 | train_auroc         | MulticlassAUROC                     | 0     
6 | val_acc             | MulticlassAccuracy               

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

In [None]:
# Define the path to save the model
model_path = "/content/drive/MyDrive/NUS_MSBA/BT5153_Final Group Project_Shared Folder/models/distilbert/latest/distilbert-dataset-v3-kaggle30k_metafea_ep3_dropout0.4.pth"

# Save the entire model
torch.save(lightning_model, model_path)

In [None]:
# test with validation dataset
val_results = trainer.validate(lightning_model, dataloaders=val_loader, ckpt_path="best")
val_results

In [None]:
# test with unseen test dataset
test_results = trainer.test(lightning_model, dataloaders=test_loader, ckpt_path="best")
test_results

#### Tensor Board

#### Test data for Finetuning Round 1

In [None]:
# Start tensorboard.
%load_ext tensorboard
%tensorboard --logdir distilbert/finetuning/original/

In [None]:
model_path = "./distilbert-dataset-v3-kaggle30k_metafea_ep3_dropout0.4.pth"

# Load the model
loaded_model = torch.load(model_path)

print("Model loaded.")

Model loaded.


In [None]:
train_data_preprocessed_ft2= pd.read_csv('./Data/new_essay_train_preprocessed.csv')
print(train_data_preprocessed_ft2.shape)
train_data_preprocessed_ft2.head()

(60, 6)


Unnamed: 0,text,label,%unique_word_total,%stop_word_total,mean_word_length,mean_char_count_per_word
0,strategic collaboration constructive communica...,0,0.479633,0.35336,5.606925,6.605906
1,united states china powerful combination ameri...,0,0.442183,0.443108,4.779833,5.778908
2,911 foreign service contributions shortterm re...,0,0.539461,0.318681,5.623377,6.623377
3,2002 united nations un secretarygeneral kofi a...,0,0.544498,0.362679,5.744498,6.744498
4,awakening witness empowering engagement levera...,0,0.492711,0.345967,5.464529,6.464529


In [None]:
######
df_train_ft2, df_val_ft2 = train_test_split(train_data_preprocessed_ft2, test_size=0.3, random_state=42)
print(df_train_ft2.shape)
print(df_val_ft2.shape)
df_train_ft2.dropna(inplace = True)
df_val_ft2.dropna(inplace = True)
print(df_train_ft2.shape)
print(df_val_ft2.shape)

(42, 6)
(18, 6)
(42, 6)
(18, 6)


In [None]:
# Prepare for DataLoader and Dataset without converting labels to ordinal
X_train_token_list_ft2 = []
X_val_token_list_ft2 = []

# Process the data
for i in df_train_ft2["text"]:
    X_train_token_list_ft2.append(tokenizer(i, truncation=True, padding='max_length', max_length=512, return_tensors='pt'))
for i in df_val_ft2["text"]:
    X_val_token_list_ft2.append(tokenizer(i, truncation=True, padding='max_length', max_length=512, return_tensors='pt'))

# add labels to token_list for use later as dataset
train_labels = []
for i in df_train_ft2["label"]:
  if int(i) == 0:
    train_labels.append(0)
  else:
    train_labels.append(1)
val_labels = []
for i in df_val_ft2["label"]:
  if int(i) == 0:
    val_labels.append(0)
  else:
    val_labels.append(1)

for i in range(len(X_train_token_list_ft2)):
  X_train_token_list_ft2[i]["label"] = torch.tensor(train_labels[i])
for i in range(len(X_val_token_list_ft2)):
  X_val_token_list_ft2[i]["label"] = torch.tensor(val_labels[i])

# add additoinal features to token_list for use later as dataset
for i in range(len(X_train_token_list_ft2)):
    row = df_train_ft2.iloc[i]
    X_train_token_list_ft2[i]['%unique_word_total'] = row['%unique_word_total'].astype(np.float32)
    X_train_token_list_ft2[i]['%stop_word_total'] = row['%stop_word_total'].astype(np.float32)
    X_train_token_list_ft2[i]['mean_word_length'] = row['mean_word_length'].astype(np.float32)
    X_train_token_list_ft2[i]['mean_char_count_per_word'] = row['mean_char_count_per_word'].astype(np.float32)

for i in range(len(X_val_token_list_ft2)):
    row = df_val_ft2.iloc[i]
    X_val_token_list_ft2[i]['%unique_word_total'] = row['%unique_word_total'].astype(np.float32)
    X_val_token_list_ft2[i]['%stop_word_total'] = row['%stop_word_total'].astype(np.float32)
    X_val_token_list_ft2[i]['mean_word_length'] = row['mean_word_length'].astype(np.float32)
    X_val_token_list_ft2[i]['mean_char_count_per_word'] = row['mean_char_count_per_word'].astype(np.float32)

In [None]:
trainset_ft2 = MyDataset(X_train_token_list_ft2)
valset_ft2 = MyDataset(X_val_token_list_ft2)

# Create new DataLoaders for the new datasets
train_loader_ft2 = DataLoader(
    dataset=trainset_ft2,
    batch_size=4,
    shuffle=True,
    num_workers=0
)

val_loader_ft2 = DataLoader(
    dataset=valset_ft2,
    batch_size=4,
    num_workers=0
)

In [None]:
# Define the PyTorch Lightning Trainer
trainer = pl.Trainer(
    max_epochs=3,
    callbacks=callbacks,
    devices=1,
    logger=logger,
    log_every_n_steps=1,
)

# Continue training with the loaded model (finetuned v1) with the new dataset
trainer.fit(
    model=loaded_model,
    train_dataloaders=train_loader_ft2,
    val_dataloaders=val_loader_ft2
)

Trainer already configured with model summary callbacks: [<class 'pytorch_lightning.callbacks.model_summary.ModelSummary'>]. Skipping setting a default `ModelSummary` callback.
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name                | Type                                | Params
----------------------------------------------------------------------------
0 | model               | DistilBertForSequenceClassification | 67.0 M
1 | additional_layer    | Linear                              | 10    
2 | additional_dropout  | Dropout                             | 0     
3 | combined_classifier | Linear                              | 10    
4 | train_acc           | MulticlassAccuracy                  | 0     
5 | train_auroc         | MulticlassAUROC                     | 0     
6 | val_acc             | MulticlassAccuracy                  | 0     
7 | val_auroc           | 

Sanity Checking: |                                                                                            …

Training: |                                                                                                   …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

`Trainer.fit` stopped: `max_epochs=3` reached.


In [None]:
# test with validation dataset
val_results = trainer.validate(loaded_model, dataloaders=val_loader_ft2, ckpt_path="best")
val_results

Restoring states from the checkpoint path at distilbert-add-fea/finetuning/original/checkpoints/epoch=2-step=14355-v1.ckpt
Loaded model weights from the checkpoint at distilbert-add-fea/finetuning/original/checkpoints/epoch=2-step=14355-v1.ckpt


Validation: |                                                                                                 …

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     Validate metric           DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
         val_acc            0.6111111044883728
        val_auroc           0.9125000238418579
        val_loss            2.0009870529174805
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'val_loss': 2.0009870529174805,
  'val_acc': 0.6111111044883728,
  'val_auroc': 0.9125000238418579}]