In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# DistilBERT

In [2]:
! pip install --quiet lightning
! pip install --quiet transformers

[0m

In [3]:
import pandas as pd
# importing all necessary packages
from os import listdir
from os.path import join
from sklearn.model_selection import train_test_split
import string
from torch.utils.data import Dataset, DataLoader
import torch
import torchmetrics
import pytorch_lightning as pl
from transformers import AutoModelForSequenceClassification
from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger
from pytorch_lightning.callbacks import ModelCheckpoint
from transformers import DistilBertTokenizer

#### Dataset Preparation

In [4]:
#show dataframe
train_data_preprocessed = pd.read_csv('./Data/kaggle_preprocessed.csv')
train_data_preprocessed.head()

Unnamed: 0,text,label,%unique_word_total,%stop_word_total,mean_word_length,mean_char_count_per_word
0,carfree cities become subject increasing inter...,1,0.466087,0.34087,5.926957,6.925217
1,car free cities carfree cities concept gaining...,1,0.518519,0.309942,6.111111,7.109162
2,sustainable urban future carfree cities emergi...,1,0.492188,0.302734,6.265625,7.265625
3,pioneering sustainable urban living era marked...,1,0.495183,0.333333,6.015414,7.015414
4,path sustainable urban living age rapid urbani...,1,0.481409,0.315068,6.039139,7.039139


In [5]:
train_data_preprocessed.shape

(27340, 6)

In [6]:
#show dataframe
test_data_preprocessed = pd.read_csv('./Data/new_essay_val_preprocessed.csv')
test_data_preprocessed.head()

Unnamed: 0,text,label,%unique_word_total,%stop_word_total,mean_word_length,mean_char_count_per_word
0,marshall plan test progress test progress whet...,0,0.444112,0.392216,5.384232,6.384232
1,promoting global regional security postcold wa...,0,0.414807,0.424949,5.150101,6.150101
2,womanhood peacemaking taking advantage unity c...,0,0.475947,0.37871,5.583419,6.583419
3,human rights universal western construct cruci...,0,0.370482,0.503012,4.875502,5.875502
4,challenges american foreign service rebuilding...,0,0.467425,0.379524,5.668046,6.668046


In [7]:
from sklearn.model_selection import train_test_split
#show shape of train and test set
df_train, df_validation = train_test_split(train_data_preprocessed, test_size=0.3, random_state=42)
df_test = test_data_preprocessed # rename

print(df_train.shape)
print(df_validation.shape)
print(df_test.shape)
df_train.dropna(inplace = True)
df_validation.dropna(inplace = True)
df_test.dropna(inplace = True)
print(df_train.shape)
print(df_validation.shape)
print(df_test.shape)

(19138, 6)
(8202, 6)
(40, 6)
(19137, 6)
(8202, 6)
(40, 6)


In [8]:
# Prepare for DataLoader and Dataset without converting labels to ordinal
X_train_token_list = []
X_val_token_list = []
X_test_token_list = []
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Process the data
for i in df_train["text"]:
    X_train_token_list.append(tokenizer(i, truncation=True, padding='max_length', max_length=512, return_tensors='pt'))
for i in df_validation["text"]:
    X_val_token_list.append(tokenizer(i, truncation=True, padding='max_length', max_length=512, return_tensors='pt'))
for i in df_test["text"]:
    X_test_token_list.append(tokenizer(i, truncation=True, padding='max_length', max_length=512, return_tensors='pt'))


In [9]:
# add labels to token_list for use later as dataset 
train_labels = []
for i in df_train["label"]:
  if int(i) == 0:
    train_labels.append(0)
  else:
    train_labels.append(1)
val_labels = []
for i in df_validation["label"]:
  if int(i) == 0:
    val_labels.append(0)
  else:
    val_labels.append(1)
test_labels = []
for i in df_test["label"]:
  if int(i) == 0:
    test_labels.append(0)
  else:
    test_labels.append(1)
for i in range(len(X_train_token_list)):
  X_train_token_list[i]["label"] = torch.tensor(train_labels[i])
for i in range(len(X_val_token_list)):
  X_val_token_list[i]["label"] = torch.tensor(val_labels[i])
for i in range(len(X_test_token_list)):
  X_test_token_list[i]["label"] = torch.tensor(test_labels[i])


In [10]:
# show our input data
print(X_train_token_list[1])

{'input_ids': tensor([[  101,  6203,  3836, 18442,  3047,  1996,  2869,  2724,  4550,  2451,
          2326,  2157,  2157,  2518,  2451,  2326,  2590,  2451,  2925,  2451,
          2689,  7955,  3407,  2111,  2987,  2102, 10587,  4783,  2393,  2477,
          2518,  2082,  2493,  2451,  2326,  2034,  3114,  2451,  2326,  2590,
          3664,  2103,  7098,  2111,  2360,  6752,  2451,  2326,  2272,  2393,
          2187, 11669,  2101,  5437,  2919,  2111,  6283,  3047,  3531,  2123,
          2102,  2681, 27042,  2723,  2404, 11669,  2117,  3114,  2045,  2015,
          2724,  2111,  2147,  4550,  2724,  4627,  2111,  2228,  3138,  2172,
          2051,  2185,  2215,  2393,  2111,  2451,  2082,  2518,  2814,  2987,
          2102,  2215,  2393,  2197,  3114,  2045,  2015,  2172,  4550,  2305,
          5553, 27287,  2342,  4550,  2305,  2279,  2154,  3310,  4550, 15708,
          2111,  4550,  4933,  2894,  2823,  2362,  2228,   102,     0,     0,
             0,     0,     0,     0,  

In [11]:
# Create dataset and dataloader
class MyDataset(Dataset):
    def __init__(self, array_of_dicts):
        self.data = array_of_dicts

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        return item

trainset = MyDataset(X_train_token_list)
valset = MyDataset(X_val_token_list)
testset = MyDataset(X_test_token_list)

train_loader = DataLoader(
    dataset=trainset,
    batch_size=4,
    shuffle=True,
    num_workers=0
)

val_loader = DataLoader(
    dataset=valset,
    batch_size=4,
    num_workers=0
)

test_loader = DataLoader(
    dataset=testset,
    batch_size=4,
    num_workers=0
)

In [12]:
# train our model and freeze our parameter
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2)

for param in model.parameters():
    param.requires_grad = False

for param in model.pre_classifier.parameters():
    param.requires_grad = True

for param in model.classifier.parameters():
    param.requires_grad = True

#model structure
class LightningModel(pl.LightningModule):
    def __init__(self, model, learning_rate=5e-5):
        super().__init__()

        self.learning_rate = learning_rate
        self.model = model

        # Ensure the config is set
        self.config = model.config  # Copy the model's configuration

        self.train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2)
        self.train_auroc = torchmetrics.AUROC(task="multiclass", num_classes=2)  # AUROC for training
        

        self.val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2)
        self.val_auroc = torchmetrics.AUROC(task="multiclass", num_classes=2)  # AUROC for evaluation
        self.test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2)
        self.test_auroc = torchmetrics.AUROC(task="multiclass", num_classes=2)  # AUROC for testing

    def forward(self, input_ids, attention_mask, labels):
        return self.model(input_ids, attention_mask=attention_mask, labels=labels)

    def training_step(self, batch, batch_idx):
        outputs = self(batch["input_ids"].squeeze(1), attention_mask=batch["attention_mask"],
                       labels=batch["label"])
        self.log("train_loss", outputs["loss"])
        logits = outputs["logits"]
        probabilities = torch.softmax(logits, dim=1)  # Get probabilities for the positive class
        predicted_labels = torch.argmax(logits, 1)

        # Log accuracy
        self.train_acc(predicted_labels, batch["label"])
        self.log("train_acc", self.train_acc)
    
        # Update and log AUROC for training
        self.train_auroc(probabilities, batch["label"])
        # self.train_auroc.update(probabilities[:, 1], batch["label"])  # Use probabilities for the positive class
        self.log("train_auroc", self.train_auroc)

        return outputs["loss"]

    def validation_step(self, batch, batch_idx):
        outputs = self(batch["input_ids"].squeeze(1), attention_mask=batch["attention_mask"],
                       labels=batch["label"])
        self.log("val_loss", outputs["loss"], prog_bar=True)

        logits = outputs["logits"]
        probabilities = torch.softmax(logits, dim=1)  # Get probabilities
        predicted_labels = torch.argmax(logits, 1)
        
        # Log accuracy
        self.val_acc(predicted_labels, batch["label"])
        self.log("val_acc", self.val_acc, prog_bar=True)
        
        # Update and log AUROC
        self.val_auroc(probabilities, batch["label"])
        # self.val_auroc.update(probabilities[:, 1], batch["label"])  # Use probabilities for the positive class
        self.log("val_auroc", self.val_auroc, prog_bar=True)
        
        # output for classification report
        return {"logits": logits, "loss": outputs["loss"], "labels": batch["label"]}

    def test_step(self, batch, batch_idx):
        outputs = self(batch["input_ids"].squeeze(1), attention_mask=batch["attention_mask"],
                       labels=batch["label"])
        self.log("test_loss", outputs["loss"], prog_bar=True)

        logits = outputs["logits"]
        probabilities = torch.softmax(logits, dim=1)  # Get probabilities
        predicted_labels = torch.argmax(logits, 1)

        # Log accuracy
        self.test_acc(predicted_labels, batch["label"])
        self.log("accuracy", self.test_acc, prog_bar=True)

        # Update and log AUROC
        self.test_auroc(probabilities, batch["label"])
        self.log("test_auroc", self.test_auroc, prog_bar=True)
        
        # output for classification report
        return {"logits": logits, "loss": outputs["loss"], "labels": batch["label"]}

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
        return optimizer


lightning_model = LightningModel(model)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# Change the monitor to "val_auroc"
logger = TensorBoardLogger("distilbert-add-fea/", name="finetuning", version="original")
callbacks = [
    ModelCheckpoint(
        save_top_k=1, mode="max", monitor="val_auroc"  # Monitor the AUROC metric
    )
]

In [14]:
trainer = pl.Trainer(
    max_epochs=3,
    callbacks=callbacks, 
    # accelerator="gpu",  uncomment if gpu available
    devices=1,
    logger=logger,
    log_every_n_steps=1,
)

# Start training
trainer.fit(
    model=lightning_model,
    train_dataloaders=train_loader,
    val_dataloaders=val_loader
)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name        | Type                                | Params
--------------------------------------------------------------------
0 | model       | DistilBertForSequenceClassification | 67.0 M
1 | train_acc   | MulticlassAccuracy                  | 0     
2 | train_auroc | MulticlassAUROC                     | 0     
3 | val_acc     | MulticlassAccuracy                  | 0     
4 | val_auroc   | MulticlassAUROC                     | 0     
5 | test_acc    | MulticlassAccuracy                  | 0     
6 | test_auroc  | MulticlassAUROC                     | 0     
--------------------------------------------------------------------
592 K     Trainable params
66.4 M    Non-trainable params
67.0 M    Total params
267.820   Total estimated model params size (MB)


Sanity Checking: |                                                                                            …

Training: |                                                                                                   …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

`Trainer.fit` stopped: `max_epochs=3` reached.


In [15]:
# Define the path to save the model
model_path = "./distilbert-dataset-v3-kaggle30k_ep3_dropout0.4.pth"

# Save the entire model
torch.save(lightning_model, model_path)

In [16]:
# test with validation dataset
val_results = trainer.validate(lightning_model, dataloaders=val_loader, ckpt_path="best")
val_results

Restoring states from the checkpoint path at distilbert-add-fea/finetuning/original/checkpoints/epoch=2-step=14355-v2.ckpt
Loaded model weights from the checkpoint at distilbert-add-fea/finetuning/original/checkpoints/epoch=2-step=14355-v2.ckpt


Validation: |                                                                                                 …

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     Validate metric           DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
         val_acc             0.980248749256134
        val_auroc           0.9968925714492798
        val_loss            0.05967448651790619
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'val_loss': 0.05967448651790619,
  'val_acc': 0.980248749256134,
  'val_auroc': 0.9968925714492798}]

In [17]:
test_results = trainer.test(lightning_model, dataloaders=test_loader, ckpt_path="best")
test_results

Restoring states from the checkpoint path at distilbert-add-fea/finetuning/original/checkpoints/epoch=2-step=14355-v2.ckpt
Loaded model weights from the checkpoint at distilbert-add-fea/finetuning/original/checkpoints/epoch=2-step=14355-v2.ckpt


Testing: |                                                                                                    …

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        accuracy             0.675000011920929
       test_auroc           0.8899999856948853
        test_loss            1.382774829864502
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_loss': 1.382774829864502,
  'accuracy': 0.675000011920929,
  'test_auroc': 0.8899999856948853}]

In [18]:
# Start tensorboard.
%load_ext tensorboard
%tensorboard --logdir distilbert/finetuning/original/

Reusing TensorBoard on port 6006 (pid 658), started 7:14:09 ago. (Use '!kill 658' to kill it.)