In [68]:
## import the libs ##

import torch, os
import pandas as pd
from transformers import pipeline, BertForSequenceClassification, BertTokenizerFast
from torch.utils.data import Dataset
from torch import cuda
from transformers import TrainingArguments, Trainer

from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm

# !pip install accelerate -U

In [2]:
## check device availability ##
device = 'cuda' if cuda.is_available() else 'cpu'
device

'cuda'

In [3]:
## load train and holdout csv ##

train_df = pd.read_csv(r"/content/training_df_info.csv")
test_df = pd.read_csv(r"/content/testing_df_info.csv")


In [6]:
# drop na
train_df.dropna(inplace = True)
test_df.dropna(inplace = True)

# get the labels

labels = train_df['target'].unique().tolist()
print(labels)

# create id <> mappings
label2id = {"Yes": 1, "No" : 0}
id2label = {1: "Yes", 0: "No"}

[1, 0]


In [39]:
## get the test texts and test labels for benchmarking scores after finetuning ##
test_texts = list(test_df['input_text'].values)
test_labels = list(test_df['target'].values)

In [7]:
## load the tokenizer from HF, BertFastTokenizer
## Idea is to finetune on this train datasets and then test on validation set
## final benchmarking will be done on holdout set / test set

tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased", max_length=512)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id)
model.to(device)   #adding to cuda

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [8]:
## checking the size of the entire train data ##
SIZE = train_df.shape[0]

## to get train and val samples, a simple train-test-split is performed
## have checked the distribution of entire train data, it's kinda balanced

print(train_df['target'].value_counts())


1    435
0    408
Name: target, dtype: int64


In [24]:
from sklearn.model_selection import train_test_split
train_text, val_text, train_labels, val_labels = train_test_split(list(train_df['input_text'].values), list(train_df['target'].values), test_size = 0.2, random_state = 33)


In [25]:
len(train_text), len(val_text), len(train_labels), len(val_labels)


(674, 169, 674, 169)

In [41]:
## create the embeddings, token ids and attention-mask which will be the inp to the bert model for finetuning, using tokenizer

train_encodings = tokenizer(train_text, truncation=True, padding=True)
val_encodings = tokenizer(val_text, truncation=True, padding=True)


## create embeddings for the final test set too for score calculations
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [27]:
train_encodings.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [28]:
train_encodings['input_ids'][0][:10]

[101, 7479, 1012, 9033, 26941, 1012, 4012, 1013, 9686, 24761]

In [30]:
train_labels[:10]

[0, 0, 1, 0, 1, 0, 0, 0, 1, 0]

In [31]:
class DataLoader(Dataset):
    """
    Custom Dataset class for handling tokenized text data and corresponding labels.
    Inherits from torch.utils.data.Dataset.
    """
    def __init__(self, encodings, labels):
        """
        Initializes the DataLoader class with encodings and labels.

        Args:
            encodings (dict): A dictionary containing tokenized input text data
                              (e.g., 'input_ids', 'token_type_ids', 'attention_mask').
            labels (list): A list of integer labels for the input text data.
        """
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        """
        Returns a dictionary containing tokenized data and the corresponding label for a given index.

        Args:
            idx (int): The index of the data item to retrieve.

        Returns:
            item (dict): A dictionary containing the tokenized data and the corresponding label.
        """
        # Retrieve tokenized data for the given index
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        # Add the label for the given index to the item dictionary
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        """
        Returns the number of data items in the dataset.

        Returns:
            (int): The number of data items in the dataset.
        """
        return len(self.labels)


In [42]:
## create the DataLoader for train and val using the Dataloader class inherited from Dataset Class of Transformers

train_dataloader = DataLoader(train_encodings, train_labels)
val_dataloader = DataLoader(val_encodings, val_labels)

test_dataloader = DataLoader(test_encodings, test_labels)

In [34]:

def compute_metrics(pred):
    """
    Computes accuracy, F1, precision, and recall for a given set of predictions.

    Args:
        pred (obj): An object containing label_ids and predictions attributes.
            - label_ids (array-like): A 1D array of true class labels.
            - predictions (array-like): A 2D array where each row represents
              an observation, and each column represents the probability of
              that observation belonging to a certain class.

    Returns:
        dict: A dictionary containing the following metrics:
            - Accuracy (float): The proportion of correctly classified instances.
            - F1 (float): The macro F1 score, which is the harmonic mean of precision
              and recall. Macro averaging calculates the metric independently for
              each class and then takes the average.
            - Precision (float): The macro precision, which is the number of true
              positives divided by the sum of true positives and false positives.
            - Recall (float): The macro recall, which is the number of true positives
              divided by the sum of true positives and false negatives.
    """
    # Extract true labels from the input object
    labels = pred.label_ids

    # Obtain predicted class labels by finding the column index with the maximum probability
    preds = pred.predictions.argmax(-1)

    # Compute macro precision, recall, and F1 score using sklearn's precision_recall_fscore_support function
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')

    # Calculate the accuracy score using sklearn's accuracy_score function
    acc = accuracy_score(labels, preds)

    # Return the computed metrics as a dictionary
    return {
        'Accuracy': acc,
        'F1': f1,
        'Precision': precision,
        'Recall': recall
    }


In [35]:
!mkdir bert_model_output

In [36]:
training_args = TrainingArguments(
    # The output directory where the model predictions and checkpoints will be written
    output_dir='/content/bert_model_output',
    do_train=True,
    do_eval=True,
    #  The number of epochs, defaults to 3.0
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    # Number of steps used for a linear warmup
    warmup_steps=100,
    weight_decay=0.01,
    logging_strategy='steps',
   # TensorBoard log directory
    logging_dir='./binary-class-logs',
    logging_steps=50,
    evaluation_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    fp16=True,
    load_best_model_at_end=True
)

In [37]:
trainer = Trainer(
    # the pre-trained model that will be fine-tuned
    model=model,
     # training arguments that we defined above
    args=training_args,
    train_dataset=train_dataloader,
    eval_dataset=val_dataloader,
    compute_metrics= compute_metrics
)

In [38]:
## train the model ##
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
50,0.5819,0.422458,0.769231,0.76842,0.798077,0.785816
100,0.1639,0.211074,0.952663,0.952287,0.950914,0.954752
150,0.103,0.233206,0.928994,0.928268,0.927372,0.929433
200,0.0775,0.187856,0.952663,0.952057,0.952057,0.952057


TrainOutput(global_step=215, training_loss=0.2191014705702316, metrics={'train_runtime': 137.5134, 'train_samples_per_second': 24.507, 'train_steps_per_second': 1.563, 'total_flos': 886684256563200.0, 'train_loss': 0.2191014705702316, 'epoch': 5.0})

In [43]:
## cal scores for train, val and test data ##

q=[trainer.evaluate(eval_dataset=df_org) for df_org in [train_dataloader, val_dataloader, test_dataloader]]

pd.DataFrame(q, index=["train","val", "test"]).iloc[:,:5]

Unnamed: 0,eval_loss,eval_Accuracy,eval_F1,eval_Precision,eval_Recall
train,0.033205,0.98368,0.983679,0.983874,0.983836
val,0.191988,0.95858,0.958205,0.95696,0.960071
test,1.673532,0.6875,0.664373,0.681363,0.741667


In [44]:
def predict(text):
    """
    Predicts the class label for a given input text

    Args:
        text (str): The input text for which the class label needs to be predicted.

    Returns:
        probs (torch.Tensor): Class probabilities for the input text.
        pred_label_idx (torch.Tensor): The index of the predicted class label.
        pred_label (str): The predicted class label.
    """
    # Tokenize the input text and move tensors to the GPU if available
    inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt").to("cuda")

    # Get model output (logits)
    outputs = model(**inputs)

    probs = outputs[0].softmax(1)
    """ Explanation outputs: The BERT model returns a tuple containing the output logits (and possibly other elements depending on the model configuration). In this case, the output logits are the first element in the tuple, which is why we access it using outputs[0].

    outputs[0]: This is a tensor containing the raw output logits for each class. The shape of the tensor is (batch_size, num_classes) where batch_size is the number of input samples (in this case, 1, as we are predicting for a single input text) and num_classes is the number of target classes.

    softmax(1): The softmax function is applied along dimension 1 (the class dimension) to convert the raw logits into class probabilities. Softmax normalizes the logits so that they sum to 1, making them interpretable as probabilities. """

    # Get the index of the class with the highest probability
    # argmax() finds the index of the maximum value in the tensor along a specified dimension.
    # By default, if no dimension is specified, it returns the index of the maximum value in the flattened tensor.
    pred_label_idx = probs.argmax()

    # Now map the predicted class index to the actual class label
    # Since pred_label_idx is a tensor containing a single value (the predicted class index),
    # the .item() method is used to extract the value as a scalar
    pred_label = model.config.id2label[pred_label_idx.item()]

    return probs, pred_label_idx, pred_label


In [71]:
len(test_labels)

80

In [74]:
## cal scores for the same

actual_labels = test_labels
pred_labels = []
for text in tqdm(test_texts):
  # print(text)
  print("Predicting info....")
  output_response = predict(text)

  if output_response:
    pred_labels.append(list(output_response)[1].item())

assert len(actual_labels) == len(pred_labels), "Mismatch in labels size"

from sklearn.metrics import f1_score, confusion_matrix
print(f1_score(actual_labels, pred_labels))
print(confusion_matrix(actual_labels, pred_labels))

  1%|▏         | 1/80 [00:00<00:08,  9.55it/s]

Predicting info....
Predicting info....
Predicting info....


  8%|▊         | 6/80 [00:00<00:03, 18.56it/s]

Predicting info....
Predicting info....
Predicting info....
Predicting info....
Predicting info....
Predicting info....


 15%|█▌        | 12/80 [00:00<00:03, 22.33it/s]

Predicting info....
Predicting info....
Predicting info....
Predicting info....
Predicting info....


 22%|██▎       | 18/80 [00:00<00:02, 21.09it/s]

Predicting info....
Predicting info....
Predicting info....
Predicting info....


 26%|██▋       | 21/80 [00:01<00:02, 20.91it/s]

Predicting info....
Predicting info....
Predicting info....
Predicting info....


 30%|███       | 24/80 [00:01<00:03, 17.35it/s]

Predicting info....
Predicting info....
Predicting info....


 36%|███▋      | 29/80 [00:01<00:02, 18.75it/s]

Predicting info....
Predicting info....
Predicting info....
Predicting info....
Predicting info....


 40%|████      | 32/80 [00:01<00:02, 20.30it/s]

Predicting info....
Predicting info....
Predicting info....
Predicting info....
Predicting info....


 48%|████▊     | 38/80 [00:02<00:02, 18.60it/s]

Predicting info....
Predicting info....
Predicting info....
Predicting info....


 54%|█████▍    | 43/80 [00:02<00:01, 19.18it/s]

Predicting info....
Predicting info....
Predicting info....
Predicting info....
Predicting info....


 57%|█████▊    | 46/80 [00:02<00:02, 16.88it/s]

Predicting info....
Predicting info....
Predicting info....


 64%|██████▍   | 51/80 [00:02<00:01, 18.74it/s]

Predicting info....
Predicting info....
Predicting info....
Predicting info....
Predicting info....


 68%|██████▊   | 54/80 [00:02<00:01, 20.27it/s]

Predicting info....
Predicting info....
Predicting info....
Predicting info....
Predicting info....


 75%|███████▌  | 60/80 [00:03<00:00, 20.80it/s]

Predicting info....
Predicting info....
Predicting info....
Predicting info....
Predicting info....


 79%|███████▉  | 63/80 [00:03<00:00, 21.99it/s]

Predicting info....
Predicting info....
Predicting info....
Predicting info....


 86%|████████▋ | 69/80 [00:03<00:00, 20.84it/s]

Predicting info....
Predicting info....
Predicting info....
Predicting info....
Predicting info....


 94%|█████████▍| 75/80 [00:03<00:00, 21.66it/s]

Predicting info....
Predicting info....
Predicting info....
Predicting info....
Predicting info....


100%|██████████| 80/80 [00:04<00:00, 19.67it/s]

Predicting info....
Predicting info....
Predicting info....
Predicting info....
0.5762711864406781
[[38 22]
 [ 3 17]]





In [75]:
## save the fine-tuned model ##

model_path = "bert-base-uncased-finetune-raw-data"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

('bert-base-uncased-finetune-raw-data/tokenizer_config.json',
 'bert-base-uncased-finetune-raw-data/special_tokens_map.json',
 'bert-base-uncased-finetune-raw-data/vocab.txt',
 'bert-base-uncased-finetune-raw-data/added_tokens.json',
 'bert-base-uncased-finetune-raw-data/tokenizer.json')

In [76]:
## load the model from the local and do prediction ##

model_path = "bert-base-uncased-finetune-raw-data"

model = BertForSequenceClassification.from_pretrained(model_path).to(device)
tokenizer= BertTokenizerFast.from_pretrained(model_path)


In [78]:
print(predict(test_texts[66]))

test_labels[66]

(tensor([[0.0145, 0.9855]], device='cuda:0', grad_fn=<SoftmaxBackward0>), tensor(1, device='cuda:0'), 'Yes')


0