<a href="https://colab.research.google.com/github/Manjesh80/ml/blob/main/classification_in_detail.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!rm -rf ./ml
!git clone https://github.com/manjesh80/ml.git
!rm -f ml/*.ipynb
!cp ml/* .
!ls

Cloning into 'ml'...
remote: Enumerating objects: 27, done.[K
remote: Counting objects: 100% (27/27), done.[K
remote: Compressing objects: 100% (25/25), done.[K
remote: Total 27 (delta 8), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (27/27), 8.58 MiB | 5.77 MiB/s, done.
Resolving deltas: 100% (8/8), done.
AirlineTweets.csv  ner_dataset_2.csv	     README.md
imdbs.csv	   ner_dataset.csv	     sample_data
ml		   ner_datasetreference.csv  tweet_sentiment_data.csv


In [None]:
import subprocess
import torch

def useGPU():
    if not torch.cuda.is_available():
        print(f" !!!! No GPU detected! This notebook will be slow !!!! \n\n")
    else:
        if torch.cuda.is_available():
          has_gpu()
          gpu_device = torch.device('cuda')
          device = torch.device('cuda')
          gpu_info = torch.cuda.get_device_properties(gpu_device)
          gpu_memory = gpu_info.total_memory / 1e9  # Convert bytes to gigabytes
          print(f"GPU: {gpu_info.name}, Total Memory: {gpu_memory:.2f} GB")
        else:
            device = torch.device("cpu")
            print("No GPU detected.")

def install_packages(packages):
    check = u'\u2705'
    print("\033[1mInstalling base requirements...\n\033[0m")
    for package in packages:
        process_scatter = subprocess.run(
            ["pip", "install", package],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        )
        if process_scatter.returncode != 0:
            print(f"Installation of {package} failed with error:\n{process_scatter.stderr.decode('utf-8')}")
        else:
            print(f"{check} {package} installation completed successfully!\n")

def has_gpu():
  gpu_info = !nvidia-smi
  gpu_info = '\n'.join(gpu_info)
  if gpu_info.find('failed') >= 0:
    print('Not connected to a GPU')
  else:
    print(gpu_info)

useGPU()
device =  "cuda" if torch.cuda.is_available() else "cpu"


Fri Jan 26 21:17:25 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-SXM2-16GB           Off | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0              24W / 300W |      2MiB / 16384MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
packages = [ "transformers[torch]", "datasets", "torchinfo", "evaluate", "wget" ]
install_packages(packages)

[1mInstalling base requirements...
[0m
✅ transformers[torch] installation completed successfully!

✅ datasets installation completed successfully!

✅ torchinfo installation completed successfully!

✅ evaluate installation completed successfully!

✅ wget installation completed successfully!



# **DETAIL UNDERSTANDING OF TOKEN, SEGMENTS, PADDING, POSITION EMBEDDING**

In [None]:
import torch
from transformers import BertModel, BertTokenizer

model = BertModel.from_pretrained('bert-base-cased')
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
sentence = 'Learning "BERT" inner workings from scratch'

In [None]:
raw_tokens = tokenizer.tokenize(sentence)
print(raw_tokens)

['Learning', '"', 'B', '##ER', '##T', '"', 'inner', 'working', '##s', 'from', 'scratch']


In [None]:
sentence = 'Machine learning inner working from scratch'
raw_tokens = tokenizer.tokenize(sentence)
print(raw_tokens)

['Machine', 'learning', 'inner', 'working', 'from', 'scratch']


In [None]:
tokens_with_cls = ['[CLS]'] + raw_tokens + ['[SEP]']
print(tokens_with_cls)

['[CLS]', 'Machine', 'learning', 'inner', 'working', 'from', 'scratch', '[SEP]']


In [None]:
tokens_with_cls_pad = tokens_with_cls + ['[PAD]'] + ['[PAD]']
print(tokens_with_cls_pad)

['[CLS]', 'Machine', 'learning', 'inner', 'working', 'from', 'scratch', '[SEP]', '[PAD]', '[PAD]']


In [None]:
attention_mask = [1 if i!= '[PAD]' else 0 for i in tokens_with_cls_pad]
print(attention_mask)

[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]


In [None]:
token_ids = tokenizer.convert_tokens_to_ids(tokens_with_cls_pad)
print(token_ids)

[101, 7792, 3776, 5047, 1684, 1121, 14515, 102, 0, 0]


In [None]:
token_ids_tensor = torch.tensor(token_ids).unsqueeze(0)
attention_mask_tensor = torch.tensor(attention_mask).unsqueeze(0)
print(token_ids_tensor)
print(attention_mask_tensor)

tensor([[  101,  7792,  3776,  5047,  1684,  1121, 14515,   102,     0,     0]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])


In [None]:
result = model(token_ids_tensor, attention_mask = attention_mask_tensor, output_hidden_states = True)

last_hidden_state = result['last_hidden_state']
cls_head = result['pooler_output']
print(last_hidden_state.shape)
print(cls_head.shape)

torch.Size([1, 10, 768])
torch.Size([1, 768])


In [None]:
last_hidden_state[0].shape # 10 tokens embeddings

torch.Size([10, 768])

Embedding of **CLS**  from Hidden Representation

In [None]:
last_hidden_state[0][0][:10]

tensor([ 0.4326,  0.2300,  0.1559, -0.2808, -0.4072, -0.1503, -0.0449, -0.0232,
         0.2325, -1.2235], grad_fn=<SliceBackward0>)

Embedding of *CLS* from cls_head

In [None]:
cls_head[0][:10]

tensor([-0.5830,  0.4211,  0.9998, -0.9930,  0.9325,  0.8418,  0.9532, -0.9888,
        -0.9799, -0.5999], grad_fn=<SliceBackward0>)

In [None]:
hidden_states = result['hidden_states']
len(hidden_states)

13

In [None]:
hidden_states[1].shape

torch.Size([1, 10, 768])

## **INITIALIZE DATASET**

In [None]:
from datasets import *
import pandas as pd
import numpy as np
# df = pd.read_csv('imdbs.csv')
df = pd.read_csv('imdbs.csv')
raw_dataset = load_dataset('csv', data_files = 'imdbs.csv')
# raw_data = raw_dataset['train'].train_test_split(test_size=0.3, seed=42)
raw_data = raw_dataset['train'].train_test_split(test_size=0.3, seed=42)
raw_data

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 70
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 30
    })
})

## **TOKENIZE DATA**

In [None]:
from transformers import  BertTokenizerFast

fast_tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

def preprocess(data):
    return fast_tokenizer(data['text'], padding=True, truncation = True)

tokenized_dataset = raw_data.map(preprocess, batched = True)

train_dataset = tokenized_dataset["train"]
val_dataset = tokenized_dataset["test"]

train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format('torch',  columns=['input_ids', 'attention_mask', 'label'])

Map:   0%|          | 0/70 [00:00<?, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

In [None]:
from torch.utils.data import DataLoader

batch_size = 16

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
print('Created train & val datasets.')

Created train & val datasets.


In [None]:
train_dataset.data[:1]

pyarrow.Table
text: string
label: int64
input_ids: list<item: int32>
  child 0, item: int32
token_type_ids: list<item: int8>
  child 0, item: int8
attention_mask: list<item: int8>
  child 0, item: int8
----
text: [["Okay, so this series kind of takes the route of 'here we go again!' Week in, week out David Morse's character helps out his ride who is in a bit of a pickle - but what's wrong with that!? David Morse is one of the greatest character actors out there, and certainly the coolest, and to have him in a series created by David Koepp - a great writer - is heaven!!<br /><br />Due to the lack of love for this show by many, I can't see it going to a season series - but you never know? The amount of rubbish that has made it beyond that baffles me - let's hope something good can make it past a first series!!!"]]
label: [[1]]
input_ids: [[[101,3956,117,1177,1142,...,0,0,0,0,0]]]
token_type_ids: [[[0,0,0,0,0,...,0,0,0,0,0]]]
attention_mask: [[[1,1,1,1,1,...,0,0,0,0,0]]]

## **UTILS**

In [None]:
import numpy as np
from sklearn.metrics import f1_score, accuracy_score

def eval_prediction(y_batch_actual, y_batch_predicted):
    """Return batches of accuracy and f1 scores."""
    y_batch_actual_np = y_batch_actual.cpu().detach().numpy()
    y_batch_predicted_np = np.round(y_batch_predicted.cpu().detach().numpy())
    acc = accuracy_score(y_true=y_batch_actual_np, y_pred=y_batch_predicted_np)
    f1 = f1_score(y_true=y_batch_actual_np, y_pred=y_batch_predicted_np, average='weighted')
    return acc, f1

print(eval_prediction.__doc__)

def compute_metrics(logits_and_labels):
  logits, labels = logits_and_labels
  predictions = np.argmax(logits, axis=-1)
  acc = np.mean(predictions == labels)
  f1 = f1_score(labels, predictions, average = 'micro')
  return {'accuracy': acc, 'f1_score': f1}

Return batches of accuracy and f1 scores.


## **BASIC SEQ CLASSIFIER**

In [None]:
from transformers import BertForSequenceClassification, TrainingArguments, Trainer
classification_model = BertForSequenceClassification.from_pretrained('bert-base-cased',  num_labels = 3)

batch_size = 16
epochs = 10
warmup_steps = 500
weight_decay = 0.01

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_steps=warmup_steps,
    weight_decay=weight_decay,
    evaluation_strategy='epoch',
    logging_dir='./logs',
)

trainer = Trainer(
    model=classification_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Score
1,No log,1.098785,0.5,0.5
2,No log,1.07777,0.5,0.5
3,No log,1.039061,0.433333,0.433333
4,No log,0.994665,0.4,0.4
5,No log,0.964186,0.5,0.5
6,No log,0.950811,0.5,0.5
7,No log,0.945535,0.5,0.5
8,No log,0.938126,0.5,0.5
9,No log,0.905428,0.5,0.5
10,No log,0.851968,0.5,0.5


TrainOutput(global_step=50, training_loss=0.936255111694336, metrics={'train_runtime': 24.083, 'train_samples_per_second': 29.066, 'train_steps_per_second': 2.076, 'total_flos': 184179392409600.0, 'train_loss': 0.936255111694336, 'epoch': 10.0})

## **CUSTOM BERT OVERRIDE WITH DENSE LAYERS**

In [None]:
def training_step(dataloader, model, optimizer, loss_fn, if_freeze_bert):
    """Method to train the model"""

    model.train()
    model.freeze_bert() if if_freeze_bert else model.unfreeze_bert()

    epoch_loss = 0
    size = len(dataloader.dataset)

    for i, batch in enumerate(dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = torch.flatten(model(tokens=input_ids, attention_mask=attention_mask))
        optimizer.zero_grad()
        loss = loss_fn(outputs, labels.float())
        epoch_loss += loss.item()
        loss.backward()
        optimizer.step()

print(training_step.__doc__)

def validation_step(dataloader, model, loss_fn):
    """Method to test the model's accuracy and loss on the validation set"""

    model.eval()
    model.freeze_bert()
    size = len(dataloader)
    f1, acc = 0, 0
    with torch.no_grad():
        for batch in dataloader:
            X = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            y = batch['label'].to(device)

            pred = model(tokens=X, attention_mask=attention_mask)

            acc_batch, f1_batch = eval_prediction(y.float(), pred)
            acc += acc_batch
            f1 += f1_batch
        acc = acc/size
        f1 = f1/size
    return acc, f1

print(validation_step.__doc__)

Method to train the model
Method to test the model's accuracy and loss on the validation set


In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, BertModel
import torch.nn as nn

checkpoint='bert-base-cased'
model_name = 'bert-base-cased'

in_features = 768 # it's 768 because that's the size of the output provided by the underlying BERT model

class BertWithCustomNNClassifier(nn.Module):
    """
    A pre-trained BERT model with a custom classifier.
    The classifier is a neural network implemented in this class.
    """

    def __init__(self, linear_size):
        super(BertWithCustomNNClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.dropout1 = nn.Dropout()
        self.linear1 = nn.Linear(in_features=in_features, out_features=linear_size)
        self.batch_norm1 = nn.BatchNorm1d(num_features=linear_size)
        self.dropout2 = nn.Dropout(p=0.8)
        self.linear2 = nn.Linear(in_features=linear_size, out_features=1)
        self.batch_norm2 = nn.BatchNorm1d(num_features=1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, tokens, attention_mask):
        bert_output = self.bert(input_ids=tokens, attention_mask=attention_mask)
        x = self.dropout1(bert_output[1])
        x = self.linear1(x)
        x = self.dropout2(x)
        x = self.batch_norm1(x)
        x = self.linear2(x)
        x = self.batch_norm2(x)
        return self.sigmoid(x)

    def freeze_bert(self):
        """
        Freezes the parameters of BERT so when BertWithCustomNNClassifier is trained
        only the wieghts of the custom classifier are modified.
        """
        for param in self.bert.named_parameters():
            param[1].requires_grad=False

    def unfreeze_bert(self):
        """
        Unfreezes the parameters of BERT so when BertWithCustomNNClassifier is trained
        both the wieghts of the custom classifier and of the underlying BERT are modified.
        """
        for param in self.bert.named_parameters():
            param[1].requires_grad=True


print(BertWithCustomNNClassifier.__doc__)

class BertWithCustomNNClassifierLastNLayers(nn.Module):
    """
    A pre-trained BERT model with a custom classifier.
    The classifier is a neural network implemented in this class.
    """

    def __init__(self, linear_size):
        super(BertWithCustomNNClassifierLastNLayers, self).__init__()
        self.bert = BertModel.from_pretrained(model_name, output_hidden_states=True)
        self.dropout1 = nn.Dropout()
        self.linear1 = nn.Linear(in_features=in_features, out_features=linear_size)
        self.batch_norm1 = nn.BatchNorm1d(num_features=linear_size)
        self.dropout2 = nn.Dropout(p=0.8)
        self.linear2 = nn.Linear(in_features=linear_size, out_features=1)
        self.batch_norm2 = nn.BatchNorm1d(num_features=1)
        self.sigmoid = nn.Sigmoid()

    def _get_cls_vec(self, vec):
        return vec[:,0,:].view(-1, 768)

    def forward(self, tokens, attention_mask):
        bert_output = self.bert(input_ids=tokens, attention_mask=attention_mask)
        hidden_states = bert_output["hidden_states"]
        vec1 = self._get_cls_vec(hidden_states[-1])
        vec2 = self._get_cls_vec(hidden_states[-2])
        # vec3 = self._get_cls_vec(hidden_states[-3])
        # vec4 = self._get_cls_vec(hidden_states[-4])
        # vec = torch.cat([vec1, vec2, vec3, vec4], dim=1)
        vec = ( vec1 + vec2 ) / 2
        x = self.dropout1(vec)
        x = self.linear1(x)
        x = self.dropout2(x)
        x = self.batch_norm1(x)
        x = self.linear2(x)
        x = self.batch_norm2(x)
        return self.sigmoid(x)

    def freeze_bert(self):
        """
        Freezes the parameters of BERT so when BertWithCustomNNClassifier is trained
        only the wieghts of the custom classifier are modified.
        """
        for param in self.bert.named_parameters():
            param[1].requires_grad=False

    def unfreeze_bert(self):
        """
        Unfreezes the parameters of BERT so when BertWithCustomNNClassifier is trained
        both the wieghts of the custom classifier and of the underlying BERT are modified.
        """
        for param in self.bert.named_parameters():
            param[1].requires_grad=True


print(BertWithCustomNNClassifier.__doc__)


    A pre-trained BERT model with a custom classifier.
    The classifier is a neural network implemented in this class.
    

    A pre-trained BERT model with a custom classifier.
    The classifier is a neural network implemented in this class.
    


In [None]:
from transformers import AdamW
from tqdm.auto import tqdm

tqdm.pandas()

# parameters
num_of_epochs = 4
learning_rate = 27e-6
batch_size = 16
hidden_layers = 8

print("Epochs: {}".format(num_of_epochs))
print("Learning rate: {:.6f}".format(learning_rate))
print("Batch size: {}".format(batch_size))
print("The number of hidden layers in the custom head: {}".format(hidden_layers))

Epochs: 4
Learning rate: 0.000027
Batch size: 16
The number of hidden layers in the custom head: 8


In [None]:
model = BertWithCustomNNClassifier(linear_size=hidden_layers)
model.to(device)

optimizer = AdamW(model.parameters(), lr=learning_rate)
print('Initialized optimizer.')

loss_fn = nn.BCELoss()
print('Initialized loss function.')

best_acc, best_f1 = 0, 0
path = './best_model.pt'
if_freeze_bert = False

for i in tqdm(range(num_of_epochs)):
    print("Epoch: #{}".format(i+1))
    if i < 5:
        if_freeze_bert = False
        # print("Bert is not freezed")
    else:
        if_freeze_bert = True
        # print("Bert is freezed")

    training_step(train_loader, model ,optimizer, loss_fn, if_freeze_bert)
    train_acc, train_f1 = validation_step(train_loader, model, loss_fn)
    val_acc, val_f1 = validation_step(val_loader, model, loss_fn)

    print("Training results:   Acc: {:.3f}, f1: {:.3f}".format(train_acc, train_f1))
    print("Validation results: Acc: {:.3f}, f1: {:.3f}".format(val_acc, val_f1))

    if val_acc > best_acc:
        best_acc = val_acc
        torch.save(model, path)

Initialized optimizer.
Initialized loss function.




  0%|          | 0/4 [00:00<?, ?it/s]

Epoch: #1
Training results:   Acc: 0.621, f1: 0.477
Validation results: Acc: 0.513, f1: 0.372
Epoch: #2
Training results:   Acc: 0.621, f1: 0.482
Validation results: Acc: 0.513, f1: 0.372
Epoch: #3
Training results:   Acc: 0.600, f1: 0.458
Validation results: Acc: 0.509, f1: 0.354
Epoch: #4
Training results:   Acc: 0.621, f1: 0.477
Validation results: Acc: 0.504, f1: 0.341


In [None]:
model = BertWithCustomNNClassifierLastNLayers(linear_size=hidden_layers)
model.to(device)

optimizer = AdamW(model.parameters(), lr=learning_rate)
print('Initialized optimizer.')

loss_fn = nn.BCELoss()
print('Initialized loss function.')

best_acc, best_f1 = 0, 0
path = './best_model.pt'
if_freeze_bert = False

for i in tqdm(range(num_of_epochs)):
    print("Epoch: #{}".format(i+1))
    if i < 5:
        if_freeze_bert = False
    else:
        if_freeze_bert = True

    training_step(train_loader, model ,optimizer, loss_fn, if_freeze_bert)
    train_acc, train_f1 = validation_step(train_loader, model, loss_fn)
    val_acc, val_f1 = validation_step(val_loader, model, loss_fn)

    print("Training results:   Acc: {:.3f}, f1: {:.3f}".format(train_acc, train_f1))
    print("Validation results: Acc: {:.3f}, f1: {:.3f}".format(val_acc, val_f1))

    if val_acc > best_acc:
        best_acc = val_acc
        torch.save(model, path)

Initialized optimizer.
Initialized loss function.




  0%|          | 0/4 [00:00<?, ?it/s]

Epoch: #1
Bert is not freezed
Training results:   Acc: 0.600, f1: 0.458
Validation results: Acc: 0.500, f1: 0.333
Epoch: #2
Bert is not freezed
Training results:   Acc: 0.692, f1: 0.643
Validation results: Acc: 0.705, f1: 0.670
Epoch: #3
Bert is not freezed
Training results:   Acc: 0.650, f1: 0.649
Validation results: Acc: 0.795, f1: 0.781
Epoch: #4
Bert is not freezed
Training results:   Acc: 0.704, f1: 0.681
Validation results: Acc: 0.768, f1: 0.749


In [None]:
from threading import excepthook
import gc

try:
  if model:
    model.cpu()
    del model
except:
  pass
try:
  if checkpoint:
    del checkpoint
except:
  pass

gc.collect()
torch.cuda.empty_cache()