In [22]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, TensorDataset, random_split, ConcatDataset, Dataset
from tqdm import tqdm

from torch.nn.utils.rnn import pad_sequence

from transformers import BertTokenizer, BertModel, AutoTokenizer
from datasets import load_dataset

In [23]:
dataset = load_dataset('dair-ai/emotion',trust_remote_code=True)

In [24]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=300)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [25]:
tokenized_datasets.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

train_dataset = TensorDataset(tokenized_datasets['train']['input_ids'], tokenized_datasets['train']['attention_mask'], tokenized_datasets['train']['label'])
test_dataset = TensorDataset(tokenized_datasets['test']['input_ids'], tokenized_datasets['test']['attention_mask'], tokenized_datasets['test']['label'])
val_dataset = TensorDataset(tokenized_datasets['validation']['input_ids'], tokenized_datasets['validation']['attention_mask'], tokenized_datasets['validation']['label'])

train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True, num_workers=4, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=16, num_workers=4, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=16, num_workers=4, pin_memory=True)

In [26]:
class LoRA(nn.Module):
    def __init__(self, model, r=16):
        super(LoRA, self).__init__()
        self.model = model
        self.r = r
        self.lora_layers = nn.ModuleDict()

        # Initialize LoRA layers for BERT's attention weights
        for name, param in self.model.named_parameters():
            if 'attention' in name and 'weight' in name:
                lora_layer = nn.Parameter(torch.zeros_like(param))
                nn.init.kaiming_uniform_(lora_layer, a=0.5)
                self.lora_layers[name] = lora_layer

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        outputs = self.model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        
        for name, param in self.model.named_parameters():
            if name in self.lora_layers:
                param.data += self.lora_layers[name] / self.r
        
        return outputs

In [27]:
class BertClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert=BertModel.from_pretrained('bert-base-uncased')
        self.fc1=nn.Linear(768, num_labels)

    def forward(self, **inputs):
        output=self.bert(**inputs)
        cls_token_hidden_state = output.last_hidden_state[:, 0, :]
        logits = self.fc1(cls_token_hidden_state)
        return logits
    
model= BertClassifier(num_labels=6)

In [28]:
for param in model.bert.embeddings.parameters():
    param.requires_grad = False

# 11 out of 12 layers are freezed 
for layer in model.bert.encoder.layer[:11]:
    for param in layer.parameters():
        param.requires_grad = False

In [29]:
from prettytable import PrettyTable
def count_parameters(model):
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad: continue
        params = parameter.numel()
        table.add_row([name, params])
        total_params+=params
    print(table)
    print(f"Total Trainable Params: {total_params}")
    return total_params
count_parameters(model)

+---------------------------------------------------------+------------+
|                         Modules                         | Parameters |
+---------------------------------------------------------+------------+
|    bert.encoder.layer.11.attention.self.query.weight    |   589824   |
|     bert.encoder.layer.11.attention.self.query.bias     |    768     |
|     bert.encoder.layer.11.attention.self.key.weight     |   589824   |
|      bert.encoder.layer.11.attention.self.key.bias      |    768     |
|    bert.encoder.layer.11.attention.self.value.weight    |   589824   |
|     bert.encoder.layer.11.attention.self.value.bias     |    768     |
|   bert.encoder.layer.11.attention.output.dense.weight   |   589824   |
|    bert.encoder.layer.11.attention.output.dense.bias    |    768     |
| bert.encoder.layer.11.attention.output.LayerNorm.weight |    768     |
|  bert.encoder.layer.11.attention.output.LayerNorm.bias  |    768     |
|     bert.encoder.layer.11.intermediate.dense.weig

7683078

In [30]:
optimizer = optim.AdamW(model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()

In [31]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
if torch.cuda.device_count() > 1:
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    model = nn.DataParallel(model)
model = model.to('cuda')
model.to(device)

num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch+1}/{num_epochs}, Training Loss: {avg_train_loss:.4f}')

# validation
    model.eval()
    total_correct = 0
    total_samples = 0

    with torch.no_grad():
        for batch in tqdm(val_loader):
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss=criterion(outputs, labels)
            predictions = torch.argmax(outputs, dim=1)
            
            total_correct += (predictions == labels).sum().item()
            total_samples += labels.size(0)

    accuracy = total_correct / total_samples
    print(f'Validation Accuracy: {accuracy:.4f}')
    print(f'Validation loss: {loss:.4f}')

Let's use 2 GPUs!


100%|██████████| 63/63 [02:37<00:00,  2.50s/it]


Epoch 1/10, Training Loss: 1.5575


100%|██████████| 125/125 [00:22<00:00,  5.52it/s]


Validation Accuracy: 0.4945
Validation loss: 1.5027


100%|██████████| 63/63 [02:36<00:00,  2.49s/it]


Epoch 2/10, Training Loss: 1.2958


100%|██████████| 125/125 [00:22<00:00,  5.52it/s]


Validation Accuracy: 0.6085
Validation loss: 1.1661


100%|██████████| 63/63 [02:36<00:00,  2.49s/it]


Epoch 3/10, Training Loss: 1.0089


100%|██████████| 125/125 [00:22<00:00,  5.53it/s]


Validation Accuracy: 0.6770
Validation loss: 0.7870


100%|██████████| 63/63 [02:36<00:00,  2.49s/it]


Epoch 4/10, Training Loss: 0.8335


100%|██████████| 125/125 [00:22<00:00,  5.58it/s]


Validation Accuracy: 0.7320
Validation loss: 0.6067


100%|██████████| 63/63 [02:36<00:00,  2.48s/it]


Epoch 5/10, Training Loss: 0.7420


100%|██████████| 125/125 [00:22<00:00,  5.52it/s]


Validation Accuracy: 0.7525
Validation loss: 0.5736


100%|██████████| 63/63 [02:36<00:00,  2.49s/it]


Epoch 6/10, Training Loss: 0.6833


100%|██████████| 125/125 [00:22<00:00,  5.53it/s]


Validation Accuracy: 0.7750
Validation loss: 0.5349


100%|██████████| 63/63 [02:36<00:00,  2.48s/it]


Epoch 7/10, Training Loss: 0.6447


100%|██████████| 125/125 [00:22<00:00,  5.52it/s]


Validation Accuracy: 0.7795
Validation loss: 0.5107


100%|██████████| 63/63 [02:36<00:00,  2.48s/it]


Epoch 8/10, Training Loss: 0.6042


100%|██████████| 125/125 [00:22<00:00,  5.52it/s]


Validation Accuracy: 0.7995
Validation loss: 0.4523


100%|██████████| 63/63 [02:36<00:00,  2.48s/it]


Epoch 9/10, Training Loss: 0.5738


100%|██████████| 125/125 [00:22<00:00,  5.51it/s]


Validation Accuracy: 0.8120
Validation loss: 0.4341


100%|██████████| 63/63 [02:36<00:00,  2.48s/it]


Epoch 10/10, Training Loss: 0.5415


100%|██████████| 125/125 [00:22<00:00,  5.53it/s]

Validation Accuracy: 0.8205
Validation loss: 0.4315





In [34]:
model.eval()
total_correct = 0
total_samples = 0

with torch.no_grad():
    for batch in tqdm(test_loader):
        input_ids, attention_mask, labels = [b.to(device) for b in batch]

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss=criterion(outputs, labels)
        predictions = torch.argmax(outputs, dim=1)

        total_correct += (predictions == labels).sum().item()
        total_samples += labels.size(0)

accuracy = total_correct / total_samples
print(f'Test Accuracy: {accuracy:.4f}')
print(f'Test loss: {loss:.4f}')

100%|██████████| 125/125 [00:22<00:00,  5.59it/s]

Test Accuracy: 0.8315
Test loss: 0.4979





In [32]:
# Save the model state dictionary
torch.save(model, 'bert_classifier_model.pt')


In [33]:
model.eval()

# Example sentence
sentence = "As Sarah stood on the edge of the cliff, her heart pounded wildly in her chest. The view was breathtaking, with the sun setting over the horizon and casting a golden glow on the ocean waves below. She felt a rush of emotions, a mix of awe and anxiety. This was the place where she had first met him, where their love story had begun. But now, standing alone, the memories brought a tinge of fear. Would she ever feel that kind of love again? The wind whispered around her, as if urging her to hold on to hope, but the shadow of doubt loomed large, making her question if it was all just an illusion."

# Tokenize the input sentence
inputs = tokenizer(sentence, return_tensors='pt', padding='max_length', truncation=True, max_length=300)

In [46]:
inputs. to(device)

{'input_ids': tensor([[  101,  2004,  4532,  2768,  2006,  1996,  3341,  1997,  1996,  7656,
          1010,  2014,  2540, 13750, 13544,  1999,  2014,  3108,  1012,  1996,
          3193,  2001,  3052, 17904,  1010,  2007,  1996,  3103,  4292,  2058,
          1996,  9154,  1998,  9179,  1037,  3585,  8652,  2006,  1996,  4153,
          5975,  2917,  1012,  2016,  2371,  1037,  5481,  1997,  6699,  1010,
          1037,  4666,  1997, 15180,  1998, 10089,  1012,  2023,  2001,  1996,
          2173,  2073,  2016,  2018,  2034,  2777,  2032,  1010,  2073,  2037,
          2293,  2466,  2018,  5625,  1012,  2021,  2085,  1010,  3061,  2894,
          1010,  1996,  5758,  2716,  1037, 28642,  2063,  1997,  3571,  1012,
          2052,  2016,  2412,  2514,  2008,  2785,  1997,  2293,  2153,  1029,
          1996,  3612,  3990,  2105,  2014,  1010,  2004,  2065, 14328,  2014,
          2000,  2907,  2006,  2000,  3246,  1010,  2021,  1996,  5192,  1997,
          4797, 24358,  2312,  1010,  

In [47]:
# Perform inference
with torch.no_grad():
    outputs = model(**inputs)

# Get the predicted label
predictions = torch.argmax(outputs, dim=1)

# Map the predicted label to the emotion
label_map = {
    0: 'sadness',
    1: 'joy',
    2: 'love',
    3: 'anger',
    4: 'fear',
    5: 'surprise'
}
predicted_emotion = label_map[predictions.item()]

print(f'The emotion predicted for the sentence "{sentence}" is: {predicted_emotion}')
print (predictions)
print(outputs)

The emotion predicted for the sentence "As Sarah stood on the edge of the cliff, her heart pounded wildly in her chest. The view was breathtaking, with the sun setting over the horizon and casting a golden glow on the ocean waves below. She felt a rush of emotions, a mix of awe and anxiety. This was the place where she had first met him, where their love story had begun. But now, standing alone, the memories brought a tinge of fear. Would she ever feel that kind of love again? The wind whispered around her, as if urging her to hold on to hope, but the shadow of doubt loomed large, making her question if it was all just an illusion." is: fear
tensor([4], device='cuda:0')
tensor([[-0.2016,  0.6204, -1.6937, -1.5064,  3.2784, -0.7354]],
       device='cuda:0')


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertClassifier(num_labels=6)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Load the fine-tuned model weights
model.load_state_dict(torch.load(r'/kaggle/working/bert_classifier2.pt'), strict=False)

In [None]:
print("Model's state_dict:")
for param_tensor in model.state_dict():
    print(param_tensor)

print("\nLoaded state_dict:")
loaded_state_dict = torch.load(r'/kaggle/working/bert_classifier.pt')
for param_tensor in loaded_state_dict:
    print(param_tensor)

In [72]:
model1=torch.load('/kaggle/working/bert_classifier_model.pt')
model1

DataParallel(
  (module): BertClassifier(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSdpaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)


In [73]:
model1.eval()

# Example sentence
sentence = "As Sarah stood on the edge of the cliff, her heart pounded wildly in her chest. The view was breathtaking, with the sun setting over the horizon and casting a golden glow on the ocean waves below. She felt a rush of emotions, a mix of awe and anxiety. This was the place where she had first met him, where their love story had begun. But now, standing alone, the memories brought a tinge of fear. Would she ever feel that kind of love again? The wind whispered around her, as if urging her to hold on to hope, but the shadow of doubt loomed large, making her question if it was all just an illusion."

# Tokenize the input sentence
inputs = tokenizer(sentence, return_tensors='pt', padding='max_length', truncation=True, max_length=128)

with torch.no_grad():
    outputs = model1(**inputs)

# Get the predicted label
predictions = torch.argmax(outputs, dim=1)

# Map the predicted label to the emotion
label_map = {
    0: 'sadness',
    1: 'joy',
    2: 'love',
    3: 'anger',
    4: 'fear',
    5: 'surprise'
}
predicted_emotion = label_map[predictions.item()]

print(f'The emotion predicted for the sentence "{sentence}" is: {predicted_emotion}')
print (predictions)
print(outputs)

The emotion predicted for the sentence "As Sarah stood on the edge of the cliff, her heart pounded wildly in her chest. The view was breathtaking, with the sun setting over the horizon and casting a golden glow on the ocean waves below. She felt a rush of emotions, a mix of awe and anxiety. This was the place where she had first met him, where their love story had begun. But now, standing alone, the memories brought a tinge of fear. Would she ever feel that kind of love again? The wind whispered around her, as if urging her to hold on to hope, but the shadow of doubt loomed large, making her question if it was all just an illusion." is: fear
tensor([4], device='cuda:0')
tensor([[-0.8899,  1.3056, -1.2069, -2.2569,  2.4537,  0.0702]],
       device='cuda:0')
