<a href="https://colab.research.google.com/github/LawrenceLLY/GNN_Pun_Detection/blob/main/BERT_BiLSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m47.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m90.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer, AdamW
from tqdm import tqdm
import csv
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## The difference between using ***nn.BCEWithLogitsLoss*** and ***nn.BCELoss***:

**nn.BCEWithLogitsLoss**: This loss function combines the sigmoid activation function and the binary cross-entropy loss in a numerically stable way. When using this loss function, you should provide the **raw logits** (i.e., the output of the model before applying the sigmoid activation) as input to the loss function. In other words, you **should not** include the nn.Sigmoid layer in your model.

**nn.BCELoss**: This loss function assumes that the input is already transformed by the sigmoid activation function. When using this loss function, you should provide the **probabilities** (i.e., the output of the model after applying the sigmoid activation) as input to the loss function. In other words, you **should** include the nn.Sigmoid layer in your model.

**In this task**: It's better to use nn.BCEWithLogitsLoss() instead of nn.BCELoss() in this case. The nn.BCEWithLogitsLoss() combines the sigmoid activation function and binary cross-entropy loss into a single function, providing better numerical stability and improved performance.

In [None]:
# Define the BERT-BiLSTM model
class BertBiLSTM(nn.Module):
    def __init__(self, bert_model_name, num_classes, hidden_dim, num_layers, bidirectional, dropout):
        super(BertBiLSTM, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        # Set the BERT layer as untrainable
        '''
        for param in self.bert.parameters():
            param.requires_grad = False
        '''
        self.lstm = nn.LSTM(
            input_size=self.bert.config.hidden_size,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            bidirectional=bidirectional,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )
        self.dropout = nn.Dropout(dropout)
        #self.classifier = nn.Linear(hidden_dim * (2 if bidirectional else 1), num_classes)
        self.classifier = nn.Linear(hidden_dim * (2 if bidirectional else 1), 1) # sigmoid
        #self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = bert_output['last_hidden_state']
        lstm_output, _ = self.lstm(sequence_output)
        pooled_output = lstm_output[:, -1]
        dropped_output = self.dropout(pooled_output)
        logits = self.classifier(dropped_output)
        return logits
        '''
        You should not apply the sigmoid function within the BertBiLSTM model
        if you are using nn.BCEWithLogitsLoss(), as this loss function combines
        the sigmoid activation and binary cross-entropy loss in a numerically stable way.
        '''


In [None]:
# Define the dataset class
class SentenceDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, max_length):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [None]:
# Set up the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Model parameters
bert_model_name = 'bert-base-uncased'
num_classes = 2
hidden_dim = 128
num_layers = 2
bidirectional = True
dropout = 0.3

# Dataset parameters
# sentences = ['Example sentence 1', 'Example sentence 2']
# labels = [0, 1]
max_length = 50
batch_size = 32

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained(bert_model_name)

# Initialize the model
model = BertBiLSTM(bert_model_name, num_classes, hidden_dim, num_layers, bidirectional, dropout).to(device)

# Use this path to save the model
model_path = "/content/drive/My Drive/my_PT_model.pt"  # Choose your desired path and filename

# Training parameters
num_epochs = 5
learning_rate = 2e-5
weight_decay = 1e-2

# criterion = nn.CrossEntropyLoss()
criterion = nn.BCEWithLogitsLoss()
# Set up the optimizer
# optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
texts_PTD = []
labels_PTD = []

# opening the CSV file
with open("/content/drive/My Drive/puns_pos_neg_data.csv", mode ='r') as file:

    # reading the CSV file
    csvFile = csv.reader(file)
    
    # displaying the contents of the CSV file
    for line in csvFile:
        #print(line)
        labels_PTD.append(0 if line[0] == "-1" else 1)
        texts_PTD.append(line[1])

del texts_PTD[0] # delete the head
del labels_PTD[0] # delete the head

In [None]:
# Create the dataset and dataloader
dataset = SentenceDataset(sentences=texts_PTD, labels=labels_PTD, tokenizer=tokenizer, max_length=max_length)

# Split the dataset into training and validation sets
train_dataset, val_dataset = torch.utils.data.random_split(
    dataset, [len(labels_PTD)-int(0.2*len(labels_PTD)), int(0.2*len(labels_PTD))])

# Create DataLoaders for each set with a batch size
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [None]:
# Training loop
for epoch in range(num_epochs):
    print(f'Epoch {epoch + 1}/{num_epochs}:')

    model.train()
    train_loss, train_correct, train_samples = 0, 0, 0
    for batch in tqdm(train_dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].unsqueeze(1).float().to(device)

        # Forward pass
        logits = model(input_ids, attention_mask)

        # Compute the loss
        loss = criterion(logits, labels)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()

        # Update the weights
        optimizer.step()

        train_loss += loss.item()

        # Compute the number of correct predictions
        # preds = torch.argmax(logits, dim=1)
        sigmoid = torch.sigmoid(logits.view(-1))
        preds = (sigmoid > 0.5).unsqueeze(1).float()
        num_correct = (preds == labels).sum().item()
        train_correct += num_correct
        train_samples += labels.size(0)

    train_avg_loss = train_loss / len(train_dataloader)
    train_accuracy = train_correct / train_samples
    print(f'Training Loss: {train_avg_loss:.4f} - Training Accuracy: {train_accuracy:.4f}')

    torch.save(model.state_dict(), model_path) # only saves the model's parameters
    
    # torch.save(model, model_path) # save the entire model, including the architecture
    # Don't need to recreate the architecture when loading the model later.
    # However, the resulting file will be larger

    # Evaluate the model on the validation set
    model.eval()
    val_loss, val_correct, val_samples = 0, 0, 0
    with torch.no_grad():
        for batch in tqdm(val_dataloader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].unsqueeze(1).float().to(device)

            # Forward pass
            logits = model(input_ids, attention_mask)

            # Compute the loss
            # loss = nn.CrossEntropyLoss()(logits, labels)
            loss = criterion(logits, labels)

            val_loss += loss.item()

            # Compute the number of correct predictions
            # preds = torch.argmax(logits, dim=1)
            sigmoid = torch.sigmoid(logits.view(-1))
            preds = (sigmoid > 0.5).unsqueeze(1).float()
            num_correct = (preds == labels).sum().item()
            val_correct += num_correct
            val_samples += labels.size(0)

    val_avg_loss = val_loss / len(val_dataloader)
    val_accuracy = val_correct / val_samples
    print(f'Validation Loss: {val_avg_loss:.4f} - Validation Accuracy: {val_accuracy:.4f}')


Epoch 1/5:


100%|██████████| 121/121 [30:10<00:00, 14.97s/it]


Training Loss: 0.4755 - Training Accuracy: 0.8205


100%|██████████| 31/31 [02:26<00:00,  4.74s/it]


Validation Loss: 0.2960 - Validation Accuracy: 0.9130
Epoch 2/5:


100%|██████████| 121/121 [30:08<00:00, 14.95s/it]


Training Loss: 0.2151 - Training Accuracy: 0.9404


100%|██████████| 31/31 [02:26<00:00,  4.73s/it]


Validation Loss: 0.2556 - Validation Accuracy: 0.9098
Epoch 3/5:


100%|██████████| 121/121 [30:11<00:00, 14.97s/it]


Training Loss: 0.1080 - Training Accuracy: 0.9780


100%|██████████| 31/31 [02:28<00:00,  4.80s/it]


Validation Loss: 0.2878 - Validation Accuracy: 0.9098
Epoch 4/5:


100%|██████████| 121/121 [30:15<00:00, 15.00s/it]


Training Loss: 0.0767 - Training Accuracy: 0.9850


100%|██████████| 31/31 [02:25<00:00,  4.70s/it]


Validation Loss: 0.2492 - Validation Accuracy: 0.9264
Epoch 5/5:


100%|██████████| 121/121 [30:09<00:00, 14.95s/it]


Training Loss: 0.0624 - Training Accuracy: 0.9873


100%|██████████| 31/31 [02:23<00:00,  4.64s/it]

Validation Loss: 0.2462 - Validation Accuracy: 0.9254





In [None]:
# Recreate the model architecture with the same parameters
loaded_model = BertBiLSTM(bert_model_name, num_classes, hidden_dim, num_layers, bidirectional, dropout)

# Load the state dictionary from the saved file
loaded_model.load_state_dict(torch.load(model_path))

# loaded_model = torch.load(model_path)

# Set the model to evaluation mode if you plan to use it for inference
loaded_model.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertBiLSTM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine