In [1]:
# Basic imports
import json
import numpy as np
import pandas as pd

# PyTorch imports
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Hugging Face Transformers imports
from transformers import BertModel, BertTokenizer, BertForTokenClassification, AdamW, BertConfig

# Scikit-learn metrics for evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# TQDM for progress bars
from tqdm.notebook import tqdm


class Bert_BiDirectional_LSTM(nn.Module):
    def __init__(self, bert_model_name, num_labels, hidden_dim=768, lstm_layers=1, dropout=0.1):
        super(Bert_BiDirectional_LSTM, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        
        self.lstm = nn.LSTM(input_size=hidden_dim, 
                            hidden_size=hidden_dim, 
                            num_layers=lstm_layers, 
                            bidirectional=True, 
                            batch_first=True,
                            dropout=dropout if lstm_layers > 1 else 0)

        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * 2, num_labels)  

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state

        lstm_output, _ = self.lstm(sequence_output)
        lstm_output = self.dropout(lstm_output)

        logits = self.fc(lstm_output)

        return logits

In [2]:
def load_data_to_dataframe(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = [json.loads(line) for line in file]
    df = pd.DataFrame(data)
    return df


class CONFIG: 
    EPS = 1e-8 
    EPOCHS = 5 # 3~5
    BATCH_SIZE = 16 # 8, 32
    LEARNING_RATE = 2e-4 # 1e-5
    MAX_LENGTH = 512 # 256
    BERT_MODEL_NAME = 'bert-base-uncased' # large, RoBERTa, DeBERTa
    DEVICE_NAME = "mps" # Cuda or alternative

'''BERTBASE (L=12, H=768, A=12, Total Param- eters=110M) and 
   BERTLARGE (L=24, H=1024, A=16, Total Parameters=340M).'''

'BERTBASE (L=12, H=768, A=12, Total Param- eters=110M) and \n   BERTLARGE (L=24, H=1024, A=16, Total Parameters=340M).'

In [3]:
train_df = load_data_to_dataframe('semeval_train.txt')
val_df = load_data_to_dataframe('semeval_val.txt')
test_df = load_data_to_dataframe('semeval_test.txt')

In [5]:
tokenizer = BertTokenizer.from_pretrained(CONFIG.BERT_MODEL_NAME)

def tokenize_and_align_labels(df, tokenizer, label_map, max_length=CONFIG.MAX_LENGTH):
    input_ids = []
    attention_masks = []
    token_type_ids = []
    label_ids = []

    for _, row in df.iterrows():
        text = row['token']
        labels = ['O']*len(text)  

        h_start, h_end = row['h']['pos']
        t_start, t_end = row['t']['pos']
        labels[h_start:h_end] = ['H']*(h_end - h_start)
        labels[t_start:t_end] = ['T']*(t_end - t_start)

        encoded_dict = tokenizer.encode_plus(text,
                                             add_special_tokens=True,      
                                             max_length=max_length,       
                                             padding='max_length',
                                             return_attention_mask=True,  
                                             is_split_into_words=True,
                                             return_tensors='pt')

        numeric_labels = [label_map[label] for label in labels]
        numeric_labels = [label_map['O']] + numeric_labels[:max_length-2] + [label_map['O']]*(max_length - len(numeric_labels) - 1)

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
        token_type_ids.append(encoded_dict['token_type_ids'])
        label_ids.append(torch.tensor(numeric_labels))

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    token_type_ids = torch.cat(token_type_ids, dim=0)
    label_ids = torch.stack(label_ids, dim=0)
    return input_ids, attention_masks, token_type_ids, label_ids

label_map = {'O': 0, 'H': 1, 'T': 2}  

train_inputs, train_masks, train_type_ids, train_labels = tokenize_and_align_labels(train_df, tokenizer, label_map)
val_inputs, val_masks, val_type_ids, val_labels = tokenize_and_align_labels(val_df, tokenizer, label_map)
test_inputs, test_masks, test_type_ids, test_labels = tokenize_and_align_labels(test_df, tokenizer, label_map)

In [6]:
model = Bert_BiDirectional_LSTM(CONFIG.BERT_MODEL_NAME, num_labels = 3,)
device = torch.device(CONFIG.DEVICE_NAME)
model.to(device)

Bert_BiDirectional_LSTM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, eleme

In [7]:
batch_size = CONFIG.BATCH_SIZE
train_dataset = TensorDataset(train_inputs, train_masks, train_labels)
val_dataset = TensorDataset(val_inputs, val_masks, val_labels)

train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
validation_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)

In [8]:
torch.mps.empty_cache()

optimizer = AdamW(model.parameters(), 
                  lr=CONFIG.LEARNING_RATE, 
                  eps=CONFIG.EPS)

epochs = CONFIG.EPOCHS



In [9]:
def evaluate_model(test_dataloader, model, label_map):
    model.eval()
    predictions, true_labels = [], []

    for batch in tqdm(test_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_input_mask)

        logits = outputs  
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        batch_predictions = np.argmax(logits, axis=2)
        
        predictions.extend([list(p) for p in batch_predictions])
        true_labels.extend(label_ids)

    flat_predictions, flat_true_labels = [], []
    for prediction, true_label in zip(predictions, true_labels):
        for p, t in zip(prediction, true_label):
            if t != label_map['O']:  
                flat_predictions.append(p)
                flat_true_labels.append(t)
    
    accuracy = accuracy_score(flat_true_labels, flat_predictions)
    precision = precision_score(flat_true_labels, flat_predictions, average='weighted', zero_division=0)
    recall = recall_score(flat_true_labels, flat_predictions, average='weighted', zero_division=0)
    f1 = f1_score(flat_true_labels, flat_predictions, average='weighted', zero_division=0)
    report = classification_report(flat_true_labels, flat_predictions, target_names=label_map.keys(), zero_division=0)

    print("Classification Report:\n", report)

    return accuracy, precision, recall, f1


In [10]:
epoch_stats = {
    "accuracy": [],
    "precision": [],
    "recall": [],
    "f1_score": [],
    "train_loss": [],
    "val_loss": []
}

In [11]:
test_dataset = TensorDataset(test_inputs, test_masks, test_labels)
test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=CONFIG.BATCH_SIZE)

In [12]:
from torch.nn import CrossEntropyLoss

loss_fn = CrossEntropyLoss()

for epoch in tqdm(range(epochs)):
    print(f"-- Epoch {epoch + 1} Started --")

    # Training Phase
    model.train()
    total_loss = 0
    for step, batch in tqdm(enumerate(train_dataloader)):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        model.zero_grad()

        logits = model(b_input_ids, attention_mask=b_input_mask)
        loss = loss_fn(logits.view(-1, 3), b_labels.view(-1))
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch + 1}: Average Training Loss: {avg_train_loss}')
    
    # Validation Phase
    model.eval()
    total_eval_loss = 0
    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            logits = model(b_input_ids, attention_mask=b_input_mask)
            loss = loss_fn(logits.view(-1, 3), b_labels.view(-1))
            total_eval_loss += loss.item()

    avg_val_loss = total_eval_loss / len(validation_dataloader)
    print(f'Epoch {epoch + 1}: Average Validation Loss: {avg_val_loss}')

        # Test Phase
    test_accuracy, test_precision, test_recall, test_f1 = evaluate_model(test_dataloader, model, label_map
                                                                         
    epoch_stats["accuracy"].append(test_accuracy)
    epoch_stats["precision"].append(test_precision)
    epoch_stats["recall"].append(test_recall)
    epoch_stats["f1_score"].append(test_f1)
    epoch_stats["train_loss"].append(avg_train_loss)
    epoch_stats["val_loss"].append(avg_val_loss)

    print(f'Epoch {epoch}: Test Metrics: Accuracy={test_accuracy}, Precision={test_precision}, Recall={test_recall}, F1 Score={test_f1}')

  0%|          | 0/5 [00:00<?, ?it/s]

-- Epoch 1 Started --


0it [00:00, ?it/s]

Epoch 1: Average Training Loss: 0.016093471394142442
Epoch 1: Average Validation Loss: 0.007846244427553833


  0%|          | 0/170 [00:00<?, ?it/s]

Classification Report:
               precision    recall  f1-score   support

           O       0.00      0.00      0.00         0
           H       0.97      0.67      0.79      2789
           T       0.91      0.50      0.65      2874

    accuracy                           0.58      5663
   macro avg       0.63      0.39      0.48      5663
weighted avg       0.94      0.58      0.72      5663

Epoch 0: Test Metrics: Accuracy=0.5813173229736889, Precision=0.9440373635840386, Recall=0.5813173229736889, F1 Score=0.7174736662760671
-- Epoch 2 Started --


0it [00:00, ?it/s]

Epoch 2: Average Training Loss: 0.006434023950056743
Epoch 2: Average Validation Loss: 0.007044660160318017


  0%|          | 0/170 [00:00<?, ?it/s]

Classification Report:
               precision    recall  f1-score   support

           O       0.00      0.00      0.00         0
           H       0.98      0.68      0.80      2789
           T       0.93      0.51      0.66      2874

    accuracy                           0.59      5663
   macro avg       0.64      0.39      0.49      5663
weighted avg       0.95      0.59      0.73      5663

Epoch 1: Test Metrics: Accuracy=0.5899699805756666, Precision=0.95433328134315, Recall=0.5899699805756666, F1 Score=0.7267476349405455
-- Epoch 3 Started --


0it [00:00, ?it/s]

Epoch 3: Average Training Loss: 0.004764049011053797
Epoch 3: Average Validation Loss: 0.007217457295058572


  0%|          | 0/170 [00:00<?, ?it/s]

Classification Report:
               precision    recall  f1-score   support

           O       0.00      0.00      0.00         0
           H       0.97      0.69      0.81      2789
           T       0.94      0.55      0.70      2874

    accuracy                           0.62      5663
   macro avg       0.64      0.41      0.50      5663
weighted avg       0.96      0.62      0.75      5663

Epoch 2: Test Metrics: Accuracy=0.619283065512979, Precision=0.9582539971295336, Recall=0.619283065512979, F1 Score=0.7506796850347005
-- Epoch 4 Started --


0it [00:00, ?it/s]

Epoch 4: Average Training Loss: 0.004058925231005676
Epoch 4: Average Validation Loss: 0.008121631207301262


  0%|          | 0/170 [00:00<?, ?it/s]

Classification Report:
               precision    recall  f1-score   support

           O       0.00      0.00      0.00         0
           H       0.96      0.75      0.85      2789
           T       0.94      0.63      0.75      2874

    accuracy                           0.69      5663
   macro avg       0.63      0.46      0.53      5663
weighted avg       0.95      0.69      0.80      5663

Epoch 3: Test Metrics: Accuracy=0.6899170051209607, Precision=0.9515085813917117, Recall=0.6899170051209607, F1 Score=0.7987299581863723
-- Epoch 5 Started --


0it [00:00, ?it/s]

Epoch 5: Average Training Loss: 0.0037710621340328447
Epoch 5: Average Validation Loss: 0.008757868234286124


  0%|          | 0/170 [00:00<?, ?it/s]

Classification Report:
               precision    recall  f1-score   support

           O       0.00      0.00      0.00         0
           H       0.95      0.68      0.79      2789
           T       0.95      0.56      0.70      2874

    accuracy                           0.62      5663
   macro avg       0.63      0.41      0.50      5663
weighted avg       0.95      0.62      0.75      5663

Epoch 4: Test Metrics: Accuracy=0.6169874624757196, Precision=0.9505699955933502, Recall=0.6169874624757196, F1 Score=0.7467542855596536
