In [102]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from transformers import BertTokenizerFast, BertModel
import datasets
from gensim.models import Word2Vec
import numpy as np
from seqeval.metrics import f1_score, classification_report
from collections import Counter
from tqdm import tqdm
import os

dataset = datasets.load_dataset('surrey-nlp/PLOD-CW-25')
train_data = dataset['train']
val_data = dataset['validation']
test_data = dataset['test']

dataset_ext = datasets.load_dataset('surrey-nlp/PLODv2-filtered')
train_data_ext = dataset_ext['train']
val_data_ext = dataset_ext['validation']
test_data_ext = dataset_ext['test']

In [104]:
%%time
#The original glove file is 300mb, but we only need a portion of the words, search them and save them in NPZ format
def convert_glove_to_npz(glove_path, output_npz_path):
    word_counts = Counter()
    for sentence in train_data:
        word_counts.update(sentence['tokens'])
    for sentence in train_data_ext:
        word_counts.update(sentence['tokens'])
    vocab_all = ['<PAD>', '<UNK>'] + [word for word, count in sorted(word_counts.items(), key = lambda x:-x[1])]#sort by frequency
    word2id_all = {word: idx for idx, word in enumerate(vocab_all)}

    words = {}
    vectors = np.zeros((len(vocab_all), 100), dtype=np.float32)
    count = 0
    with open(glove_path, 'r', encoding='utf-8') as f:
        for idx, line in tqdm(enumerate(f), total=400000, desc="Processing"):
            row = line.rstrip().split()
            if row[0] in vocab_all:
                count += 1
                words[row[0]] = word2id_all[row[0]]
                vectors[word2id_all[row[0]]] = np.array(row[1:], dtype=np.float32)

    np.savez_compressed(
        output_npz_path,
        words=words,
        vectors=vectors
    )
    print(f"Saved compressed GloVe to {output_npz_path}, found {count}/{len(vocab_all)} words")

#convert_glove_to_npz("glove.6B.100d.txt", "glove.6B.100d.PLOD-CW-25.npz")

CPU times: total: 0 ns
Wall time: 0 ns


In [106]:
#Init
class Config:
    GLOVE_NPZ_PATH = 'glove.6B.100d.PLOD-CW-25.npz'
    EMBEDDING_VEC = 100
    HIDDEN = 128
    BATCH_SIZE = 32
    EPOCHS = 1
    BERT_MODEL_NAME = 'bert-base-cased'
    NUM_LSTM_LAYERS = 2
    NUM_RNN_LAYERS = 2
    MAX_LENGTH = 128
    LEARNING_RATE = 1e-5
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [108]:
#Build vocabulary, convert to ID
def build_vocab_id(data):
    word_counts = Counter()
    for sentence in data:
        word_counts.update(sentence['tokens'])
    vocab = ['<PAD>', '<UNK>'] + [word for word, count in sorted(word_counts.items(), key = lambda x:-x[1])]#sort by frequency
    word2id = {word: idx for idx, word in enumerate(vocab)}
    return vocab, word2id

vocab, word2id = build_vocab_id(train_data)

#Convert ner_tags to ID
ner_labels = set()
for split in [train_data, val_data, test_data]:
    for data in split:
        ner_labels.update(data['ner_tags'])
ner_tag2id = {tag: idx for idx, tag in enumerate(ner_labels)}
id2ner = {v: k for k, v in ner_tag2id.items()}

In [112]:
#Word2Vec
def build_w2v_matrix(word2id, w2v_model):
    embedding_w2v_matrix = np.zeros((len(vocab), Config.EMBEDDING_VEC))
    for word, idx in word2id.items():
        if word in w2v_model.wv:
            embedding_w2v_matrix[idx] = w2v_model.wv[word]
        elif word == '<PAD>':
            embedding_w2v_matrix[idx] = np.zeros(Config.EMBEDDING_VEC)
        else:
            embedding_w2v_matrix[idx] = np.random.normal(size=(Config.EMBEDDING_VEC,))
    return embedding_w2v_matrix

w2v_model = Word2Vec(train_data['tokens'], vector_size=Config.EMBEDDING_VEC, window=3, min_count=1, workers=4, epochs=50)
embedding_w2v_matrix = build_w2v_matrix(word2id, w2v_model)

In [114]:
%%time
#GloVe
def load_compressed_glove(npz_path):
    data = np.load(npz_path, mmap_mode='r', allow_pickle=True)
    return data['words'], data['vectors']

glove_words, glove_vectors = load_compressed_glove(Config.GLOVE_NPZ_PATH)

def build_glove_matrix(word2id, glove_words, glove_vectors):
    embedding_glove_matrix = np.zeros((len(word2id), Config.EMBEDDING_VEC))
    for word, idx in word2id.items():
        if word in glove_words:
            embedding_glove_matrix[idx] = glove_vectors[glove_words['word']]
        elif word == '<PAD>':
            embedding_glove_matrix[idx] = np.zeros(Config.EMBEDDING_VEC)
        else:
            embedding_glove_matrix[idx] = np.random.normal(size=(Config.EMBEDDING_VEC,))
    return embedding_glove_matrix

embedding_glove_matrix = build_glove_matrix(word2id, glove_words, glove_vectors)

CPU times: total: 500 ms
Wall time: 485 ms


In [116]:
#BERT
tokenizer = BertTokenizerFast.from_pretrained(Config.BERT_MODEL_NAME)

In [118]:
#Custom dataset
class NER_Dataset(Dataset):
    def __init__(self, data, embedding_type = None, tokenizer = None):
        self.data = data
        self.embedding_type = embedding_type
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        sentence = self.data[idx]
        tokens = sentence['tokens']
        ner_tags = sentence['ner_tags']

        if self.embedding_type == 'BERT':#BERT
            encoding = self.tokenizer(
            tokens,
            is_split_into_words=True,
            truncation=True,
            max_length=Config.MAX_LENGTH,
            padding=False
            )

            word_ids = encoding.word_ids()
            labels = []
            current_word_id = None
            
            for word_id in word_ids:
                if word_id is None:
                    labels.append(-100)
                else:
                    if word_id != current_word_id:
                        labels.append(ner_tag2id[ner_tags[word_id]])
                        current_word_id = word_id
                    else:
                        labels.append(-100)
    
            return {
                'input_ids': encoding['input_ids'],
                'attention_mask': encoding['attention_mask'],
                'labels': labels
            }
        else:
            token_ids = [word2id.get(token, word2id['<UNK>']) for token in tokens]
            ner_ids = [ner_tag2id[tag] for tag in ner_tags]
            
            return {
                'token_ids': torch.LongTensor(token_ids),
                'ner_ids': torch.LongTensor(ner_ids),
                'lengths': len(token_ids)
            }

#Custom collate fuction
def collate_fn(batch):
    if 'attention_mask' in batch[0]:#BERT
        max_length = max(len(x['input_ids']) for x in batch)
    
        input_ids = []
        attention_masks = []
        labels = []
        
        for x in batch:
            pad_length = max_length - len(x['input_ids'])
            input_ids.append(x['input_ids'] + [tokenizer.pad_token_id] * pad_length)
            attention_masks.append(x['attention_mask'] + [0] * pad_length)
            labels.append(x['labels'] + [-100] * pad_length)
        
        return {
            'input_ids': torch.LongTensor(input_ids).to(Config.DEVICE),
            'attention_mask': torch.LongTensor(attention_masks).to(Config.DEVICE),
            'labels': torch.LongTensor(labels).to(Config.DEVICE)
        }
    else:
        token_ids = [x['token_ids'] for x in batch]
        ner_ids = [x['ner_ids'] for x in batch]
        lengths = [x['lengths'] for x in batch]
        
        sorted_indices = np.argsort(lengths)[::-1]
        token_ids = [token_ids[i] for i in sorted_indices]
        ner_ids = [ner_ids[i] for i in sorted_indices]
        lengths = [lengths[i] for i in sorted_indices]
    
        #Fill to the same length
        token_ids = pad_sequence(token_ids, batch_first=True, padding_value=word2id['<PAD>'])
        ner_ids = pad_sequence(ner_ids, batch_first=True, padding_value=-100)
        
        return {
            'token_ids': token_ids.to(Config.DEVICE),
            'ner_ids': ner_ids.to(Config.DEVICE),
            'lengths': torch.LongTensor(lengths).to(Config.DEVICE)
        }

In [120]:
#Word2Vec&GloVe
train_dataset = NER_Dataset(train_data)
val_dataset = NER_Dataset(val_data)
test_dataset = NER_Dataset(test_data)

train_loader = DataLoader(train_dataset, batch_size=Config.BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=Config.BATCH_SIZE, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=Config.BATCH_SIZE, collate_fn=collate_fn)

#BERT
train_dataset_bert = NER_Dataset(train_data, 'BERT', tokenizer)
val_dataset_bert = NER_Dataset(val_data, 'BERT', tokenizer)
test_dataset_bert = NER_Dataset(test_data, 'BERT', tokenizer)

train_loader_bert = DataLoader(train_dataset_bert, batch_size=Config.BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader_bert = DataLoader(val_dataset_bert, batch_size=Config.BATCH_SIZE, collate_fn=collate_fn)
test_loader_bert = DataLoader(test_dataset_bert, batch_size=Config.BATCH_SIZE, collate_fn=collate_fn)

In [122]:
#Init LSTM model
class NER_Model(nn.Module):
    def __init__(self, embedding_type, model_type='LSTM'):
        super().__init__()
        self.embedding_type = embedding_type
        self.model_type = model_type

        if self.embedding_type == 'Word2Vec':
            self.embedding = nn.Embedding.from_pretrained(
                torch.FloatTensor(embedding_w2v_matrix),
                padding_idx=word2id['<PAD>']
            )
            self.lstm = nn.LSTM(Config.EMBEDDING_VEC, Config.HIDDEN, bidirectional=True, num_layers=Config.NUM_LSTM_LAYERS, dropout=0.3)
        elif self.embedding_type == 'GloVe':
            self.embedding = nn.Embedding.from_pretrained(
                torch.FloatTensor(embedding_glove_matrix),
                padding_idx=word2id['<PAD>']
            )
            self.lstm = nn.LSTM(Config.EMBEDDING_VEC, Config.HIDDEN, bidirectional=True, num_layers=Config.NUM_LSTM_LAYERS, dropout=0.3)
        elif self.embedding_type == 'BERT':
            self.bertmodel = BertModel.from_pretrained(Config.BERT_MODEL_NAME)
            self.lstm = nn.LSTM(
            input_size=self.bertmodel.config.hidden_size,
            hidden_size=Config.HIDDEN,
            num_layers=Config.NUM_LSTM_LAYERS,
            bidirectional=True,
            batch_first=True,
            dropout=0.3
            )
            self.rnn = nn.RNN(
            input_size=self.bertmodel.config.hidden_size,
            hidden_size=Config.HIDDEN,
            num_layers=Config.NUM_RNN_LAYERS,
            bidirectional=True,
            batch_first=True,
            nonlinearity='relu',
            dropout=0.3
            )
        self.fc = nn.Linear(Config.HIDDEN*2, len(ner_tag2id))
        
    def forward(self, token_ids=None, lengths=None, input_ids=None, attention_mask=None):
        if self.embedding_type == 'BERT':
            outputs = self.bertmodel(input_ids=input_ids, attention_mask=attention_mask)
            sequence_output = outputs.last_hidden_state
            if self.model_type == 'LSTM':
                output, _ = self.lstm(sequence_output)
            else:
                output, _ = self.rnn(sequence_output)
            return self.fc(output)
        else:
            emb_tokens = self.embedding(token_ids)
            packed = pack_padded_sequence(emb_tokens, lengths.cpu(), batch_first=True, enforce_sorted=False)
            output, _ = self.lstm(packed)
            output, _ = pad_packed_sequence(output, batch_first=True)
            return self.fc(output)

In [None]:
def eval_ner(y_true, y_pred, labels):
    report = classification_report(y_true, y_pred, output_dict=True)
    precision = report['micro avg']['precision']
    recall = report['micro avg']['recall']
    f1 = report['micro avg']['f1-score']

    return precision, recall, f1

In [138]:
#Init training
def training(model, model_name, embedding_type, train_loader, val_loader):
    best_f1 = -1
    for epoch in range(Config.EPOCHS):
        #Train
        model.train()
        train_loss = 0
        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
            optimizer.zero_grad()
            if embedding_type == 'BERT':#BERT
                logits = model(None, None, batch['input_ids'], batch['attention_mask'])
                loss = criterion(logits.view(-1, len(ner_tag2id)), batch['labels'].view(-1))
            else:
                logits = model(batch['token_ids'], batch['lengths'], None, None)
                loss = criterion(logits.view(-1, len(ner_tag2id)), batch['ner_ids'].view(-1))         
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        
        #Evaluate
        model.eval()
        val_loss = 0
        all_preds, all_labels = [], []
        with torch.no_grad():
            for batch in tqdm(val_loader, desc=f"Validating Epoch {epoch+1}"):
                if embedding_type == 'BERT':#BERT
                    logits = model(None, None, batch['input_ids'], batch['attention_mask'])
                    loss = criterion(logits.view(-1, len(ner_tag2id)), batch['labels'].view(-1))
                    val_loss += loss.item()
                    
                    preds = torch.argmax(logits, dim=-1).cpu().numpy()
                    labels = batch['labels'].cpu().numpy()

                    for i in range(preds.shape[0]):
                        valid_labels = []
                        valid_preds = []
                        for j in range(preds.shape[1]):
                            if labels[i][j] != -100:
                                valid_labels.append(id2ner[labels[i][j]])
                                valid_preds.append(id2ner[preds[i][j]])
                        all_labels.append(valid_labels)
                        all_preds.append(valid_preds)
                else:
                    logits = model(batch['token_ids'], batch['lengths'], None, None)
                    loss = criterion(logits.view(-1, len(ner_tag2id)), batch['ner_ids'].view(-1)) 
                    val_loss += loss.item()
                    
                    preds = torch.argmax(logits, dim=-1).cpu().numpy()
                    labels = batch['ner_ids'].cpu().numpy()

                    for i in range(len(batch['lengths'])):
                        valid_preds = []
                        valid_labels = []
                        length = batch['lengths'][i]
                        result_preds = preds[i, :length]
                        result_labels = labels[i, :length]
                        mask = result_labels != -100
                        for i in result_preds[mask].tolist():
                            valid_preds.append(id2ner[i])
                        for i in result_labels[mask].tolist():
                            valid_labels.append(id2ner[i])
                        all_preds.append(valid_preds)
                        all_labels.append(valid_labels)

        val_f1 = f1_score(all_labels, all_preds)
    
        if val_f1 > best_f1:
            best_f1 = val_f1
            torch.save(model.state_dict(), model_name)
            print('Best model saved!')
        
        print(f"Epoch {epoch+1}/{Config.EPOCHS}")
        print(f"Train Loss: {train_loss/len(train_loader):.4f}")
        print(f"Val Loss: {val_loss/len(val_loader):.4f}")
        print(f"Val F1: {val_f1:.4f}\n")

In [146]:
%%time
#BERT+LSTM
print('BERT+LSTM type training...')
model_bert_lstm = NER_Model('BERT','LSTM').to(Config.DEVICE)
optimizer = optim.Adam(model_bert_lstm.parameters(), lr=Config.LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index=-100, label_smoothing=0.1)
training(model_bert_lstm,'best_bert_LSTM.pth', 'BERT', train_loader_bert, val_loader_bert)

BERT+LSTM type training...


Training Epoch 1: 100%|████████████████████████████████████████████████████████████████| 63/63 [00:11<00:00,  5.56it/s]
Validating Epoch 1: 100%|████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 14.87it/s]


Best model saved!
Epoch 1/1
Train Loss: 0.9113
Val Loss: 0.9216
Val F1: 0.0000

CPU times: total: 12.8 s
Wall time: 17.3 s


In [148]:
%%time
#BERT
print('BERT+RNN type training...')
model_bert_rnn = NER_Model('BERT','RNN').to(Config.DEVICE)
optimizer = optim.Adam(model_bert_rnn.parameters(), lr=Config.LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index=-100, label_smoothing=0.1)
training(model_bert_rnn,'best_bert_RNN.pth', 'BERT', train_loader_bert, val_loader_bert)

BERT+RNN type training...


Training Epoch 1: 100%|████████████████████████████████████████████████████████████████| 63/63 [00:11<00:00,  5.50it/s]
Validating Epoch 1: 100%|████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 14.22it/s]


Best model saved!
Epoch 1/1
Train Loss: 0.9098
Val Loss: 0.9216
Val F1: 0.0000

CPU times: total: 13 s
Wall time: 19.5 s


In [132]:
%%time
#Word2Vec
print('Word2Vec type training...')
model_w2v = NER_Model('Word2Vec').to(Config.DEVICE)
optimizer = optim.Adam(model_w2v.parameters(), lr=Config.LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index=-100, label_smoothing=0.1)
training(model_w2v,'best_w2v_LSTM.pth', 'Word2Vec', train_loader, val_loader)

Word2Vec type training...


Training Epoch 1: 100%|████████████████████████████████████████████████████████████████| 63/63 [00:05<00:00, 10.93it/s]
Validating Epoch 1: 100%|████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 23.55it/s]


Best model saved!
Epoch 1/1
Train Loss: 1.3556
Val Loss: 1.3365
Val F1: 0.0536

CPU times: total: 8.84 s
Wall time: 6.12 s


In [134]:
%%time
#GloVe
print('GloVe type training...')
model_glove = NER_Model('GloVe').to(Config.DEVICE)
optimizer = optim.Adam(model_glove.parameters(), lr=Config.LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index=-100, label_smoothing=0.1)
training(model_glove, 'best_glove_LSTM.pth', 'GloVe', train_loader, val_loader)

GloVe type training...


Training Epoch 1: 100%|████████████████████████████████████████████████████████████████| 63/63 [00:05<00:00, 11.22it/s]
Validating Epoch 1: 100%|████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 24.47it/s]


Best model saved!
Epoch 1/1
Train Loss: 1.3991
Val Loss: 1.3801
Val F1: 0.1033

CPU times: total: 8.94 s
Wall time: 5.97 s


In [154]:
def testing(model, model_name, embedding_type, test_loader):
    model.load_state_dict(torch.load(model_name))
    model.eval()
    
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Testing"):
            if embedding_type == 'BERT':#BERT
                logits = model(None, None, batch['input_ids'], batch['attention_mask'])
                preds = torch.argmax(logits, dim=-1).cpu().numpy()
                labels = batch['labels'].cpu().numpy()

                for i in range(preds.shape[0]):
                    valid_labels = []
                    valid_preds = []
                    for j in range(preds.shape[1]):
                        if labels[i][j] != -100:
                            valid_labels.append(id2ner[labels[i][j]])
                            valid_preds.append(id2ner[preds[i][j]])
                    all_labels.append(valid_labels)
                    all_preds.append(valid_preds)
            else:
                logits = model(batch['token_ids'], batch['lengths'], None, None)
                preds = torch.argmax(logits, dim=-1).cpu().numpy()
                labels = batch['ner_ids'].cpu().numpy()

                for i in range(len(batch['lengths'])):
                    valid_preds = []
                    valid_labels = []
                    length = batch['lengths'][i]
                    result_preds = preds[i, :length]
                    result_labels = labels[i, :length]
                    mask = result_labels != -100
                    for i in result_preds[mask].tolist():
                        valid_preds.append(id2ner[i])
                    for i in result_labels[mask].tolist():
                        valid_labels.append(id2ner[i])
                    all_preds.append(valid_preds)
                    all_labels.append(valid_labels)
    
    test_f1 = f1_score(all_labels, all_preds)
    print(f"Test F1 Score: {test_f1:.4f}")
    report = classification_report(all_labels, all_preds)
    print(report)

In [156]:
print('Word2Vec type testing...')
testing(model_w2v,'best_w2v_LSTM.pth', 'Word2Vec', test_loader)
print('GloVe type testing...')
testing(model_glove,'best_glove_LSTM.pth', 'GloVe', test_loader)
print('BERT+LSTM type testing...')
testing(model_bert_lstm,'best_bert_LSTM.pth', 'BERT', test_loader_bert)
print('BERT+RNN type testing...')
testing(model_bert_rnn,'best_bert_RNN.pth', 'BERT', test_loader_bert)

Word2Vec type testing...


Testing: 100%|███████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 18.88it/s]


Test F1 Score: 0.0590
              precision    recall  f1-score   support

          AC       0.16      0.06      0.09       797
          LF       0.02      0.00      0.01       482

   micro avg       0.13      0.04      0.06      1279
   macro avg       0.09      0.03      0.05      1279
weighted avg       0.11      0.04      0.06      1279

GloVe type testing...


Testing: 100%|███████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 22.40it/s]


Test F1 Score: 0.0839
              precision    recall  f1-score   support

          AC       0.11      0.17      0.13       797
          LF       0.01      0.01      0.01       482

   micro avg       0.07      0.11      0.08      1279
   macro avg       0.06      0.09      0.07      1279
weighted avg       0.07      0.11      0.09      1279

BERT+LSTM type testing...


Testing: 100%|███████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 11.09it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Test F1 Score: 0.0000
              precision    recall  f1-score   support

          AC       0.00      0.00      0.00       756
          LF       0.00      0.00      0.00       459

   micro avg       0.00      0.00      0.00      1215
   macro avg       0.00      0.00      0.00      1215
weighted avg       0.00      0.00      0.00      1215

BERT+RNN type testing...


Testing: 100%|███████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 14.53it/s]


Test F1 Score: 0.0052
              precision    recall  f1-score   support

          AC       0.04      0.00      0.01       756
          LF       0.00      0.05      0.01       459

   micro avg       0.00      0.02      0.01      1215
   macro avg       0.02      0.03      0.01      1215
weighted avg       0.03      0.02      0.01      1215

