In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
import datasets
from gensim.models import Word2Vec
import numpy as np
from sklearn.metrics import classification_report
from collections import Counter
from tqdm import tqdm

dataset = datasets.load_dataset('surrey-nlp/PLOD-CW-25')
train_data = dataset['train']
val_data = dataset['validation']
test_data = dataset['test']

dataset_ext = datasets.load_dataset('surrey-nlp/PLODv2-filtered')
train_data_ext = dataset_ext['train']
val_data_ext = dataset_ext['validation']
test_data_ext = dataset_ext['test']

In [4]:
%%time
#The original glove file is 300mb, but we only need a portion of the words, search them and save them in NPZ format
def convert_glove_to_npz(glove_path, output_npz_path):
    word_counts = Counter()
    for sentence in train_data:
        word_counts.update(sentence['tokens'])
    for sentence in train_data_ext:
        word_counts.update(sentence['tokens'])
    vocab_all = ['<PAD>', '<UNK>'] + [word for word, count in sorted(word_counts.items(), key = lambda x:-x[1])]#sort by frequency
    word2id_all = {word: idx for idx, word in enumerate(vocab_all)}

    words = {}
    vectors = np.zeros((len(vocab_all), 100), dtype=np.float32)
    count = 0
    with open(glove_path, 'r', encoding='utf-8') as f:
        for idx, line in tqdm(enumerate(f), total=400000, desc="Processing"):
            row = line.rstrip().split()
            if row[0] in vocab_all:
                count += 1
                words[row[0]] = word2id_all[row[0]]
                vectors[word2id_all[row[0]]] = np.array(row[1:], dtype=np.float32)

    np.savez_compressed(
        output_npz_path,
        words=words,
        vectors=vectors
    )
    print(f"Saved compressed GloVe to {output_npz_path}, found {count}/{len(vocab_all)} words")

#convert_glove_to_npz("glove.6B.100d.txt", "glove.6B.100d.PLOD-CW-25.npz")

CPU times: total: 0 ns
Wall time: 0 ns


In [6]:
#Init
class Config:
    GLOVE_NPZ_PATH = 'glove.6B.100d.PLOD-CW-25.npz'
    EMBEDDING_VEC = 100
    HIDDEN = 128
    BATCH_SIZE = 32
    EPOCHS = 20
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [8]:
#Build vocabulary, convert to ID
def build_vocab_id(data):
    word_counts = Counter()
    for sentence in data:
        word_counts.update(sentence['tokens'])
    vocab = ['<PAD>', '<UNK>'] + [word for word, count in sorted(word_counts.items(), key = lambda x:-x[1])]#sort by frequency
    word2id = {word: idx for idx, word in enumerate(vocab)}
    return vocab, word2id

vocab, word2id = build_vocab_id(train_data)

#Convert ner_tags to ID
ner_labels = set()
for split in [train_data, val_data, test_data]:
    for data in split:
        ner_labels.update(data['ner_tags'])
ner_tag2id = {tag: idx for idx, tag in enumerate(ner_labels)}
ner_tag2id['<PAD>'] = len(ner_tag2id)
id2ner = {v: k for k, v in ner_tag2id.items()}

In [10]:
#Word2Vec
def build_w2v_matrix(word2id, w2v_model):
    embedding_w2v_matrix = np.zeros((len(vocab), Config.EMBEDDING_VEC))
    for word, idx in word2id.items():
        if word in w2v_model.wv:
            embedding_w2v_matrix[idx] = w2v_model.wv[word]
        elif word == '<PAD>':
            embedding_w2v_matrix[idx] = np.zeros(Config.EMBEDDING_VEC)
        else:
            embedding_w2v_matrix[idx] = np.random.normal(size=(Config.EMBEDDING_VEC,))
    return embedding_w2v_matrix

w2v_model = Word2Vec(train_data['tokens'], vector_size=Config.EMBEDDING_VEC, window=3, min_count=1, workers=4, epochs=50)
embedding_w2v_matrix = build_w2v_matrix(word2id, w2v_model)

In [12]:
%%time
#GloVe
def load_compressed_glove(npz_path):
    data = np.load(npz_path, mmap_mode='r', allow_pickle=True)
    return data['words'], data['vectors']

glove_words, glove_vectors = load_compressed_glove(Config.GLOVE_NPZ_PATH)

def build_glove_matrix(word2id, glove_words, glove_vectors):
    embedding_glove_matrix = np.zeros((len(word2id), Config.EMBEDDING_VEC))
    for word, idx in word2id.items():
        if word in glove_words:
            embedding_glove_matrix[idx] = glove_vectors[glove_words['word']]
        elif word == '<PAD>':
            embedding_glove_matrix[idx] = np.zeros(Config.EMBEDDING_VEC)
        else:
            embedding_glove_matrix[idx] = np.random.normal(size=(Config.EMBEDDING_VEC,))
    return embedding_glove_matrix

embedding_glove_matrix = build_glove_matrix(word2id, glove_words, glove_vectors)

CPU times: total: 453 ms
Wall time: 438 ms


In [14]:
#Custom dataset
class NER_Dataset(Dataset):
    def __init__(self, data):
        self.data = data
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        sentence = self.data[idx]
        tokens = sentence['tokens']
        ner_tags = sentence['ner_tags']
        
        token_ids = [word2id.get(token, word2id['<UNK>']) for token in tokens]
        ner_ids = [ner_tag2id[tag] for tag in ner_tags]
        
        return {
            'token_ids': torch.LongTensor(token_ids),
            'ner_ids': torch.LongTensor(ner_ids),
            'lengths': len(token_ids)
        }

#Custom collate fuction
def collate_fn(batch):
    token_ids = [x['token_ids'] for x in batch]
    ner_ids = [x['ner_ids'] for x in batch]
    lengths = [x['lengths'] for x in batch]
    
    sorted_indices = np.argsort(lengths)[::-1]
    token_ids = [token_ids[i] for i in sorted_indices]
    ner_ids = [ner_ids[i] for i in sorted_indices]
    lengths = [lengths[i] for i in sorted_indices]

    #Fill to the same length
    token_ids = pad_sequence(token_ids, batch_first=True, padding_value=word2id['<PAD>'])
    ner_ids = pad_sequence(ner_ids, batch_first=True, padding_value=ner_tag2id['<PAD>'])
    
    return {
        'token_ids': token_ids.to(Config.DEVICE),
        'ner_ids': ner_ids.to(Config.DEVICE),
        'lengths': torch.LongTensor(lengths).to(Config.DEVICE)
    }

In [16]:
train_dataset = NER_Dataset(train_data)
val_dataset = NER_Dataset(val_data)
test_dataset = NER_Dataset(test_data)

train_loader = DataLoader(train_dataset, batch_size=Config.BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=Config.BATCH_SIZE, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=Config.BATCH_SIZE, collate_fn=collate_fn)

In [18]:
#Init LSTM model
class NER_Model(nn.Module):
    def __init__(self, embedding_type):
        super().__init__()
        if embedding_type == 'Word2Vec':
            self.embedding = nn.Embedding.from_pretrained(
                torch.FloatTensor(embedding_w2v_matrix),
                padding_idx=word2id['<PAD>']
            )
        elif embedding_type == 'GloVe':
            self.embedding = nn.Embedding.from_pretrained(
                torch.FloatTensor(embedding_glove_matrix),
                padding_idx=word2id['<PAD>']
            )
        self.lstm = nn.LSTM(Config.EMBEDDING_VEC, Config.HIDDEN, bidirectional=True, num_layers=2, dropout=0.3)
        self.fc = nn.Linear(Config.HIDDEN*2, len(ner_tag2id))
        
    def forward(self, token_ids, lengths):
        emb_tokens = self.embedding(token_ids)
        packed = pack_padded_sequence(emb_tokens, lengths.cpu(), batch_first=True, enforce_sorted=False)
        output, _ = self.lstm(packed)
        output, _ = pad_packed_sequence(output, batch_first=True)
        return self.fc(output)

In [20]:
#Init training
def training(model, model_name, train_loader):
    best_f1 = 0
    for epoch in range(Config.EPOCHS):
        #Train
        model.train()
        train_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            logits = model(batch['token_ids'], batch['lengths'])
            loss = criterion(logits.view(-1, len(ner_tag2id)), batch['ner_ids'].view(-1))
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        
        #Evaluate
        model.eval()
        all_preds, all_labels = [], []
        with torch.no_grad():
            for batch in val_loader:
                logits = model(batch['token_ids'], batch['lengths'])
                preds = torch.argmax(logits, dim=-1)
                
                for i in range(len(batch['lengths'])):
                    length = batch['lengths'][i]
                    valid_preds = preds[i, :length].cpu().numpy()
                    valid_labels = batch['ner_ids'][i, :length].cpu().numpy()
                    mask = valid_labels != ner_tag2id['<PAD>']
                    all_preds.extend(valid_preds[mask])
                    all_labels.extend(valid_labels[mask])

        target_names = [tag for tag in ner_tag2id if tag != '<PAD>']
        report = classification_report(all_labels, all_preds, target_names=target_names, zero_division=0)
        current_f1 = float(report.split('\n')[-2].split()[-2])
        print(f'Epoch {epoch+1}/{Config.EPOCHS}')
        print(f'Train Loss: {train_loss/len(train_loader):.4f}')
        
        #Save best model, using F1-score
        if current_f1 > best_f1:
            best_f1 = current_f1
            torch.save(model.state_dict(), model_name)
            print('Best model saved!')
    print(f'Validation Report:\n{report}')

In [22]:
%%time
#Word2Vec
print('Word2Vec type training...')
model_w2v = NER_Model('Word2Vec').to(Config.DEVICE)
optimizer = optim.Adam(model_w2v.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=ner_tag2id['<PAD>'])
training(model_w2v,'best_w2v_model.pth', train_loader)

Word2Vec type training...
Epoch 1/20
Train Loss: 0.7150
Best model saved!
Epoch 2/20
Train Loss: 0.3799
Best model saved!
Epoch 3/20
Train Loss: 0.3236
Best model saved!
Epoch 4/20
Train Loss: 0.2976
Epoch 5/20
Train Loss: 0.2818
Best model saved!
Epoch 6/20
Train Loss: 0.2720
Epoch 7/20
Train Loss: 0.2544
Epoch 8/20
Train Loss: 0.2509
Epoch 9/20
Train Loss: 0.2381
Epoch 10/20
Train Loss: 0.2215
Epoch 11/20
Train Loss: 0.2172
Epoch 12/20
Train Loss: 0.2060
Epoch 13/20
Train Loss: 0.1943
Epoch 14/20
Train Loss: 0.1812
Epoch 15/20
Train Loss: 0.1710
Epoch 16/20
Train Loss: 0.1617
Epoch 17/20
Train Loss: 0.1510
Epoch 18/20
Train Loss: 0.1479
Epoch 19/20
Train Loss: 0.1364
Epoch 20/20
Train Loss: 0.1273
Validation Report:
              precision    recall  f1-score   support

        I-LF       0.77      0.80      0.79       730
           O       0.91      0.94      0.92      4460
        B-LF       0.70      0.68      0.69       306
        B-AC       0.82      0.54      0.65       508



In [24]:
%%time
#GloVe
print('GloVe type training...')
model_glove = NER_Model('GloVe').to(Config.DEVICE)
optimizer = optim.Adam(model_glove.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=ner_tag2id['<PAD>'])
training(model_glove,'best_glove_model.pth', train_loader)

GloVe type training...
Epoch 1/20
Train Loss: 0.8013
Best model saved!
Epoch 2/20
Train Loss: 0.4547
Best model saved!
Epoch 3/20
Train Loss: 0.3695
Epoch 4/20
Train Loss: 0.3251
Best model saved!
Epoch 5/20
Train Loss: 0.2850
Epoch 6/20
Train Loss: 0.2477
Epoch 7/20
Train Loss: 0.2174
Epoch 8/20
Train Loss: 0.1878
Epoch 9/20
Train Loss: 0.1632
Epoch 10/20
Train Loss: 0.1364
Epoch 11/20
Train Loss: 0.1203
Epoch 12/20
Train Loss: 0.1015
Epoch 13/20
Train Loss: 0.0958
Epoch 14/20
Train Loss: 0.0720
Epoch 15/20
Train Loss: 0.0589
Epoch 16/20
Train Loss: 0.0502
Epoch 17/20
Train Loss: 0.0419
Epoch 18/20
Train Loss: 0.0362
Epoch 19/20
Train Loss: 0.0341
Epoch 20/20
Train Loss: 0.0284
Validation Report:
              precision    recall  f1-score   support

        I-LF       0.77      0.68      0.72       730
           O       0.90      0.93      0.92      4460
        B-LF       0.60      0.58      0.59       306
        B-AC       0.78      0.62      0.69       508

    accuracy         

In [26]:
def testing(model, model_name, test_loader):
    model.load_state_dict(torch.load(model_name))
    model.eval()
    
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            logits = model(batch['token_ids'], batch['lengths'])
            preds = torch.argmax(logits, dim=-1)
            
            for i in range(len(batch['lengths'])):
                length = batch['lengths'][i]
                valid_preds = preds[i, :length].cpu().numpy()
                valid_labels = batch['ner_ids'][i, :length].cpu().numpy()
                mask = valid_labels != ner_tag2id['<PAD>']
                all_preds.extend(valid_preds[mask])
                all_labels.extend(valid_labels[mask])
    
    target_names = [tag for tag in ner_tag2id if tag != '<PAD>']
    print('Final Test Report:')
    print(classification_report(all_labels, all_preds, target_names=target_names, zero_division=0))

In [28]:
print('Word2Vec type testing...')
testing(model_w2v,'best_w2v_model.pth', test_loader)
print('GloVe type testing...')
testing(model_glove,'best_glove_model.pth', test_loader)

Word2Vec type testing...
Final Test Report:
              precision    recall  f1-score   support

        I-LF       0.77      0.82      0.80      1227
           O       0.93      0.94      0.94      7751
        B-LF       0.71      0.72      0.72       482
        B-AC       0.84      0.67      0.75       797

    accuracy                           0.90     10257
   macro avg       0.82      0.79      0.80     10257
weighted avg       0.90      0.90      0.89     10257

GloVe type testing...
Final Test Report:
              precision    recall  f1-score   support

        I-LF       0.75      0.76      0.76      1227
           O       0.92      0.94      0.93      7751
        B-LF       0.68      0.61      0.64       482
        B-AC       0.83      0.62      0.71       797

    accuracy                           0.88     10257
   macro avg       0.79      0.74      0.76     10257
weighted avg       0.88      0.88      0.88     10257



In [30]:
%%time
#Experiment 3, 4000 training datasets
train_data_4000 = datasets.concatenate_datasets([train_data, train_data_ext.select(range(2000))])
vocab_4000, word2id_4000 = build_vocab_id(train_data_4000)
w2v_4000_model = Word2Vec(train_data_4000['tokens'], vector_size=Config.EMBEDDING_VEC, window=3, min_count=1, workers=4, epochs=50)
embedding_w2v_4000_matrix = build_w2v_matrix(word2id, w2v_4000_model)
embedding_glove_4000_matrix = build_glove_matrix(word2id_4000, glove_words, glove_vectors)

train_4000_dataset = NER_Dataset(train_data_4000)
train_4000_loader = DataLoader(train_4000_dataset, batch_size=Config.BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

#Word2Vec
print('Word2Vec type training...')
model_w2v = NER_Model('Word2Vec').to(Config.DEVICE)
optimizer = optim.Adam(model_w2v.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=ner_tag2id['<PAD>'])
training(model_w2v,'best_w2v_model_4000.pth', train_4000_loader)

#GloVe
print('GloVe type training...')
model_glove = NER_Model('GloVe').to(Config.DEVICE)
optimizer = optim.Adam(model_glove.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=ner_tag2id['<PAD>'])
training(model_glove,'best_glove_model_4000.pth', train_4000_loader)

Word2Vec type training...
Epoch 1/20
Train Loss: 0.5276
Best model saved!
Epoch 2/20
Train Loss: 0.2909
Best model saved!
Epoch 3/20
Train Loss: 0.2681
Epoch 4/20
Train Loss: 0.2525
Epoch 5/20
Train Loss: 0.2382
Best model saved!
Epoch 6/20
Train Loss: 0.2307
Best model saved!
Epoch 7/20
Train Loss: 0.2201
Epoch 8/20
Train Loss: 0.2091
Epoch 9/20
Train Loss: 0.2025
Epoch 10/20
Train Loss: 0.1939
Epoch 11/20
Train Loss: 0.1823
Epoch 12/20
Train Loss: 0.1738
Epoch 13/20
Train Loss: 0.1635
Epoch 14/20
Train Loss: 0.1538
Epoch 15/20
Train Loss: 0.1477
Epoch 16/20
Train Loss: 0.1389
Epoch 17/20
Train Loss: 0.1295
Epoch 18/20
Train Loss: 0.1216
Epoch 19/20
Train Loss: 0.1134
Epoch 20/20
Train Loss: 0.1027
Validation Report:
              precision    recall  f1-score   support

        I-LF       0.78      0.77      0.77       730
           O       0.91      0.93      0.92      4460
        B-LF       0.68      0.67      0.68       306
        B-AC       0.78      0.65      0.71       508



In [32]:
print('Word2Vec type testing...')
testing(model_w2v,'best_w2v_model_4000.pth', test_loader)
print('GloVe type testing...')
testing(model_glove,'best_glove_model_4000.pth', test_loader)

Word2Vec type testing...
Final Test Report:
              precision    recall  f1-score   support

        I-LF       0.79      0.84      0.82      1227
           O       0.93      0.95      0.94      7751
        B-LF       0.74      0.71      0.73       482
        B-AC       0.84      0.69      0.76       797

    accuracy                           0.90     10257
   macro avg       0.83      0.80      0.81     10257
weighted avg       0.90      0.90      0.90     10257

GloVe type testing...
Final Test Report:
              precision    recall  f1-score   support

        I-LF       0.78      0.83      0.80      1227
           O       0.93      0.95      0.94      7751
        B-LF       0.71      0.73      0.72       482
        B-AC       0.86      0.64      0.74       797

    accuracy                           0.90     10257
   macro avg       0.82      0.78      0.80     10257
weighted avg       0.90      0.90      0.90     10257



In [34]:
%%time
#Experiment 3, 8000 training datasets
train_data_8000 = datasets.concatenate_datasets([train_data, train_data_ext.select(range(2000))])
vocab_8000, word2id_8000 = build_vocab_id(train_data_8000)
w2v_8000_model = Word2Vec(train_data_8000['tokens'], vector_size=Config.EMBEDDING_VEC, window=3, min_count=1, workers=4, epochs=50)
embedding_w2v_8000_matrix = build_w2v_matrix(word2id, w2v_8000_model)
embedding_glove_8000_matrix = build_glove_matrix(word2id_8000, glove_words, glove_vectors)

train_8000_dataset = NER_Dataset(train_data_8000)
train_8000_loader = DataLoader(train_8000_dataset, batch_size=Config.BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

#Word2Vec
print('Word2Vec type training...')
model_w2v = NER_Model('Word2Vec').to(Config.DEVICE)
optimizer = optim.Adam(model_w2v.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=ner_tag2id['<PAD>'])
training(model_w2v,'best_w2v_model_8000.pth', train_8000_loader)

#GloVe
print('GloVe type training...')
model_glove = NER_Model('GloVe').to(Config.DEVICE)
optimizer = optim.Adam(model_glove.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=ner_tag2id['<PAD>'])
training(model_glove,'best_glove_model_8000.pth', train_8000_loader)

Word2Vec type training...
Epoch 1/20
Train Loss: 0.5136
Best model saved!
Epoch 2/20
Train Loss: 0.2947
Epoch 3/20
Train Loss: 0.2680
Epoch 4/20
Train Loss: 0.2527
Best model saved!
Epoch 5/20
Train Loss: 0.2383
Epoch 6/20
Train Loss: 0.2280
Epoch 7/20
Train Loss: 0.2167
Epoch 8/20
Train Loss: 0.2083
Epoch 9/20
Train Loss: 0.2015
Epoch 10/20
Train Loss: 0.1897
Epoch 11/20
Train Loss: 0.1803
Epoch 12/20
Train Loss: 0.1703
Epoch 13/20
Train Loss: 0.1622
Best model saved!
Epoch 14/20
Train Loss: 0.1539
Epoch 15/20
Train Loss: 0.1439
Epoch 16/20
Train Loss: 0.1351
Epoch 17/20
Train Loss: 0.1267
Epoch 18/20
Train Loss: 0.1187
Epoch 19/20
Train Loss: 0.1166
Epoch 20/20
Train Loss: 0.1053
Validation Report:
              precision    recall  f1-score   support

        I-LF       0.80      0.78      0.79       730
           O       0.91      0.94      0.92      4460
        B-LF       0.69      0.67      0.68       306
        B-AC       0.78      0.58      0.67       508

    accuracy      

In [36]:
print('Word2Vec type testing...')
testing(model_w2v,'best_w2v_model_8000.pth', test_loader)
print('GloVe type testing...')
testing(model_glove,'best_glove_model_8000.pth', test_loader)

Word2Vec type testing...
Final Test Report:
              precision    recall  f1-score   support

        I-LF       0.80      0.79      0.80      1227
           O       0.93      0.95      0.94      7751
        B-LF       0.76      0.72      0.74       482
        B-AC       0.85      0.68      0.75       797

    accuracy                           0.90     10257
   macro avg       0.83      0.79      0.81     10257
weighted avg       0.90      0.90      0.90     10257

GloVe type testing...
Final Test Report:
              precision    recall  f1-score   support

        I-LF       0.79      0.81      0.80      1227
           O       0.93      0.95      0.94      7751
        B-LF       0.73      0.72      0.72       482
        B-AC       0.83      0.69      0.75       797

    accuracy                           0.90     10257
   macro avg       0.82      0.79      0.80     10257
weighted avg       0.90      0.90      0.90     10257

