In [59]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from transformers import BertTokenizerFast, BertModel
import datasets
from gensim.models import Word2Vec
import numpy as np
from seqeval.metrics import f1_score#, classification_report
from sklearn.metrics import classification_report
from collections import Counter
from tqdm import tqdm
import os

dataset = datasets.load_dataset('surrey-nlp/PLOD-CW-25')
train_data = dataset['train']
val_data = dataset['validation']
test_data = dataset['test']

dataset_ext = datasets.load_dataset('surrey-nlp/PLODv2-filtered')
train_data_ext = dataset_ext['train']
val_data_ext = dataset_ext['validation']
test_data_ext = dataset_ext['test']

In [3]:
%%time
#The original glove file is 300mb, but we only need a portion of the words, search them and save them in NPZ format
def convert_glove_to_npz(glove_path, output_npz_path):
    word_counts = Counter()
    for sentence in train_data:
        word_counts.update(sentence['tokens'])
    for sentence in train_data_ext:
        word_counts.update(sentence['tokens'])
    vocab_all = ['<PAD>', '<UNK>'] + [word for word, count in sorted(word_counts.items(), key = lambda x:-x[1])]#sort by frequency
    word2id_all = {word: idx for idx, word in enumerate(vocab_all)}

    words = {}
    vectors = np.zeros((len(vocab_all), 100), dtype=np.float32)
    count = 0
    with open(glove_path, 'r', encoding='utf-8') as f:
        for idx, line in tqdm(enumerate(f), total=400000, desc="Processing"):
            row = line.rstrip().split()
            if row[0] in vocab_all:
                count += 1
                words[row[0]] = word2id_all[row[0]]
                vectors[word2id_all[row[0]]] = np.array(row[1:], dtype=np.float32)

    np.savez_compressed(
        output_npz_path,
        words=words,
        vectors=vectors
    )
    print(f"Saved compressed GloVe to {output_npz_path}, found {count}/{len(vocab_all)} words")

#convert_glove_to_npz("glove.6B.100d.txt", "glove.6B.100d.PLOD-CW-25.npz")

CPU times: total: 0 ns
Wall time: 0 ns


In [25]:
#Init
class Config:
    GLOVE_NPZ_PATH = 'glove.6B.100d.PLOD-CW-25.npz'
    EMBEDDING_VEC = 100
    HIDDEN = 128
    BATCH_SIZE = 32
    EPOCHS = 5
    BERT_MODEL_NAME = 'bert-base-cased'
    NUM_LSTM_LAYERS = 2
    NUM_RNN_LAYERS = 2
    MAX_LENGTH = 128
    LEARNING_RATE = 1e-5
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [7]:
#Build vocabulary, convert to ID
def build_vocab_id(data):
    word_counts = Counter()
    for sentence in data:
        word_counts.update(sentence['tokens'])
    vocab = ['<PAD>', '<UNK>'] + [word for word, count in sorted(word_counts.items(), key = lambda x:-x[1])]#sort by frequency
    word2id = {word: idx for idx, word in enumerate(vocab)}
    return vocab, word2id

vocab, word2id = build_vocab_id(train_data)

#Convert ner_tags to ID
ner_labels = set()
for split in [train_data, val_data, test_data]:
    for data in split:
        ner_labels.update(data['ner_tags'])
ner_tag2id = {tag: idx for idx, tag in enumerate(ner_labels)}
ner_tag2id['<PAD>'] = -100
id2ner = {v: k for k, v in ner_tag2id.items()}

In [19]:
#Word2Vec
def build_w2v_matrix(word2id, w2v_model):
    embedding_w2v_matrix = np.zeros((len(vocab), Config.EMBEDDING_VEC))
    for word, idx in word2id.items():
        if word in w2v_model.wv:
            embedding_w2v_matrix[idx] = w2v_model.wv[word]
        elif word == '<PAD>':
            embedding_w2v_matrix[idx] = np.zeros(Config.EMBEDDING_VEC)
        else:
            embedding_w2v_matrix[idx] = np.random.normal(size=(Config.EMBEDDING_VEC,))
    return embedding_w2v_matrix

w2v_model = Word2Vec(train_data['tokens'], vector_size=Config.EMBEDDING_VEC, window=3, min_count=1, workers=4, epochs=50)
embedding_w2v_matrix = build_w2v_matrix(word2id, w2v_model)

In [21]:
%%time
#GloVe
def load_compressed_glove(npz_path):
    data = np.load(npz_path, mmap_mode='r', allow_pickle=True)
    return data['words'], data['vectors']

glove_words, glove_vectors = load_compressed_glove(Config.GLOVE_NPZ_PATH)

def build_glove_matrix(word2id, glove_words, glove_vectors):
    embedding_glove_matrix = np.zeros((len(word2id), Config.EMBEDDING_VEC))
    for word, idx in word2id.items():
        if word in glove_words:
            embedding_glove_matrix[idx] = glove_vectors[glove_words['word']]
        elif word == '<PAD>':
            embedding_glove_matrix[idx] = np.zeros(Config.EMBEDDING_VEC)
        else:
            embedding_glove_matrix[idx] = np.random.normal(size=(Config.EMBEDDING_VEC,))
    return embedding_glove_matrix

embedding_glove_matrix = build_glove_matrix(word2id, glove_words, glove_vectors)

CPU times: total: 469 ms
Wall time: 1.15 s


In [23]:
#BERT
tokenizer = BertTokenizerFast.from_pretrained(Config.BERT_MODEL_NAME)

In [27]:
#Custom dataset
class NER_Dataset(Dataset):
    def __init__(self, data, embedding_type = None, tokenizer = None):
        self.data = data
        self.embedding_type = embedding_type
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        sentence = self.data[idx]
        tokens = sentence['tokens']
        ner_tags = sentence['ner_tags']

        if self.embedding_type == 'BERT':#BERT
            encoding = self.tokenizer(
            tokens,
            is_split_into_words=True,
            truncation=True,
            max_length=Config.MAX_LENGTH,
            padding=False
            )

            word_ids = encoding.word_ids()
            labels = []
            current_word_id = None
            
            for word_id in word_ids:
                if word_id is None:
                    labels.append(ner_tag2id['<PAD>'])
                else:
                    if word_id != current_word_id:
                        labels.append(ner_tag2id[ner_tags[word_id]])
                        current_word_id = word_id
                    else:
                        labels.append(ner_tag2id['<PAD>'])
    
            return {
                'input_ids': encoding['input_ids'],
                'attention_mask': encoding['attention_mask'],
                'labels': labels
            }
        else:
            token_ids = [word2id.get(token, word2id['<UNK>']) for token in tokens]
            ner_ids = [ner_tag2id[tag] for tag in ner_tags]
            
            return {
                'token_ids': torch.LongTensor(token_ids),
                'ner_ids': torch.LongTensor(ner_ids),
                'lengths': len(token_ids)
            }

#Custom collate fuction
def collate_fn(batch):
    if 'attention_mask' in batch[0]:#BERT
        max_length = max(len(x['input_ids']) for x in batch)
    
        input_ids = []
        attention_masks = []
        labels = []
        
        for x in batch:
            pad_length = max_length - len(x['input_ids'])
            input_ids.append(x['input_ids'] + [tokenizer.pad_token_id] * pad_length)
            attention_masks.append(x['attention_mask'] + [0] * pad_length)
            labels.append(x['labels'] + [ner_tag2id['<PAD>']] * pad_length)
        
        return {
            'input_ids': torch.LongTensor(input_ids).to(Config.DEVICE),
            'attention_mask': torch.LongTensor(attention_masks).to(Config.DEVICE),
            'labels': torch.LongTensor(labels).to(Config.DEVICE)
        }
    else:
        token_ids = [x['token_ids'] for x in batch]
        ner_ids = [x['ner_ids'] for x in batch]
        lengths = [x['lengths'] for x in batch]
        
        sorted_indices = np.argsort(lengths)[::-1]
        token_ids = [token_ids[i] for i in sorted_indices]
        ner_ids = [ner_ids[i] for i in sorted_indices]
        lengths = [lengths[i] for i in sorted_indices]
    
        #Fill to the same length
        token_ids = pad_sequence(token_ids, batch_first=True, padding_value=word2id['<PAD>'])
        ner_ids = pad_sequence(ner_ids, batch_first=True, padding_value=ner_tag2id['<PAD>'])
        
        return {
            'token_ids': token_ids.to(Config.DEVICE),
            'ner_ids': ner_ids.to(Config.DEVICE),
            'lengths': torch.LongTensor(lengths).to(Config.DEVICE)
        }

In [29]:
#Word2Vec&GloVe
train_dataset = NER_Dataset(train_data)
val_dataset = NER_Dataset(val_data)
test_dataset = NER_Dataset(test_data)

train_loader = DataLoader(train_dataset, batch_size=Config.BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=Config.BATCH_SIZE, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=Config.BATCH_SIZE, collate_fn=collate_fn)

#BERT
train_dataset_bert = NER_Dataset(train_data, 'BERT', tokenizer)
val_dataset_bert = NER_Dataset(val_data, 'BERT', tokenizer)
test_dataset_bert = NER_Dataset(test_data, 'BERT', tokenizer)

train_loader_bert = DataLoader(train_dataset_bert, batch_size=Config.BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader_bert = DataLoader(val_dataset_bert, batch_size=Config.BATCH_SIZE, collate_fn=collate_fn)
test_loader_bert = DataLoader(test_dataset_bert, batch_size=Config.BATCH_SIZE, collate_fn=collate_fn)

In [31]:
#Init LSTM model
class NER_Model(nn.Module):
    def __init__(self, embedding_type, model_type='LSTM'):
        super().__init__()
        self.embedding_type = embedding_type
        self.model_type = model_type

        if self.embedding_type == 'Word2Vec':
            self.embedding = nn.Embedding.from_pretrained(
                torch.FloatTensor(embedding_w2v_matrix),
                padding_idx=word2id['<PAD>']
            )
            self.lstm = nn.LSTM(Config.EMBEDDING_VEC, Config.HIDDEN, bidirectional=True, num_layers=Config.NUM_LSTM_LAYERS, dropout=0.3)
        elif self.embedding_type == 'GloVe':
            self.embedding = nn.Embedding.from_pretrained(
                torch.FloatTensor(embedding_glove_matrix),
                padding_idx=word2id['<PAD>']
            )
            self.lstm = nn.LSTM(Config.EMBEDDING_VEC, Config.HIDDEN, bidirectional=True, num_layers=Config.NUM_LSTM_LAYERS, dropout=0.3)
        elif self.embedding_type == 'BERT':
            self.bertmodel = BertModel.from_pretrained(Config.BERT_MODEL_NAME)
            self.lstm = nn.LSTM(
            input_size=self.bertmodel.config.hidden_size,
            hidden_size=Config.HIDDEN,
            num_layers=Config.NUM_LSTM_LAYERS,
            bidirectional=True,
            batch_first=True,
            dropout=0.3
            )
            self.rnn = nn.RNN(
            input_size=self.bertmodel.config.hidden_size,
            hidden_size=Config.HIDDEN,
            num_layers=Config.NUM_RNN_LAYERS,
            bidirectional=True,
            batch_first=True,
            nonlinearity='relu',
            dropout=0.3
            )
        self.fc = nn.Linear(Config.HIDDEN*2, len(ner_tag2id))
        
    def forward(self, token_ids=None, lengths=None, input_ids=None, attention_mask=None):
        if self.embedding_type == 'BERT':
            outputs = self.bertmodel(input_ids=input_ids, attention_mask=attention_mask)
            sequence_output = outputs.last_hidden_state
            if self.model_type == 'LSTM':
                output, _ = self.lstm(sequence_output)
            else:
                output, _ = self.rnn(sequence_output)
            return self.fc(output)
        else:
            emb_tokens = self.embedding(token_ids)
            packed = pack_padded_sequence(emb_tokens, lengths.cpu(), batch_first=True, enforce_sorted=False)
            output, _ = self.lstm(packed)
            output, _ = pad_packed_sequence(output, batch_first=True)
            return self.fc(output)

In [55]:
#Init training
def training(model, model_name, embedding_type, train_loader, val_loader):
    best_f1 = 0
    for epoch in range(Config.EPOCHS):
        #Train
        model.train()
        train_loss = 0
        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
            optimizer.zero_grad()
            if embedding_type == 'BERT':#BERT
                logits = model(None, None, batch['input_ids'], batch['attention_mask'])
                loss = criterion(logits.view(-1, len(ner_tag2id)), batch['labels'].view(-1))
            else:
                logits = model(batch['token_ids'], batch['lengths'], None, None)
                loss = criterion(logits.view(-1, len(ner_tag2id)), batch['ner_ids'].view(-1))         
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        
        #Evaluate
        model.eval()
        val_loss = 0
        all_preds, all_labels = [], []
        with torch.no_grad():
            for batch in tqdm(val_loader, desc=f"Validating Epoch {epoch+1}"):
                if embedding_type == 'BERT':#BERT
                    logits = model(None, None, batch['input_ids'], batch['attention_mask'])
                    loss = criterion(logits.view(-1, len(ner_tag2id)), batch['labels'].view(-1))
                    val_loss += loss.item()
                    
                    preds = torch.argmax(logits, dim=-1).cpu().numpy()
                    labels = batch['labels'].cpu().numpy()

                    for i in range(preds.shape[0]):
                        valid_labels = []
                        valid_preds = []
                        for j in range(preds.shape[1]):
                            if labels[i][j] != ner_tag2id['<PAD>']:
                                valid_labels.append(id2ner[labels[i][j]])
                                valid_preds.append(id2ner[preds[i][j]])
                        all_labels.append(valid_labels)
                        all_preds.append(valid_preds)
                else:
                    logits = model(batch['token_ids'], batch['lengths'], None, None)
                    loss = criterion(logits.view(-1, len(ner_tag2id)), batch['ner_ids'].view(-1)) 
                    val_loss += loss.item()
                    
                    #preds = torch.argmax(logits, dim=-1).cpu().numpy()
                    #labels = batch['ner_ids'].cpu().numpy()

                    #for i in range(preds.shape[0]):
                    #    valid_labels = []
                    #    valid_preds = []
                    #    for j in range(preds.shape[1]):
                    #        if labels[i][j] != -100:
                    #            valid_labels.append(id2ner[labels[i][j]])
                    #            valid_preds.append(id2ner[preds[i][j]])
                    #    all_labels.append(valid_labels)
                    #    all_preds.append(valid_preds)
                        
                    preds = torch.argmax(logits, dim=-1)
                    
                    for i in range(len(batch['lengths'])):
                        length = batch['lengths'][i]
                        valid_preds = preds[i, :length].cpu().numpy()
                        valid_labels = batch['ner_ids'][i, :length].cpu().numpy()
                        mask = valid_labels != ner_tag2id['<PAD>']
                        all_preds.extend(valid_preds[mask])
                        all_labels.extend(valid_labels[mask])

        #val_f1 = f1_score(all_labels, all_preds)
        if embedding_type == 'BERT':
            val_f1 = f1_score(all_labels, all_preds)
        else:
            target_names = [tag for tag in ner_tag2id if tag != '<PAD>']
            report = classification_report(all_labels, all_preds, target_names=target_names, zero_division=0)
            val_f1 = float(report.split('\n')[-2].split()[-2])
    
        if val_f1 > best_f1:
            best_f1 = val_f1
            torch.save(model.state_dict(), model_name)
            print('Best model saved!')
        
        print(f"Epoch {epoch+1}/{Config.EPOCHS}")
        print(f"Train Loss: {train_loss/len(train_loader):.4f}")
        print(f"Val Loss: {val_loss/len(val_loader):.4f}")
        print(f"Val F1: {val_f1:.4f}\n")

In [39]:
%%time
#BERT
print('BERT type training...')
model_bert = NER_Model('BERT').to(Config.DEVICE)
optimizer = optim.Adam(model_bert.parameters(), lr=Config.LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index=ner_tag2id['<PAD>'])
training(model_bert,'best_bert_LSTM.pth', 'BERT', train_loader_bert, val_loader_bert)

BERT type training...


Training Epoch 1: 100%|████████████████████████████████████████████████████████████████| 63/63 [00:12<00:00,  5.11it/s]
Validating Epoch 1: 100%|████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 14.38it/s]


Epoch 1/5
Train Loss: 1.0955
Val Loss: 0.8944
Val F1: 0.0000



Training Epoch 2: 100%|████████████████████████████████████████████████████████████████| 63/63 [00:12<00:00,  5.13it/s]
Validating Epoch 2: 100%|████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 13.69it/s]


Epoch 2/5
Train Loss: 0.8224
Val Loss: 0.7531
Val F1: 0.0000



Training Epoch 3: 100%|████████████████████████████████████████████████████████████████| 63/63 [00:25<00:00,  2.49it/s]
Validating Epoch 3: 100%|████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00,  7.56it/s]


Best model saved!
Epoch 3/5
Train Loss: 0.6650
Val Loss: 0.6060
Val F1: 0.0843



Training Epoch 4: 100%|████████████████████████████████████████████████████████████████| 63/63 [00:27<00:00,  2.29it/s]
Validating Epoch 4: 100%|████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00,  7.21it/s]


Best model saved!
Epoch 4/5
Train Loss: 0.5390
Val Loss: 0.5135
Val F1: 0.1272



Training Epoch 5: 100%|████████████████████████████████████████████████████████████████| 63/63 [00:27<00:00,  2.32it/s]
Validating Epoch 5: 100%|████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00,  7.50it/s]


Best model saved!
Epoch 5/5
Train Loss: 0.4602
Val Loss: 0.4623
Val F1: 0.2409

CPU times: total: 1min 48s
Wall time: 2min 2s


In [41]:
%%time
#BERT
print('BERT type training...')
model_bert = NER_Model('BERT','RNN').to(Config.DEVICE)
optimizer = optim.Adam(model_bert.parameters(), lr=Config.LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index=ner_tag2id['<PAD>'])
training(model_bert,'best_bert_RNN.pth', 'BERT', train_loader_bert, val_loader_bert)

BERT type training...


Training Epoch 1: 100%|████████████████████████████████████████████████████████████████| 63/63 [00:13<00:00,  4.78it/s]
Validating Epoch 1: 100%|████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 14.15it/s]


Best model saved!
Epoch 1/5
Train Loss: 0.8907
Val Loss: 0.4277
Val F1: 0.6051



Training Epoch 2: 100%|████████████████████████████████████████████████████████████████| 63/63 [00:21<00:00,  2.98it/s]
Validating Epoch 2: 100%|████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00,  7.05it/s]


Best model saved!
Epoch 2/5
Train Loss: 0.3242
Val Loss: 0.2858
Val F1: 0.7618



Training Epoch 3: 100%|████████████████████████████████████████████████████████████████| 63/63 [00:26<00:00,  2.34it/s]
Validating Epoch 3: 100%|████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00,  7.39it/s]


Best model saved!
Epoch 3/5
Train Loss: 0.2250
Val Loss: 0.2732
Val F1: 0.7783



Training Epoch 4: 100%|████████████████████████████████████████████████████████████████| 63/63 [00:26<00:00,  2.36it/s]
Validating Epoch 4: 100%|████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00,  7.46it/s]


Best model saved!
Epoch 4/5
Train Loss: 0.1889
Val Loss: 0.2690
Val F1: 0.7945



Training Epoch 5: 100%|████████████████████████████████████████████████████████████████| 63/63 [00:26<00:00,  2.35it/s]
Validating Epoch 5: 100%|████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00,  7.42it/s]


Best model saved!
Epoch 5/5
Train Loss: 0.1690
Val Loss: 0.2754
Val F1: 0.8007

CPU times: total: 2min 1s
Wall time: 2min 28s


In [61]:
%%time
#Word2Vec
print('Word2Vec type training...')
model_w2v = NER_Model('Word2Vec').to(Config.DEVICE)
optimizer = optim.Adam(model_w2v.parameters(), lr=Config.LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index=ner_tag2id['<PAD>'])
training(model_w2v,'best_w2v_LSTM.pth', 'Word2Vec', train_loader, val_loader)

Word2Vec type training...


Training Epoch 1: 100%|████████████████████████████████████████████████████████████████| 63/63 [00:05<00:00, 10.76it/s]
Validating Epoch 1: 100%|████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 20.70it/s]


ValueError: Number of classes, 5, does not match size of target_names, 4. Try specifying the labels parameter

In [67]:
%%time
#GloVe
print('GloVe type training...')
model_glove = NER_Model('GloVe').to(Config.DEVICE)
optimizer = optim.Adam(model_glove.parameters(), lr=Config.LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index=ner_tag2id['<PAD>'])
training(model_glove, 'best_glove_LSTM.pth', 'GloVe', train_loader, val_loader)

GloVe type training...


Training Epoch 1: 100%|██████████████████████████████████████████████████████████████| 125/125 [00:08<00:00, 15.10it/s]
Validating Epoch 1: 100%|██████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 28.49it/s]


Best model saved!
Epoch 1/5
Train Loss: 1.5353
Val Loss: 1.5037
Val F1: 0.6300



Training Epoch 2: 100%|██████████████████████████████████████████████████████████████| 125/125 [00:08<00:00, 15.13it/s]
Validating Epoch 2: 100%|██████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 30.54it/s]


Epoch 2/5
Train Loss: 1.4571
Val Loss: 1.4066
Val F1: 0.6300



Training Epoch 3: 100%|██████████████████████████████████████████████████████████████| 125/125 [00:08<00:00, 15.26it/s]
Validating Epoch 3: 100%|██████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 30.91it/s]


Epoch 3/5
Train Loss: 1.3118
Val Loss: 1.1936
Val F1: 0.6300



Training Epoch 4: 100%|██████████████████████████████████████████████████████████████| 125/125 [00:08<00:00, 15.14it/s]
Validating Epoch 4: 100%|██████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 33.14it/s]


Epoch 4/5
Train Loss: 1.0251
Val Loss: 0.9120
Val F1: 0.6300



Training Epoch 5: 100%|██████████████████████████████████████████████████████████████| 125/125 [00:08<00:00, 15.18it/s]
Validating Epoch 5: 100%|██████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 32.72it/s]


Epoch 5/5
Train Loss: 0.8756
Val Loss: 0.8705
Val F1: 0.6300

CPU times: total: 47.5 s
Wall time: 48.8 s


In [69]:
def testing(model, model_name, test_loader):
    model.load_state_dict(torch.load(model_name))
    model.eval()
    
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            logits = model(batch['token_ids'], batch['lengths'])
            preds = torch.argmax(logits, dim=-1)
            
            for i in range(len(batch['lengths'])):
                length = batch['lengths'][i]
                valid_preds = preds[i, :length].cpu().numpy()
                valid_labels = batch['ner_ids'][i, :length].cpu().numpy()
                mask = valid_labels != ner_tag2id['<PAD>']
                all_preds.extend(valid_preds[mask])
                all_labels.extend(valid_labels[mask])
    
    target_names = [tag for tag in ner_tag2id if tag != '<PAD>']
    print('Final Test Report:')
    print(classification_report(all_labels, all_preds, target_names=target_names, zero_division=0))

In [75]:
print('Word2Vec type testing...')
testing(model_w2v,'best_w2v_model.pth', test_loader)
print('GloVe type testing...')
testing(model_glove,'best_glove_model.pth', test_loader)
print('BERT type testing...')
testing(model_glove,'best_bert_model.pth', test_loader)

Word2Vec type testing...
Final Test Report:
              precision    recall  f1-score   support

           O       0.76      1.00      0.86      7751
        B-LF       0.00      0.00      0.00       482
        B-AC       0.00      0.00      0.00       797
        I-LF       0.00      0.00      0.00      1227

    accuracy                           0.76     10257
   macro avg       0.19      0.25      0.22     10257
weighted avg       0.57      0.76      0.65     10257

GloVe type testing...
Final Test Report:
              precision    recall  f1-score   support

           O       0.76      1.00      0.86      7751
        B-LF       0.00      0.00      0.00       482
        B-AC       0.00      0.00      0.00       797
        I-LF       0.00      0.00      0.00      1227

    accuracy                           0.76     10257
   macro avg       0.19      0.25      0.22     10257
weighted avg       0.57      0.76      0.65     10257

BERT type testing...


RuntimeError: Error(s) in loading state_dict for NER_Model:
	Missing key(s) in state_dict: "bertmodel.embeddings.word_embeddings.weight", "bertmodel.embeddings.position_embeddings.weight", "bertmodel.embeddings.token_type_embeddings.weight", "bertmodel.embeddings.LayerNorm.weight", "bertmodel.embeddings.LayerNorm.bias", "bertmodel.encoder.layer.0.attention.self.query.weight", "bertmodel.encoder.layer.0.attention.self.query.bias", "bertmodel.encoder.layer.0.attention.self.key.weight", "bertmodel.encoder.layer.0.attention.self.key.bias", "bertmodel.encoder.layer.0.attention.self.value.weight", "bertmodel.encoder.layer.0.attention.self.value.bias", "bertmodel.encoder.layer.0.attention.output.dense.weight", "bertmodel.encoder.layer.0.attention.output.dense.bias", "bertmodel.encoder.layer.0.attention.output.LayerNorm.weight", "bertmodel.encoder.layer.0.attention.output.LayerNorm.bias", "bertmodel.encoder.layer.0.intermediate.dense.weight", "bertmodel.encoder.layer.0.intermediate.dense.bias", "bertmodel.encoder.layer.0.output.dense.weight", "bertmodel.encoder.layer.0.output.dense.bias", "bertmodel.encoder.layer.0.output.LayerNorm.weight", "bertmodel.encoder.layer.0.output.LayerNorm.bias", "bertmodel.encoder.layer.1.attention.self.query.weight", "bertmodel.encoder.layer.1.attention.self.query.bias", "bertmodel.encoder.layer.1.attention.self.key.weight", "bertmodel.encoder.layer.1.attention.self.key.bias", "bertmodel.encoder.layer.1.attention.self.value.weight", "bertmodel.encoder.layer.1.attention.self.value.bias", "bertmodel.encoder.layer.1.attention.output.dense.weight", "bertmodel.encoder.layer.1.attention.output.dense.bias", "bertmodel.encoder.layer.1.attention.output.LayerNorm.weight", "bertmodel.encoder.layer.1.attention.output.LayerNorm.bias", "bertmodel.encoder.layer.1.intermediate.dense.weight", "bertmodel.encoder.layer.1.intermediate.dense.bias", "bertmodel.encoder.layer.1.output.dense.weight", "bertmodel.encoder.layer.1.output.dense.bias", "bertmodel.encoder.layer.1.output.LayerNorm.weight", "bertmodel.encoder.layer.1.output.LayerNorm.bias", "bertmodel.encoder.layer.2.attention.self.query.weight", "bertmodel.encoder.layer.2.attention.self.query.bias", "bertmodel.encoder.layer.2.attention.self.key.weight", "bertmodel.encoder.layer.2.attention.self.key.bias", "bertmodel.encoder.layer.2.attention.self.value.weight", "bertmodel.encoder.layer.2.attention.self.value.bias", "bertmodel.encoder.layer.2.attention.output.dense.weight", "bertmodel.encoder.layer.2.attention.output.dense.bias", "bertmodel.encoder.layer.2.attention.output.LayerNorm.weight", "bertmodel.encoder.layer.2.attention.output.LayerNorm.bias", "bertmodel.encoder.layer.2.intermediate.dense.weight", "bertmodel.encoder.layer.2.intermediate.dense.bias", "bertmodel.encoder.layer.2.output.dense.weight", "bertmodel.encoder.layer.2.output.dense.bias", "bertmodel.encoder.layer.2.output.LayerNorm.weight", "bertmodel.encoder.layer.2.output.LayerNorm.bias", "bertmodel.encoder.layer.3.attention.self.query.weight", "bertmodel.encoder.layer.3.attention.self.query.bias", "bertmodel.encoder.layer.3.attention.self.key.weight", "bertmodel.encoder.layer.3.attention.self.key.bias", "bertmodel.encoder.layer.3.attention.self.value.weight", "bertmodel.encoder.layer.3.attention.self.value.bias", "bertmodel.encoder.layer.3.attention.output.dense.weight", "bertmodel.encoder.layer.3.attention.output.dense.bias", "bertmodel.encoder.layer.3.attention.output.LayerNorm.weight", "bertmodel.encoder.layer.3.attention.output.LayerNorm.bias", "bertmodel.encoder.layer.3.intermediate.dense.weight", "bertmodel.encoder.layer.3.intermediate.dense.bias", "bertmodel.encoder.layer.3.output.dense.weight", "bertmodel.encoder.layer.3.output.dense.bias", "bertmodel.encoder.layer.3.output.LayerNorm.weight", "bertmodel.encoder.layer.3.output.LayerNorm.bias", "bertmodel.encoder.layer.4.attention.self.query.weight", "bertmodel.encoder.layer.4.attention.self.query.bias", "bertmodel.encoder.layer.4.attention.self.key.weight", "bertmodel.encoder.layer.4.attention.self.key.bias", "bertmodel.encoder.layer.4.attention.self.value.weight", "bertmodel.encoder.layer.4.attention.self.value.bias", "bertmodel.encoder.layer.4.attention.output.dense.weight", "bertmodel.encoder.layer.4.attention.output.dense.bias", "bertmodel.encoder.layer.4.attention.output.LayerNorm.weight", "bertmodel.encoder.layer.4.attention.output.LayerNorm.bias", "bertmodel.encoder.layer.4.intermediate.dense.weight", "bertmodel.encoder.layer.4.intermediate.dense.bias", "bertmodel.encoder.layer.4.output.dense.weight", "bertmodel.encoder.layer.4.output.dense.bias", "bertmodel.encoder.layer.4.output.LayerNorm.weight", "bertmodel.encoder.layer.4.output.LayerNorm.bias", "bertmodel.encoder.layer.5.attention.self.query.weight", "bertmodel.encoder.layer.5.attention.self.query.bias", "bertmodel.encoder.layer.5.attention.self.key.weight", "bertmodel.encoder.layer.5.attention.self.key.bias", "bertmodel.encoder.layer.5.attention.self.value.weight", "bertmodel.encoder.layer.5.attention.self.value.bias", "bertmodel.encoder.layer.5.attention.output.dense.weight", "bertmodel.encoder.layer.5.attention.output.dense.bias", "bertmodel.encoder.layer.5.attention.output.LayerNorm.weight", "bertmodel.encoder.layer.5.attention.output.LayerNorm.bias", "bertmodel.encoder.layer.5.intermediate.dense.weight", "bertmodel.encoder.layer.5.intermediate.dense.bias", "bertmodel.encoder.layer.5.output.dense.weight", "bertmodel.encoder.layer.5.output.dense.bias", "bertmodel.encoder.layer.5.output.LayerNorm.weight", "bertmodel.encoder.layer.5.output.LayerNorm.bias", "bertmodel.encoder.layer.6.attention.self.query.weight", "bertmodel.encoder.layer.6.attention.self.query.bias", "bertmodel.encoder.layer.6.attention.self.key.weight", "bertmodel.encoder.layer.6.attention.self.key.bias", "bertmodel.encoder.layer.6.attention.self.value.weight", "bertmodel.encoder.layer.6.attention.self.value.bias", "bertmodel.encoder.layer.6.attention.output.dense.weight", "bertmodel.encoder.layer.6.attention.output.dense.bias", "bertmodel.encoder.layer.6.attention.output.LayerNorm.weight", "bertmodel.encoder.layer.6.attention.output.LayerNorm.bias", "bertmodel.encoder.layer.6.intermediate.dense.weight", "bertmodel.encoder.layer.6.intermediate.dense.bias", "bertmodel.encoder.layer.6.output.dense.weight", "bertmodel.encoder.layer.6.output.dense.bias", "bertmodel.encoder.layer.6.output.LayerNorm.weight", "bertmodel.encoder.layer.6.output.LayerNorm.bias", "bertmodel.encoder.layer.7.attention.self.query.weight", "bertmodel.encoder.layer.7.attention.self.query.bias", "bertmodel.encoder.layer.7.attention.self.key.weight", "bertmodel.encoder.layer.7.attention.self.key.bias", "bertmodel.encoder.layer.7.attention.self.value.weight", "bertmodel.encoder.layer.7.attention.self.value.bias", "bertmodel.encoder.layer.7.attention.output.dense.weight", "bertmodel.encoder.layer.7.attention.output.dense.bias", "bertmodel.encoder.layer.7.attention.output.LayerNorm.weight", "bertmodel.encoder.layer.7.attention.output.LayerNorm.bias", "bertmodel.encoder.layer.7.intermediate.dense.weight", "bertmodel.encoder.layer.7.intermediate.dense.bias", "bertmodel.encoder.layer.7.output.dense.weight", "bertmodel.encoder.layer.7.output.dense.bias", "bertmodel.encoder.layer.7.output.LayerNorm.weight", "bertmodel.encoder.layer.7.output.LayerNorm.bias", "bertmodel.encoder.layer.8.attention.self.query.weight", "bertmodel.encoder.layer.8.attention.self.query.bias", "bertmodel.encoder.layer.8.attention.self.key.weight", "bertmodel.encoder.layer.8.attention.self.key.bias", "bertmodel.encoder.layer.8.attention.self.value.weight", "bertmodel.encoder.layer.8.attention.self.value.bias", "bertmodel.encoder.layer.8.attention.output.dense.weight", "bertmodel.encoder.layer.8.attention.output.dense.bias", "bertmodel.encoder.layer.8.attention.output.LayerNorm.weight", "bertmodel.encoder.layer.8.attention.output.LayerNorm.bias", "bertmodel.encoder.layer.8.intermediate.dense.weight", "bertmodel.encoder.layer.8.intermediate.dense.bias", "bertmodel.encoder.layer.8.output.dense.weight", "bertmodel.encoder.layer.8.output.dense.bias", "bertmodel.encoder.layer.8.output.LayerNorm.weight", "bertmodel.encoder.layer.8.output.LayerNorm.bias", "bertmodel.encoder.layer.9.attention.self.query.weight", "bertmodel.encoder.layer.9.attention.self.query.bias", "bertmodel.encoder.layer.9.attention.self.key.weight", "bertmodel.encoder.layer.9.attention.self.key.bias", "bertmodel.encoder.layer.9.attention.self.value.weight", "bertmodel.encoder.layer.9.attention.self.value.bias", "bertmodel.encoder.layer.9.attention.output.dense.weight", "bertmodel.encoder.layer.9.attention.output.dense.bias", "bertmodel.encoder.layer.9.attention.output.LayerNorm.weight", "bertmodel.encoder.layer.9.attention.output.LayerNorm.bias", "bertmodel.encoder.layer.9.intermediate.dense.weight", "bertmodel.encoder.layer.9.intermediate.dense.bias", "bertmodel.encoder.layer.9.output.dense.weight", "bertmodel.encoder.layer.9.output.dense.bias", "bertmodel.encoder.layer.9.output.LayerNorm.weight", "bertmodel.encoder.layer.9.output.LayerNorm.bias", "bertmodel.encoder.layer.10.attention.self.query.weight", "bertmodel.encoder.layer.10.attention.self.query.bias", "bertmodel.encoder.layer.10.attention.self.key.weight", "bertmodel.encoder.layer.10.attention.self.key.bias", "bertmodel.encoder.layer.10.attention.self.value.weight", "bertmodel.encoder.layer.10.attention.self.value.bias", "bertmodel.encoder.layer.10.attention.output.dense.weight", "bertmodel.encoder.layer.10.attention.output.dense.bias", "bertmodel.encoder.layer.10.attention.output.LayerNorm.weight", "bertmodel.encoder.layer.10.attention.output.LayerNorm.bias", "bertmodel.encoder.layer.10.intermediate.dense.weight", "bertmodel.encoder.layer.10.intermediate.dense.bias", "bertmodel.encoder.layer.10.output.dense.weight", "bertmodel.encoder.layer.10.output.dense.bias", "bertmodel.encoder.layer.10.output.LayerNorm.weight", "bertmodel.encoder.layer.10.output.LayerNorm.bias", "bertmodel.encoder.layer.11.attention.self.query.weight", "bertmodel.encoder.layer.11.attention.self.query.bias", "bertmodel.encoder.layer.11.attention.self.key.weight", "bertmodel.encoder.layer.11.attention.self.key.bias", "bertmodel.encoder.layer.11.attention.self.value.weight", "bertmodel.encoder.layer.11.attention.self.value.bias", "bertmodel.encoder.layer.11.attention.output.dense.weight", "bertmodel.encoder.layer.11.attention.output.dense.bias", "bertmodel.encoder.layer.11.attention.output.LayerNorm.weight", "bertmodel.encoder.layer.11.attention.output.LayerNorm.bias", "bertmodel.encoder.layer.11.intermediate.dense.weight", "bertmodel.encoder.layer.11.intermediate.dense.bias", "bertmodel.encoder.layer.11.output.dense.weight", "bertmodel.encoder.layer.11.output.dense.bias", "bertmodel.encoder.layer.11.output.LayerNorm.weight", "bertmodel.encoder.layer.11.output.LayerNorm.bias", "bertmodel.pooler.dense.weight", "bertmodel.pooler.dense.bias", "embedding.weight". 
	Unexpected key(s) in state_dict: "bert.embeddings.word_embeddings.weight", "bert.embeddings.position_embeddings.weight", "bert.embeddings.token_type_embeddings.weight", "bert.embeddings.LayerNorm.weight", "bert.embeddings.LayerNorm.bias", "bert.encoder.layer.0.attention.self.query.weight", "bert.encoder.layer.0.attention.self.query.bias", "bert.encoder.layer.0.attention.self.key.weight", "bert.encoder.layer.0.attention.self.key.bias", "bert.encoder.layer.0.attention.self.value.weight", "bert.encoder.layer.0.attention.self.value.bias", "bert.encoder.layer.0.attention.output.dense.weight", "bert.encoder.layer.0.attention.output.dense.bias", "bert.encoder.layer.0.attention.output.LayerNorm.weight", "bert.encoder.layer.0.attention.output.LayerNorm.bias", "bert.encoder.layer.0.intermediate.dense.weight", "bert.encoder.layer.0.intermediate.dense.bias", "bert.encoder.layer.0.output.dense.weight", "bert.encoder.layer.0.output.dense.bias", "bert.encoder.layer.0.output.LayerNorm.weight", "bert.encoder.layer.0.output.LayerNorm.bias", "bert.encoder.layer.1.attention.self.query.weight", "bert.encoder.layer.1.attention.self.query.bias", "bert.encoder.layer.1.attention.self.key.weight", "bert.encoder.layer.1.attention.self.key.bias", "bert.encoder.layer.1.attention.self.value.weight", "bert.encoder.layer.1.attention.self.value.bias", "bert.encoder.layer.1.attention.output.dense.weight", "bert.encoder.layer.1.attention.output.dense.bias", "bert.encoder.layer.1.attention.output.LayerNorm.weight", "bert.encoder.layer.1.attention.output.LayerNorm.bias", "bert.encoder.layer.1.intermediate.dense.weight", "bert.encoder.layer.1.intermediate.dense.bias", "bert.encoder.layer.1.output.dense.weight", "bert.encoder.layer.1.output.dense.bias", "bert.encoder.layer.1.output.LayerNorm.weight", "bert.encoder.layer.1.output.LayerNorm.bias", "bert.encoder.layer.2.attention.self.query.weight", "bert.encoder.layer.2.attention.self.query.bias", "bert.encoder.layer.2.attention.self.key.weight", "bert.encoder.layer.2.attention.self.key.bias", "bert.encoder.layer.2.attention.self.value.weight", "bert.encoder.layer.2.attention.self.value.bias", "bert.encoder.layer.2.attention.output.dense.weight", "bert.encoder.layer.2.attention.output.dense.bias", "bert.encoder.layer.2.attention.output.LayerNorm.weight", "bert.encoder.layer.2.attention.output.LayerNorm.bias", "bert.encoder.layer.2.intermediate.dense.weight", "bert.encoder.layer.2.intermediate.dense.bias", "bert.encoder.layer.2.output.dense.weight", "bert.encoder.layer.2.output.dense.bias", "bert.encoder.layer.2.output.LayerNorm.weight", "bert.encoder.layer.2.output.LayerNorm.bias", "bert.encoder.layer.3.attention.self.query.weight", "bert.encoder.layer.3.attention.self.query.bias", "bert.encoder.layer.3.attention.self.key.weight", "bert.encoder.layer.3.attention.self.key.bias", "bert.encoder.layer.3.attention.self.value.weight", "bert.encoder.layer.3.attention.self.value.bias", "bert.encoder.layer.3.attention.output.dense.weight", "bert.encoder.layer.3.attention.output.dense.bias", "bert.encoder.layer.3.attention.output.LayerNorm.weight", "bert.encoder.layer.3.attention.output.LayerNorm.bias", "bert.encoder.layer.3.intermediate.dense.weight", "bert.encoder.layer.3.intermediate.dense.bias", "bert.encoder.layer.3.output.dense.weight", "bert.encoder.layer.3.output.dense.bias", "bert.encoder.layer.3.output.LayerNorm.weight", "bert.encoder.layer.3.output.LayerNorm.bias", "bert.encoder.layer.4.attention.self.query.weight", "bert.encoder.layer.4.attention.self.query.bias", "bert.encoder.layer.4.attention.self.key.weight", "bert.encoder.layer.4.attention.self.key.bias", "bert.encoder.layer.4.attention.self.value.weight", "bert.encoder.layer.4.attention.self.value.bias", "bert.encoder.layer.4.attention.output.dense.weight", "bert.encoder.layer.4.attention.output.dense.bias", "bert.encoder.layer.4.attention.output.LayerNorm.weight", "bert.encoder.layer.4.attention.output.LayerNorm.bias", "bert.encoder.layer.4.intermediate.dense.weight", "bert.encoder.layer.4.intermediate.dense.bias", "bert.encoder.layer.4.output.dense.weight", "bert.encoder.layer.4.output.dense.bias", "bert.encoder.layer.4.output.LayerNorm.weight", "bert.encoder.layer.4.output.LayerNorm.bias", "bert.encoder.layer.5.attention.self.query.weight", "bert.encoder.layer.5.attention.self.query.bias", "bert.encoder.layer.5.attention.self.key.weight", "bert.encoder.layer.5.attention.self.key.bias", "bert.encoder.layer.5.attention.self.value.weight", "bert.encoder.layer.5.attention.self.value.bias", "bert.encoder.layer.5.attention.output.dense.weight", "bert.encoder.layer.5.attention.output.dense.bias", "bert.encoder.layer.5.attention.output.LayerNorm.weight", "bert.encoder.layer.5.attention.output.LayerNorm.bias", "bert.encoder.layer.5.intermediate.dense.weight", "bert.encoder.layer.5.intermediate.dense.bias", "bert.encoder.layer.5.output.dense.weight", "bert.encoder.layer.5.output.dense.bias", "bert.encoder.layer.5.output.LayerNorm.weight", "bert.encoder.layer.5.output.LayerNorm.bias", "bert.encoder.layer.6.attention.self.query.weight", "bert.encoder.layer.6.attention.self.query.bias", "bert.encoder.layer.6.attention.self.key.weight", "bert.encoder.layer.6.attention.self.key.bias", "bert.encoder.layer.6.attention.self.value.weight", "bert.encoder.layer.6.attention.self.value.bias", "bert.encoder.layer.6.attention.output.dense.weight", "bert.encoder.layer.6.attention.output.dense.bias", "bert.encoder.layer.6.attention.output.LayerNorm.weight", "bert.encoder.layer.6.attention.output.LayerNorm.bias", "bert.encoder.layer.6.intermediate.dense.weight", "bert.encoder.layer.6.intermediate.dense.bias", "bert.encoder.layer.6.output.dense.weight", "bert.encoder.layer.6.output.dense.bias", "bert.encoder.layer.6.output.LayerNorm.weight", "bert.encoder.layer.6.output.LayerNorm.bias", "bert.encoder.layer.7.attention.self.query.weight", "bert.encoder.layer.7.attention.self.query.bias", "bert.encoder.layer.7.attention.self.key.weight", "bert.encoder.layer.7.attention.self.key.bias", "bert.encoder.layer.7.attention.self.value.weight", "bert.encoder.layer.7.attention.self.value.bias", "bert.encoder.layer.7.attention.output.dense.weight", "bert.encoder.layer.7.attention.output.dense.bias", "bert.encoder.layer.7.attention.output.LayerNorm.weight", "bert.encoder.layer.7.attention.output.LayerNorm.bias", "bert.encoder.layer.7.intermediate.dense.weight", "bert.encoder.layer.7.intermediate.dense.bias", "bert.encoder.layer.7.output.dense.weight", "bert.encoder.layer.7.output.dense.bias", "bert.encoder.layer.7.output.LayerNorm.weight", "bert.encoder.layer.7.output.LayerNorm.bias", "bert.encoder.layer.8.attention.self.query.weight", "bert.encoder.layer.8.attention.self.query.bias", "bert.encoder.layer.8.attention.self.key.weight", "bert.encoder.layer.8.attention.self.key.bias", "bert.encoder.layer.8.attention.self.value.weight", "bert.encoder.layer.8.attention.self.value.bias", "bert.encoder.layer.8.attention.output.dense.weight", "bert.encoder.layer.8.attention.output.dense.bias", "bert.encoder.layer.8.attention.output.LayerNorm.weight", "bert.encoder.layer.8.attention.output.LayerNorm.bias", "bert.encoder.layer.8.intermediate.dense.weight", "bert.encoder.layer.8.intermediate.dense.bias", "bert.encoder.layer.8.output.dense.weight", "bert.encoder.layer.8.output.dense.bias", "bert.encoder.layer.8.output.LayerNorm.weight", "bert.encoder.layer.8.output.LayerNorm.bias", "bert.encoder.layer.9.attention.self.query.weight", "bert.encoder.layer.9.attention.self.query.bias", "bert.encoder.layer.9.attention.self.key.weight", "bert.encoder.layer.9.attention.self.key.bias", "bert.encoder.layer.9.attention.self.value.weight", "bert.encoder.layer.9.attention.self.value.bias", "bert.encoder.layer.9.attention.output.dense.weight", "bert.encoder.layer.9.attention.output.dense.bias", "bert.encoder.layer.9.attention.output.LayerNorm.weight", "bert.encoder.layer.9.attention.output.LayerNorm.bias", "bert.encoder.layer.9.intermediate.dense.weight", "bert.encoder.layer.9.intermediate.dense.bias", "bert.encoder.layer.9.output.dense.weight", "bert.encoder.layer.9.output.dense.bias", "bert.encoder.layer.9.output.LayerNorm.weight", "bert.encoder.layer.9.output.LayerNorm.bias", "bert.encoder.layer.10.attention.self.query.weight", "bert.encoder.layer.10.attention.self.query.bias", "bert.encoder.layer.10.attention.self.key.weight", "bert.encoder.layer.10.attention.self.key.bias", "bert.encoder.layer.10.attention.self.value.weight", "bert.encoder.layer.10.attention.self.value.bias", "bert.encoder.layer.10.attention.output.dense.weight", "bert.encoder.layer.10.attention.output.dense.bias", "bert.encoder.layer.10.attention.output.LayerNorm.weight", "bert.encoder.layer.10.attention.output.LayerNorm.bias", "bert.encoder.layer.10.intermediate.dense.weight", "bert.encoder.layer.10.intermediate.dense.bias", "bert.encoder.layer.10.output.dense.weight", "bert.encoder.layer.10.output.dense.bias", "bert.encoder.layer.10.output.LayerNorm.weight", "bert.encoder.layer.10.output.LayerNorm.bias", "bert.encoder.layer.11.attention.self.query.weight", "bert.encoder.layer.11.attention.self.query.bias", "bert.encoder.layer.11.attention.self.key.weight", "bert.encoder.layer.11.attention.self.key.bias", "bert.encoder.layer.11.attention.self.value.weight", "bert.encoder.layer.11.attention.self.value.bias", "bert.encoder.layer.11.attention.output.dense.weight", "bert.encoder.layer.11.attention.output.dense.bias", "bert.encoder.layer.11.attention.output.LayerNorm.weight", "bert.encoder.layer.11.attention.output.LayerNorm.bias", "bert.encoder.layer.11.intermediate.dense.weight", "bert.encoder.layer.11.intermediate.dense.bias", "bert.encoder.layer.11.output.dense.weight", "bert.encoder.layer.11.output.dense.bias", "bert.encoder.layer.11.output.LayerNorm.weight", "bert.encoder.layer.11.output.LayerNorm.bias", "bert.pooler.dense.weight", "bert.pooler.dense.bias". 
	size mismatch for lstm.weight_ih_l0: copying a param with shape torch.Size([512, 768]) from checkpoint, the shape in current model is torch.Size([512, 100]).
	size mismatch for lstm.weight_ih_l0_reverse: copying a param with shape torch.Size([512, 768]) from checkpoint, the shape in current model is torch.Size([512, 100]).

In [30]:
%%time
#Experiment 3, 4000 training datasets
train_data_4000 = datasets.concatenate_datasets([train_data, train_data_ext.select(range(2000))])
vocab_4000, word2id_4000 = build_vocab_id(train_data_4000)
w2v_4000_model = Word2Vec(train_data_4000['tokens'], vector_size=Config.EMBEDDING_VEC, window=3, min_count=1, workers=4, epochs=50)
embedding_w2v_4000_matrix = build_w2v_matrix(word2id, w2v_4000_model)
embedding_glove_4000_matrix = build_glove_matrix(word2id_4000, glove_words, glove_vectors)

train_4000_dataset = NER_Dataset(train_data_4000)
train_4000_loader = DataLoader(train_4000_dataset, batch_size=Config.BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

#Word2Vec
print('Word2Vec type training...')
model_w2v = NER_Model('Word2Vec').to(Config.DEVICE)
optimizer = optim.Adam(model_w2v.parameters(), lr=Config.LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index=ner_tag2id['<PAD>'])
training(model_w2v,'best_w2v_model_4000.pth', train_4000_loader)

#GloVe
print('GloVe type training...')
model_glove = NER_Model('GloVe').to(Config.DEVICE)
optimizer = optim.Adam(model_glove.parameters(), lr=Config.LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index=ner_tag2id['<PAD>'])
training(model_glove,'best_glove_model_4000.pth', train_4000_loader)

Word2Vec type training...
Epoch 1/20
Train Loss: 0.5276
Best model saved!
Epoch 2/20
Train Loss: 0.2909
Best model saved!
Epoch 3/20
Train Loss: 0.2681
Epoch 4/20
Train Loss: 0.2525
Epoch 5/20
Train Loss: 0.2382
Best model saved!
Epoch 6/20
Train Loss: 0.2307
Best model saved!
Epoch 7/20
Train Loss: 0.2201
Epoch 8/20
Train Loss: 0.2091
Epoch 9/20
Train Loss: 0.2025
Epoch 10/20
Train Loss: 0.1939
Epoch 11/20
Train Loss: 0.1823
Epoch 12/20
Train Loss: 0.1738
Epoch 13/20
Train Loss: 0.1635
Epoch 14/20
Train Loss: 0.1538
Epoch 15/20
Train Loss: 0.1477
Epoch 16/20
Train Loss: 0.1389
Epoch 17/20
Train Loss: 0.1295
Epoch 18/20
Train Loss: 0.1216
Epoch 19/20
Train Loss: 0.1134
Epoch 20/20
Train Loss: 0.1027
Validation Report:
              precision    recall  f1-score   support

        I-LF       0.78      0.77      0.77       730
           O       0.91      0.93      0.92      4460
        B-LF       0.68      0.67      0.68       306
        B-AC       0.78      0.65      0.71       508



In [32]:
print('Word2Vec type testing...')
testing(model_w2v,'best_w2v_model_4000.pth', test_loader)
print('GloVe type testing...')
testing(model_glove,'best_glove_model_4000.pth', test_loader)

Word2Vec type testing...
Final Test Report:
              precision    recall  f1-score   support

        I-LF       0.79      0.84      0.82      1227
           O       0.93      0.95      0.94      7751
        B-LF       0.74      0.71      0.73       482
        B-AC       0.84      0.69      0.76       797

    accuracy                           0.90     10257
   macro avg       0.83      0.80      0.81     10257
weighted avg       0.90      0.90      0.90     10257

GloVe type testing...
Final Test Report:
              precision    recall  f1-score   support

        I-LF       0.78      0.83      0.80      1227
           O       0.93      0.95      0.94      7751
        B-LF       0.71      0.73      0.72       482
        B-AC       0.86      0.64      0.74       797

    accuracy                           0.90     10257
   macro avg       0.82      0.78      0.80     10257
weighted avg       0.90      0.90      0.90     10257



In [34]:
%%time
#Experiment 3, 8000 training datasets
train_data_8000 = datasets.concatenate_datasets([train_data, train_data_ext.select(range(2000))])
vocab_8000, word2id_8000 = build_vocab_id(train_data_8000)
w2v_8000_model = Word2Vec(train_data_8000['tokens'], vector_size=Config.EMBEDDING_VEC, window=3, min_count=1, workers=4, epochs=50)
embedding_w2v_8000_matrix = build_w2v_matrix(word2id, w2v_8000_model)
embedding_glove_8000_matrix = build_glove_matrix(word2id_8000, glove_words, glove_vectors)

train_8000_dataset = NER_Dataset(train_data_8000)
train_8000_loader = DataLoader(train_8000_dataset, batch_size=Config.BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

#Word2Vec
print('Word2Vec type training...')
model_w2v = NER_Model('Word2Vec').to(Config.DEVICE)
optimizer = optim.Adam(model_w2v.parameters(), lr=Config.LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index=ner_tag2id['<PAD>'])
training(model_w2v,'best_w2v_model_8000.pth', train_8000_loader)

#GloVe
print('GloVe type training...')
model_glove = NER_Model('GloVe').to(Config.DEVICE)
optimizer = optim.Adam(model_glove.parameters(), lr=Config.LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index=ner_tag2id['<PAD>'])
training(model_glove,'best_glove_model_8000.pth', train_8000_loader)

Word2Vec type training...
Epoch 1/20
Train Loss: 0.5136
Best model saved!
Epoch 2/20
Train Loss: 0.2947
Epoch 3/20
Train Loss: 0.2680
Epoch 4/20
Train Loss: 0.2527
Best model saved!
Epoch 5/20
Train Loss: 0.2383
Epoch 6/20
Train Loss: 0.2280
Epoch 7/20
Train Loss: 0.2167
Epoch 8/20
Train Loss: 0.2083
Epoch 9/20
Train Loss: 0.2015
Epoch 10/20
Train Loss: 0.1897
Epoch 11/20
Train Loss: 0.1803
Epoch 12/20
Train Loss: 0.1703
Epoch 13/20
Train Loss: 0.1622
Best model saved!
Epoch 14/20
Train Loss: 0.1539
Epoch 15/20
Train Loss: 0.1439
Epoch 16/20
Train Loss: 0.1351
Epoch 17/20
Train Loss: 0.1267
Epoch 18/20
Train Loss: 0.1187
Epoch 19/20
Train Loss: 0.1166
Epoch 20/20
Train Loss: 0.1053
Validation Report:
              precision    recall  f1-score   support

        I-LF       0.80      0.78      0.79       730
           O       0.91      0.94      0.92      4460
        B-LF       0.69      0.67      0.68       306
        B-AC       0.78      0.58      0.67       508

    accuracy      

In [36]:
print('Word2Vec type testing...')
testing(model_w2v,'best_w2v_model_8000.pth', test_loader)
print('GloVe type testing...')
testing(model_glove,'best_glove_model_8000.pth', test_loader)

Word2Vec type testing...
Final Test Report:
              precision    recall  f1-score   support

        I-LF       0.80      0.79      0.80      1227
           O       0.93      0.95      0.94      7751
        B-LF       0.76      0.72      0.74       482
        B-AC       0.85      0.68      0.75       797

    accuracy                           0.90     10257
   macro avg       0.83      0.79      0.81     10257
weighted avg       0.90      0.90      0.90     10257

GloVe type testing...
Final Test Report:
              precision    recall  f1-score   support

        I-LF       0.79      0.81      0.80      1227
           O       0.93      0.95      0.94      7751
        B-LF       0.73      0.72      0.72       482
        B-AC       0.83      0.69      0.75       797

    accuracy                           0.90     10257
   macro avg       0.82      0.79      0.80     10257
weighted avg       0.90      0.90      0.90     10257

