In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
# Import generate_masked_sentences from scripts/maskPrecessTest.py
from scripts.maskPrecessTest import generate_masked_sentences

In [2]:
from datasets import load_dataset

wnut = load_dataset("wnut_17")


#preprocess data
train_data = generate_masked_sentences(wnut['train'])

Found cached dataset wnut_17 (/home/malthe/.cache/huggingface/datasets/wnut_17/wnut_17/1.0.0/077c7f08b8dbc800692e8c9186cdf3606d5849ab0e7be662e6135bb10eba54f9)


  0%|          | 0/3 [00:00<?, ?it/s]

In [55]:
max_len=32
PAD = '<PAD>'

word2idx = {PAD:0}
idx2word = [PAD]

# Generate word2idxs
for sentPos, sent in enumerate(train_data):
    for wordPos, word in enumerate(sent['tokens'][:max_len]):
        if word not in word2idx:
            word2idx[word] = len(idx2word)
            idx2word.append(word)        

# Vocab length
vocab_dim = len(idx2word)

feats = torch.zeros((len(train_data), max_len), dtype=torch.long)
for sentPos, sent in enumerate(train_data):
    for wordPos, word in enumerate(sent['tokens'][:max_len]):
        wordIdx = word2idx[PAD] if word not in word2idx else word2idx[word]
        feats[sentPos][wordPos] = wordIdx

# Generate labels as a tensor of booleans indicating if the masked token is a named entity
labels = torch.tensor([sent['is_ner'] for sent in train_data], dtype=torch.float)

In [16]:
# Define a simple nn model
class Model(nn.Module):
    def __init__(self, vocab_dim, emb_dim):
        # Model should predict if the masked token is a named entity
        super(Model, self).__init__()
        self.word_embeddings = nn.Embedding(vocab_dim, emb_dim)
        self.linear = nn.Linear(emb_dim, 128)        
        self.relu = nn.ReLU()

        # pool and output a single value
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.mask_output = nn.Linear(128, 1)

        # self.linear2 = nn.Linear(128, 128)


    def forward(self, x):
        x = self.word_embeddings(x)
        x = self.linear(x)
        x = self.relu(x)
        x = self.pool(x.transpose(1, 2)).squeeze(2)
        x = self.mask_output(x)
        # Use sigmoid activation function to get a value between 0 and 1
        x = torch.sigmoid(x)
        # If x is greater than 0.5, then the masked token is a named entity and we run the second linear layer
        # x = torch.where(x > 0.5, self.linear2(x), x)
        return x
    
model = Model(vocab_dim, 128)

# Define cross entropy loss function and optimizer
criterion = nn.BCELoss()
# criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

batch_size = 32

# Train model
for epoch in range(10):
    for i in range(0, len(feats), batch_size):
        batch_feats = feats[i:i+batch_size]
        batch_labels = labels[i:i+batch_size]
        y_pred = model(batch_feats)
        loss = criterion(y_pred, batch_labels.unsqueeze(1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f'Epoch: {epoch}, Loss: {loss.item()}')

Epoch: 0, Loss: 0.0633123368024826
Epoch: 1, Loss: 0.046949632465839386
Epoch: 2, Loss: 0.03520117327570915
Epoch: 3, Loss: 0.007973277941346169
Epoch: 4, Loss: 0.004464718978852034
Epoch: 5, Loss: 0.0048362743109464645
Epoch: 6, Loss: 0.0018194129224866629
Epoch: 7, Loss: 0.003108053235337138
Epoch: 8, Loss: 0.0012030262732878327
Epoch: 9, Loss: 0.0005174627876840532


In [74]:
# Test model
test_data = generate_masked_sentences(wnut['test'])

feats = torch.zeros((len(test_data), max_len), dtype=torch.long)
for sentPos, sent in enumerate(test_data):
    for wordPos, word in enumerate(sent['tokens'][:max_len]):
        wordIdx = word2idx[PAD] if word not in word2idx else word2idx[word]
        feats[sentPos][wordPos] = wordIdx

labels = torch.tensor([sent['is_ner'] for sent in test_data], dtype=torch.float)

y_pred = model(feats)

# Calculate accuracy
correct = 0
for i in range(len(y_pred)):
    if (y_pred[i] > 0.5) == labels[i]:
        correct += 1
print(f'Accuracy: {correct/len(y_pred)}')


NameError: name 'model' is not defined

In [None]:
# Make confusion matrix
confusion_matrix = torch.zeros((2, 2))
for i in range(len(y_pred)):
    if y_pred[i] > 0.5:
        if labels[i]:
            confusion_matrix[0][0] += 1
        else:
            confusion_matrix[0][1] += 1
    else:
        if labels[i]:
            confusion_matrix[1][0] += 1
        else:
            confusion_matrix[1][1] += 1

print(confusion_matrix)

tensor([[   49.,   392.],
        [ 1691., 21262.]])


### Now train the second model to detect the named entity type

In [102]:
# Start by filtering out all the sentences in train_data where the masked token is not a named entity, i.e. the is_ner field is False
filtered_train_data = [sent for sent in train_data if sent['is_ner']]

In [113]:
# Get all ner_tags from wnut_17 dataset
ner_tags = wnut['train'].features['ner_tags'].feature.names

# Create a dictionary mapping ner_tags to indices
idx2ner = {i:ner for i, ner in enumerate(ner_tags)}
num_entities = len(idx2ner)
print(num_entities)

# Create a list of one hot encoded vectors for each sentence in filtered_train_data
true_ner_tags = torch.zeros((len(filtered_train_data), num_entities), dtype=torch.float)
for sentPos, sent in enumerate(filtered_train_data):
    true_ner_tags[sentPos][sent['ner_tag']] = 1


feats = torch.zeros((len(filtered_train_data), max_len), dtype=torch.long)
for sentPos, sent in enumerate(filtered_train_data):
    for wordPos, word in enumerate(sent['tokens'][:max_len]):
        wordIdx = word2idx[PAD] if word not in word2idx else word2idx[word]
        feats[sentPos][wordPos] = wordIdx

print(true_ner_tags.shape)

13
torch.Size([3160, 13])


In [133]:
# Create model 2
class Model2(nn.Module):
    def __init__(self, vocab_dim, emb_dim):
        # Model should predict the entity type of the masked token
        super(Model2, self).__init__()
        self.word_embeddings = nn.Embedding(vocab_dim, emb_dim)
        self.linear = nn.Linear(emb_dim, 128)
        self.relu = nn.ReLU()

        # pool and output a single value
        self.pool = nn.AdaptiveMaxPool1d(1)

        # Output a single value for each entity type
        self.output = nn.Linear(128, num_entities)



    def forward(self, x):
        x = self.word_embeddings(x)
        x = self.linear(x)
        x = self.relu(x)
        x = self.pool(x.transpose(1, 2)).squeeze(2)
        x = self.output(x)        
        x = torch.softmax(x, dim=1)
        return x
    
model2 = Model2(vocab_dim, 128)

# Define cross entropy loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model2.parameters(), lr=0.001)


# Train model
for epoch in range(10):
    for i in range(0, len(feats), batch_size):
        batch_feats = feats[i:i+batch_size]
        batch_labels = true_ner_tags[i:i+batch_size]
        y_pred = model2(batch_feats)        
        loss = criterion(y_pred, batch_labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f'Epoch: {epoch}, Loss: {loss.item()}')

Epoch: 0, Loss: 2.475318193435669
Epoch: 1, Loss: 2.356250762939453
Epoch: 2, Loss: 2.3452768325805664
Epoch: 3, Loss: 2.3380355834960938
Epoch: 4, Loss: 2.3237128257751465
Epoch: 5, Loss: 2.3061845302581787
Epoch: 6, Loss: 2.3028838634490967
Epoch: 7, Loss: 2.3018319606781006
Epoch: 8, Loss: 2.30107045173645
Epoch: 9, Loss: 2.315247058868408


In [147]:
# Predict on the sentence 'I live in London'
sentence = "It 's the view from where I 'm living for two weeks . [MASK] State Building"
sentence = "Hello my name is John and I live in [MASK] State Building"
sentence = sentence.split()


with torch.no_grad():
    sentence = torch.tensor([word2idx[word] for word in sentence], dtype=torch.long)
    sentence = sentence.unsqueeze(0)
    y_pred = model2(sentence)
    print(y_pred)
    print(y_pred.shape)
    print(idx2ner[torch.argmax(y_pred).item()])

tensor([[3.4616e-05, 3.0004e-04, 2.1161e-06, 2.2064e-05, 1.2284e-04, 1.7052e-04,
         6.1553e-05, 3.7935e-02, 1.3420e-04, 9.6108e-01, 1.1349e-04, 6.2741e-06,
         1.5634e-05]])
torch.Size([1, 13])
B-person


In [128]:
# Get class distribution of ner_tags in filtered_train_data
ner_counts = {}
for sent in filtered_train_data:
    ner = idx2ner[sent['ner_tag']]
    if ner not in ner_counts:
        ner_counts[ner] = 0
    ner_counts[ner] += 1

print(ner_counts)

{'B-location': 548, 'I-location': 245, 'B-group': 264, 'B-corporation': 221, 'B-person': 660, 'B-creative-work': 140, 'B-product': 142, 'I-person': 335, 'I-creative-work': 206, 'I-corporation': 46, 'I-group': 150, 'I-product': 203}


### Now combine the two models into one so we can train them sequentially

In [None]:
# Model 1 predicts if a masked token is a named entity
# Model 2 predicts what type of named entity a masked token is. Model 2 is only run if model 1 predicts that the masked token is a named entity

# Define combined model
class CombinedModel(nn.Module):
    def __init__(self, vocab_dim, emb_dim):
        super(CombinedModel, self).__init__()
        # First part of the model is same between the two models
        self.word_embeddings = nn.Embedding(vocab_dim, emb_dim)
        self.linear = nn.Linear(emb_dim, 128)                
        # Pool together all word embeddings after linear layer
        self.pool = nn.AdaptiveMaxPool1d(1)

        # Model 1 specific layers
        self.model1_output = nn.Linear(128, 1)

        # Add a gating mechanism to decide whether to run model 2
        self.gate = nn.Linear(1, 1)

        # Model 2 specific layers        
        # Output a single value for whether the masked token is a named entity
        self.model2_output = nn.Linear(128, num_entities)

        

    def forward(self, x):
        x = self.word_embeddings(x)
        x = self.linear(x)
        x = nn.ReLU(x)
        x = self.pool(x.transpose(1, 2)).squeeze(2)
        # Run through model 1
        model1_output = self.model1_output(x)
        model1_output = torch.sigmoid(model1_output)

        # Run through model 2 if model 1 predicts that the masked token is a named entity
        model2_output = self.model2_output(x)
        model2_output = torch.softmax(model2_output, dim=1)
        # Run through gate
        gate_output = self.gate(model1_output)        
        gate_output = nn.ReLU(gate_output)
        model2_output = model2_output * gate_output
        # The gating mechanism should be able to learn to not run model 2 if model 1 predicts that the masked token is not a named entity 

        return model1_output, model2_output
    

combined_model = CombinedModel(vocab_dim, 128)

criterion1 = nn.BCELoss()
criterion2 = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(combined_model.parameters(), lr=0.001)

for epoch in range(10):
    for i in range(0, len(feats), batch_size):
        x, y_true1, y_true2 = data
        optimizer.zero_grad()
        y1_pred, y2_pred = model(x.view(x.size(0), -1))
        loss1 = criterion1(y1_pred, y_true1.float())
        loss2 = criterion2(y2_pred, y_true2)
        loss = loss1 + loss2
        loss.backward()
        optimizer.step()