In [3]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
# Import generate_masked_sentences from scripts/maskPrecessTest.py
from scripts.maskPrecessTest import generate_masked_sentences

In [4]:
from datasets import load_dataset

wnut = load_dataset("wnut_17")


#preprocess data
train_data = generate_masked_sentences(wnut['train'])

Found cached dataset wnut_17 (/home/malthe/.cache/huggingface/datasets/wnut_17/wnut_17/1.0.0/077c7f08b8dbc800692e8c9186cdf3606d5849ab0e7be662e6135bb10eba54f9)


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
max_len=32
PAD = '<PAD>'

word2idx = {PAD:0}
idx2word = [PAD]

# Generate word2idxs
for sentPos, sent in enumerate(train_data):
    for wordPos, word in enumerate(sent['tokens'][:max_len]):
        if word not in word2idx:
            word2idx[word] = len(idx2word)
            idx2word.append(word)        

# Vocab length
vocab_dim = len(idx2word)

feats = torch.zeros((len(train_data), max_len), dtype=torch.long)
for sentPos, sent in enumerate(train_data):
    for wordPos, word in enumerate(sent['tokens'][:max_len]):
        wordIdx = word2idx[PAD] if word not in word2idx else word2idx[word]
        feats[sentPos][wordPos] = wordIdx

# Generate labels as a tensor of booleans indicating if the masked token is a named entity
labels = torch.tensor([sent['is_ner'] for sent in train_data], dtype=torch.float)

In [11]:
y_pred.shape

torch.Size([62730, 32])

In [16]:
# Define a simple nn model
class Model(nn.Module):
    def __init__(self, vocab_dim, emb_dim):
        # Model should predict if the masked token is a named entity
        super(Model, self).__init__()
        self.word_embeddings = nn.Embedding(vocab_dim, emb_dim)
        self.linear = nn.Linear(emb_dim, 128)

        # pool and output a single value
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.output = nn.Linear(128, 1)            
    def forward(self, x):
        x = self.word_embeddings(x)
        x = self.linear(x)
        x = self.pool(x.transpose(1, 2)).squeeze(2)
        x = self.output(x)
        # Use sigmoid activation function to get a value between 0 and 1
        x = torch.sigmoid(x)
        return x
    
model = Model(vocab_dim, 128)

# Define cross entropy loss function and optimizer
criterion = nn.BCELoss()
# criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

batch_size = 32

# Train model
for epoch in range(10):
    for i in range(0, len(feats), batch_size):
        batch_feats = feats[i:i+batch_size]
        batch_labels = labels[i:i+batch_size]
        y_pred = model(batch_feats)
        loss = criterion(y_pred, batch_labels.unsqueeze(1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f'Epoch: {epoch}, Loss: {loss.item()}')

Epoch: 0, Loss: 0.0633123368024826
Epoch: 1, Loss: 0.046949632465839386
Epoch: 2, Loss: 0.03520117327570915
Epoch: 3, Loss: 0.007973277941346169
Epoch: 4, Loss: 0.004464718978852034
Epoch: 5, Loss: 0.0048362743109464645
Epoch: 6, Loss: 0.0018194129224866629
Epoch: 7, Loss: 0.003108053235337138
Epoch: 8, Loss: 0.0012030262732878327
Epoch: 9, Loss: 0.0005174627876840532


In [17]:
# Test model
test_data = generate_masked_sentences(wnut['test'])

feats = torch.zeros((len(test_data), max_len), dtype=torch.long)
for sentPos, sent in enumerate(test_data):
    for wordPos, word in enumerate(sent['tokens'][:max_len]):
        wordIdx = word2idx[PAD] if word not in word2idx else word2idx[word]
        feats[sentPos][wordPos] = wordIdx

labels = torch.tensor([sent['is_ner'] for sent in test_data], dtype=torch.float)

y_pred = model(feats)

# Calculate accuracy
correct = 0
for i in range(len(y_pred)):
    if (y_pred[i] > 0.5) == labels[i]:
        correct += 1
print(f'Accuracy: {correct/len(y_pred)}')


Accuracy: 0.9109600752329657


In [18]:
# Make confusion matrix
confusion_matrix = torch.zeros((2, 2))
for i in range(len(y_pred)):
    if y_pred[i] > 0.5:
        if labels[i]:
            confusion_matrix[0][0] += 1
        else:
            confusion_matrix[0][1] += 1
    else:
        if labels[i]:
            confusion_matrix[1][0] += 1
        else:
            confusion_matrix[1][1] += 1

print(confusion_matrix)

tensor([[   49.,   392.],
        [ 1691., 21262.]])
