### Load the dataset

In [2]:
def read_conll_file(file_path):
    sentences = []
    sentence = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if not line:
                if sentence:
                    sentences.append(sentence)
                    sentence = []
            elif line.startswith("-DOCSTART-"):
                continue
            else:
                token, pos, chunk, ner = line.split(' ')
                sentence.append((token, ner))
        if sentence:
            sentences.append(sentence)
    return sentences


In [3]:
train_path = 'conll2003-ner/train.txt'
valid_path = 'conll2003-ner/valid.txt'
test_path = 'conll2003-ner/test.txt'

train_data = read_conll_file(train_path)
valid_data = read_conll_file(valid_path)
test_data = read_conll_file(test_path)

In [4]:
print(train_data[0])

[('EU', 'B-ORG'), ('rejects', 'O'), ('German', 'B-MISC'), ('call', 'O'), ('to', 'O'), ('boycott', 'O'), ('British', 'B-MISC'), ('lamb', 'O'), ('.', 'O')]


### Create word embeddings for the data

In order to compare the models on even footing, we will use BERT-created word embeddings for all of them.

In [5]:
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm.auto import tqdm
from torch.nn.utils.rnn import pad_sequence

max_length = 150
batch_size = 32  # Define your batch size

padded_sentences = []
for i in range(0, len(train_data), batch_size):
    batch = train_data[i:i+batch_size]
    batch_padded = [sentence + [('PAD', 'O')] * (max_length - len(sentence)) for sentence in batch]
    padded_sentences.extend([" ".join([token for token, tag in sentence]) for sentence in batch_padded])

tokenizer = AutoTokenizer.from_pretrained('bert-base-cased', use_fast=False)
model = AutoModel.from_pretrained('bert-base-cased')

# Check if a GPU is available and move the model to GPU if it is
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
model.to(DEVICE)

def get_sentence_embeddings(sentences):
    encoded_inputs = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True, max_length=max_length, is_split_into_words=True)
    encoded_inputs = {key: value.to(DEVICE) for key, value in encoded_inputs.items()}
    
    with torch.no_grad():
        outputs = model(**encoded_inputs)
    # Instead of using just the [CLS] token, use all token embeddings
    return outputs.last_hidden_state

# Process data in batches and move to GPU in advance

data_embeddings = []

data_embeddings = []
for batch_sentences in tqdm(padded_sentences):
    batch_embeddings = get_sentence_embeddings([batch_sentences])  # Pass as a single-item list
    data_embeddings.append(batch_embeddings)

# Concatenate all batches and move to GPU
data_embeddings = torch.cat(data_embeddings, dim=0).to(DEVICE)

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 14041/14041 [03:43<00:00, 62.88it/s]


### Create character embeddings (TODO, not used yet, needs fixing)
According to https://direct.mit.edu/tacl/article/doi/10.1162/tacl_a_00104/43361/Named-Entity-Recognition-with-Bidirectional-LSTM, they also used character embeddings

In [6]:
import torch.nn as nn
import torch
import string

char_set = ['<PAD>', '<UNK>'] + list(string.printable)
char_to_idx = {char: idx for idx, char in enumerate(char_set)}

embedding_dim = 25
char_embeddings = nn.Embedding(num_embeddings=len(char_set), embedding_dim=embedding_dim)
char_embeddings = char_embeddings.to(DEVICE)
char_embeddings.weight.data.uniform_(-0.5, 0.5)

def get_sentence_char_embeddings(sentences):
    # Convert sentences to character indices and pad
    batch_char_indices = [[char_to_idx.get(char, char_to_idx['<UNK>']) for char in sentence] for sentence in sentences]
    batch_char_indices_padded = pad_sequence([torch.tensor(seq) for seq in batch_char_indices], batch_first=True, padding_value=char_to_idx['<PAD>'])
    
    # Move padded sequences to the same device as the model
    batch_char_indices_padded = batch_char_indices_padded.to(DEVICE)
    
    # Get character embeddings
    with torch.no_grad():
        char_embeddings_batch = char_embeddings(batch_char_indices_padded)
    
    # For simplicity, let's take the mean of the character embeddings to represent the sentence
    sentence_char_embeddings = char_embeddings_batch.mean(dim=1)
    return sentence_char_embeddings

batch_size = 32  # Define your batch size
data_char_embeddings = []

for i in tqdm(range(0, len(train_data), batch_size)):
    batch_sentences = ["".join([token for token, _ in sentence]) for sentence in train_data[i:i+batch_size]]
    batch_char_embeddings = get_sentence_char_embeddings(batch_sentences)
    data_char_embeddings.append(batch_char_embeddings)

# Concatenate all batches
data_char_embeddings = torch.cat(data_char_embeddings, dim=0)

100%|██████████| 439/439 [00:00<00:00, 1045.80it/s]


### Train the RNN

In [13]:
from torch import nn
import torch
from torch.utils.data import DataLoader, TensorDataset
torch.manual_seed(0)
RNN_HIDDEN = 30
BATCH_SIZE = 32
LEARNING_RATE = 0.01
EPOCHS = 10

class TaggerModel(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(TaggerModel, self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, inputData):
        # x shape: (batch_size, seq_length, input_size)
        out, _ = self.rnn(inputData)  # out shape: (batch_size, seq_length, hidden_size)
        out = self.fc(out)    # out shape: (batch_size, seq_length, output_size)
        return out

tag_to_idx = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}

# Model parameters
input_size = data_embeddings.size(2)  # Assuming data_embeddings is your input tensor
output_size = len(tag_to_idx)  # Number of unique NER tags

model = TaggerModel(input_size, RNN_HIDDEN, output_size).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
loss_function = torch.nn.CrossEntropyLoss(ignore_index=0, reduction='sum')

encoded_tags = [[tag_to_idx[tag] for token, tag in sentence] for sentence in train_data]
max_length = data_embeddings.size(1)
padded_tags = [tags + [tag_to_idx['O']] * (max_length - len(tags)) for tags in encoded_tags]
tag_tensor = torch.tensor(padded_tags, dtype=torch.long)
tag_tensor = tag_tensor.to(DEVICE)

train_dataset = TensorDataset(data_embeddings, tag_tensor)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    for inputs, targets in train_loader:
        inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)
        
        # Forward pass
        outputs = model(inputs)
        
        # Reshape outputs and targets to comply with CrossEntropyLoss requirements
        outputs = outputs.view(-1, outputs.shape[-1])  # Combined shape: (batch_size * seq_length, output_size)
        targets = targets.view(-1)  # Combined shape: (batch_size * seq_length)
        
        loss = loss_function(outputs, targets)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()  # Aggregate the loss
    
    print(f"Epoch {epoch + 1}, Loss: {total_loss}")
        
# set to evaluation mode
model.eval()

Epoch 1, Loss: 45544.44019699097
Epoch 2, Loss: 39099.11204910278
Epoch 3, Loss: 36933.60237503052
Epoch 4, Loss: 35415.18404960632
Epoch 5, Loss: 34777.54089927673
Epoch 6, Loss: 33819.65024185181
Epoch 7, Loss: 33227.716468811035
Epoch 8, Loss: 33430.36266326904
Epoch 9, Loss: 32971.8424949646
Epoch 10, Loss: 31661.90768814087


TaggerModel(
  (rnn): RNN(768, 30, batch_first=True)
  (fc): Linear(in_features=30, out_features=9, bias=True)
)

In [14]:
def evaluate_model(model, test_data, data_embeddings, tag_to_idx, max_length, batch_size=32):
    encoded_tags = [[tag_to_idx[tag] for token, tag in sentence] for sentence in test_data]
    
    # Get the test data embeddings from data_embeddings
    test_embeddings = data_embeddings[-len(test_data):].to(DEVICE)
    
    padded_tags = [tags + [tag_to_idx['O']] * (max_length - len(tags)) for tags in encoded_tags]
    tag_tensor = torch.tensor(padded_tags, dtype=torch.long).to(DEVICE)
    
    test_dataset = TensorDataset(test_embeddings, tag_tensor)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    model.eval()  # Set the model to evaluation mode

    total_correct, total_non_pad_tokens = 0, 0
    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)
            outputs = model(inputs)
            predictions = outputs.argmax(dim=2)  # Get the index of the max log-probability

            # Compare predictions to true labels (ignoring padding tokens)
            non_pad_indices = targets != tag_to_idx['O']
            correct = (predictions[non_pad_indices] == targets[non_pad_indices]).sum().item()
            total_correct += correct
            total_non_pad_tokens += non_pad_indices.sum().item()

    accuracy = total_correct / total_non_pad_tokens
    return accuracy

# Call the function with your model and test data
accuracy = evaluate_model(model, test_data, data_embeddings, tag_to_idx, max_length)
print(f'Test Accuracy: {accuracy}')

Test Accuracy: 0.17036489151873768
