### Load the dataset

In [1]:
def read_conll_file(file_path):
    sentences = []
    sentence = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if not line:
                if sentence:
                    sentences.append(sentence)
                    sentence = []
            elif line.startswith("-DOCSTART-"):
                continue
            else:
                token, pos, chunk, ner = line.split(' ')
                sentence.append((token, ner))
        if sentence:
            sentences.append(sentence)
    return sentences


In [2]:
def read_ner_file(file_path):
    """
    Reads a NER annotated file and returns a list of sentences with their annotations.

    Each sentence is represented as a list of tuples where each tuple contains
    a token (word) and its corresponding entity tag.

    Args:
    file_path (str): The path to the NER annotated file.

    Returns:
    list of list of tuples: A list where each element represents a sentence.
                            Each sentence is a list of (token, tag) tuples.
    """
    sentences = []  # List to hold sentences
    current_sentence = []  # Current sentence being processed

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()

            if line.startswith('# text = '):  # Start of a new sentence
                # If there's a current sentence being processed, add it to the sentences list
                if current_sentence:
                    sentences.append(current_sentence)
                    current_sentence = []  # Reset current sentence

            elif line and not line.startswith('#'):  # Process token lines
                parts = line.split('\t')  # Split the line into parts
                if len(parts) >= 3:  # Ensure the line has at least 3 parts
                    token, tag = parts[1], parts[2]  # Extract token and tag (ignoring the first column)
                    current_sentence.append((token, tag))  # Add to the current sentence

        # Add the last sentence if the file doesn't end with a newline
        if current_sentence:
            sentences.append(current_sentence)

    return sentences

In [3]:
train_path = 'conll2003-ner/train.txt'
valid_path = 'conll2003-ner/valid.txt'
test_path = 'conll2003-ner/test.txt'

train_path1 = 'baseline-data/en_ewt-ud-train.iob2'
test_path1 = 'baseline-data/en_ewt-ud-dev.iob2'

#train_data = read_conll_file(train_path)
#valid_data = read_conll_file(valid_path)
#test_data = read_conll_file(test_path)

train_data = read_ner_file(train_path1)
test_data = read_ner_file(test_path1)

In [4]:
print(train_data[0])

[('Where', 'O'), ('in', 'O'), ('the', 'O'), ('world', 'O'), ('is', 'O'), ('Iguazu', 'B-LOC'), ('?', 'O')]


### Create word embeddings for the data

In order to compare the models on even footing, we will use BERT-created word embeddings for all of them.

In [5]:
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm.auto import tqdm
from torch.nn.utils.rnn import pad_sequence

max_length = 165
batch_size = 32  # Define your batch size

padded_sentences = []
for i in range(0, len(train_data), batch_size):
    batch = train_data[i:i+batch_size]
    batch_padded = [sentence + [('PAD', 'O')] * (max_length - len(sentence)) for sentence in batch]
    padded_sentences.extend([" ".join([token for token, tag in sentence]) for sentence in batch_padded])

tokenizer = AutoTokenizer.from_pretrained('bert-base-cased', use_fast=False)
model = AutoModel.from_pretrained('bert-base-cased')

# Check if a GPU is available and move the model to GPU if it is
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
model.to(DEVICE)

def get_sentence_embeddings(sentences):
    encoded_inputs = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True, max_length=max_length, is_split_into_words=True)
    encoded_inputs = {key: value.to(DEVICE) for key, value in encoded_inputs.items()}
    
    with torch.no_grad():
        outputs = model(**encoded_inputs)
    # Instead of using just the [CLS] token, use all token embeddings
    return outputs.last_hidden_state

# Process data in batches and move to GPU in advance

data_embeddings = []

data_embeddings = []
for batch_sentences in tqdm(padded_sentences):
    batch_embeddings = get_sentence_embeddings([batch_sentences])  # Pass as a single-item list
    data_embeddings.append(batch_embeddings)

# Concatenate all batches and move to GPU
data_embeddings = torch.cat(data_embeddings, dim=0).to(DEVICE)

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 12543/12543 [03:46<00:00, 55.31it/s]


### Create character embeddings (TODO, not used yet, needs fixing)
According to https://direct.mit.edu/tacl/article/doi/10.1162/tacl_a_00104/43361/Named-Entity-Recognition-with-Bidirectional-LSTM, they also used character embeddings

In [24]:
import torch.nn as nn
import torch
import string

char_set = ['<PAD>', '<UNK>'] + list(string.printable)
char_to_idx = {char: idx for idx, char in enumerate(char_set)}

embedding_dim = 25
char_embeddings = nn.Embedding(num_embeddings=len(char_set), embedding_dim=embedding_dim)
char_embeddings = char_embeddings.to(DEVICE)
char_embeddings.weight.data.uniform_(-0.5, 0.5)

def get_sentence_char_embeddings(sentences):
    # Convert sentences to character indices and pad
    batch_char_indices = [[char_to_idx.get(char, char_to_idx['<UNK>']) for char in sentence] for sentence in sentences]
    batch_char_indices_padded = pad_sequence([torch.tensor(seq) for seq in batch_char_indices], batch_first=True, padding_value=char_to_idx['<PAD>'])
    
    # Move padded sequences to the same device as the model
    batch_char_indices_padded = batch_char_indices_padded.to(DEVICE)
    
    # Get character embeddings
    with torch.no_grad():
        char_embeddings_batch = char_embeddings(batch_char_indices_padded)
    
    # For simplicity, let's take the mean of the character embeddings to represent the sentence
    sentence_char_embeddings = char_embeddings_batch.mean(dim=1)
    return sentence_char_embeddings

#batch_size = 32  # Define your batch size
#data_char_embeddings = []

#for i in tqdm(range(0, len(train_data), batch_size)):
#    batch_sentences = ["".join([token for token, _ in sentence]) for sentence in train_data[i:i+batch_size]]
#    batch_char_embeddings = get_sentence_char_embeddings(batch_sentences)
#    data_char_embeddings.append(batch_char_embeddings)

# Concatenate all batches
#data_char_embeddings = torch.cat(data_char_embeddings, dim=0)

100%|██████████| 392/392 [00:00<00:00, 859.85it/s]


### Train the RNN

In [6]:
from torch import nn
import torch
from torch.utils.data import DataLoader, TensorDataset
torch.manual_seed(0)
RNN_HIDDEN = 30
BATCH_SIZE = 32
LEARNING_RATE = 0.01
EPOCHS = 10

class TaggerModel(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(TaggerModel, self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, inputData):
        # x shape: (batch_size, seq_length, input_size)
        out, _ = self.rnn(inputData)  # out shape: (batch_size, seq_length, hidden_size)
        out = self.fc(out)    # out shape: (batch_size, seq_length, output_size)
        return out

tag_to_idx = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}

# Model parameters
input_size = data_embeddings.size(2)  # Assuming data_embeddings is your input tensor
output_size = len(tag_to_idx)  # Number of unique NER tags

model = TaggerModel(input_size, RNN_HIDDEN, output_size).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
loss_function = torch.nn.CrossEntropyLoss(ignore_index=0, reduction='sum')

encoded_tags = [[tag_to_idx[tag] for token, tag in sentence] for sentence in train_data]
max_length = data_embeddings.size(1)
padded_tags = [tags + [tag_to_idx['O']] * (max_length - len(tags)) for tags in encoded_tags]
tag_tensor = torch.tensor(padded_tags, dtype=torch.long)
tag_tensor = tag_tensor.to(DEVICE)

train_dataset = TensorDataset(data_embeddings, tag_tensor)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    for inputs, targets in train_loader:
        inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)
        
        # Forward pass
        outputs = model(inputs)
        
        # Reshape outputs and targets to comply with CrossEntropyLoss requirements
        outputs = outputs.view(-1, outputs.shape[-1])  # Combined shape: (batch_size * seq_length, output_size)
        targets = targets.view(-1)  # Combined shape: (batch_size * seq_length)
        
        loss = loss_function(outputs, targets)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()  # Aggregate the loss
    
    print(f"Epoch {epoch + 1}, Loss: {total_loss}")
        
# set to evaluation mode
model.eval()

Epoch 1, Loss: 12668.014045238495
Epoch 2, Loss: 10558.905173420906
Epoch 3, Loss: 9980.450883865356
Epoch 4, Loss: 9350.718356609344
Epoch 5, Loss: 9089.721791028976
Epoch 6, Loss: 8581.658468961716
Epoch 7, Loss: 8482.562471091747
Epoch 8, Loss: 8339.634456396103
Epoch 9, Loss: 7920.577274382114
Epoch 10, Loss: 7648.626221656799


TaggerModel(
  (rnn): RNN(768, 30, batch_first=True)
  (fc): Linear(in_features=30, out_features=9, bias=True)
)

In [None]:
def evaluate_model(model, test_data, data_embeddings, tag_to_idx, max_length, batch_size=32):
    encoded_tags = [[tag_to_idx[tag] for token, tag in sentence] for sentence in test_data]
    
    # Get the test data embeddings from data_embeddings
    test_embeddings = data_embeddings[-len(test_data):].to(DEVICE)
    
    padded_tags = [tags + [tag_to_idx['O']] * (max_length - len(tags)) for tags in encoded_tags]
    tag_tensor = torch.tensor(padded_tags, dtype=torch.long).to(DEVICE)
    
    test_dataset = TensorDataset(test_embeddings, tag_tensor)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    model.eval()  # Set the model to evaluation mode

    total_correct, total_non_pad_tokens = 0, 0
    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)
            outputs = model(inputs)
            predictions = outputs.argmax(dim=2)  # Get the index of the max log-probability

            # Compare predictions to true labels (ignoring padding tokens)
            non_pad_indices = targets != tag_to_idx['O']
            correct = (predictions[non_pad_indices] == targets[non_pad_indices]).sum().item()
            total_correct += correct
            total_non_pad_tokens += non_pad_indices.sum().item()

    accuracy = total_correct / total_non_pad_tokens
    return accuracy

# Call the function with your model and test data
accuracy = evaluate_model(model, test_data, data_embeddings, tag_to_idx, max_length)
print(f'Test Accuracy: {accuracy}')

Test Accuracy: 0.17036489151873768


In [9]:
def evaluate_model(model, test_data, data_embeddings, tag_to_idx, idx_to_tag, max_length, batch_size=32):
    encoded_tags = [[tag_to_idx[tag] for token, tag in sentence] for sentence in test_data]
    
    # Get the test data embeddings from data_embeddings
    test_embeddings = data_embeddings[-len(test_data):].to(DEVICE)
    
    padded_tags = [tags + [tag_to_idx['O']] * (max_length - len(tags)) for tags in encoded_tags]
    tag_tensor = torch.tensor(padded_tags, dtype=torch.long).to(DEVICE)
    
    test_dataset = TensorDataset(test_embeddings, tag_tensor)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    model.eval()  # Set the model to evaluation mode

    total_correct, total_non_pad_tokens = 0, 0

    with open('predictions.txt', 'w', encoding='utf-8') as f:
        with torch.no_grad():
            for i, (inputs, targets) in enumerate(test_loader):
                inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)
                outputs = model(inputs)
                predictions = outputs.argmax(dim=2)  # Get the index of the max log-probability

                # Compare predictions to true labels (ignoring padding tokens)
                non_pad_indices = targets != tag_to_idx['O']
                correct = (predictions[non_pad_indices] == targets[non_pad_indices]).sum().item()
                total_correct += correct
                total_non_pad_tokens += non_pad_indices.sum().item()

                # Write predictions to file
                for sentence_idx in range(inputs.size(0)):
                    original_sentence = test_data[i * batch_size + sentence_idx]
                    f.write(' '.join(token for token, tag in original_sentence) + '\n')
                    pred_tags = predictions[sentence_idx][:len(original_sentence)]
                    for token_idx, pred_tag_idx in enumerate(pred_tags):
                        f.write(f'{original_sentence[token_idx][0]}: {idx_to_tag[pred_tag_idx.item()]}\n')
                    f.write('\n')  # Separate sentences by a newline

    accuracy = total_correct / total_non_pad_tokens
    return accuracy

idx_to_tag = {
  0: 'O',
  1: 'B-PER',
  2: 'I-PER',
  3: 'B-ORG',
  4: 'I-ORG',
  5: 'B-LOC',
  6: 'I-LOC',
  7: 'B-MISC',
  8: 'I-MISC'
}


# Call the function with your model and test data
accuracy = evaluate_model(model, test_data, data_embeddings, tag_to_idx, idx_to_tag, max_length)
print(f'Test Accuracy: {accuracy}')

Test Accuracy: 0.25


In [18]:
def convert_predictions_to_iob2(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    # Initialize variables
    sentences = []
    current_sentence = []
    sent_id = 1

    for line in lines:
        if line.strip() == '':
            if current_sentence:
                sentences.append(current_sentence)
                current_sentence = []
            continue

        # Splitting the line into word, tag parts
        parts = line.split()
        if len(parts) == 2:
            word, tag = parts
            word = word.rstrip(':')
            # Tag format conversion
            if tag.startswith('B-') or tag.startswith('I-'):
                tag_type = tag[2:]
            else:
                tag_type = 'O'
            current_sentence.append((word, tag_type))
        else:
            # Handle malformed lines gracefully
            print(f"Malformed line skipped: {line.strip()}")

    # Don't forget to add the last sentence if the file doesn't end with a newline
    if current_sentence:
        sentences.append(current_sentence)

    # Convert sentences to the desired format
    output_lines = []
    for sentence in sentences:
        output_lines.append(f"# sent_id = generated-{sent_id:04d}\n")
        output_lines.append("# text = " + " ".join([word for word, _ in sentence]) + "\n")
        for i, (word, tag) in enumerate(sentence, start=1):
            output_lines.append(f"{i}\t{word}\t{tag}\t-\t-\n")
        output_lines.append("\n")  # Sentence separator
        sent_id += 1

    # Write to output file
    with open('converted_predictions.iob2', 'w', encoding='utf-8') as f:
        f.writelines(output_lines)
    print(f"Converted content written to 'converted_predictions.iob2'")

# Replace 'predictions.txt' with the actual path to your predictions file
convert_predictions_to_iob2('predictions.txt')

Malformed line skipped: where can I get morcillas in tampa bay , I will like the argentinian type , but I will to try anothers please ?
Malformed line skipped: I searched all over the internet , but I could not find one place in Tampa Bay that sells morcillas , also known as blood pudding , black pudding and blood sausages .
Malformed line skipped: I learned that morcillas are basically impossible to find all across the North American region .
Malformed line skipped: But I did find this website , www.igourmet.com , where they sell all types of sausages , including blood sausages !
Malformed line skipped: So follow the link at the bottom and buy some blood sausages .
Malformed line skipped: I do n't know , and it is because I do n't like them , do you know that , morcillas is coagulated blood from animals , ewww
Malformed line skipped: are they just making these places up ?
Malformed line skipped: Well you say Miramar I say Piramar
Malformed line skipped: MIRAMAR
Malformed line skipped: