In [69]:
import pandas as pd

train = pd.read_csv('train/train.tsv', sep='\t', names=['label', 'text'])
dev_x = pd.read_csv('dev-0/in.tsv', sep='\t', names=['text'])
dev_y = pd.read_csv('dev-0/expected.tsv', sep='\t', names=['label'])

In [70]:
train['tokens'] = train['text'].apply(lambda x: x.split())
train['labels'] = train['label'].apply(lambda x: x.split())

dev_x['tokens'] = dev_x['text'].apply(lambda x: x.split())
dev_y['labels'] = dev_y['label'].apply(lambda x: x.split())

print("text:", train['tokens'].iloc[0])
print("labels:", train['labels'].iloc[0])

text: ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.', '</S>', 'Peter', 'Blackburn', '</S>', 'BRUSSELS', '1996-08-22', '</S>', 'The', 'European', 'Commission', 'said', 'on', 'Thursday', 'it', 'disagreed', 'with', 'German', 'advice', 'to', 'consumers', 'to', 'shun', 'British', 'lamb', 'until', 'scientists', 'determine', 'whether', 'mad', 'cow', 'disease', 'can', 'be', 'transmitted', 'to', 'sheep', '.', '</S>', 'Germany', "'s", 'representative', 'to', 'the', 'European', 'Union', "'s", 'veterinary', 'committee', 'Werner', 'Zwingmann', 'said', 'on', 'Wednesday', 'consumers', 'should', 'buy', 'sheepmeat', 'from', 'countries', 'other', 'than', 'Britain', 'until', 'the', 'scientific', 'advice', 'was', 'clearer', '.', '</S>', '"', 'We', 'do', "n't", 'support', 'any', 'such', 'recommendation', 'because', 'we', 'do', "n't", 'see', 'any', 'grounds', 'for', 'it', ',', '"', 'the', 'Commission', "'s", 'chief', 'spokesman', 'Nikolaus', 'van', 'der', 'Pas', 'told', 'a', 'ne

In [71]:
for text, labels in zip(train['tokens'], train['labels']):
    if len(text) != len(labels):
        print(f"Text and labels length mismatch: {len(text)} vs {len(labels)}")

In [72]:
def build_word_to_ix(tokens_list):
    word_to_ix = {"<PAD>": 0, "<UNK>": 1}
    for tokens in tokens_list:
        for token in tokens:
            if token not in word_to_ix:
                word_to_ix[token] = len(word_to_ix)
    return word_to_ix

def build_tag_to_ix(labels_list):
    tag_to_ix = {"<PAD>": 0}
    for labels in labels_list:
        for label in labels:
            if label not in tag_to_ix:
                tag_to_ix[label] = len(tag_to_ix)
    return tag_to_ix

word_to_ix = build_word_to_ix(train['tokens'])
tag_to_ix = build_tag_to_ix(train['labels'])

In [73]:
print("Word to index len:", len(word_to_ix))
print("Tag to index len:", len(tag_to_ix))

Word to index len: 23626
Tag to index len: 10


In [74]:
import torch

def prepare_sequence(seq, to_ix):
    word_tensor = []
    for word in seq:
        word_tensor.append(to_ix.get(word, 1))
    return word_tensor

In [75]:
import copy

def pad_sequences(tensors_list):
    padded_tensor = []
    mask_tensor = []
    
    longest = len(max(tensors_list, key=len))

    for tensor in tensors_list:
        tensor = copy.deepcopy(tensor)
        mask = [1 for number in tensor]
        while len(tensor) != longest:
            tensor.append(0)
            mask.append(0)
        padded_tensor.append(tensor)
        mask_tensor.append(mask)

    return torch.tensor(padded_tensor), torch.tensor(mask_tensor)


In [76]:
import torch.nn as nn

class LSTM(torch.nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_dim=100, hidden_dim=256):
        super(LSTM, self).__init__()
        self.emb = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
    
    def forward(self, input_tensor):
        embeds = self.emb(input_tensor)
        lstm_out, _ = self.lstm(embeds)
        tag_space = self.hidden2tag(lstm_out)
        tag_scores = torch.log_softmax(tag_space, dim=2)
        return tag_scores

In [77]:
from torch.utils.data import Dataset

class NERDataset(Dataset):
    def __init__(self, tokens_list, labels_list, word_to_ix, tag_to_ix):
        self.tokens_list = tokens_list
        self.labels_list = labels_list
        self.word_to_ix = word_to_ix
        self.tag_to_ix = tag_to_ix

    def __len__(self):
        return len(self.tokens_list)

    def __getitem__(self, idx):
        tokens = self.tokens_list[idx]
        labels = self.labels_list[idx]

        token_ids = [self.word_to_ix.get(tok, self.word_to_ix["<UNK>"]) for tok in tokens]
        label_ids = [self.tag_to_ix[label] for label in labels]

        return token_ids, label_ids

In [78]:
def collate_fn(batch):
    token_batch, label_batch = zip(*batch)

    input_tensor, input_mask = pad_sequences(token_batch)
    label_tensor, _ = pad_sequences(label_batch)

    return input_tensor, label_tensor, input_mask

In [79]:
from torch.utils.data import DataLoader

dataset = NERDataset(train['tokens'], train['labels'], word_to_ix, tag_to_ix)
loader = DataLoader(dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)

dev_dataset = NERDataset(dev_x['tokens'], dev_y['labels'], word_to_ix, tag_to_ix)
dev_loader = DataLoader(dev_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)

In [93]:
model = LSTM(len(word_to_ix), len(tag_to_ix))
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.NLLLoss(ignore_index=0)

In [81]:
from seqeval.metrics import f1_score
from collections import Counter

def evaluation():
    model.eval()

    all_preds = []  
    all_labels = []  

    with torch.no_grad():
        for input_tensor, label_tensor, input_mask in dev_loader:
            output = model(input_tensor)  
            preds = torch.argmax(output, dim=2)  


            for i in range(input_tensor.size(0)): 
                pred_seq = []
                label_seq = []
                for j in range(input_tensor.size(1)): 
                    if input_mask[i][j] == 0:
                        continue  # pomiń <PAD>
                    pred_tag = list(tag_to_ix.keys())[list(tag_to_ix.values()).index(preds[i][j].item())]
                    true_tag = list(tag_to_ix.keys())[list(tag_to_ix.values()).index(label_tensor[i][j].item())]
                    pred_seq.append(pred_tag)
                    label_seq.append(true_tag)

                all_preds.append(pred_seq)
                all_labels.append(label_seq)            
    print("F1 score: ", f1_score(all_labels, all_preds))

In [94]:
num_epoch = 25
for epoch in range(num_epoch):
    model.train()
    total_loss = 0.0
    for input_tensor, label_tensor, input_mask in loader:
        optimizer.zero_grad()

        output = model(input_tensor)

        output = output.view(-1, len(tag_to_ix))         
        label_tensor = label_tensor.view(-1)              

        loss = loss_fn(output, label_tensor)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1} / {num_epoch},  Loss: {loss.item():.4f}")
    if (epoch+1) % 5 == 0:
        evaluation()

Epoch 1 / 25,  Loss: 1.1544
Epoch 2 / 25,  Loss: 0.8228
Epoch 3 / 25,  Loss: 0.5083
Epoch 4 / 25,  Loss: 0.3103
Epoch 5 / 25,  Loss: 0.1856
F1 score:  0.4423713511886849
Epoch 6 / 25,  Loss: 0.1156
Epoch 7 / 25,  Loss: 0.0735
Epoch 8 / 25,  Loss: 0.0498
Epoch 9 / 25,  Loss: 0.0370
Epoch 10 / 25,  Loss: 0.0277
F1 score:  0.5837044367474847
Epoch 11 / 25,  Loss: 0.0172
Epoch 12 / 25,  Loss: 0.0123
Epoch 13 / 25,  Loss: 0.0079
Epoch 14 / 25,  Loss: 0.0060
Epoch 15 / 25,  Loss: 0.0050
F1 score:  0.6313031048995473
Epoch 16 / 25,  Loss: 0.0043
Epoch 17 / 25,  Loss: 0.0043
Epoch 18 / 25,  Loss: 0.0033
Epoch 19 / 25,  Loss: 0.0029
Epoch 20 / 25,  Loss: 0.0024
F1 score:  0.6200197313500796
Epoch 21 / 25,  Loss: 0.0021
Epoch 22 / 25,  Loss: 0.0019
Epoch 23 / 25,  Loss: 0.0022
Epoch 24 / 25,  Loss: 0.0016
Epoch 25 / 25,  Loss: 0.0015
F1 score:  0.638929932343106
