## Building Deep Learning Models for Named Entity Recognition (NER) using PyTorch
#### Time to run - Approximately 20 mins on GPU

In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.init as init
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence,pad_packed_sequence,pad_sequence
import numpy as np
import torch.optim.lr_scheduler as lr_scheduler
device = torch.device('cuda')

The function `make_sentences` reads the data and groups all words and labels for each sentence together. It returns a list of tuples in the structure `[(word, label), (word, label), ... , (word, label)]`.


In [None]:
def make_sentences(file_name):
    train_data = []
    with open(file_name,'r') as trainFile:
        words = []
        labels = []
        for line in trainFile:
            if line == '\n':
                train_data.append((words, labels))
                words = []
                labels = []
            else:
                index,word, label = line.strip().split()
                words.append(word)
                labels.append(label)
    train_data.append((words,labels))
    return train_data



The function `get_word_count` returns two dictionaries: `count_dict` and `label_dict`. `count_dict` stores the count for every word, which is then used to remove words with occurrences less than a threshold value. `label_dict` is used only to get all unique labels.


In [None]:
def get_word_count(data):
    count_dict = {}
    label_dict = {}
    for words, labels in data:
        for word in words:
            if word not in count_dict.keys():
                count_dict[word] = 1
            else:
                count_dict[word]+=1
        for label in labels:
            if label not in label_dict.keys():
                label_dict[label] = 1
    return count_dict,label_dict




The function `get_word_embeddings` returns two dictionaries: `word_to_id` and `id_to_word`. `word_to_id` stores the index for every unique word, which is used for creating the vector that is fed into the model. `id_to_word` is used to retrieve the word from the model's prediction. I also define an `UNK_TOKEN` and a `PAD_TOKEN` index.


In [None]:
def get_word_embeddings(count_dict,UNK_TOKEN,PAD_TOKEN,threshold):
    train_data = list(count_dict.keys())
    word_to_id = {}
    id_to_word = {}
    word_to_id[UNK_TOKEN] = 0
    id_to_word[0] = UNK_TOKEN

    for word in train_data:
        if word not in word_to_id.keys() and count_dict[word] > threshold:
                id_to_word[len(word_to_id)] = word
                word_to_id[word] = len(word_to_id)

    id_to_word[len(word_to_id)] = PAD_TOKEN
    word_to_id[PAD_TOKEN] = len(word_to_id)

    return word_to_id,id_to_word




The function `get_label_embeddings` performs a similar task as above. Instead of words, the inputs here are labels.


In [None]:
def get_label_embeddings(label_dict,PAD_TOKEN):
    train_data = list(label_dict.keys())
    label_to_id = {}
    id_to_label = {}
    for label in train_data:
        if label not in label_to_id.keys():
            id_to_label[len(label_to_id)] = label
            label_to_id[label] = len(label_to_id)
    id_to_label[len(label_to_id)] = PAD_TOKEN
    label_to_id[PAD_TOKEN] = len(label_to_id)
    return label_to_id, id_to_label



I create the training data and initialize the tokens below. A threshold of 1 means, It will consider all words that have occurred more than once. Out of all the models tried, the one with a threshold of 1 gave the best performance.


In [None]:
train_file = '/kaggle/input/dataset/data/train'
valid_file = '/kaggle/input/dataset/data/dev'
train_data = make_sentences(train_file)
PAD_TOKEN = '<PAD>'
UNK_TOKEN = '<UNK>'
threshold = 1

Creating dictionaries using the functions defined above.

In [None]:
count_dict = {}
label_dict = {}
word_to_id = {}
label_to_id = {}
id_to_word = {}
id_to_label = {}

count_dict,label_dict = get_word_count(train_data)
word_to_id, id_to_word = get_word_embeddings(count_dict,UNK_TOKEN,PAD_TOKEN,threshold )
label_to_id, id_to_label = get_label_embeddings(label_dict,PAD_TOKEN )



The function `build_train_dataset` returns the training dataset ready to be fed into the model. It maps every word from the vocabulary to its index. If a word's count is below the threshold of 1, I map it to 0. Along with the words and labels, I also pass the real word list for each sentence, which is used to write files during prediction.


In [None]:
def build_train_dataset(train_data,word_to_id,label_to_id):
    train_dataset = []
    for words, labels in train_data:
        wordlist = []
        labellist = []
        realword = []
        for word in words:
            realword.append(word)
            if word not in word_to_id.keys():
                wordlist.append(word_to_id['<UNK>'])
            else:
                wordlist.append(word_to_id[word])

        for label in labels:
            labellist.append(label_to_id[label])
        train_dataset.append((torch.LongTensor(wordlist),torch.LongTensor(labellist),realword))
    return train_dataset


The `collate` function is used when loading dataloaders. For every batch, it pads the input words and labels. The padding size depends on the longest sequence in the batch. I use my padding index to determine the padding value.


In [None]:
def collate_fn(batch):
    inputs = [item[0] for item in batch]
    targets = [item[1] for item in batch]
    realword = [item[2] for item in batch]
    length = []
    for item in batch:
        length.append(len(item[0]))
    padded_inputs = pad_sequence(inputs, batch_first=True, padding_value=len(word_to_id)-1)
    padded_targets = pad_sequence(targets, batch_first=True, padding_value=9)
    length = torch.Tensor(length).int()
    padded_inputs = padded_inputs.to(device)
    padded_targets = padded_targets.to(device)
    return padded_inputs, length, padded_targets,realword


In [None]:
# Defining the PyTorch DataLoader
batch_size = 8
train_dataset = build_train_dataset(train_data,word_to_id,label_to_id)
train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn,drop_last=True)



Below is the bidirectional LSTM model. The architecture of the model is as follows: Embedding → BLSTM → Linear → ELU → classifier. The parameters are defined as:
- Embedding dimension: 100
- Number of LSTM layers: 1
- LSTM hidden dimension: 256
- LSTM dropout: 0.33
- Linear output dimension: 128

The only additional feature in the model is weight initialization. After trying various methods, Xavier initialization gave the best results.


In [None]:
torch.manual_seed(1233)
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, dropout):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim,padding_idx = word_to_id[PAD_TOKEN])
        self.blstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=1, bidirectional=True)
        self.dropout = nn.Dropout(0.33)
        self.linear = nn.Linear(hidden_dim * 2, 128)
        self.activation = nn.ELU()
        self.classifier = nn.Linear(128, output_dim)
        # initialize weights of linear layer
        init.xavier_uniform_(self.linear.weight)
        init.zeros_(self.linear.bias)

        # initialize weights of lstm layer
        for name, param in self.blstm.named_parameters():
            if 'weight' in name:
                init.xavier_uniform_(param)
            elif 'bias' in name:
                init.zeros_(param)


    def forward(self, text, lengths):
        embedded = self.embedding(text)
        s = pack_padded_sequence(embedded, lengths, batch_first=True, enforce_sorted=False)
        outputs, _ = self.blstm(s)
        s, _ = pad_packed_sequence(outputs, batch_first=True)
        s = self.dropout(s)
        linear_output = self.linear(s)
        activation_output = self.activation(linear_output)
        prediction = self.classifier(activation_output)

        return prediction


In [None]:
#Defining the parameters for the model
embedding_dim = 100
hidden_dim = 256
dropout = 0.33
blstm1 = BiLSTM(len(word_to_id), embedding_dim, hidden_dim, len(label_to_id), dropout)
criterion = nn.CrossEntropyLoss(ignore_index = 9)
optimizer = torch.optim.SGD(blstm1.parameters(),lr = 0.9)
scheduler = lr_scheduler.StepLR(optimizer, step_size=14, gamma=0.5)
blstm1 = blstm1.to(device)
criterion = criterion.to(device)

In [None]:
#Training the model
num_epochs = 23
for epoch in range(num_epochs):
    train_loss = 0.0
    pre_loss = float('inf')
    blstm1.train()

    for i, batch in enumerate(train_loader):

        inputs,lengths, labels,realword = batch
        optimizer.zero_grad()
        outputs = blstm1(inputs, lengths)
        outputs = outputs.reshape(-1, outputs.shape[-1])
        labels = labels.view(-1)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # I save the model only if the training loss in the current iteration is lower than the previous iteration
    train_loss = train_loss/ len(train_loader)
    if train_loss < pre_loss:
        pre_loss = train_loss
        torch.save(blstm1.state_dict(), 'blstm1.pt')

    scheduler.step()
    print("Epoch: ",epoch+1, " Loss: ",train_loss)


Epoch:  1  Loss:  0.5002880737190948
Epoch:  2  Loss:  0.2504637408752971
Epoch:  3  Loss:  0.16017735704259517
Epoch:  4  Loss:  0.11480645833717183
Epoch:  5  Loss:  0.08844368781250983
Epoch:  6  Loss:  0.07152367537546
Epoch:  7  Loss:  0.05886797258144254
Epoch:  8  Loss:  0.05100310852508245
Epoch:  9  Loss:  0.042981528808318605
Epoch:  10  Loss:  0.03630846395481501
Epoch:  11  Loss:  0.036669236550931766
Epoch:  12  Loss:  0.029804354210239473
Epoch:  13  Loss:  0.027534459578828087
Epoch:  14  Loss:  0.02313673351439886
Epoch:  15  Loss:  0.017273899331148403
Epoch:  16  Loss:  0.013459274439771764
Epoch:  17  Loss:  0.012160614638177587
Epoch:  18  Loss:  0.009805573573226641
Epoch:  19  Loss:  0.00961612596136959
Epoch:  20  Loss:  0.009301256132147709
Epoch:  21  Loss:  0.008650588321013461
Epoch:  22  Loss:  0.0075916284958447745
Epoch:  23  Loss:  0.007192347489208993


In [None]:
#loading the model parameters
blstm1.load_state_dict(torch.load('blstm1.pt'))

<All keys matched successfully>

In [None]:
# Reading and creating the valid Dataset for testing
valid_data = make_sentences(valid_file)
valid_dataset = build_train_dataset(valid_data,word_to_id,label_to_id)
valid_loader =  DataLoader(valid_dataset, batch_size=1,collate_fn = collate_fn)

The code below generates the file used for evaluation. I use the official evaluation script `conll03eval` to evaluate the results of the model.


In [None]:
with open('/kaggle/working/dev1-perl.txt','w') as f:
    blstm1.eval()
    with torch.no_grad():
        for (sent, length,tags,realword) in valid_loader:

            prediction = blstm1(sent,length)
            sent = sent.tolist()
            tags = tags.tolist()
            prediction = prediction.argmax(-1)
            prediction = prediction.tolist()

            i = 1
            for (sent_ele,tag_ele,pred_ele) in zip(realword[0],tags[0],prediction[0]):
                string = str(i) + " " + str(sent_ele) + " " + str(id_to_label[tag_ele]) + " " + str(id_to_label[pred_ele]) + "\n"
                f.write(string)
                i+=1
            f.write("\n")


The following code generates the .out file for Named Entity Recognition on the dev data.


In [None]:
with open('/kaggle/working/dev1.out','w') as f:
    blstm1.eval()
    with torch.no_grad():
        for (sent, length,tags,realword) in valid_loader:

            prediction = blstm1(sent,length)
            sent = sent.tolist()
            prediction = prediction.argmax(-1)
            prediction = prediction.tolist()

            i = 1
            for (sent_ele,pred_ele) in zip(realword[0],prediction[0]):
                string = str(i) + " " + str(sent_ele) + " " + str(id_to_label[pred_ele]) + "\n"
                f.write(string)
                i+=1
            f.write("\n")


In [None]:
# creating sentences from the test file data
test_file = '/kaggle/input/dataset/data/test'
test_data = []
with open(test_file,'r') as testFile:
    words = []
    for line in testFile:
        if line == '\n':
            test_data.append((words))
            words = []

        else:
            index,word = line.strip().split()
            words.append(word)
    test_data.append([word])


In [None]:
# Embedding the test data sentences
test_dataset = []
for words in test_data:
    wordlist = []
    realword = []
    for word in words:
        realword.append(word)
        if word not in word_to_id.keys():
            wordlist.append(word_to_id['<UNK>'])
        else:
            wordlist.append(word_to_id[word])

    test_dataset.append((torch.LongTensor(wordlist),realword))


Below is the collate function for test data. The only difference here is we dont have labels. Rest everything remains the same as above.

In [None]:
def collate_fn_test(batch):
    inputs = [item[0] for item in batch]
    realword = [item[1] for item in batch]
    length = []
    for item in batch:
        length.append(len(item[0]))
    padded_inputs = pad_sequence(inputs, batch_first=True, padding_value=len(word_to_id)-1)
    length = torch.Tensor(length).int()
    padded_inputs = padded_inputs.to(device)
    return padded_inputs, length,realword
#Loading the test data with dataloader
test_loader =  DataLoader(test_dataset, batch_size=1,collate_fn = collate_fn_test)



The following snippet writes the required test file

In [None]:
with open('/kaggle/working/test1.out','w') as f:
    blstm1.eval()
    with torch.no_grad():
        for (sent, length,realword) in test_loader:
            prediction = blstm1(sent,length)
            sent = sent.tolist()
            prediction = prediction.argmax(-1)
            prediction = prediction.tolist()

            i = 1
            for (sent_ele,pred_ele) in zip(realword[0],prediction[0]):
                string = str(i) + " " + str(sent_ele) + " " + str(id_to_label[pred_ele]) + "\n"
                f.write(string)
                i+=1
            f.write("\n")


## In conclusion
For task 1, after trying a lot of different hyperparameters I got an F1 score of 77% on the dev data. The precision is 80% and recall is about 74%

### TASK 2 - Using GloVe word embeddings to improve BLSTM

In [None]:
#Loading data from the train file and defining tokens
train_file = '/kaggle/input/dataset/data/train'
valid_file = '/kaggle/input/dataset/data/dev'
train_data = make_sentences(train_file)
PAD_TOKEN = '<PAD>'
UNK_TOKEN = '<UNK>'
threshold = 1

In [None]:
#Creating dictionaries
count_dict = {}
label_dict = {}
word_to_id = {}
label_to_id = {}
id_to_word = {}
id_to_label = {}

count_dict,label_dict = get_word_count(train_data)
word_to_id, id_to_word = get_word_embeddings(count_dict,UNK_TOKEN,PAD_TOKEN,threshold )
label_to_id, id_to_label = get_label_embeddings(label_dict,PAD_TOKEN )


The code below creates a dictionary called `textEmbedding_glove`. It has the word as its key and the GloVe embeddings as its value.
value

In [None]:
textEmbedding_glove = {}
i = 0
with open('/kaggle/input/gloveembeddings/glove.6B.100d.txt','r') as gloveFile:
        for line in gloveFile:
            word = line.split()[0]
            embedding = line.split()[1:]
            embedding = np.array(embedding).astype('float64')
            textEmbedding_glove[word] = embedding


The following code returns `gloveValues`, which we use as initial weights for the model. I first check if the word is capitalized, and if it is, whether the same word is covered by GloVe or not. If it is not covered, I embed the lowercase word. For any other word not covered by GloVe, I randomly initialize the values uniformly between -0.01 and 0.01. The embedding size is 100, as mentioned in the assignment.


In [None]:
np.random.seed(129)
gloveValues = []
for word in word_to_id.keys():
    lowercase_word = word.lower()
    if lowercase_word in textEmbedding_glove.keys():

        if word in textEmbedding_glove.keys():
            gloveValues.append(textEmbedding_glove[word])
        else:
            gloveValues.append(textEmbedding_glove[lowercase_word])
    else:
        gloveValues.append(np.random.uniform(low=-0.01, high=0.01, size=100))

gloveValues = np.array(gloveValues)

The following code is used to build the training dataset. It also handles capitalization. I have introduced a list called `mask`, which is 1 if the letter is capitalized and 0 otherwise. I pass this mask list to the model.


In [None]:
def build_train_dataset_task2(train_data,word_to_id,label_to_id):
    train_dataset = []
    for words, labels in train_data:
        wordlist = []
        labellist = []
        realword = []
        mask = []
        for word in words:
            maskval = 0
            for letter in word:
                if letter.isupper():
                    maskval = 1
                    break
            mask.append(maskval)
            realword.append(word)
            if word not in word_to_id.keys():
                wordlist.append(word_to_id['<UNK>'])
            else:
                wordlist.append(word_to_id[word])

        for label in labels:
            labellist.append(label_to_id[label])
        train_dataset.append((torch.LongTensor(wordlist),torch.LongTensor(labellist),realword,torch.LongTensor(mask)))
    return train_dataset


The following is the collate function for Task 2. The only difference here is the addition of mask values. I also pad the values with 0 according to the longest sequence in the batch.


In [None]:
def collate_fn_task2(batch):
    inputs = [item[0] for item in batch]
    targets = [item[1] for item in batch]
    realword = [item[2] for item in batch]
    mask = [item[3] for item in batch]
    length = []
    for item in batch:
        length.append(len(item[0]))
    padded_inputs = pad_sequence(inputs, batch_first=True, padding_value=len(word_to_id)-1)
    padded_targets = pad_sequence(targets, batch_first=True, padding_value=9)
    padded_mask =  pad_sequence(mask, batch_first=True, padding_value=0)
    length = torch.Tensor(length).int()
    padded_inputs = padded_inputs.to(device)
    padded_targets = padded_targets.to(device)
    padded_mask = padded_mask.to(device)
    return padded_inputs, length, padded_targets,realword,padded_mask


In [None]:
#Trraining data to be fecthed inside the model
batch_size = 8
train_dataset = build_train_dataset_task2(train_data,word_to_id,label_to_id)
train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn_task2,drop_last=True)

The model architecture is the same as in Task 1. The only difference here is the handling of capitalization.


In [None]:
torch.manual_seed(1234)
class NewBiLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, dropout):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim,padding_idx = word_to_id[PAD_TOKEN])
        self.blstm = nn.LSTM(embedding_dim+1, hidden_dim, num_layers=1, bidirectional=True)
        self.embedding.weight.data.copy_(torch.from_numpy(gloveValues))
        self.dropout = nn.Dropout(0.33)
        self.linear = nn.Linear(hidden_dim * 2, 128)
        self.activation = nn.ELU()
        self.classifier = nn.Linear(128, output_dim)



    def forward(self, text, lengths,masks):
        embedded = self.embedding(text)
        mask = torch.unsqueeze(masks, dim=2)
        embednew =  torch.cat((embedded,mask),dim=2)
        s = pack_padded_sequence(embednew, lengths, batch_first=True, enforce_sorted=False)
        outputs, _ = self.blstm(s)
        s, _ = pad_packed_sequence(outputs, batch_first=True)
        s = self.dropout(s)
        linear_output = self.linear(s)
        activation_output = self.activation(linear_output)
        prediction = self.classifier(activation_output)

        return prediction


In [None]:
#Initializing the parameters
embedding_dim = 100
hidden_dim = 256
dropout = 0.33
blstm2 = NewBiLSTM(len(word_to_id), embedding_dim, hidden_dim, len(label_to_id), dropout)
criterion = nn.CrossEntropyLoss(ignore_index = 9)
optimizer = torch.optim.SGD(blstm2.parameters(),lr = 0.1)
scheduler1 = lr_scheduler.StepLR(optimizer, step_size=14, gamma=0.5)
blstm2 = blstm2.to(device)
criterion = criterion.to(device)

In [None]:
#Training the model
num_epochs = 86

for epoch in range(num_epochs):
    train_loss = 0.0
    pre_loss = float('inf')
    blstm2.train()
    for i, batch in enumerate(train_loader):

        inputs,lengths, labels,realword,mask = batch
        optimizer.zero_grad()
        outputs = blstm2(inputs, lengths,mask)
        outputs = outputs.reshape(-1, outputs.shape[-1])
        labels = labels.view(-1)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()


    train_loss = train_loss/ len(train_loader)
    if train_loss < pre_loss:
        pre_loss = train_loss
        torch.save(blstm2.state_dict(), 'blstm2.pt')


    scheduler1.step()


    print("Epoch: ",epoch, " Loss: ",train_loss)


Epoch:  0  Loss:  0.39916226742078104
Epoch:  1  Loss:  0.187626430999241
Epoch:  2  Loss:  0.1325912398581522
Epoch:  3  Loss:  0.11153483425154666
Epoch:  4  Loss:  0.10042689911747307
Epoch:  5  Loss:  0.09180080379118016
Epoch:  6  Loss:  0.08543492293931187
Epoch:  7  Loss:  0.07964268960933824
Epoch:  8  Loss:  0.07601772577784512
Epoch:  9  Loss:  0.0717917988959555
Epoch:  10  Loss:  0.06759736507862396
Epoch:  11  Loss:  0.06507494565735411
Epoch:  12  Loss:  0.061725664948927735
Epoch:  13  Loss:  0.058625030751644866
Epoch:  14  Loss:  0.0556737042939451
Epoch:  15  Loss:  0.05342360034658605
Epoch:  16  Loss:  0.05168309015194484
Epoch:  17  Loss:  0.05097128996161
Epoch:  18  Loss:  0.0494417325696187
Epoch:  19  Loss:  0.04848483345842913
Epoch:  20  Loss:  0.04742912489016499
Epoch:  21  Loss:  0.04647199210288725
Epoch:  22  Loss:  0.04517862374541777
Epoch:  23  Loss:  0.044052932706180416
Epoch:  24  Loss:  0.04293764054552892
Epoch:  25  Loss:  0.04193526163328558
Ep

In [None]:
#Loading the model parameters
blstm2.load_state_dict(torch.load('blstm2.pt'))

<All keys matched successfully>

In [None]:
valid_data = make_sentences(valid_file)
valid_dataset = build_train_dataset_task2(valid_data,word_to_id,label_to_id)
valid_loader =  DataLoader(valid_dataset, batch_size=1,collate_fn = collate_fn_task2)

Following is the code to generate the output, to be evaluated by the perl script.

In [None]:
j =0
with open('/kaggle/working/dev2-Perl.txt','w') as f:
    blstm2.eval()
    with torch.no_grad():
        for (sent, length,tags,realword,mask) in valid_loader:
            prediction = blstm2(sent,length,mask)
            sent = sent.tolist()
            tags = tags.tolist()
            prediction = prediction.argmax(-1)
            prediction = prediction.tolist()

            i = 1
            for (sent_ele,tag_ele,pred_ele) in zip(realword[0],tags[0],prediction[0]):
                string = str(i) + " " + str(sent_ele) + " " + str(id_to_label[tag_ele]) + " " + str(id_to_label[pred_ele]) + "\n"
                f.write(string)
                i+=1

            f.write("\n")


In [None]:
#Generates the required dev file output
j =0
with open('/kaggle/working/dev2.out','w') as f:
    blstm2.eval()
    with torch.no_grad():
        for (sent, length,tags,realword,mask) in valid_loader:
            prediction = blstm2(sent,length,mask)
            sent = sent.tolist()
            tags = tags.tolist()
            prediction = prediction.argmax(-1)
            prediction = prediction.tolist()

            i = 1
            for (sent_ele,pred_ele) in zip(realword[0],prediction[0]):
                string = str(i) + " " + str(sent_ele) + " " + str(id_to_label[pred_ele]) + "\n"
                f.write(string)
                i+=1

            f.write("\n")


Generating Predictions on the Test Data

In [None]:
#Creating data from the test file
test_file = '/kaggle/input/dataset/data/test'
test_data = []
with open(test_file,'r') as testFile:
    words = []
    for line in testFile:
        if line == '\n':
            test_data.append((words))
            words = []

        else:
            index,word = line.strip().split()
            words.append(word)
    test_data.append([word])


In [None]:
test_dataset = []
for words in test_data:
    wordlist = []
    realword = []
    mask = []
    for word in words:
        maskVal = 0
        realword.append(word)
        for letter in word:
            if letter.isupper():
                maskVal = 1
        mask.append(maskVal)
        if word not in word_to_id.keys():
            wordlist.append(word_to_id['<UNK>'])
        else:
            wordlist.append(word_to_id[word])

    test_dataset.append((torch.LongTensor(wordlist),realword,torch.LongTensor(mask)))


In [None]:
#Collate function for the test data
def collate_fn_test2(batch):
    inputs = [item[0] for item in batch]
    realword = [item[1] for item in batch]
    mask = [item[2] for item in batch]
    length = []
    for item in batch:
        length.append(len(item[0]))
    padded_inputs = pad_sequence(inputs, batch_first=True, padding_value=len(word_to_id)-1)
    padded_masks= pad_sequence(mask, batch_first=True, padding_value=0)
    length = torch.Tensor(length).int()
    padded_inputs = padded_inputs.to(device)
    padded_masks = padded_masks.to(device)
    return padded_inputs, length,realword,padded_masks
test_loader =  DataLoader(test_dataset, batch_size=1,collate_fn = collate_fn_test2)



In [None]:
#Creating the required test file output
with open('/kaggle/working/test2.out','w') as f:
    blstm2.eval()
    with torch.no_grad():
        for (sent, length,realword,mask) in test_loader:
            prediction = blstm2(sent,length,mask)
            sent = sent.tolist()
            prediction = prediction.argmax(-1)
            prediction = prediction.tolist()
            i = 1
            for (sent_ele,pred_ele) in zip(realword[0],prediction[0]):
                string = str(i) + " " + str(sent_ele) + " " + str(id_to_label[pred_ele]) + "\n"
                f.write(string)
                i+=1
            f.write("\n")

In [None]:
!perl /kaggle/input/conll03/conll03eval.txt < /kaggle/working/dev2-Perl.txt

processed 51578 tokens with 5942 phrases; found: 5908 phrases; correct: 5118.
accuracy:  97.66%; precision:  86.63%; recall:  86.13%; FB1:  86.38
              LOC: precision:  91.70%; recall:  90.20%; FB1:  90.94  1807
             MISC: precision:  77.73%; recall:  81.78%; FB1:  79.70  970
              ORG: precision:  78.61%; recall:  79.49%; FB1:  79.05  1356
              PER: precision:  92.45%; recall:  89.09%; FB1:  90.74  1775


## In conclusion
I handle the capitalization criteria by create a new mask list as described above. The model returns an F1 score of 86.38% , precision: 86.63%, and recall: 86.13%