In [3]:
#importing required libraries 
import torch 
import torch.nn as nn 
import torch.nn.functional as F
from torch import optim 
from conllu import parse
import numpy as np


In [4]:
train_file = 'en_atis-ud-train.conllu'
test_file = 'en_atis-ud-test.conllu'
dev_file = 'en_atis-ud-dev.conllu'


In [5]:
raw_datasets = {}
raw_datasets['train'] = parse(open(train_file, 'r').read())
raw_datasets['dev'] = parse(open(dev_file, 'r').read())
raw_datasets['test'] = parse(open(test_file, 'r').read())
datasets = {}
datasets['test'] = []
datasets['train'] = []



In [6]:

word2idx = {}
tag2idx = {}
train_loader = []
dev_loader = []
test_loader = []


for sentence in raw_datasets['train']:
    sentence_list =[[],[]]
    for token in sentence:  
        if token['upostag'] not in tag2idx:
            tag2idx.update({token['upostag']: len(tag2idx)})
        if token['form'] not in word2idx:
            word2idx.update({token['form']: len(word2idx)})
        sentence_list[0].append(word2idx[token['form']])
        sentence_list[1].append(tag2idx[token['upos']])
    datasets['train'].append(sentence_list)

for sentence in datasets['train']:
    train_loader.append((torch.tensor(sentence[0], dtype=torch.long), torch.tensor(sentence[1], dtype=torch.long)))


for sentence in raw_datasets['dev']:
    sentence_list =[[],[]]
    for token in sentence:  
        if token['upostag'] not in tag2idx:
            tag2idx.update({token['upostag']: len(tag2idx)})
        if token['form'] not in word2idx:
            word2idx.update({token['form']: len(word2idx)})
        sentence_list[0].append(word2idx[token['form']])
        sentence_list[1].append(tag2idx[token['upos']])
    datasets['dev'].append(sentence_list)

for sentence in raw_datasets['test']:
    sentence_list =[[],[]]
    for token in sentence:  
        if token['upostag'] not in tag2idx:
            tag2idx.update({token['upostag']: len(tag2idx)})
        if token['form'] not in word2idx:
            word2idx.update({token['form']: len(word2idx)})
        sentence_list[0].append(word2idx[token['form']])
        sentence_list[1].append(tag2idx[token['upos']])
    datasets['test'].append(sentence_list)

for sentence in datasets['dev']:
    dev_loader.append((torch.tensor(sentence[0], dtype=torch.long), torch.tensor(sentence[1], dtype=torch.long)))

for sentence in datasets['test']:
    test_loader.append((torch.tensor(sentence[0], dtype=torch.long), torch.tensor(sentence[1], dtype=torch.long)))




KeyError: 'dev'

In [7]:

class POS_tagger(nn.Module):
    def __init__(self, embedding_dim , hidden_dim , vocab_size , tagset_size ):
        super(POS_tagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
       
    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores
        


In [8]:
# mode1 = nn.Sequential(nn.Embedding(len(word2idx), 100), nn.LSTM(100, 100), nn.Linear(100, len(tag2idx)), nn.LogSoftmax(dim=1))
model = POS_tagger(100, 100, len(word2idx), len(tag2idx))
criterion = nn.NLLLoss()
optimizer = optim.Adamax(model.parameters(), lr=0.001)



In [9]:
model.train()
EPOCHS = 50 
STEPS = 0 
best_accuracy = 0
RUNNING_LOSS = 0
PRINT_EVERY = 1000
for epoch in range(EPOCHS):
    for sentence, tags in train_loader:
        STEPS += 1
        optimizer.zero_grad()
        tag_scores = model(sentence)
        loss = criterion(tag_scores, tags)
        loss.backward()
        optimizer.step()
        RUNNING_LOSS += loss.item()
        if STEPS % PRINT_EVERY == 0:
            model.eval()
            test_loss = 0
            accuracy = 0
            with torch.no_grad():
                for sentence, tags in dev_loader:
                    tag_scores = model(sentence)
                    test_loss += criterion(tag_scores, tags)
                    ps = torch.exp(tag_scores)
                    top_p, top_class = ps.topk(1, dim=1)
                    equals = top_class == tags.view(*top_class.shape)
                    accuracy += torch.mean(equals.type(torch.FloatTensor))
            
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                torch.save(model.state_dict(), 'pos_tagger.pth')
                print(f"Epoch {epoch+1}/{EPOCHS} "
                  f"Train loss: {RUNNING_LOSS/PRINT_EVERY:.3f}.. "
                  f"Test loss: {test_loss/len(dev_loader):.3f}.. "
                  f"Test accuracy: {accuracy/len(dev_loader):.3f}")
            
            model.train()
            RUNNING_LOSS = 0

KeyboardInterrupt: 

In [None]:
model.load_state_dict(torch.load('pos_tagger.pth'))
model.eval()
accuracy = 0

for sentence, tags in test_loader:
    tag_scores = model(sentence)
    ps = torch.exp(tag_scores)
    top_p, top_class = ps.topk(1, dim=1)
    equals = top_class == tags.view(*top_class.shape)
    accuracy += torch.mean(equals.type(torch.FloatTensor))
print(f"Test accuracy: {accuracy/len(test_loader):.3f}")

Test accuracy: 0.971
