In [30]:
#importing required libraries 
!pip install conllu
import torch 
import torch.nn as nn 
import torch.nn.functional as F
from torch import optim 
from conllu import parse
import numpy as np
import random as rd 

cuda = torch.cuda.is_available()
device = torch.device("cuda" if cuda else "cpu")
print("Using device:", device)

Using device: cpu


In [31]:
train_file = 'en_atis-ud-train.conllu'
test_file = 'en_atis-ud-test.conllu'
dev_file = 'en_atis-ud-dev.conllu'


In [32]:
raw_datasets = {}
raw_datasets['train'] = parse(open(train_file, 'r').read())
raw_datasets['dev'] = parse(open(dev_file, 'r').read())
raw_datasets['test'] = parse(open(test_file, 'r').read())
datasets = {}
datasets['test'] = []
datasets['train'] = []
datasets['dev'] = []

In [33]:

word2idx = {}
word2idx = {'<UNK>': 0}
tag2idx = {}
train_loader = []
dev_loader = []
test_loader = []


for sentence in raw_datasets['train']:
    sentence_list =[[],[]]
    
    for token in sentence:  
        if token['upostag'] not in tag2idx:
            tag2idx.update({token['upostag']: len(tag2idx)})
        if token['form'] not in word2idx:
            word2idx.update({token['form']: len(word2idx)})
        sentence_list[0].append(word2idx[token['form']])
        sentence_list[1].append(tag2idx[token['upos']])
    
    datasets['train'].append(sentence_list)
    rand_int = rd.randint(0,150)
    if rand_int < 30:
        idx_to_change = rd.randint(0,len(sentence_list[0])-1)
        sentence_list[0][idx_to_change] = word2idx['<UNK>']
        datasets['train'].append(sentence_list)
    elif rand_int < 50:
        idxes = set()
        ranges= rd.randint(1, len(sentence_list[0])-1)
        for i in range(ranges):
            idxes.add(rd.randint(0,len(sentence_list[0])-1))
        for idx in idxes:
            sentence_list[0][idx] = word2idx['<UNK>']
        datasets['train'].append(sentence_list)

for sentence in datasets['train']:
    train_loader.append((torch.tensor(sentence[0], dtype=torch.long), torch.tensor(sentence[1], dtype=torch.long)))


for sentence in raw_datasets['dev']:
    sentence_list =[[],[]]
    for token in sentence:  
        if token['upostag'] not in tag2idx:
            tag2idx.update({token['upostag']: len(tag2idx)})
        if token['form'] not in word2idx:
            word2idx.update({token['form']: len(word2idx)})
        sentence_list[0].append(word2idx[token['form']])
        sentence_list[1].append(tag2idx[token['upos']])
    datasets['dev'].append(sentence_list)
    random_int= rd.randint(0,150)
    if random_int < 30:
        idx_to_change = rd.randint(0,len(sentence_list[0])-1)
        sentence_list[0][idx_to_change] = word2idx['<UNK>']
        datasets['dev'].append(sentence_list)
    elif random_int < 50:
        idxes = set()
        ranges= rd.randint(1, len(sentence_list[0])-1)
        for i in range(ranges):
            idxes.add(rd.randint(0,len(sentence_list[0])-1))
        for idx in idxes:
            sentence_list[0][idx] = word2idx['<UNK>']
        datasets['dev'].append(sentence_list)
    

for sentence in raw_datasets['test']:
    sentence_list =[[],[]]
    for token in sentence:  
        if token['upostag'] not in tag2idx:
            tag2idx.update({token['upostag']: len(tag2idx)})
        if token['form'] not in word2idx:
            word2idx.update({token['form']: len(word2idx)})
        sentence_list[0].append(word2idx[token['form']])
        sentence_list[1].append(tag2idx[token['upos']])
    datasets['test'].append(sentence_list)
    random_int= rd.randint(0,150)
    if random_int < 30:
        idx_to_change = rd.randint(0,len(sentence_list[0])-1)
        sentence_list[0][idx_to_change] = word2idx['<UNK>']
        datasets['test'].append(sentence_list)
    elif random_int < 50:
        idxes = set()
        ranges= rd.randint(1, len(sentence_list[0])-1)
        for i in range(ranges):
            idxes.add(rd.randint(0,len(sentence_list[0])-1))
        for idx in idxes:
            sentence_list[0][idx] = word2idx['<UNK>']
        datasets['test'].append(sentence_list)
        

for sentence in datasets['dev']:
    dev_loader.append((torch.tensor(sentence[0], dtype=torch.long), torch.tensor(sentence[1], dtype=torch.long)))

for sentence in datasets['test']:
    test_loader.append((torch.tensor(sentence[0], dtype=torch.long), torch.tensor(sentence[1], dtype=torch.long)))

print(len(train_loader))

5726


In [14]:
# class textDataset(torch.utils.data.Dataset):
#     def __init__


class POS_tagger(nn.Module):
    def __init__(self, embedding_dim , hidden_dim , vocab_size , tagset_size ):
        super(POS_tagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
       
    # def forward(self, sentence):
    #     embeds = self.word_embeddings(sentence)
    #     lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
    #     tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
    #     tag_scores = F.log_softmax(tag_space, dim=1)
    #     return tag_scores
    def forward(self, sentences):
    # sentences shape: (batch_size, sentence_length)
        embeds = self.word_embeddings(sentences)
        lstm_out, _ = self.lstm(embeds.transpose(0, 1))  # lstm expects shape (seq_len, batch_size, input_size)
        tag_space = self.hidden2tag(lstm_out.view(len(sentences), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores
        


In [6]:
# mode1 = nn.Sequential(nn.Embedding(len(word2idx), 100), nn.LSTM(100, 100), nn.Linear(100, len(tag2idx)), nn.LogSoftmax(dim=1))
model = POS_tagger(512, 512, len(word2idx), len(tag2idx))
criterion = nn.NLLLoss()
optimizer = optim.Adamax(model.parameters(), lr=0.001)


In [8]:
model.train()
EPOCHS = 50 
STEPS = 0 
best_accuracy = 0
RUNNING_LOSS = 0
PRINT_EVERY = 100
for epoch in range(EPOCHS):
    if epoch % 10 == 0:
        optimizer= optim.Adamax(model.parameters(), lr=optimizer.param_groups[0]['lr']/2)
    for sentence, tags in train_loader:
        STEPS += 1
        optimizer.zero_grad()
        tag_scores = model(sentence)
        # print(sentence)
        loss = criterion(tag_scores, tags)
        loss.backward()
        optimizer.step()
        RUNNING_LOSS += loss.item()
        if STEPS % PRINT_EVERY == 0:
            model.eval()
            test_loss = 0
            accuracy = 0
            with torch.no_grad():
                for sentence, tags in dev_loader:
                    tag_scores = model(sentence)
                    test_loss += criterion(tag_scores, tags)
                    ps = torch.exp(tag_scores)
                    top_p, top_class = ps.topk(1, dim=1)
                    equals = top_class == tags.view(*top_class.shape)
                    accuracy += torch.mean(equals.type(torch.FloatTensor))
            
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                torch.save(model.state_dict(), 'pos_tagger.pth')
                print(f"Epoch {epoch+1}/{EPOCHS} "
                  f"Train loss: {RUNNING_LOSS/PRINT_EVERY:.3f}.. "
                  f"Test loss: {test_loss/len(dev_loader):.3f}.. "
                  f"Test accuracy: {accuracy/len(dev_loader):.3f}")
            
            model.train()
            RUNNING_LOSS = 0

Epoch 1/50 Train loss: 0.602.. Test loss: 0.671.. Test accuracy: 0.834
Epoch 1/50 Train loss: 0.501.. Test loss: 0.629.. Test accuracy: 0.838
Epoch 1/50 Train loss: 0.615.. Test loss: 0.592.. Test accuracy: 0.859
Epoch 1/50 Train loss: 0.648.. Test loss: 0.545.. Test accuracy: 0.868
Epoch 1/50 Train loss: 0.621.. Test loss: 0.514.. Test accuracy: 0.877
Epoch 1/50 Train loss: 0.498.. Test loss: 0.463.. Test accuracy: 0.898
Epoch 1/50 Train loss: 0.451.. Test loss: 0.387.. Test accuracy: 0.910
Epoch 1/50 Train loss: 0.387.. Test loss: 0.350.. Test accuracy: 0.912
Epoch 1/50 Train loss: 0.474.. Test loss: 0.321.. Test accuracy: 0.920
Epoch 1/50 Train loss: 0.345.. Test loss: 0.310.. Test accuracy: 0.925
Epoch 1/50 Train loss: 0.192.. Test loss: 0.263.. Test accuracy: 0.929
Epoch 1/50 Train loss: 0.223.. Test loss: 0.254.. Test accuracy: 0.931
Epoch 1/50 Train loss: 0.386.. Test loss: 0.251.. Test accuracy: 0.937
Epoch 1/50 Train loss: 0.188.. Test loss: 0.227.. Test accuracy: 0.945
Epoch 

KeyboardInterrupt: 

In [None]:
model.load_state_dict(torch.load('pos_tagger.pth'))
model.eval()
accuracy = 0

for sentence, tags in test_loader:
    tag_scores = model(sentence)
    ps = torch.exp(tag_scores)
    top_p, top_class = ps.topk(1, dim=1)
    equals = top_class == tags.view(*top_class.shape)
    accuracy += torch.mean(equals.type(torch.FloatTensor))
print(f"Test accuracy: {accuracy/len(test_loader):.3f}")

UnpicklingError: A load persistent id instruction was encountered,
but no persistent_load function was specified.

In [None]:
input = 'mary had a little lamb'
input = input.split()
input_new = []
for word in input:
    if word not in word2idx:
        input_new.append(word2idx['<UNK>'])
    else :
        input_new.append(word2idx[word])

input_new = torch.tensor(input_new, dtype=torch.long)
print(input_new)
tag_scores = model(input_new)
print(tag_scores)
ps = torch.exp(tag_scores)
top_p, top_class = ps.topk(5, dim=1)
print(top_p)
idxtag = {v: k for k, v in tag2idx.items()}
for i in range(len(top_class)):
    print(idxtag[top_class[0][i].item()])


tensor([ 0,  0,  6,  0,  0])
tensor([[ -1.2414,  -3.1266,  -2.3856,  -3.4544,  -3.6708,  -3.5160,
          -0.8663,  -4.7407,  -4.6933,  -5.5995,  -3.8823,  -4.8671,
          -4.0368,  -8.2040],
        [ -1.0688,  -2.1512,  -1.3650,  -3.2179,  -2.8714,  -3.8387,
          -2.1097,  -5.2123,  -4.3725,  -6.5846,  -4.6473,  -4.6300,
          -5.0260, -10.0728],
        [ -9.3179, -12.1338,  -0.0017,  -8.4055, -10.4911,  -7.1951,
          -8.5077,  -8.7644,  -8.8071, -13.6253,  -9.9282, -11.4025,
         -12.7595, -16.9372],
        [ -7.3976,  -8.1665,  -5.0877,  -0.1375,  -4.7784,  -2.4011,
          -6.5232,  -5.7348,  -4.2687,  -7.6875,  -6.4809,  -6.7223,
          -7.6439, -14.2419],
        [ -7.1834,  -6.3058,  -4.7755,  -2.3709,  -0.2588,  -4.0474,
          -2.4382,  -4.8498,  -5.7840,  -6.0946,  -6.7821,  -5.6120,
          -7.0031, -13.7856]])
tensor([[ 0.4205,  0.2890,  0.0920,  0.0439,  0.0316],
        [ 0.3434,  0.2554,  0.1213,  0.1163,  0.0566],
        [ 0.9983,  0

[(tensor([ 23, 134, 135,   2, 103,   8,   9,  10,  11,  12,  25,  15,  16,  17,
           18,  19,  20,  21]),
  tensor([0, 1, 6, 2, 8, 3, 4, 5, 4, 5, 6, 3, 7, 8, 9, 6, 3, 8])),
 (tensor([ 23, 212,   5,   8,   9, 308,  11, 335,  69, 277,  31,  32,  33, 217,
           35]),
  tensor([ 0,  6,  2,  3,  4,  5,  4,  5,  4,  6,  2, 10,  4,  7,  3])),
 (tensor([ 49,  50,   6,   7,  52,   9,  41,  11, 150,  25,  36,  63,  18,  19,
            2,  44,  56]),
  tensor([6, 0, 3, 3, 3, 4, 5, 4, 5, 6, 8, 3, 9, 6, 2, 3, 4])),
 (tensor([  0, 153, 863,  48, 155,  55,  55, 228]),
  tensor([0, 1, 5, 5, 6, 4, 4, 3])),
 (tensor([ 49,  50,  51,  52,   9,  12,  11,  30,  53, 171,  56, 169, 216,  35,
            2,  44,  56,  57]),
  tensor([ 6,  0,  2,  3,  4,  5,  4,  5,  2,  6,  4,  7, 10,  3,  2,  3,  4,  3])),
 (tensor([  0,   8,   9,  30,  11,  10, 277, 253,  29,   2,  64]),
  tensor([2, 3, 4, 5, 4, 5, 6, 8, 4, 2, 3])),
 (tensor([353,  23, 183, 864,   4,  52,   9,  72,  73,  11,  74,  75,  25,  14,
 