In [1]:
import numpy as np
import torch 
from torch import nn,optim
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import torch.nn.functional as F
torch.manual_seed(1)

<torch._C.Generator at 0x1d538131170>

In [2]:
class TransformerToTensor():
    def __init__(self):
        pass
    def __call__(self,x):
        return torch.tensor(x,dtype=torch.long)

In [3]:
class dataset(Dataset):
    def __init__(self,corpus,transform=None):
        self.transform = transform
        self.tokens = self.tokenize_corpus(corpus)
        self.vocabulary,self.tags = self.get_vocabulary(self.tokens)
        self.word2idx = self.get_word2idx(self.vocabulary)
        self.idx2word = self.get_idx2word(self.vocabulary)
        self.tag2idx = self.get_word2idx(self.tags)
        self.idx2tag = self.get_idx2word(self.tags) 
        
        self.dataset = self.idx_pair(self.tokens)
    
    def tokenize_corpus(self,corpus):
        tokens = [(x.split(),y) for x,y in corpus]
        return tokens
    
    def get_vocabulary(self,tokens):
        vocabulary = []
        tags = []
        for sentence,tags_vocab in tokens:
            for token in sentence:
                if token not in vocabulary:
                    vocabulary.append(token)
            for tag in tags_vocab:
                if tag not in tags:
                    tags.append(tag)
        return vocabulary,tags
    
    def get_word2idx(self,vocabulary):
        return {w: idx for (idx, w) in enumerate(vocabulary)}
    
    def get_idx2word(self,vocabulary):
        return {idx: w for (idx, w) in enumerate(vocabulary)}
    
    
    def idx_pair(self,data):
        dataset = []
        for input_sent,tag_sent in data: 
            dataset.append(([self.word2idx[word] for word in input_sent],[self.tag2idx[tag] for tag in tag_sent]))
        return dataset
    
    def show_dataset(self):
        for input_word,output_word in self.dataset:
            print(self.idx2word[input_word],self.idx2word[output_word])
            
    def __getitem__(self,idx):
        x,y = self.dataset[idx]
        if(self.transform):
            x = self.transform(x)
            y = self.transform(y)
        return x,y 
        #it must be returned like to to be casting as set for dataLoader
    
    def __len__(self):
        return len(self.dataset)
    
        

In [4]:
training_data = [
    ("The dog ate the apple", ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book", ["NN", "V", "DET", "NN"])
]
pos_dataset = dataset(training_data,transform=TransformerToTensor())
print(pos_dataset[1])

(tensor([5, 6, 7, 8]), tensor([1, 2, 0, 1]))


In [11]:
class LSTMTagger(nn.Module):
    def __init__(self,embedd_dim,hidden_dim,vocab_size,tagset_size):
        super(LSTMTagger,self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(vocab_size,embedd_dim)
        self.lstm = nn.LSTM(embedd_dim,hidden_dim,bidirectional=False)
        self.hidden2tag = nn.Linear(hidden_dim,tagset_size)
        
    def forward(self,x):
        inputs_seq = self.embedding(x)
        a_out , _ = self.lstm(inputs_seq.view(len(x), 1, -1)) 
        #this is not sequance model so we don't need hidden_states a,c to be accamulated
        output_tags = self.hidden2tag(a_out.view(len(x), -1))
        output_scores = F.log_softmax(output_tags,dim = 1)
        return output_scores

In [12]:
vocab_size = len(pos_dataset.word2idx)
tagset_size = len(pos_dataset.tag2idx)
embedding_dims = 6
hidden_dim = 6
model = LSTMTagger(embedding_dims,hidden_dim,vocab_size,tagset_size)
optimizer = optim.SGD(model.parameters(),lr=0.1)
criterion = nn.NLLLoss()
def train_model(model,optimizer,criterion,dataset,iter = 10):
    for i in range(iter):
        for x,y in dataset:
            yhat = model(x)
            loss = criterion(yhat,y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

In [13]:
trained_model = train_model(model,optimizer,criterion,pos_dataset,iter = 300)

In [14]:
# See what the scores are after training
testing_data = [
    ("Everybody ate the book", ["NN","V", "DET", "NN"])
]
test_dataset = dataset(testing_data,transform=TransformerToTensor() )
for x,y in test_dataset:
    print("input = ",x," output = ",y)
with torch.no_grad():
    for x,_ in test_dataset:
        tag_scores = model(x)
        max_tag_scores_values,max_tag_scores_indices = tag_scores.max(dim=1) 
        for i in range(len(x)):
            print("word :",test_dataset.idx2word[x[i].item()]," Tag :",test_dataset.idx2tag[max_tag_scores_indices[i].item()])

input =  tensor([0, 1, 2, 3])  output =  tensor([0, 1, 2, 0])
word : Everybody  Tag : NN
word : ate  Tag : V
word : the  Tag : DET
word : book  Tag : NN
