RNN practice 1 simple exercise on POS(Parts of Speech) tagging

In [158]:
corpus = [
    ("I love NLP", ["PRON", "VERB", "NOUN"]),
    ("Python is great", ["NOUN", "VERB", "ADJ"]),
    ("She reads books", ["PRON", "VERB", "NOUN"]),
    ("They play football", ["PRON", "VERB", "NOUN"]),
    ("Machine learning is fun", ["NOUN", "NOUN", "VERB", "ADJ"]),
    ("He writes code", ["PRON", "VERB", "NOUN"])
]


In [159]:
print(corpus)

[('I love NLP', ['PRON', 'VERB', 'NOUN']), ('Python is great', ['NOUN', 'VERB', 'ADJ']), ('She reads books', ['PRON', 'VERB', 'NOUN']), ('They play football', ['PRON', 'VERB', 'NOUN']), ('Machine learning is fun', ['NOUN', 'NOUN', 'VERB', 'ADJ']), ('He writes code', ['PRON', 'VERB', 'NOUN'])]


In [160]:
from collections import Counter

In [161]:
list_of_words = [word for sentence, tags in corpus for word in sentence.split()]
list_of_pos = [tags for sentence, tags in corpus ]
print(list_of_words)

['I', 'love', 'NLP', 'Python', 'is', 'great', 'She', 'reads', 'books', 'They', 'play', 'football', 'Machine', 'learning', 'is', 'fun', 'He', 'writes', 'code']


In [162]:
input_vocab=list(set(list_of_words))
pos_vocab=list(set([pos for tags in list_of_pos for pos in tags]))

In [163]:
print(input_vocab)
print(pos_vocab)

['They', 'is', 'love', 'Python', 'football', 'Machine', 'He', 'learning', 'writes', 'NLP', 'code', 'play', 'reads', 'She', 'great', 'fun', 'I', 'books']
['PRON', 'ADJ', 'VERB', 'NOUN']


In [164]:
special_tokens = ["PAD", "UNK"]
input_vocab = special_tokens + input_vocab
pos_vocab = special_tokens + pos_vocab

In [165]:
word2idx = {word: idx for idx ,word  in enumerate(input_vocab)}
pos2idx = {pos: idx for idx, pos in enumerate(pos_vocab)}
print(word2idx)
print(pos2idx)

{'PAD': 0, 'UNK': 1, 'They': 2, 'is': 3, 'love': 4, 'Python': 5, 'football': 6, 'Machine': 7, 'He': 8, 'learning': 9, 'writes': 10, 'NLP': 11, 'code': 12, 'play': 13, 'reads': 14, 'She': 15, 'great': 16, 'fun': 17, 'I': 18, 'books': 19}
{'PAD': 0, 'UNK': 1, 'PRON': 2, 'ADJ': 3, 'VERB': 4, 'NOUN': 5}


In [166]:
list_of_tags = [tag for tags in list_of_pos for tag in tags]

In [167]:
list_of_tags

['PRON',
 'VERB',
 'NOUN',
 'NOUN',
 'VERB',
 'ADJ',
 'PRON',
 'VERB',
 'NOUN',
 'PRON',
 'VERB',
 'NOUN',
 'NOUN',
 'NOUN',
 'VERB',
 'ADJ',
 'PRON',
 'VERB',
 'NOUN']

In [168]:
input_list = [item.split() for sentence, tags in corpus for item in sentence.split(",")]
tag_list = [tags for sentence, tags in corpus]
print(input_list)
print(tag_list)

[['I', 'love', 'NLP'], ['Python', 'is', 'great'], ['She', 'reads', 'books'], ['They', 'play', 'football'], ['Machine', 'learning', 'is', 'fun'], ['He', 'writes', 'code']]
[['PRON', 'VERB', 'NOUN'], ['NOUN', 'VERB', 'ADJ'], ['PRON', 'VERB', 'NOUN'], ['PRON', 'VERB', 'NOUN'], ['NOUN', 'NOUN', 'VERB', 'ADJ'], ['PRON', 'VERB', 'NOUN']]


In [183]:
x = [[word2idx.get(word, word2idx["UNK"]) for word in words] for words in input_list]

In [184]:
x

[[18, 4, 11], [5, 3, 16], [15, 14, 19], [2, 13, 6], [7, 9, 3, 17], [8, 10, 12]]

In [185]:
y = [[pos2idx.get(pos, pos2idx["UNK"]) for pos in tags] for tags in tag_list]

In [186]:
y

[[2, 4, 5], [5, 4, 3], [2, 4, 5], [2, 4, 5], [5, 5, 4, 3], [2, 4, 5]]

In [187]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

In [192]:
class POSDATASET(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return torch.tensor(self.X[idx], dtype=torch.long), torch.tensor(self.Y[idx], dtype=torch.long)

In [209]:
def collate_fn(batch):
    inputs, targets = zip(*batch)
    X_padded = pad_sequence(inputs, batch_first=True, padding_value=word2idx["PAD"])
    Y_padded = pad_sequence(targets, batch_first=True, padding_value=pos2idx["PAD"])
    return X_padded, Y_padded   

In [210]:
dataset = POSDATASET(x, y)  
dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)

In [211]:
input, label = next(iter(dataloader))
print(input.shape)
print(label)

torch.Size([2, 3])
tensor([[2, 4, 5],
        [2, 4, 5]])
