RNN practice 1 simple exercise on POS(Parts of Speech) tagging

In [40]:
corpus = [
    ("I love NLP", ["PRON", "VERB", "NOUN"]),
    ("Python is great", ["NOUN", "VERB", "ADJ"]),
    ("She reads books", ["PRON", "VERB", "NOUN"]),
    ("They play football", ["PRON", "VERB", "NOUN"]),
    ("Machine learning is fun", ["NOUN", "NOUN", "VERB", "ADJ"]),
    ("He writes code", ["PRON", "VERB", "NOUN"])
]


In [41]:
print(corpus)

[('I love NLP', ['PRON', 'VERB', 'NOUN']), ('Python is great', ['NOUN', 'VERB', 'ADJ']), ('She reads books', ['PRON', 'VERB', 'NOUN']), ('They play football', ['PRON', 'VERB', 'NOUN']), ('Machine learning is fun', ['NOUN', 'NOUN', 'VERB', 'ADJ']), ('He writes code', ['PRON', 'VERB', 'NOUN'])]


In [42]:
from collections import Counter

In [43]:
list_of_words = [word for sentence, tags in corpus for word in sentence.split()]
list_of_pos = [tags for sentence, tags in corpus ]
print(list_of_words)

['I', 'love', 'NLP', 'Python', 'is', 'great', 'She', 'reads', 'books', 'They', 'play', 'football', 'Machine', 'learning', 'is', 'fun', 'He', 'writes', 'code']


In [44]:
input_vocab=list(set(list_of_words))
pos_vocab=list(set([pos for tags in list_of_pos for pos in tags]))

In [45]:
print(input_vocab)
print(pos_vocab)

['Python', 'She', 'books', 'reads', 'fun', 'He', 'NLP', 'I', 'is', 'play', 'great', 'love', 'writes', 'Machine', 'code', 'learning', 'football', 'They']
['NOUN', 'VERB', 'ADJ', 'PRON']


In [46]:
special_tokens = ["PAD", "UNK"]
input_vocab = special_tokens + input_vocab
pos_vocab = special_tokens + pos_vocab

In [47]:
word2idx = {word: idx for idx ,word  in enumerate(input_vocab)}
pos2idx = {pos: idx for idx, pos in enumerate(pos_vocab)}
print(word2idx)
print(pos2idx)

{'PAD': 0, 'UNK': 1, 'Python': 2, 'She': 3, 'books': 4, 'reads': 5, 'fun': 6, 'He': 7, 'NLP': 8, 'I': 9, 'is': 10, 'play': 11, 'great': 12, 'love': 13, 'writes': 14, 'Machine': 15, 'code': 16, 'learning': 17, 'football': 18, 'They': 19}
{'PAD': 0, 'UNK': 1, 'NOUN': 2, 'VERB': 3, 'ADJ': 4, 'PRON': 5}


In [48]:
list_of_tags = [tag for tags in list_of_pos for tag in tags]

In [49]:
list_of_tags

['PRON',
 'VERB',
 'NOUN',
 'NOUN',
 'VERB',
 'ADJ',
 'PRON',
 'VERB',
 'NOUN',
 'PRON',
 'VERB',
 'NOUN',
 'NOUN',
 'NOUN',
 'VERB',
 'ADJ',
 'PRON',
 'VERB',
 'NOUN']

In [50]:
input_list = [item.split() for sentence, tags in corpus for item in sentence.split(",")]
tag_list = [tags for sentence, tags in corpus]
print(input_list)
print(tag_list)

[['I', 'love', 'NLP'], ['Python', 'is', 'great'], ['She', 'reads', 'books'], ['They', 'play', 'football'], ['Machine', 'learning', 'is', 'fun'], ['He', 'writes', 'code']]
[['PRON', 'VERB', 'NOUN'], ['NOUN', 'VERB', 'ADJ'], ['PRON', 'VERB', 'NOUN'], ['PRON', 'VERB', 'NOUN'], ['NOUN', 'NOUN', 'VERB', 'ADJ'], ['PRON', 'VERB', 'NOUN']]


In [51]:
x = [[word2idx.get(word, word2idx["UNK"]) for word in words] for words in input_list]

In [52]:
x

[[9, 13, 8],
 [2, 10, 12],
 [3, 5, 4],
 [19, 11, 18],
 [15, 17, 10, 6],
 [7, 14, 16]]

In [53]:
y = [[pos2idx.get(pos, pos2idx["UNK"]) for pos in tags] for tags in tag_list]

In [54]:
y

[[5, 3, 2], [2, 3, 4], [5, 3, 2], [5, 3, 2], [2, 2, 3, 4], [5, 3, 2]]

In [55]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.optim import Adam


In [56]:
class POSDATASET(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return torch.tensor(self.X[idx], dtype=torch.long), torch.tensor(self.Y[idx], dtype=torch.long)

In [57]:
def collate_fn(batch):
    inputs, targets = zip(*batch)
    X_padded = pad_sequence(inputs, batch_first=True, padding_value=word2idx["PAD"])
    Y_padded = pad_sequence(targets, batch_first=True, padding_value=pos2idx["PAD"])
    return X_padded, Y_padded   

In [58]:
dataset = POSDATASET(x, y)  
dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)

In [59]:
output= next(iter(dataloader))
print(output)


(tensor([[ 7, 14, 16,  0],
        [15, 17, 10,  6]]), tensor([[5, 3, 2, 0],
        [2, 2, 3, 4]]))


In [60]:
class RNNPOSTAGGER(nn.Module):
    def __init__(self, vocab_size,tag_size, embedding_dim=32, hidden_dim=64):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=word2idx["PAD"])
        self.rnn = nn.RNN(embedding_dim, hidden_dim,batch_first=True)
        self.fc = nn.Linear(hidden_dim, tag_size)
        
    def forward(self, x):
        embedded = self.embedding(x) # (batch_size, seq_len, embedding_dim)
        rnn_out, _ = self.rnn(embedded) # (batch_size, seq_len, hidden_dim)
        logits = self.fc(rnn_out) # (batch_size, seq_len, tag_size)
        return logits

In [61]:
model = RNNPOSTAGGER(vocab_size=len(word2idx), tag_size=len(pos2idx))
optimizer = Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=word2idx["PAD"])

In [68]:
epochs = 10

for epoch in range(epochs):
    model.train()
    for input , label in dataloader:
        optimizer.zero_grad()
        output = model(input)
        output = output.flatten(0,1)
        label = label.flatten(0,1)
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()
        
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")

Epoch 1/10, Loss: 1.2411
Epoch 2/10, Loss: 1.1175
Epoch 3/10, Loss: 1.2236
Epoch 4/10, Loss: 1.1465
Epoch 5/10, Loss: 0.8984
Epoch 6/10, Loss: 0.8524
Epoch 7/10, Loss: 0.8645
Epoch 8/10, Loss: 0.6577
Epoch 9/10, Loss: 0.6224
Epoch 10/10, Loss: 0.5444


In [69]:
with torch.no_grad():
    model.eval()
    test_sentence = "He writes code"
    test_indices = [word2idx.get(word, word2idx["UNK"]) for word in test_sentence.split()]
    test_tensor = torch.tensor(test_indices, dtype=torch.long) 
    logits = model(test_tensor)   
    predicted_tags = torch.argmax(logits, dim=-1)
    
    print(logits)
    print(predicted_tags)
    id2pos = {idx: pos for pos, idx in pos2idx.items()}
    predicted_pos_tags = [id2pos[idx.item()] for idx in predicted_tags]
    print(predicted_pos_tags)

tensor([[-0.5760, -0.5401,  0.0755,  0.7542, -0.1608,  0.8628],
        [-1.2989, -1.2050,  0.2034,  2.4106, -0.8838, -0.8145],
        [-0.5971, -0.5411,  2.4322, -1.0225,  0.1000, -0.3447]])
tensor([5, 3, 2])
['PRON', 'VERB', 'NOUN']
