# Hometask

1) Find text to train (any book)<br>
2) Build train and validation set <br>
3) Train bidirectional language model that predicts the POS of word being based on its `n_context= 3` neighbours from the left and `n_context= 3` neighbours from the right <br>
4) Evaluate the model 

In [184]:
import torch
import torch.nn.functional
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch import optim
from torch import nn
from torch.utils.data import DataLoader,Dataset
from tqdm import tqdm

In [185]:
import numpy as np

In [186]:
fn = 'Siaiuchyi_shliakh.txt'

import codecs
fileObj = codecs.open( "Siaiuchyi_shliakh.txt", "r", "utf_8_sig" )
text = fileObj.read()
fileObj.close()

In [187]:
import re
from sklearn.feature_extraction.text import CountVectorizer

In [188]:
class TextDS(Dataset):
    def __init__(self, text):
        self.text = text
        self.text = re.sub(r'\[.*\]', "", self.text)
        self.text = re.sub(r'\d+', "", self.text)
        self.text = re.sub(r'«|»', "", self.text)
        vectorizer = CountVectorizer(token_pattern=r'(?u)(?:\b\w+\b|\.|\,|\!|\?|\-|\n)').fit([self.text])
        self.vocab_arr = vectorizer.get_feature_names_out()
        self.w2i = vectorizer.vocabulary_
        self.i2w = {v: k for k, v in self.w2i.items()}
        word_tokenizer = vectorizer.build_tokenizer()
        raw_tokens = word_tokenizer(text)
        tokens = sorted(set(raw_tokens))
        step = 1 # shift to build new sample
        self.contexts = []
        self.targets = []
        self.n_context = 7
        for i in range(0, len(raw_tokens) - self.n_context, step):
            context_tokens = [token.lower() for token in raw_tokens[i: i + self.n_context]]
            context_tokens.pop(self.n_context // 2)
            context_tokens = [token for token in context_tokens if token in self.w2i]
            if len(context_tokens) != self.n_context - 1:
                continue
            target_index = i + self.n_context // 2
            target_word = raw_tokens[target_index].lower()
            if target_word in self.w2i:
                self.targets.append(target_word)
                self.contexts.append(context_tokens)
            else:
                print(f"there '{target_word}' no in voc")
        self.embedding_dim = 10
        self.embedded = nn.Embedding(len(tokens), self.embedding_dim)
        self.x = []
        self.y = []
        for context in self.contexts:
            context_indices = [self.w2i[token] for token in context]
            context_embedding = self.embedded(torch.tensor(context_indices))
            self.x.append(context_embedding)
        target_indices = [self.w2i[target] for target in self.targets]
        self.y = torch.tensor(target_indices)
        #self.y = [y.unsqueeze(0) for y in self.y]
        #print({self.y.shape})
        
        
    def __getitem__(self, index):
        return self.x[index], self.y[index]

    
    def __len__(self):
        return len(self.contexts)
    
    def preprocess(self,text):
        text = re.sub(r'\[.*\]', "", text)
        text = re.sub(r'\d+', "", text)
        text = re.sub(r'«|»', "", text)
        vectorizer = CountVectorizer(token_pattern=r'(?u)(?:\b\w+\b|\.|\,|\!|\?|\-|\n)').fit([text])
        word_tokenizer = vectorizer.build_tokenizer()
        raw_tokens = word_tokenizer(text)
        token_indices = [self.w2i[token.lower()] for token in raw_tokens if token.lower() in self.w2i]
        
        return torch.tensor(token_indices)

In [189]:
ds=TextDS(text)

there '11' no in voc
there '180' no in voc
there '2001' no in voc
there '200' no in voc
there '1111' no in voc
there '50' no in voc
there '1943' no in voc
there '60' no in voc
there '11' no in voc
there '11' no in voc
there '11' no in voc
there '1240' no in voc
there '2' no in voc
there '19' no in voc
there '1' no in voc
there '2' no in voc
there '3' no in voc
there '40' no in voc
there '16а2' no in voc
there '11' no in voc
there '15' no in voc
there '2002' no in voc
there '11' no in voc
there '11' no in voc
there '11' no in voc
there '1137' no in voc
there '11' no in voc
there '200' no in voc
there '25' no in voc
there '2' no in voc
there '1986' no in voc
there '1992' no in voc
there '1998' no in voc
there '1' no in voc
there '2' no in voc
there '3' no in voc
there '35' no in voc
there '150' no in voc
there '180' no in voc
there '145' no in voc
there '20' no in voc
there '20' no in voc
there '100' no in voc
there '7' no in voc
there '9' no in voc
there '80' no in voc
there '16' no in 

In [190]:
#print({ds[1337][0].shape}, {ds[1337][1].shape})

In [191]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(ds)

In [192]:
class BiRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(BiRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, output_size)  # 2 for bidirectional
        
    def forward(self, x):
        
        # Initialize hidden state with zeros
        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(x.device)  # 2 for bidirectional
        # Initialize cell state
        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(x.device)
        
        # Forward propagate LSTM
        out, _ = self.lstm(x, (h0, c0))  # out: tensor of shape (batch_size, seq_length, hidden_size*2)
        
        # Decode the hidden state of the last time step
        out = self.fc(out[:, -1, :])
        return out
    


In [193]:
input_size = 10
seq_length = 6
hidden_size = 64
num_layers = 1
output_size = len(ds.vocab_arr)


model = BiRNN(input_size, hidden_size, num_layers, output_size)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

batch_size = 64 
train_loader = DataLoader(train_data, batch_size=batch_size,shuffle=True)
test_loader= DataLoader(test_data, batch_size=batch_size, shuffle=True)

In [194]:
print(device)

cuda


In [195]:
def train_loop(dataloader, model, loss_fn, optimizer, epoch):
    model.train()
    
    tqdm_loader = tqdm(dataloader, unit="batch", desc=f"Epoch {epoch}", total=len(dataloader))
    for batch, (X, y) in enumerate(tqdm_loader):
        # Compute prediction and loss
        X, y = X.to(device), y.to(device)
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward(retain_graph=True)
        optimizer.step()
        optimizer.zero_grad()

        if batch % 1000 == 0:
            tqdm_loader.set_postfix(loss=loss.item())
            
def test_loop(dataloader, model, loss_fn):
    model.eval()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0


    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [196]:
n_epochs = 15
for t in range(n_epochs):
    train_loop(train_loader, model, criterion, optimizer, epoch=t)
    test_loop(test_loader, model, criterion)

torch.save(model, 'model.pth')

Epoch 0: 100%|██████████| 1143/1143 [00:23<00:00, 49.65batch/s, loss=6.53]


Test Error: 
 Accuracy: 16.7%, Avg loss: 7.073682 



Epoch 1: 100%|██████████| 1143/1143 [00:14<00:00, 80.64batch/s, loss=6.34]


Test Error: 
 Accuracy: 17.0%, Avg loss: 7.326131 



Epoch 2: 100%|██████████| 1143/1143 [00:13<00:00, 82.76batch/s, loss=5.71]


Test Error: 
 Accuracy: 17.2%, Avg loss: 7.829980 



Epoch 3: 100%|██████████| 1143/1143 [00:13<00:00, 82.64batch/s, loss=5.53]


Test Error: 
 Accuracy: 16.1%, Avg loss: 8.392230 



Epoch 4: 100%|██████████| 1143/1143 [00:13<00:00, 81.79batch/s, loss=3.92]


Test Error: 
 Accuracy: 16.0%, Avg loss: 8.881456 



Epoch 5: 100%|██████████| 1143/1143 [00:14<00:00, 81.47batch/s, loss=3.95]


Test Error: 
 Accuracy: 15.3%, Avg loss: 9.296077 



Epoch 6: 100%|██████████| 1143/1143 [00:14<00:00, 80.71batch/s, loss=3.98]


Test Error: 
 Accuracy: 15.8%, Avg loss: 9.608563 



Epoch 7: 100%|██████████| 1143/1143 [00:13<00:00, 81.91batch/s, loss=2.82]


Test Error: 
 Accuracy: 15.1%, Avg loss: 9.963832 



Epoch 8: 100%|██████████| 1143/1143 [00:13<00:00, 82.43batch/s, loss=3.13]


Test Error: 
 Accuracy: 15.0%, Avg loss: 10.307096 



Epoch 9: 100%|██████████| 1143/1143 [00:13<00:00, 82.24batch/s, loss=2.97]


Test Error: 
 Accuracy: 15.4%, Avg loss: 10.457911 



Epoch 10: 100%|██████████| 1143/1143 [00:13<00:00, 83.55batch/s, loss=2.68]


Test Error: 
 Accuracy: 15.0%, Avg loss: 10.731898 



Epoch 11: 100%|██████████| 1143/1143 [00:14<00:00, 80.96batch/s, loss=2.3]


Test Error: 
 Accuracy: 14.9%, Avg loss: 10.975221 



Epoch 12: 100%|██████████| 1143/1143 [00:13<00:00, 82.73batch/s, loss=2.48]


Test Error: 
 Accuracy: 15.1%, Avg loss: 11.148243 



Epoch 13: 100%|██████████| 1143/1143 [00:13<00:00, 83.17batch/s, loss=1.84]


Test Error: 
 Accuracy: 15.1%, Avg loss: 11.306624 



Epoch 14: 100%|██████████| 1143/1143 [00:14<00:00, 81.46batch/s, loss=2.41]


Test Error: 
 Accuracy: 15.0%, Avg loss: 11.512945 

