In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
from tqdm.auto import tqdm
from utils import BOS_TOKEN, EOS_TOKEN
from utils import load_reuters, save_pretrained, get_loader, init_weights

import nltk
from collections import defaultdict
torch.cuda.is_available()


True

In [8]:
class Vocab :             #We map the tokens into indexs.
    def __init__(self,tokens= None):
        self.idx_to_token = list()
        self.token_to_idx = dict()
        
        if tokens is not None :
            if "<unk>" not in tokens :
                tokens = tokens + ["<unk>"]
            for token in tokens :
                self.idx_to_token.append(token)
                self.token_to_idx[token] = len(self.idx_to_token) - 1
            self.unk = self.token_to_idx['<unk>']
            
    @classmethod
    def build(cls,text,min_freq=1,reserved_tokens = None):
        token_freqs = defaultdict(int)
        for sentence in text :
            for token in sentence :
                token_freqs[token] += 1
        uniq_tokens = ["<unk>"] + (reserved_tokens if reserved_tokens else [])
        uniq_tokens += [token for token,freq in token_freqs.items() \
                        if freq >= min_freq and token != "<unk>"]
        return cls(uniq_tokens)
    
    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, token):
        return self.token_to_idx.get(token, self.unk)

    def convert_tokens_to_ids(self, tokens):
        return [self[token] for token in tokens]

    def convert_ids_to_tokens(self, indices):
        return [self.idx_to_token[index] for index in indices]


def save_vocab(vocab, path):
    with open(path, 'w') as writer:
        writer.write("\n".join(vocab.idx_to_token))


def read_vocab(path):
    with open(path, 'r') as f:
        tokens = f.read().split('\n')
    return Vocab(tokens)

        
        

                

In [9]:
class NGramDataset(Dataset):            #Datasets
    def __init__(self,corpus,vocab,context_size = 2) :
        self.data = []
        self.bos = vocab[BOS_TOKEN]
        self.eos = vocab[EOS_TOKEN]
        
        for sentence in tqdm(corpus, desc = "Dataset Construction") : 
            sentence = [self.bos] + sentence + [self.eos]
            if len(sentence) < context_size :
                continue
            for i in range(context_size, len(sentence)) :
                context = sentence[i-context_size : i]
                target = sentence[i]
                self.data.append((context,target))
    
    def __len__(self) :
        return len(self.data)
    
    
    def __getitem__(self,i) :
        return self.data[i]
    
    
    def collate_fn(self, examples) :
        inputs = torch.tensor([ex[0] for ex in examples], dtype = torch.long)
        targets = torch.tensor([ex[1] for ex in examples], dtype = torch.long)
        return (inputs,targets)
    
            

In [10]:
#Model
class FeedForwardNNLM(nn.Module) :
    def __init__(self,vocab_size, embedding_dim, context_size, hidden_dim) :
        super(FeedForwardNNLM,self).__init__()
        self.embeddings = nn.Embedding(vocab_size,embedding_dim)
        self.linear1 = nn.Linear(context_size*embedding_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, vocab_size)
        self.activate = F.relu
        init_weights(self)
        
    def forward(self,inputs):
        embeds = self.embeddings(inputs).view((inputs.shape[0],-1))
        hidden = self.activate(self.linear1(embeds))
        output = self.linear2(hidden)
        log_probs = F.log_softmax(output,dim=1)
        return log_probs
        

In [18]:
#Training
embedding_dim = 64
context_size = 5
hidden_dim = 256
batch_size = 1024
num_epoch = 10


corpus,vocab = load_reuters()
dataset = NGramDataset(corpus,vocab,context_size)
data_loader = get_loader(dataset, batch_size)

nll_loss = nn.NLLLoss()
device = torch.device('cuda' if torch.cuda.is_available else 'cpu')
model = FeedForwardNNLM(len(vocab), embedding_dim, context_size ,hidden_dim)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr =0.001)


model.train()
total_losses = []
for epoch in range(num_epoch) :
    total_loss = 0
    for batch in tqdm(data_loader, desc = f"Training Epoch {epoch}"):
        inputs,targets = [x.to(device) for x in batch]
        optimizer.zero_grad()
        log_probs = model(inputs)
        loss = nll_loss(log_probs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Loss : {total_loss:.2f}")
    total_losses.append(total_loss)
        

Dataset Construction:   0%|          | 0/54711 [00:00<?, ?it/s]

Training Epoch 0:   0%|          | 0/1521 [00:00<?, ?it/s]

Loss : 7999.64


Training Epoch 1:   0%|          | 0/1521 [00:00<?, ?it/s]

Loss : 6616.47


Training Epoch 2:   0%|          | 0/1521 [00:00<?, ?it/s]

Loss : 6003.27


Training Epoch 3:   0%|          | 0/1521 [00:00<?, ?it/s]

Loss : 5565.19


Training Epoch 4:   0%|          | 0/1521 [00:00<?, ?it/s]

Loss : 5270.40


Training Epoch 5:   0%|          | 0/1521 [00:00<?, ?it/s]

Loss : 5076.78


Training Epoch 6:   0%|          | 0/1521 [00:00<?, ?it/s]

Loss : 4935.61


Training Epoch 7:   0%|          | 0/1521 [00:00<?, ?it/s]

Loss : 4821.23


Training Epoch 8:   0%|          | 0/1521 [00:00<?, ?it/s]

Loss : 4724.24


Training Epoch 9:   0%|          | 0/1521 [00:00<?, ?it/s]

Loss : 4639.32


embedding_dim = 64
context_size = 2
hidden_dim = 128
batch_size = 1024
num_epoch = 10

Loss : 9169.07
Training Epoch 1: 100%
1681/1681 [00:14<00:00, 108.50it/s]
Loss : 7822.51
Training Epoch 2: 100%
1681/1681 [00:14<00:00, 115.62it/s]
Loss : 7340.44
Training Epoch 3: 100%
1681/1681 [00:14<00:00, 116.88it/s]
Loss : 7029.79
Training Epoch 4: 100%
1681/1681 [00:14<00:00, 116.44it/s]
Loss : 6810.32
Training Epoch 5: 100%
1681/1681 [00:14<00:00, 116.74it/s]
Loss : 6649.40
Training Epoch 6: 100%
1681/1681 [00:14<00:00, 113.97it/s]
Loss : 6529.11
Training Epoch 7: 100%
1681/1681 [00:14<00:00, 118.39it/s]
Loss : 6434.87
Training Epoch 8: 100%
1681/1681 [00:14<00:00, 108.42it/s]
Loss : 6356.45
Training Epoch 9: 100%
1681/1681 [00:14<00:00, 116.71it/s]


embedding_dim = 64
context_size = 5
hidden_dim = 128
batch_size = 1024
num_epoch = 10

Dataset Construction: 100%
54711/54711 [00:01<00:00, 43560.28it/s]
Training Epoch 0: 100%
1521/1521 [00:14<00:00, 111.04it/s]
Loss : 8213.62
Training Epoch 1: 100%
1521/1521 [00:13<00:00, 110.07it/s]
Loss : 6897.06
Training Epoch 2: 100%
1521/1521 [00:13<00:00, 110.72it/s]
Loss : 6397.75
Training Epoch 3: 100%
1521/1521 [00:13<00:00, 110.83it/s]
Loss : 6050.45
Training Epoch 4: 100%
1521/1521 [00:13<00:00, 110.56it/s]
Loss : 5788.94
Training Epoch 5: 100%
1521/1521 [00:13<00:00, 109.98it/s]
Loss : 5592.05
Training Epoch 6: 100%
1521/1521 [00:13<00:00, 103.47it/s]
Loss : 5448.94
Training Epoch 7: 100%
1521/1521 [00:13<00:00, 112.62it/s]
Loss : 5340.46
Training Epoch 8: 100%
1521/1521 [00:13<00:00, 112.47it/s]
Loss : 5252.75
Training Epoch 9: 100%
1521/1521 [00:13<00:00, 110.87it/s]

In [19]:
save_pretrained(vocab, model.embeddings.weight.data, "ffnnlm.vec")

Pretrained embeddings saved to: ffnnlm.vec
