<a href="https://colab.research.google.com/github/Nashema007/Word2Vec-LM/blob/main/NLP_Assignment2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import torch
import nltk
import os
import torch.nn as nn
import sklearn
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import euclidean_distances
import re
import time
import math
from collections import Counter
import torch.nn.functional as F
import torch.optim as optim
nltk.download('punkt')
%matplotlib inline

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# represents the number of context words to use after the center word
CONTEXT_SIZE = 3
# embedding dim can be 50, 100, 300
EMBEDDING_DIM = 300

In [3]:
def read_file(file):  
  path = os.sys.path[1]
  txt = open((path + file))
  txtRead = txt.read()
  return txtRead
test_data_file = read_file('/nchlt_text.nr.test')
train_data_file = read_file('/nchlt_text.nr.train')
valid_data_file = read_file('/nchlt_text.nr.valid')
test_data = nltk.sent_tokenize(test_data_file)
train_data = nltk.sent_tokenize(train_data_file)
valid_data = nltk.word_tokenize(valid_data_file)

In [4]:
sentences = []
for value in train_data:
   sentences.append(nltk.word_tokenize(re.sub(pattern =r'[\!"#$%&\*+,./:;<=>?@^_`()|~=]', repl='', string = value)))

In [5]:
all_sentences = []
for i in sentences:
  all_sentences+=i

In [6]:
all_sentences_dict = dict(Counter(all_sentences))
all_sentences_dict['<unk>'] = len(all_sentences_dict) - len({x:y for x,y in all_sentences_dict.items() if y > 1})
all_sentences = [i for i in all_sentences_dict.keys()]

In [7]:
vocabulary = list(set(all_sentences_dict.keys())) 
vocabulary_size = len(vocabulary)

In [8]:
ngrams = []
for i in range(len(all_sentences) - CONTEXT_SIZE):
    tup = [all_sentences[j] for j in np.arange(i + 1 , i + CONTEXT_SIZE + 1) ]
    ngrams.append((all_sentences[i],tup))

In [9]:
word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}

In [10]:
class SkipgramModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(SkipgramModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim, 128)
        self.linear2 = nn.Linear(128, context_size * vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))  # -1 implies size inferred for that index from the size of the data
        m = nn.Dropout(p=0.8)
        embeds = m(embeds) # apply drop out on the first layer with .8% prob
        out1 = F.relu(self.linear1(embeds)) # output of first layer 
        n = nn.Dropout(p=0.2) # apply drop out for second layer with .2% prob
        out1 = n(out1)
        out2 = self.linear2(out1)           # output of second layer
        log_probs = F.log_softmax(out2, dim=1).view(CONTEXT_SIZE,-1)
        return log_probs

    def predict(self,input):
        context_idxs = torch.tensor([word2idx[input]], dtype=torch.long)
        res = self.forward(context_idxs)
        res_arg = torch.argmax(res)
        res_val, res_ind = res.sort(descending=True)
        indices = [res_ind[i][0] for i in np.arange(0,CONTEXT_SIZE)]
        for arg in indices:
          print( [ (key, val) for key,val in word2idx.items() if val == arg ])
    
    def divide_chunks(self,ngrams, chunk_size):
      # looping till length l
      for i in range(0, len(ngrams), chunk_size): 
          yield ngrams[i:i + chunk_size]

    def write_embedding_to_file(self,filename):
        for i in self.embeddings.parameters():
            weights = i.data.numpy()
        np.save(filename,weights)
    def create_log(self, text):
      outF = open("model.log", "a")
      outF.write(text)
      outF.write("\n")
      outF.close()



In [11]:
model = SkipgramModeler(vocabulary_size, EMBEDDING_DIM, CONTEXT_SIZE)

In [12]:
def calculateLoss(ngramType,ngrams, model, lr, num_epochs, batch_size, adam=False):
  valid_losses = []
  valid_perpelxity = []
  test_loss = []
  test_perp = []
  loss_function = nn.NLLLoss()
  if adam:
    optimizer = optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=True)
  else:
    optimizer = optim.SGD(model.parameters(), lr=lr)
  data = list(model.divide_chunks(ngrams, batch_size))
  scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

  b = 1

  for epoch in range(num_epochs):
    start1 = time.time()
    total_loss = 0
    for single_batch in data:
      start = time.time()
      for context, target in single_batch:
        context_idxs = torch.tensor([word2idx[context]], dtype=torch.long)
        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in tensors)
        context_idxs = torch.tensor([word2idx[context]], dtype=torch.long)

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        log_probs = model.forward(context_idxs)

        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        target_list = torch.tensor([word2idx[w] for w in target], dtype=torch.long)

      
        loss = loss_function(log_probs, target_list)

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()
        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
        perplexity  = torch.exp(loss)
        valid_perpelxity.append(perplexity)
      end = time.time()
      scheduler.step()
      print(ngramType+' loss')
      textLog = f'| epoch {epoch+1} | {b}/{len(data)} batches | lr {lr} | s/batch {end-start} | loss {round(loss.item(), 3)} | ppl {perplexity}'
      model.create_log(textLog)
      print(textLog)
      b += 1
      # print(total_loss)
    b = 1
    end1 = time.time()
    valid_losses.append(total_loss)
    overall_losses = sum(valid_losses)/batch_size
    overall_perpelxity = sum(valid_perpelxity)/batch_size
    test_loss.append(overall_losses)
    test_perp.append(overall_perpelxity)
    validLog = f'| end of epoch {epoch+1} | time: {end1-start1}s | valid loss {overall_losses} | valid ppl {overall_perpelxity}'
    print(validLog)
    model.create_log(validLog)
  t_p = sum(test_perp)/num_epochs
  t_l = sum(test_loss)/num_epochs
  testLog = f'| End of training | test loss {t_l} | test ppl {t_p}'
  print(testLog)
  model.create_log(testLog)



In [13]:
ngramType=''
if CONTEXT_SIZE == 1:
  ngramType='Unigram'
elif CONTEXT_SIZE == 2:
  ngramType='Bigram'
elif CONTEXT_SIZE == 3:
  ngramType='Trigram'
elif CONTEXT_SIZE == 4:
  ngramType='Four-gram'
elif CONTEXT_SIZE == 5:
  ngramType='Five-gram'
calculateLoss(ngramType=ngramType, ngrams=ngrams[:5000], model=model, lr=0.05, num_epochs=3, batch_size=200)

Trigram loss
| epoch 1 | 1/25 batches | lr 0.05 | s/batch 59.549243688583374 | loss 13.861 | ppl 1046724.6875
Trigram loss
| epoch 1 | 2/25 batches | lr 0.05 | s/batch 58.795618772506714 | loss 14.109 | ppl 1341431.875
Trigram loss
| epoch 1 | 3/25 batches | lr 0.05 | s/batch 58.77241539955139 | loss 13.378 | ppl 645394.375
Trigram loss
| epoch 1 | 4/25 batches | lr 0.05 | s/batch 59.05006718635559 | loss 12.757 | ppl 346932.1875
Trigram loss
| epoch 1 | 5/25 batches | lr 0.05 | s/batch 58.85957479476929 | loss 12.373 | ppl 236398.1875
Trigram loss
| epoch 1 | 6/25 batches | lr 0.05 | s/batch 58.44507384300232 | loss 12.783 | ppl 356155.28125
Trigram loss
| epoch 1 | 7/25 batches | lr 0.05 | s/batch 57.99445652961731 | loss 12.731 | ppl 338024.0
Trigram loss
| epoch 1 | 8/25 batches | lr 0.05 | s/batch 58.14544987678528 | loss 13.181 | ppl 530072.9375
Trigram loss
| epoch 1 | 9/25 batches | lr 0.05 | s/batch 59.191744804382324 | loss 13.877 | ppl 1063741.625
Trigram loss
| epoch 1 | 10

In [14]:
def cluster_embeddings(filename,nclusters):
    X = np.load(filename)
    kmeans = KMeans(n_clusters=nclusters, random_state=0).fit(X)
    center = kmeans.cluster_centers_
    distances = euclidean_distances(X,center)

In [15]:
#Predict the next word given n context words
model.predict('kobana')
model.write_embedding_to_file('embeddings_skipgrams.npy')
cluster_embeddings('embeddings_skipgrams.npy',5)

[('amalungelo', 28765)]
[('Ngiyamukela', 12416)]
[('ngesikhathi', 53837)]
