<a href="https://colab.research.google.com/github/GustaveRw/NLP-Fellowship/blob/master/Neural_Language_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Word Embeddings
Word embeddings were proposed by  [Bengio et. al. (2001, 2003)](https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf)

From the dataset, the neural network would identify the words with similar meaning, but also preserve the words semantic (properties of word boy - young, male, human) and syntatic (word order, noun, grammatic relationship) propoerties.

The cat is walking in the bedroom

A dog was running in a room

The cat is running in a room

A dog is walking in a bedroom

The dog was walking in the room

This neural network has three components:

1. An embedding layer that generates word embedding, and the parameters are shared across words.
  * It’s a lookup table, given the index, it will return the corresponding vector.
  * The vector representation indicated the weighted matrix is initialized as random values and will be updated by backpropagation
2. A hidden layer of one or more layers, which introduces non-linearity to the embeddings.
3. A softmax function that produces probability distribution over all the words in the vocabulary. 

The words would be represented in a dense vector
* dog [0.2,0.5,-1.5,2.4] (length of vector is set as a parameter)

## Step 1: Get the dataset

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
import os
import pandas as pd
path = '' # Enter your path
os.chdir(path)
!ls

In [None]:
dataset = pd.read_csv('IMDB Dataset.csv')
dataset.head(100)

In [None]:
dataset.describe()

In [None]:
english_text = list(set()) #Depends on your data. Change code
english_text = english_text[:5000]
print(len(english_text))

In [None]:
import re
def preprocess(text):
    text = text.lower()
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    html_pattern = re.compile('<.*?>')
    text = emoji_pattern.sub(r'', text)
    text = url_pattern.sub(r'', text)
    text = html_pattern.sub(r'', text)
    text = re.sub(r"[^\w\d'\s]+", ' ', text)

    return text

In [None]:
processed_english_text = []
for sent in english_text:
  processed_english_text.append(preprocess(sent))

print(len(processed_english_text))
processed_english_text[:5]

## Step 2: index the words

In [None]:
UNK_symbol = "<UNK>"
all_words = set([UNK_symbol])
word_to_index = {}
index_to_word = {}
for sentence in processed_english_text:
  words = [token for token in sentence.split() ]
  all_words.update(words)

for index,value in enumerate(all_words):
  word_to_index[value] = index
  index_to_word[index] = value
  
n_class = len(word_to_index) # number of Vocabulary

print(n_class)

In [None]:
sorted(word_to_index, key=word_to_index.get)[10:40]

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
import time

torch.cuda.empty_cache()
if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")

In [None]:
def create_input_target(list_sentences):
  input_batch = []
  target_batch = []

  for sen in list_sentences:
      word = sen.split() # space tokenizer
      input = [word_to_index[n] for n in word[:-1]] # create (1~n-1) as input (every word in the sentence except the last word)
      target = word_to_index[word[-1]] # create (n) as target, We usually call this 'casual language model'

      input_batch.append(torch.tensor(input))
      target_batch.append(target)

  return input_batch,target_batch

## Build the model

In [None]:
#Use this cell for testing the shape of input when passed through different layers
# X =nn.Embedding(n_class, 2)(input_batch)
# X=X.view(-1, 2 * 2)
# #tanh = torch.tanh(nn.Parameter(torch.ones(2)) + nn.Linear(2 * 2, 2, bias=False)(X))
# nn.Parameter(torch.ones(2))
# #nn.Linear(2 * 2, 2, bias=False)
# # # nn.Linear(2, n_class, bias=False)
# # # nn.Linear(2 * 2, n_class, bias=False)
# # # nn.Parameter(torch.ones(n_class))
# # #torch.LongTensor(input_batch[0])
# #tanh.shape

In [None]:


class NNLM(nn.Module):
    def __init__(self):
        super(NNLM, self).__init__()
        self.embeddings = nn.Embedding(n_class, m)
        self.hidden1 = nn.Linear(n_step * m, n_hidden, bias=False)
        self.ones = nn.Parameter(torch.ones(n_hidden))
        self.hidden2 = nn.Linear(n_hidden, n_class, bias=False)
        self.hidden3 = nn.Linear(n_step * m, n_class, bias=False)
        self.bias = nn.Parameter(torch.ones(n_class))

    def forward(self, X):
        X = self.embeddings(X) # X : [batch_size, n_step, m]
        
        #X = X.flatten()
        X = X.view(-1, n_step * m) # [batch_size, n_step * m] first layer (-1 flattens the tensor)
        
        tanh = torch.tanh(self.ones + self.hidden1(X)) # [batch_size, n_hidden] pass embedded layer through first hidden layet and add bias. The result is passed through tanh function
        
        output = self.bias + self.hidden3(X) + self.hidden2(tanh) # [batch_size, n_class]
        return output

if __name__ == '__main__':
  n_step = 2 # number of steps, n-1 in paper
  n_hidden = 2 # number of hidden size, h in paper
  m = 10 # embedding size, m in paper,( batch size)
  
  gpu = 0
  input_batch,target_batch = create_input_target(processed_english_text)
  input_batch = torch.LongTensor(pad_sequence(input_batch))
  train_loader = DataLoader(input_batch, batch_size = 10, num_workers = 1)
  
  target_batch = torch.LongTensor(target_batch)
  dev_loader = DataLoader(target_batch, batch_size = 10, num_workers = 1)
  

  model = NNLM()
  model.cuda(gpu)

  criterion = nn.CrossEntropyLoss()
  optimizer = optim.Adam(model.parameters(), lr=0.001)
  

      

In [None]:

for epoch in range(50):
    st = time.time()
    
    for it, data_tensor in enumerate(train_loader):  
      
      context_tensor = data_tensor[:,0:2]
      target_tensor = data_tensor[:,2]

      context_tensor, target_tensor = context_tensor.cuda(gpu), target_tensor.cuda(gpu)

      optimizer.zero_grad()
      output = model(context_tensor)

      #acc = get_accuracy_from_log_probs(output, target_tensor)

      # output : [batch_size, n_class], target_batch : [batch_size]
      loss = criterion(output, target_tensor)
      if (epoch + 1) % 5 == 0:
          print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss),'output size: ',output.size)

      loss.backward()
      optimizer.step()

      if it % 500 == 0: 
        print("Training Iteration {} of epoch {} complete. Loss: {}; Time taken (s): {}".format(it, epoch, loss.item(), (time.time()-st)))
        st = time.time()

      
      # set best model path
      best_model_path = 'model/best_model_{}.dat'.format(epoch)
      # saving best model
      torch.save(model.state_dict(), best_model_path)

In [None]:

best_model = NNLM()
best_model.load_state_dict(torch.load(best_model_path))
best_model.cuda(gpu)

cos = nn.CosineSimilarity(dim=0)

lm_similarities = {}
# word pairs to calculate similarity
words = {('women','wife'),('king','man'),('london','paris'),('male','husband')}

# ----------- Calculate LM similarities using cosine similarity ----------
for word_pairs in words:
    w1 = word_pairs[0]
    w2 = word_pairs[1]
    words_tensor = torch.LongTensor([word_to_index.get(w1,word_to_index['<UNK>']),word_to_index.get(w2,word_to_index['<UNK>'])])
    
    words_tensor = words_tensor.cuda(gpu)
    # get word embeddings from the best model
    words_embeds = best_model.embeddings(words_tensor)
    # calculate cosine similarity between word vectors
    sim = cos(words_embeds[0],words_embeds[1])
    lm_similarities[word_pairs] = sim.item()

print(lm_similarities)

{('london', 'paris'): -0.20533907413482666, ('male', 'husband'): 0.5646224617958069, ('women', 'wife'): 0.06242431700229645, ('king', 'man'): -0.07820046693086624}


In [None]:
best_model.embeddings(torch.LongTensor([word_to_index.get('london')]).cuda(gpu)) # Test the embeddings here

tensor([[ 0.7643,  1.5840, -1.5441, -1.8176, -0.9911,  0.4807,  0.5916,  0.4323,
          1.4739, -0.6284]], device='cuda:0', grad_fn=<EmbeddingBackward0>)