### Word2Vec from scratch

In [125]:
# Import necessary libraries
import tensorflow as tf
from tensorflow.keras.models import Model, Sequential 
from tensorflow.keras.layers import Input, Dense, Reshape,Embedding,dot
from tensorflow.keras.preprocessing.sequence import skipgrams
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing import sequence
import collections
import numpy as np

In [126]:
# For this example, let us consider the sentence from the lecture 
raw_data = "Rohan was hit by a red bus, so Kunal rushed to the scene with a red scarf, \
but Chris and Marios did not rush to the scene and instead watched a movie"

#Conver the raw_data into a list of words separated by spaces
corpus = raw_data.lower().split()

In [127]:
count=collections.Counter(corpus).most_common(len(corpus))
dictionary={}
for i,word in enumerate(count):
        dictionary[word[0]]=i
## checking the dictionary
dictionary

{'a': 0,
 'red': 1,
 'to': 2,
 'the': 3,
 'scene': 4,
 'and': 5,
 'rohan': 6,
 'was': 7,
 'hit': 8,
 'by': 9,
 'bus,': 10,
 'so': 11,
 'kunal': 12,
 'rushed': 13,
 'with': 14,
 'scarf,': 15,
 'but': 16,
 'chris': 17,
 'marios': 18,
 'did': 19,
 'not': 20,
 'rush': 21,
 'instead': 22,
 'watched': 23,
 'movie': 24}

## Part 1: Data pre-processing

### `build_dataset(text_corpus, vocabulary_size)`

We want to write a function, that takes the list of words, and your vocabulary size and returns
1. Tokenized sequences
2. Count of each word
3. Dictionary to convert word to token
4. Reverse dictionary to convert token to word


In [128]:
def build_dataset(text_corpus, vocabulary_size):
    """Process raw inputs into a dataset."""
    
    # The 'UNK' tag is a special character that is assigned token value zero. It represents 'out of vocabulary words'
    # We initialize it to a random value to begin with
    
    count = [['UNK', -1]]
    
    # For all words in sequence, get the count of the most common words
    # We can use collections.Counter and it's most_common function to extend your list
    count.extend(collections.Counter(text_corpus).most_common(vocabulary_size-1))
    
    
    # Define a dictionary called 'dictionary'
    # and for every word in 'count', add it to dictionary with a tokenization
    # Easiest way is to give it the index of the 'count' variable
    
    dictionary={}
    for i,word in enumerate(count):
        dictionary[word[0]]=i
    # Make a new list of tokens associated with words    
    data = []
    # Initialize a counter for 'UNK' values 
    unk_count = 0
    
    # For all words in corpus, find the associated token, and append to 
    # the 'data' variable defined above
    for word in text_corpus:
        if word in dictionary:
            token=dictionary[word]
        # If word is not in dictionary, it is 'out of vocabulary'
        # So we need to assign it the zero token and
        # update the count of the 'UNK' token
        else:
            token = 0  
            unk_count += 1
            
        # Append token to data 
        data.append(token)
        
    # We can now set the count of 'UNK' tokens in the corpus
    count[0][1]=unk_count
    
    # A reverse dictionary takes you from tokens to words
    reversed_dictionary = dict(zip(dictionary.values(),dictionary.keys()))
    
    return data, count, dictionary, reversed_dictionary

In [129]:
# Use the helper function to convert the corpus to sequential data
vocab_size = len(set(corpus))+1
data, count, dictionary, reverse_dictionary = build_dataset(corpus,vocab_size)

In [130]:
# print the original sentence
print('The original sentence was:   ',raw_data.lower())

The original sentence was:    rohan was hit by a red bus, so kunal rushed to the scene with a red scarf, but chris and marios did not rush to the scene and instead watched a movie


In [131]:
# Print the list of tokenized words associated with this sentence
print('The tokenized form of the sentence is :', data)

The tokenized form of the sentence is : [7, 8, 9, 10, 1, 2, 11, 12, 13, 14, 3, 4, 5, 15, 1, 2, 16, 17, 18, 6, 19, 20, 21, 22, 3, 4, 5, 6, 23, 24, 1, 25]


In [132]:
# Print the sentence transformed from the tokenized list above
reformed_sentence=" ".join([reverse_dictionary[i] for i in data ])
print('Reformed_sentence:  ', reformed_sentence)

Reformed_sentence:   rohan was hit by a red bus, so kunal rushed to the scene with a red scarf, but chris and marios did not rush to the scene and instead watched a movie


In [133]:
## checking if our function is working al right
raw_data.lower()==reformed_sentence

True

### Skipgrams

![alt text](https://storage.googleapis.com/public_colab_images/nlp/skip-gram.png)

In [9]:
# We use the `skipgrams` function from tensorflow.keras to build the training dataset
window_size = 2
couples, labels = skipgrams(data,window_size)

# Separate the target,context pairs as word_target, word_context 

word_center, word_context = zip(*couples)
print(couples[:5], labels[:5])

[[5, 4], [22, 19], [13, 2], [12, 1], [22, 4]] [1, 1, 1, 0, 1]


## Building the Word2Vec using Dense layers

In [108]:
# We build the sub-model for target words
# As a dense layer on a one-hot encoded input without bias term
# remember that the dense layer will have activation 'linear'
# and number of neurons as the embedding dimension


embedding_dim = 200         ### It will be number of neurons in the hidden layer 
word_model = Sequential()
word_model.add((Input(shape=(1,vocab_size))))
word_model.add(Dense(embedding_dim,activation='linear',use_bias=False))
word_model.add(Reshape((embedding_dim,)))


# We build the same for the context words
context_model = Sequential()
context_model.add((Input(shape=(1,vocab_size))))
context_model.add(Dense(embedding_dim,activation='linear',use_bias=False))
context_model.add(Reshape((embedding_dim,)))


# We use the `tf.keras.layers.dot` which returns the 
# dot product of two output vectors
# Read more here --> https://www.tensorflow.org/api_docs/python/tf/keras/layers/Dot
dot_product = dot([word_model.output, context_model.output], axes=1,
                  normalize=False,name='dotproduct')

# We also add a sigmoid to ensure the outputs are between 0 & 1
# Simply call a Dense layer and on `dot_product_above`
sigmoid_dot_product = Dense(1,activation="sigmoid")(dot_product)

# Similar to the model above we create our model with inputs
# from `word_model` and `context_model` and the output from 
# the `dot_product`
w2v_model = Model(inputs=[word_model.input, context_model.input], 
              outputs=sigmoid_dot_product,name='Custom')

# Again we run the model summary to ensure we have built the
# word2vec architecture correctly
w2v_model.summary()

Model: "Custom"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_45 (InputLayer)          [(None, 1, 26)]      0           []                               
                                                                                                  
 input_46 (InputLayer)          [(None, 1, 26)]      0           []                               
                                                                                                  
 dense_57 (Dense)               (None, 1, 200)       5200        ['input_45[0][0]']               
                                                                                                  
 dense_58 (Dense)               (None, 1, 200)       5200        ['input_46[0][0]']               
                                                                                             

In [114]:
word_model = Sequential()
#word_model.add((Input(shape=(1,vocab_size))))
word_model.add(Dense(embedding_dim,activation='linear',use_bias=False,input_dim=26))
word_model.add(Reshape((embedding_dim,)))
word_model.summary()

Model: "sequential_46"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_60 (Dense)            (None, 200)               5200      
                                                                 
 reshape_40 (Reshape)        (None, 200)               0         
                                                                 
Total params: 5,200
Trainable params: 5,200
Non-trainable params: 0
_________________________________________________________________


In [109]:
# Again we compile the model using binary crossentropy and rmsprop optimizer
w2v_model.compile(loss="binary_crossentropy", optimizer="rmsprop")

In [113]:
# Lets choose a random training sample
idx = np.random.randint(0, len(labels))

# Using the index we call the input values 
# NOTE: This time, we will have to one-hot encode the input
# in order to comply with our new model
# Also, we will have to add one extra dimension to the input
# using np.expand_dims in order to avoid warnings from tf.keras API
onehot_center =  np.expand_dims(to_categorical(word_center[idx],num_classes=vocab_size).reshape(1,-1),axis=0)
onehot_context =  np.expand_dims(to_categorical(word_context[idx],num_classes=vocab_size).reshape(1,-1),axis=0)
training_label = np.array(labels[idx],dtype='float32').reshape(1,)

# We use the tf.keras `model.train_on_batch` to train on a single batch
# for demonstration that our model works
# more documentation here -> https://www.tensorflow.org/api_docs/python/tf/keras/Model?version=nightly#train_on_batch
loss = w2v_model.train_on_batch([onehot_center, onehot_context], training_label)
print(f'Loss after one batch is {loss:.2f}')

Loss after one batch is 0.70


## Retrain on bigger corpus

Retrain the above, but use a bigger corpus (randomly generated using [this GPT-2 auto-complete model](https://transformer.huggingface.co/doc/arxiv-nlp) and use a larger embedding size = 1000

In [117]:
## BONUS SEGMENT
bigger_corpus = """
Rohan was hit by a red bus, so Kunal rushed to the scene with a red scarf, 
but Chris and Marios did not rush to the scene because of the circumstances. 
Instead they decided to chase after the car. Chris and Marios were also hurt 
when a red vehicle came running down a ramp in front of the building. 
Chris and Marios were then able to find a police officer and ran into the back of the building.
"""

### *Efficient* Word2Vec Skipgram with Negative sampling Model - Using the Embedding Layer

Now rebuild the same Word2Vec architecture, but this time use the `tf.keras.layers.Embedding()` layer.

In [120]:
# The following code builds the SGNS word2vec architecture
# Use the embedding layer instead of one-hot encoding

embedding_dim = 200

word_model = Sequential()
word_model.add(Embedding(vocab_size,embedding_dim,input_length=1))
word_model.add(Reshape((embedding_dim, ))) 
# We build the same for the context words
context_model = Sequential()
context_model.add(Embedding(vocab_size,embedding_dim,input_length=1))
context_model.add(Reshape((embedding_dim, ))) 

# We use the `tf.keras.layers.dot` which returns the 
# dot product of two output vectors
# Read more here --> https://www.tensorflow.org/api_docs/python/tf/keras/layers/Dot
dot_product = dot([word_model.output, context_model.output], axes=1,
                  normalize=False,name='dotproduct') 

# We also add a sigmoid to ensure the outputs are between 0 & 1
# Simply call a Dense layer and on `dot_product_above`
sigmoid_dot_product = Dense(1,activation="sigmoid")(dot_product)

# Similar to the model above we create our model with inputs
# from `word_model` and `context_model` and the output from 
# the `dot_product`
w2v_model =  Model(inputs=[word_model.input, context_model.input], 
              outputs=sigmoid_dot_product,name='SGNS') 


# Again we run the model summary to ensure we have built the
# word2vec architecture correctly
w2v_model.summary()

Model: "SGNS"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 embedding_2_input (InputLayer)  [(None, 1)]         0           []                               
                                                                                                  
 embedding_3_input (InputLayer)  [(None, 1)]         0           []                               
                                                                                                  
 embedding_2 (Embedding)        (None, 1, 200)       5200        ['embedding_2_input[0][0]']      
                                                                                                  
 embedding_3 (Embedding)        (None, 1, 200)       5200        ['embedding_3_input[0][0]']      
                                                                                               

In [122]:
# We compile the model using binary crossentropy and rmsprop optimizer
w2v_model.compile(loss="binary_crossentropy", optimizer="rmsprop") 

In [123]:
# Lets choose a random training sample
idx = np.random.randint(0, len(labels))

# Using the index we call the input values 
# NOTE: we process the input to comply with the model
# i.e changing dtype and shape
center_input = np.array(word_center[idx],dtype='float32').reshape(1,)
context_input = np.array(word_context[idx],dtype='float32').reshape(1,)
training_label = np.array(labels[idx],dtype='float32').reshape(1,)

# We use the tf.keras `model.train_on_batch` to train on a single batch
# for demonstration that our model works
loss = w2v_model.train_on_batch([center_input, context_input], training_label)
print(f'Loss after one batch is {loss:.2f}')

Loss after one batch is 0.69


# Resources:
    1.) https://jalammar.github.io/illustrated-word2vec/
    2.) Univ.AI NLP course and materials