# Word2Vec

In [37]:
import nlp_helpers
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from nltk.corpus import stopwords
import os
import pickle

In [66]:
stop_words= stopwords.words('english')
vocabulary_size= 2000
embedding_size= 200
batch_size= 512
generations= 10
max_words= 100

In [39]:
# Load Movies data
texts, targets = nlp_helpers.load_movies_data()
# Normalize Text
texts= nlp_helpers.normalize_text(texts, stop_words)
# Crate Word Dictionary
word_dic= nlp_helpers.build_dictionary(texts, vocabulary_size)
# Reverse Word Dictionary
word_dic_rev= dict(zip(word_dic.values(), word_dic.keys()))
# Text to Numbers
text_data= nlp_helpers.text_to_numbers(texts, word_dic)

In [40]:
# Load Embeddings
# Skip-Grams
file= 'skip-grams-200.pickle'
with open(os.path.join('Resources', file), 'rb') as f:
    skip_grams_embeddings= pickle.load(f)

# CBOW
file= 'cbow-200.pickle'
with open(os.path.join('Resources', file), 'rb') as f:
    cbow_embeddings= pickle.load(f)

print(f"""
    Skip-Grams Embeddings:
                    {skip_grams_embeddings.shape}
    CBOW Embeddings:
                    {cbow_embeddings.shape}
""")


    Skip-Grams Embeddings:
                    (2000, 200)
    CBOW Embeddings:
                    (2000, 200)



In [41]:
#  FIll with 0s sentences with len(words) < max_words
text_data = [(row+[0]*max_words)[:max_words] for row in text_data]

### Build the Model

In [78]:
# To tensorflow constants
X_const= tf.constant(text_data, dtype= tf.int32)
y_const= tf.constant(targets, dtype= tf.float32)
embeddings= tf.constant(cbow_embeddings, dtype= tf.float32)
# To Tensors
X_tensor= tf.data.Dataset.from_tensor_slices(X_const)
y_tensor= tf.data.Dataset.from_tensor_slices(targets)
# Zip data
samples= tf.data.Dataset.zip((X_tensor, y_tensor))

In [93]:
# PARAMETERS
units= 200
# First Layer
W0= tf.Variable(tf.random.uniform(shape= [units, embedding_size], dtype= tf.float32, minval= 0.0, maxval= 1.0))
b0= tf.Variable(tf.zeros(shape= [units, 1], dtype= tf.float32))
# Second Layer
W1= tf.Variable(tf.random.uniform(shape= [1, units], dtype= tf.float32, minval= 0.0, maxval= 1.0))
b1= tf.Variable(tf.zeros(shape= [1, 1], dtype= tf.float32))
#
# Define Tensorflow Functions
#

# Function to encode embeddings
@tf.function
def encode_embeddings(X):
    return tf.reduce_mean(tf.nn.embedding_lookup(embeddings, X), axis= 1)

@tf.function
def model(X):
    A0= tf.nn.relu(tf.matmul(W0, X, transpose_b= True) + b0)
    A1= tf.nn.sigmoid(tf.matmul(W1, A0) + b1)
    return A1

# Losss Function
@tf.function
def loss_function(y_true, y_pred):
    return tf.reduce_mean(tf.losses.binary_crossentropy(y_true, y_pred))

# Optimizer
eta= 0.01
my_opt= tf.optimizers.legacy.Adam(learning_rate= eta)

# Get Score
@tf.function
def score(X, y, threshold= 0.5):
    prediction= tf.cast(model(X) > threshold, dtype= tf.float32)
    return tf.reduce_mean(tf.cast(tf.equal(y, prediction), dtype= tf.float32))

# Main lopp
for ite in range(generations):
    batches= samples.shuffle(buffer_size= len(text_data)).batch(batch_size)
    for x_rand, y_rand in batches:
        x_encoded= encode_embeddings(x_rand)
        y_rand= tf.expand_dims(y_rand, 0)
        with tf.GradientTape() as g:
            g.watch(W0)
            g.watch(b0)
            g.watch(W1)
            g.watch(b1)

            output= model(x_encoded)
            loss= loss_function(y_rand, output)
        gradients= g.gradient(loss, [W0,b0,W1,b1])
        my_opt.apply_gradients(zip(gradients, [W0,b0,W1,b1]))
    
    if ite % 1 == 0:
        X_enc= encode_embeddings(X_const)
        y_2d = tf.expand_dims(y_const, 0)
        acc= score(X_enc, y_2d)
        output= tf.cast(model(X_enc) > 0.5, dtype= tf.float32)
        loss= loss_function(y_2d, output)
        print(f"Iteration: {ite} -- Accuracy: {acc.numpy()} -- Loss: {loss.numpy()}")

Iteration: 0 -- Accuracy: 0.5 -- Loss: 7.712477684020996
Iteration: 1 -- Accuracy: 0.5 -- Loss: 7.624630928039551
Iteration: 2 -- Accuracy: 0.5 -- Loss: 7.624630928039551
Iteration: 3 -- Accuracy: 0.5 -- Loss: 7.624630928039551
Iteration: 4 -- Accuracy: 0.5 -- Loss: 7.624630928039551
Iteration: 5 -- Accuracy: 0.5 -- Loss: 7.712477684020996
Iteration: 6 -- Accuracy: 0.5 -- Loss: 7.712477684020996
Iteration: 7 -- Accuracy: 0.5 -- Loss: 7.624630928039551
Iteration: 8 -- Accuracy: 0.5 -- Loss: 7.624630928039551
Iteration: 9 -- Accuracy: 0.5 -- Loss: 7.712477684020996
