# Skip-Gram

- King - man + woman = Queen
- Indian Pale Ale - hops + malt = Stout

#### Continuos Bag of Words **CBOW**
- If we want to predict an objective word based on context (words surrounding it).

#### **Skip-Gram**
- If we want to predict the words that surround (context) an objective word.

![CBOW & Skip-Gram](Resources/CBOW_Skip-Gram.png)

### Download and Clean Data

In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import collections

import requests
import io
import tarfile
import os

from nltk.corpus import stopwords
import re 

In [3]:
batch_size = 512
embedding_size = 200
vocabulary_size = 2000 # 10 000
generations = 15000 # 5 000
print_loss_every = 1000
num_sampled = int(batch_size/2)
window_size = 3
stop_words = set(stopwords.words('english'))
print_valid_every = 2000
valid_words = ['cliche', 'love', 'hate', 'silly', 'sad']

In [4]:
def load_movies_data():
    # Define paths
    save_folder_name = 'movies_data'
    pos_file = os.path.join(save_folder_name, 'rt-polarity.pos')
    neg_file = os.path.join(save_folder_name, 'rt-polarity.neg')
    
    if os.path.exists(pos_file) and os.path.exists(neg_file):
        ## Get the data from path
        pos_data = []
        with open(pos_file, 'r') as pos_file_handler:
            for row in pos_file_handler:
                pos_data.append(row)
        neg_data = []
        with open(neg_file, 'r') as neg_file_handler:
            for row in neg_file_handler:
                neg_data.append(row)
    else:
        # Download data from url
        url = "https://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz"
        req = requests.get(url)
        # Performe Request
        if req.ok:
            stream_data = io.BytesIO(req.content)
            tmp = io.BytesIO() 
            while True:
                s = stream_data.read(16384)
                if not s:
                    break
                tmp.write(s)
            stream_data.close()
            tmp.seek(0)
        else:
            raise ConnectionError(f"Something went wrong. Code: {req.code}")
        # Extract tar File
        tar_file = tarfile.open(fileobj= tmp, mode= "r:gz")
        pos = tar_file.extractfile('rt-polaritydata/rt-polarity.pos')
        neg = tar_file.extractfile('rt-polaritydata/rt-polarity.neg')
        # Get positive reviews
        pos_data = []
        for line in pos:
            pos_data.append(line.decode("ISO-8859-1").encode('ascii', errors= 'ignore').decode())
        # Get negative reviews
        neg_data = []
        for line in neg:
            neg_data.append(line.decode('ISO-8859-1').encode('ascii', errors= 'ignore').decode())
        tar_file.close()
        # Save data
        os.makedirs(save_folder_name, exist_ok= True)
        with open(pos_file, 'w') as pos_file_handler:
            pos_file_handler.write(''.join(pos_data))
        with open(neg_file, 'w') as neg_file_handler:
            neg_file_handler.write(''.join(neg_data))
    texts = pos_data + neg_data
    target = [1]*len(pos_data) + [0]*len(neg_data)
    return (texts, target)

In [5]:
texts, target = load_movies_data()

In [6]:
texts[0]

'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal . \n'

In [7]:
target[0]

1

### Data Cleaning

In [8]:
def normalize_text(texts, stop):
    texts = [x.lower() for x in texts] # To lower case
    texts = [re.findall(pattern= "[a-z]+", string= x) for x in texts] # Remove Numbers and Punctuation marks
    texts = [' '.join([x for x in row if x not in stop]) for row in texts ]
    return texts

In [9]:
texts = normalize_text(texts, stop_words)

In [10]:
texts[200]

'without heavy handedness dong provides perspective intelligent grasp human foibles contradictions'

In [11]:
len(texts)

10662

In [12]:
#target = [target[i] for i, x in enumerate(texts) if len(x.split()) > 2]
texts = [x for x in texts if len(x.split()) > 2]

In [13]:
len(texts)

10425

In [14]:
def build_dictionary(sentences, vocabulary_size):
    words = []
    for sentence in sentences:
        words.extend(sentence.split())
    count= [['RARE', -1]]
    count.extend(collections.Counter(words).most_common(vocabulary_size-1))
    word_dict = {}
    for word, word_count in count:
        word_dict[word] = len(word_dict)
    return word_dict

In [15]:
word_dict = build_dictionary(texts, vocabulary_size)

In [16]:
len(word_dict)

2000

In [17]:
def text_to_numbers(sentences, word_dict):
    data = []
    for sentence in sentences:
        sentence_data = []
        for word in sentence.split():
            if word in word_dict:
                word_idx = word_dict[word]
            else:
                word_idx = 0
            sentence_data.append(word_idx)
        data.append(sentence_data)
    return data

In [18]:
word_dict_rev = dict(zip(word_dict.values(), word_dict.keys())) # {idx: word}

In [19]:
text_data = text_to_numbers(texts, word_dict)
text_data[0]

[439, 0, 0, 754, 30, 0, 135, 17, 0, 7, 0, 1343, 0, 0, 0, 1569, 0, 799, 0]

In [20]:
valid_examples = [word_dict[x] for x in valid_words]
valid_examples

[986, 27, 938, 209, 371]

In [21]:
def generate_batch_data(sentences, batch_size, window_size, method= 'skip_gram'):
    batch_data = []
    label_data = []
    while len(batch_data) < batch_size:
        rand_idx = np.random.choice(len(sentences))
        rand_sentences = sentences[rand_idx]
        window_seq = [rand_sentences[max(ix-window_size, 0):ix+window_size+1]
                  for ix, x in enumerate(rand_sentences)]
        label_idx = [ix if ix < window_size else window_size for ix, x in enumerate(window_seq)]
        if method == 'skip_gram':
            batch_and_labels = [(x[y], x[:y]+x[y+1:]) for x,y in zip(window_seq, label_idx)]
            tuple_data = [(x,y_) for x, y in batch_and_labels for y_ in y]
        else:
            raise ValueError(f'Invalid Method {method}.')
        batch, labels = [list(x) for x in zip(*tuple_data)]
        batch_data.extend(batch[:batch_size])
        label_data.extend(labels[:batch_size])
    batch_data = batch_data[:batch_size]
    label_data = label_data[:batch_size]
    
    batch_data = np.array(batch_data)
    label_data = np.transpose(np.array(label_data))
    return (batch_data, label_data)

### Training the Skip-Grams
Layers:
- input_layer. $[n_{words},~~ batch_n]$
- hidden layer. $[n_{embeddings}, ~~ batch_n]$
- output layer. $[n_{words},~~ batch_n]$

Weights:
* input  $~=>~$  hidden (**embeddings**). $[n_{embeddings}, ~~ n_{words]}]$
* hidden $~=>~$  output. $[n_{words}, ~~ n_{embeddings}]$

In [23]:
# Weights / Bias
embeddings = tf.Variable(tf.random.uniform(shape= [embedding_size, vocabulary_size], minval= 0, maxval= 1, dtype= tf.float32))
b0 = tf.Variable(tf.zeros(shape= [embedding_size, 1], dtype= tf.float32))
W1 = tf.Variable(tf.random.uniform(shape= [vocabulary_size, embedding_size], minval= 0, maxval= 1, dtype= tf.float32))
b1 = tf.Variable(tf.zeros(shape= [vocabulary_size, 1], dtype= tf.float32))

# Identity Matrix
identity = tf.linalg.diag(tf.ones(shape= [vocabulary_size], dtype= tf.float32))
# Function to one_hot sampes
def one_hot(samples, transpose= False):
    encoded = tf.nn.embedding_lookup(identity, samples) 
    if transpose:
        return tf.transpose(encoded)
    return encoded

# Neural Network
@tf.function
def model(X):
    A0 = tf.nn.relu(tf.matmul(embeddings, X) + b0)
    A1 = tf.nn.softmax(tf.matmul(W1, A0) + b1, axis= 0)
    return A1

# Loss Function
@tf.function
def loss_function(y_true, y_pred):
    y_pred_t = tf.transpose(y_pred)
    return tf.reduce_sum(tf.losses.categorical_crossentropy(y_true, y_pred_t)) / len(y_pred)

# Compute the Similarity Between validation words and all the vocabulary words
@tf.function
def compute_similarity(embeddings, val_idx, topk= 10):
    # Normalize Embeddings
    embeddings_t = tf.transpose(embeddings)
    norm = tf.math.sqrt(tf.reduce_sum(tf.square(embeddings_t), 1, keepdims= True))
    norm_embeddings = embeddings_t / norm # n_samples, 200
    # Get Validation words
    validation_words= tf.nn.embedding_lookup(norm_embeddings, val_idx) # n_samples, 200
    # Dot Product
    cos_similarity = tf.matmul(validation_words, norm_embeddings, transpose_b= True)
    # Get top K words
    values_K, idx_K = tf.nn.top_k(-cos_similarity, k= topk, )
    return idx_K

# Optimizer
eta = 0.1
my_opt = tf.optimizers.legacy.Adam(learning_rate= eta)

# Main Loop
loss_vect = []
for ite in range(1, generations+1):
    X_idx, y_idx = generate_batch_data(text_data, batch_size, window_size, method= 'skip_gram')
    X_input= tf.constant(one_hot(X_idx, transpose= True), dtype= tf.float32)
    y_input= tf.constant(one_hot(y_idx), dtype= tf.float32)

    with tf.GradientTape() as g:
        g.watch(embeddings)
        g.watch(b0)
        g.watch(W1)
        g.watch(b1)

        output= model(X_input)
        loss= loss_function(y_input, output)
    gradients= g.gradient(loss, [embeddings, b0, W1, b1])
    my_opt.apply_gradients(zip(gradients, [embeddings, b0, W1, b1]))

    if ite % print_loss_every == 0:
        print(f"Iterations: {ite} -- Loss: {loss.numpy()}")
    if ite % print_valid_every == 0:
        similarity_idx = compute_similarity(embeddings, valid_examples, topk= 10)
        for idx, val in enumerate(valid_words):
            print(f'Word: {val} Similarity: ', end= '')
            for k in similarity_idx[idx].numpy():
                print(f'{word_dict_rev[k]}', end= ' - ')
            print()
    loss_vect.append(loss.numpy())

Iterations: 1000 -- Loss: 1.3687118291854858
Iterations: 2000 -- Loss: 1.2327297925949097
Word: cliche Similarity: RARE - movie - film - sense - like - funny - sounds - least - kind - really - 
Word: love Similarity: RARE - movie - film - really - funny - like - sounds - could - terrific - george - 
Word: hate Similarity: RARE - film - movie - sense - like - kind - sounds - funny - could - read - 
Word: silly Similarity: RARE - movie - film - end - sounds - read - really - sense - kind - funny - 
Word: sad Similarity: RARE - read - movie - kind - funny - sense - science - like - george - lacks - 
Iterations: 3000 -- Loss: 1.1505544185638428
Iterations: 4000 -- Loss: 1.5082716941833496
Word: cliche Similarity: RARE - movie - film - sense - like - funny - sounds - least - kind - really - 
Word: love Similarity: RARE - movie - film - really - funny - like - sounds - could - terrific - george - 
Word: hate Similarity: RARE - film - movie - sense - like - kind - sounds - funny - could - rea

In [24]:
word_vects = tf.transpose(embeddings).numpy() # (vocabulary_size, 600)

In [122]:
# king - man + women
queen_vect = word_vects[word_dict['sad']] + word_vects[word_dict['happy']] + word_vects[word_dict['love']] # (600,)
queen = queen_vect.reshape(1,1,queen_vect.shape[0]) # (1,1,embeddings)

# L2 Norm
l2_norm = np.power(queen-word_vects, 2)
sum_l2_norm = np.sum(l2_norm, axis= 2) # (1,10 000)

# Get Nearest Vector
idx = np.argmin(sum_l2_norm)
word_dict_rev[idx]

'sad'

#### Save the Embeddings

In [25]:
import pickle

In [28]:
# Save the Embeddings
with open(os.path.join('Resources', 'skip-grams-200.pickle'), 'wb') as f:
    pickle.dump(word_vects, f, pickle.HIGHEST_PROTOCOL)