# Word2Vec (Skipgram)

In [1]:
import numpy as np
import torch
import torch.nn
import matplotlib
import matplotlib.pyplot as plt

In [2]:
np.__version__, torch.__version__, matplotlib.__version__

('1.25.2', '2.1.0', '3.7.2')

## 1. Load data

In [6]:
corpus = ["apple banana fruit", "banana apple fruit", "banana fruit apple",
          "dog cat animal", "cat animal dog", "cat dog animal"]

# note : embedding of banana should be the same as embedding of apple and fruit, ws = 1

### 1.1 tokenization

In [7]:
# 1. tokenization
corpus = [sent.split(" ") for sent in corpus]
corpus

[['apple', 'banana', 'fruit'],
 ['banana', 'apple', 'fruit'],
 ['banana', 'fruit', 'apple'],
 ['dog', 'cat', 'animal'],
 ['cat', 'animal', 'dog'],
 ['cat', 'dog', 'animal']]

### 1.2 numericalization

In [11]:
# 2. numericalization
# find the unique words
flatten = lambda l:[item for sublist in l for item in sublist]

# assign unique integer
vocabs = list(set(flatten(corpus))) # all the words we have in the system ; <UNK>: for unknown words

In [12]:
vocabs

['dog', 'apple', 'animal', 'cat', 'fruit', 'banana']

In [13]:
# create handy mapping betweem integer and word
word2index = {v:idx for idx,v in enumerate(vocabs)} # a dictionary array

In [16]:
print(word2index)
print(word2index['dog'])
print(word2index['apple'])

{'dog': 0, 'apple': 1, 'animal': 2, 'cat': 3, 'fruit': 4, 'banana': 5}
0
1


In [None]:
# add <UNK>
vocabs.append("<UNK>")
word2index['<UNK>'] = 6

In [19]:
word2index

{'dog': 0,
 'apple': 1,
 'animal': 2,
 'cat': 3,
 'fruit': 4,
 'banana': 5,
 '<UNK>': 6}

In [21]:
# index to word
index2word = {v:k for k,v in word2index.items()}

In [22]:
index2word

{0: 'dog',
 1: 'apple',
 2: 'animal',
 3: 'cat',
 4: 'fruit',
 5: 'banana',
 6: '<UNK>'}

## 2. Prepare train data

In [32]:
# create pairs of center word and outside word

def random_batch(batch_size, corpus):
    
    skipgrams = []

    # loop each corpus
    for doc in corpus:
        
        # look from the 2nd word until second last word
        for i in range (1, len(doc)-1):
            
            # assign center word
            center = word2index[doc[i]]
                
            # assign outside word = 2 words (if ws = 2, we must assign 2 outside word for each center word )
            outside = (word2index[doc[i-1]], word2index[doc[i+1]])
            
            # for each of these two outside words, we gonna append to a list
            for each_out in outside:
                
                #center, outside1; cener, outside2;
                skipgrams.append([center,each_out])
                
    random_index = np.random.choice(range(len(skipgrams)), batch_size, replace = False)
    
    inputs, labels = [], []
    for index in random_index:
        inputs.append([skipgrams[index][0]]) # center word
        labels.append([skipgrams[index][1]]) # outside word
    
    return np.array(inputs), np.array(labels)
            

In [33]:
random_batch (2, corpus)

(array([[0],
        [2]]),
 array([[3],
        [0]]))

## 3. Model

## 4. Training

## 5. Plot the embeddings

## 6. Cosine Similarity