# CBOW

### The main concept is given the context words, we have to predict the centre word

In [1]:
import re
import numpy as np
import string
import pandas as pd 
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

from subprocess import check_output
from wordcloud import WordCloud, STOPWORDS

stopwords = set(STOPWORDS)

In [5]:
data = """wut lol idk whats this but im still going to do this
           because im clueless, read that again , you didnt understand ha,
           same here fam lololol."""

wordcloud = WordCloud(
    background_color = 'white',
    stopwords = stopwords,
    max_words = 200,
    max_font_size=40, 
    random_state=42
    ).generate(data)

plt.imshow(wordcloud)
plt.axis('off')

# Dataset

In [6]:
sentences = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells."""

In [7]:
# remove special characters
sentences = re.sub('[^A-Za-z0-9]+', ' ', sentences)

# remove 1 letter words
sentences = re.sub(r'(?:^| )\w(?:$| )', ' ', sentences).strip()

# lower all characters
sentences = sentences.lower()

In [9]:
words = sentences.split()
vocab = set(words) 

vocab_size = len(vocab) #43
embed_dim = 10
context_size = 2

In [10]:
word_to_ix = {word: i for i, word in enumerate(vocab)}
ix_to_word = {i: word for i, word in enumerate(vocab)}

In [13]:
# data - [(context), target]

data = []
for i in range(2, len(words) - 2):
    context = [words[i - 2], words[i - 1], words[i + 1], words[i + 2]]
    target = words[i]
    data.append((context, target))

In [18]:
data[:2]

In [20]:
embeddings = np.random.random_sample((vocab_size, embed_dim)) #Return random floats in the half-open interval [0.0, 1.0).

In [22]:
embeddings.shape

In [25]:
def linear(m, theta):
    w = theta
    return m.dot(w)

In [26]:
def log_softmax(x):
    e_x = np.exp(x - np.max(x))
    return np.log(e_x / e_x.sum())

In [27]:
def NLLLoss(logs, targets):
    out = logs[range(len(targets)), targets]
    return -out.sum()/len(out)

In [28]:
def log_softmax_crossentropy_with_logits(logits,target):

    out = np.zeros_like(logits)
    out[np.arange(len(logits)),target] = 1
    
    softmax = np.exp(logits) / np.exp(logits).sum(axis=-1,keepdims=True)
    
    return (- out + softmax) / logits.shape[0]

In [74]:
def forward(context_idxs, theta):
    m = embeddings[context_idxs].reshape(1, -1)
    n = linear(m , theta)
    o = log_softmax(n)
    return m, n, o

In [30]:
def backward(preds, theta, target_idxs):
    m, n, o = preds
    
    dlog = log_softmax_crossentropy_with_logits(n, target_idxs)
    dw = m.T.dot(dlog)
    
    return dw

In [31]:
def optimize(theta, grad, lr=0.03):
    theta -= grad * lr
    return theta

In [32]:
theta = np.random.uniform(-1, 1, (2 * context_size * embed_dim, vocab_size))
theta

In [70]:
theta.shape

In [75]:
epoch_losses = {}

for epoch in range(80):
    losses = []
    
    for context, target in data:
        context_idxs = np.array([word_to_ix[w] for w in context])
        preds = forward(context_idxs, theta)
        
        target_idxs = np.array([word_to_ix[target]])
        loss = NLLLoss(preds[-1], target_idxs)
        
        losses.append(loss)
        
        grad = backward(preds, theta, target_idxs)
        theta = optimize(theta, grad, lr=0.03)
        
    epoch_losses[epoch] = losses

In [43]:
ix = np.arange(0,80)

fig = plt.figure()
fig.suptitle('Epoch/Losses', fontsize=20)
plt.plot(ix,[epoch_losses[i][0] for i in ix])
plt.xlabel('Epochs', fontsize=12)
plt.ylabel('Losses', fontsize=12)

In [58]:
def predict(words):
    context_ids = np.array([word_to_ix[w] for w in words])
    preds = forward(context_ids, theta)
    
    final = ix_to_word[np.argmax(preds[-1])] # -1 since it return m, n, o we just need o
    return final

In [59]:
predict(['we', 'are', 'to', 'study'])

In [66]:
def accuracy():
    wrong = 0

    for context, target in data:
        if(predict(context) != target):
            wrong += 1
            
    return (1 - (wrong / len(data)))

accuracy()