# 1. Data Preparation

In [1]:
text = "natural language processing and machine learning is fun and exciting"

# Note the .lower() as upper and lowercase does not matter in our implementation
# [['natural', 'language', 'processing', 'and', 'machine', 'learning', 'is', 'fun', 'and', 'exciting']]
corpus = [[word.lower() for word in text.split()]]
print(corpus)

[['natural', 'language', 'processing', 'and', 'machine', 'learning', 'is', 'fun', 'and', 'exciting']]


# 2. Hyperparameters


In [2]:
settings = {
    'window_size': 2, # context window +- center word
    'n': 10, # dimensions of word embeddings, also refer to size of hidden layer
    'epochs': 50, # number of training epochs
    'learning_rate': 0.01 # learning rate
}

# 3. Generate Training Data


In [3]:
from collections import defaultdict
import numpy as np

class word2vec():
  def __init__(self):
    self.n = settings['n']
    self.lr = settings['learning_rate']
    self.epochs = settings['epochs']
    self.window = settings['window_size']

  def generate_training_data(self, settings, corpus):
    # Find unique word counts using dictonary
    word_counts = defaultdict(int)
    for row in corpus:
      for word in row:
        word_counts[word] += 1
    ## How many unique words in vocab? 9
    self.v_count = len(word_counts.keys())
    # Generate Lookup Dictionaries (vocab)
    self.words_list = list(word_counts.keys())
    # Generate word:index
    self.word_index = dict((word, i) for i, word in enumerate(self.words_list))
    # Generate index:word
    self.index_word = dict((i, word) for i, word in enumerate(self.words_list))

    training_data = []
    # Cycle through each sentence in corpus
    for sentence in corpus:
      sent_len = len(sentence)
      # Cycle through each word in sentence
      for i, word in enumerate(sentence):
        # Convert target word to one-hot
        w_target = self.word2onehot(sentence[i])
        # Cycle through context window
        w_context = []
        # Note: window_size 2 will have range of 5 values
        for j in range(i - self.window, i + self.window+1):
          # Criteria for context word 
          # 1. Target word cannot be context word (j != i)
          # 2. Index must be greater or equal than 0 (j >= 0) - if not list index out of range
          # 3. Index must be less or equal than length of sentence (j <= sent_len-1) - if not list index out of range 
          if j != i and j <= sent_len-1 and j >= 0:
            # Append the one-hot representation of word to w_context
            w_context.append(self.word2onehot(sentence[j]))
            # print(sentence[i], sentence[j]) 
            # training_data contains a one-hot representation of the target word and context words
        training_data.append([w_target, w_context])
    return np.array(training_data)

  def word2onehot(self, word):
    # word_vec - initialise a blank vector
    word_vec = [0 for i in range(0, self.v_count)] # Alternative - np.zeros(self.v_count)
    # Get ID of word from word_index
    word_index = self.word_index[word]
    # Change value from 0 to 1 according to ID of the word
    word_vec[word_index] = 1
    return word_vec

  def train(self, training_data):
    # Initialising weight matrices
    # Both s1 and s2 should be randomly initialised but for this demo, we pre-determine the arrays (getW1 and getW2)
    # getW1 - shape (9x10) and getW2 - shape (10x9)
#         self.w1 = np.array(getW1)
#         self.w2 = np.array(getW2)
    self.w1 = np.random.uniform(-1, 1, (self.v_count, self.n))
    self.w2 = np.random.uniform(-1, 1, (self.n, self.v_count))
    
      ##Removed##
  
    # Cycle through each epoch
    for i in range(self.epochs):
      # Intialise loss to 0
      self.loss = 0

      # Cycle through each training sample
      # w_t = vector for target word, w_c = vectors for context words
      for w_t, w_c in training_data:
        # Forward pass - Pass in vector for target word (w_t) to get:
        # 1. predicted y using softmax (y_pred) 2. matrix of hidden layer (h) 3. output layer before softmax (u)
        y_pred, h, u = self.forward_pass(w_t)


              # Calculate error
      # 1. For a target word, calculate difference between y_pred and each of the context words
      # 2. Sum up the differences using np.sum to give us the error for this particular target word
        EI = np.sum([np.subtract(y_pred, word) for word in w_c], axis=0)

      # Backpropagation
      # We use SGD to backpropagate errors - calculate loss on the output layer 
        self.backprop(EI, h, w_t)

      # Calculate loss
      # There are 2 parts to the loss function
      # Part 1: -ve sum of all the output +
      # Part 2: length of context words * log of sum for all elements (exponential-ed) in the output layer before softmax (u)
      # Note: word.index(1) returns the index in the context word vector with value 1
      # Note: u[word.index(1)] returns the value of the output layer before softmax
        self.loss += -np.sum([u[word.index(1)] for word in w_c]) + len(w_c) * np.log(np.sum(np.exp(u)))
        #print('Epoch:', i, "Loss:", self.loss)
        
        ##Removed##
    
    return np.array(training_data)

        
  def forward_pass(self, x):
    # x is one-hot vector for target word, shape - 9x1
    # Run through first matrix (w1) to get hidden layer - 10x9 dot 9x1 gives us 10x1
    h = np.dot(self.w1.T, x)
    # Dot product hidden layer with second matrix (w2) - 9x10 dot 10x1 gives us 9x1
    u = np.dot(self.w2.T, h)
    # Run 1x9 through softmax to force each element to range of [0, 1] - 1x8
    y_c = self.softmax(u)
    return y_c, h, u
  
  def softmax(self, x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

  def backprop(self, e, h, x):
    # https://docs.scipy.org/doc/numpy-1.15.1/reference/generated/numpy.outer.html
    # Column vector EI represents row-wise sum of prediction errors across each context word for the current center word
    # Going backwards, we need to take derivative of E with respect of w2
    # h - shape 10x1, e - shape 9x1, dl_dw2 - shape 10x9
    dl_dw2 = np.outer(h, e)
    # x - shape 1x8, w2 - 5x8, e.T - 8x1
    # x - 1x8, np.dot() - 5x1, dl_dw1 - 8x5
    dl_dw1 = np.outer(x, np.dot(self.w2, e.T))
    # Update weights
    self.w1 = self.w1 - (self.lr * dl_dw1)
    self.w2 = self.w2 - (self.lr * dl_dw2)

In [4]:
# Initialise object
w2v = word2vec()
# Numpy ndarray with one-hot representation for [target_word, context_words]
training_data = w2v.generate_training_data(settings, corpus)



# 4. Model Training

In [5]:
# Training
w2v.train(training_data)

Epoch: 0 Loss: 4.403464455608226
Epoch: 0 Loss: 14.22888112209453
Epoch: 0 Loss: 25.710650251832696
Epoch: 0 Loss: 33.053651783239076
Epoch: 0 Loss: 43.50295106433417
Epoch: 0 Loss: 50.42062172290266
Epoch: 0 Loss: 60.104306633568484
Epoch: 0 Loss: 69.10682351499196
Epoch: 0 Loss: 79.99040287137477
Epoch: 0 Loss: 83.26320025523344
Epoch: 1 Loss: 4.169039789614281
Epoch: 1 Loss: 13.497212317398446
Epoch: 1 Loss: 24.701941707874326
Epoch: 1 Loss: 31.982941863912338
Epoch: 1 Loss: 42.22144719775979
Epoch: 1 Loss: 49.05208359046988
Epoch: 1 Loss: 58.61860694639615
Epoch: 1 Loss: 67.43858061085025
Epoch: 1 Loss: 77.90250214153049
Epoch: 1 Loss: 81.10019726160365
Epoch: 2 Loss: 3.955467839657018
Epoch: 2 Loss: 12.864266009701016
Epoch: 2 Loss: 23.823603418049053
Epoch: 2 Loss: 31.07386533765685
Epoch: 2 Loss: 41.126001708977185
Epoch: 2 Loss: 47.87807575488845
Epoch: 2 Loss: 57.33671705459984
Epoch: 2 Loss: 65.99123834378531
Epoch: 2 Loss: 76.09066581253178
Epoch: 2 Loss: 79.22027919608088
E

array([[list([1, 0, 0, 0, 0, 0, 0, 0, 0]),
        list([[0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0]])],
       [list([0, 1, 0, 0, 0, 0, 0, 0, 0]),
        list([[1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0]])],
       [list([0, 0, 1, 0, 0, 0, 0, 0, 0]),
        list([[1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0]])],
       [list([0, 0, 0, 1, 0, 0, 0, 0, 0]),
        list([[0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0]])],
       [list([0, 0, 0, 0, 1, 0, 0, 0, 0]),
        list([[0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0, 0]])],
       [list([0, 0, 0, 0, 0, 1, 0, 0, 0]),
        list([[0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 0, 0, 1, 0]])],
       [list([0, 0, 0, 0, 0, 0,

# 5. Inferencing


In [6]:
# Get vector for word
vec = w2v.word_vec("machine")

class word2vec():
  ## Removed ##
  
  # Get vector from word
  def word_vec(self, word):
    w_index = self.word_index[word]
    v_w = self.w1[w_index]
    return v_w

AttributeError: 'word2vec' object has no attribute 'word_vec'

# Gensim Article

In [2]:
import bs4 as bs
import urllib.request
import re
import nltk

scrapped_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/Artificial_intelligence')
article = scrapped_data .read()

parsed_article = bs.BeautifulSoup(article,'html.parser')

paragraphs = parsed_article.find_all('p')

article_text = ""

for p in paragraphs:
    article_text += p.text

In [4]:
article_text;

In [5]:
# Cleaning the text
processed_article = article_text.lower()
processed_article = re.sub('[^a-zA-Z]', ' ', processed_article )
processed_article = re.sub(r'\s+', ' ', processed_article)

# Preparing the dataset
all_sentences = nltk.sent_tokenize(processed_article)

all_words = [nltk.word_tokenize(sent) for sent in all_sentences]

# Removing Stop Words
from nltk.corpus import stopwords
for i in range(len(all_words)):
    all_words[i] = [w for w in all_words[i] if w not in stopwords.words('english')]

In [7]:
all_words;

In [8]:
from gensim.models import Word2Vec

word2vec = Word2Vec(all_words, min_count=2)

In [11]:
# Show words that appear at least twice
vocabulary = word2vec.wv.vocab
#print(vocabulary)

In [12]:
# Show the corresponding vector for a word
v1 = word2vec.wv['artificial']
v1

array([-3.4275716e-03, -4.5426958e-03, -1.0319741e-03,  5.3739091e-03,
        1.3346088e-03,  2.6749505e-03,  2.3683738e-03,  4.5259180e-03,
        7.4983359e-04,  4.6483026e-04,  3.1671436e-03, -2.9444511e-03,
        2.2884756e-03,  2.5514776e-03,  8.9689746e-04,  5.2988110e-03,
        3.5735765e-03,  5.0671184e-03,  3.5097853e-03,  3.1882184e-04,
        3.8150158e-03,  3.4713263e-03,  4.4909916e-03, -3.6571559e-03,
        1.1781120e-03, -3.7262079e-03,  5.1380042e-03, -1.4140581e-04,
       -1.6374971e-03,  2.8830691e-05,  3.3130441e-03,  5.5346108e-04,
       -1.9603034e-03, -3.4262862e-03,  1.6257574e-03, -5.1931730e-03,
       -1.8518420e-03, -3.8676381e-03,  4.2401184e-04,  2.5461868e-03,
        3.9011026e-03, -6.9584360e-04,  5.6364303e-03,  3.8105107e-03,
       -2.8897370e-03, -6.2121586e-03,  1.3195871e-03,  4.2332853e-03,
       -1.8012434e-03, -1.8020233e-03,  1.8336568e-03, -2.1448592e-03,
       -6.9063255e-03, -2.5056889e-03,  5.9005762e-03,  1.4972986e-03,
      

In [14]:
sim_words = word2vec.wv.most_similar('intelligence')
sim_words

[('machine', 0.3812499940395355),
 ('ai', 0.3750268816947937),
 ('classical', 0.36905476450920105),
 ('hard', 0.3496473431587219),
 ('goals', 0.32975834608078003),
 ('problems', 0.32730886340141296),
 ('use', 0.3059532642364502),
 ('creating', 0.29804661870002747),
 ('assess', 0.2958163321018219),
 ('could', 0.2956176996231079)]