<a href="https://colab.research.google.com/github/Harshj1/Word2Vec/blob/master/Word2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
from collections import defaultdict

In [0]:
text = "natural language processing and machine learning is fun and exciting"

In [3]:
corpus = [[word.lower() for word in text.split()]]
corpus

[['natural',
  'language',
  'processing',
  'and',
  'machine',
  'learning',
  'is',
  'fun',
  'and',
  'exciting']]

In [0]:
settings = {
	'window_size': 2,	# context window +- center word
	'n': 10,		# dimensions of word embeddings, also refer to size of hidden layer
	'epochs': 50,		# number of training epochs
	'learning_rate': 0.01	# learning rate
}

In [0]:
class word2vec():
  def __init__(self):
    self.n = settings['n']
    self.lr = settings['learning_rate']
    self.epochs = settings['epochs']
    self.window = settings['window_size']

  def generate_training_data(self, settings, corpus):
    word_counts = defaultdict(int)
    for row in corpus:
      for word in row:
        word_counts[word] += 1
    self.v_count = len(word_counts.keys())
    self.words_list = list(word_counts.keys())
    self.word_index = dict((word, i) for i, word in enumerate(self.words_list))
    self.index_word = dict((i, word) for i, word in enumerate(self.words_list))

    training_data = []
    for sentence in corpus:
      sent_len = len(sentence)
      for i, word in enumerate(sentence):
        w_target = self.word2onehot(sentence[i])
        w_context = []
        for j in range(i - self.window, i + self.window+1):
          if j != i and j <= sent_len-1 and j >= 0:
            w_context.append(self.word2onehot(sentence[j]))
        training_data.append([w_target, w_context])
    return np.array(training_data)

  def word2onehot(self, word):
    word_vec = np.zeros(self.v_count)
    word_index = self.word_index[word]
    word_vec[word_index] = 1
    return word_vec

  def train(self, training_data):
    self.w1 = np.random.uniform(-1, 1, (self.v_count, self.n))
    self.w2 = np.random.uniform(-1, 1, (self.n, self.v_count))

    for i in range(self.epochs):
      # Intialise loss to 0
      self.loss = 0

      for w_t, w_c in training_data:
        y_pred, h, u = self.forward_pass(w_t)
        
  def forward_pass(self, x):
    h = np.dot(self.w1.transpose(), x)
    u = np.dot(self.w2.transpose(), h)
    y_c = self.softmax(u)
    return y_c, h, u
  
  def softmax(self, x):
    return np.exp(x) / np.sum(np.exp(x), axis=0)

In [0]:
# Initialise object
w2v = word2vec()
training_data = w2v.generate_training_data(settings, corpus)
w2v.train(training_data)