In [0]:
from google.colab import drive
import numpy as np
import tensorflow as tf

In [0]:
import zipfile
import collections
import math
import random

In [0]:
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
def read_words():
  with zipfile.ZipFile(file='/content/gdrive/My Drive/SentimentAnalysisTensorFlow/SampleText.zip') as myZip:
    firstFile = myZip.namelist()[0]
    filestring = tf.compat.as_str((myZip.read(firstFile)))
    words = filestring.split()

  return words

In [0]:
vocabulary = read_words()
len(vocabulary)

17005207

In [0]:
# building the dataset in useful format for word2vec embeddings
# generating embeddings for only the top 'n' most frequently used words

def buildDataset(words, n_words):
  # A 2D array which holds the word and its frequency
  word_counts = [['UNKNOWN', -1]]

  counter = collections.Counter(words)
  word_counts.extend(counter.most_common(n_words-1))
  dictionary = dict()

  for word, _ in word_counts:
    # Assign unique indices to words
    # The more common the word, the lower its index value
    dictionary[word] = len(dictionary)

  word_indices = list()

  unknown_count = 0
  for word in words:
    if word in dictionary:
      index = dictionary[word]
    else:
      index = 0
      unknown_count += 1
    word_indices.append(index)

  word_counts[0][1] = unknown_count
  reversed_dict = dict(zip(dictionary.values(), dictionary.keys()))
  return word_counts, word_indices, dictionary, reversed_dict

In [0]:
def buildDataset2(words, n_words):
  counter = collections.Counter(words)
  word_frequency = dict(counter.most_common(n_words-1))
  top_words_indices = dict()
  word_indices = list()

  unknow_count = 0
  top_words_indices['UNKNOWN'] = 0

  for word in word_frequency:
    top_words_indices[word] = len(top_words_indices)

  for word in words:
    if word in top_words_indices:
      index = top_words_indices[word]
    else:
      index = 0
      unknow_count += 1
    word_indices.append(index)

  word_frequency['UNKNOWN'] = unknow_count
  reversed_dict = dict(zip(top_words_indices.values(), top_words_indices.keys()))

  return word_frequency, word_indices, top_words_indices, reversed_dict

In [0]:
VOCAB_SIZE = 5000
word_counts, word_indices, dictionary, reversed_dict = buildDataset(
    words=vocabulary,
    n_words=VOCAB_SIZE
)
word_indices[:10]

[0, 3081, 12, 6, 195, 2, 3134, 46, 59, 156]

In [0]:
global_index = 0
# return a new batch of data for every iteration
def generate_batch(word_indices, batch_size, num_skips, skip_window_size):
  global global_index
  assert batch_size % num_skips == 0
  assert num_skips <= 2 * skip_window_size

  batch = np.ndarray(shape=(batch_size), dtype=np.int32)
  labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)

  # size of context window
  # +1, so that it includes the input word
  span = 2 * skip_window_size + 1

  # text within the context window
  buffer = collections.deque(maxlen=span)

  for _ in range(span):
    buffer.append(word_indices[global_index])
    global_index = (global_index + 1) % len(word_indices)

  # print(buffer)

  for i in range(batch_size // num_skips):
    target = skip_window_size
    targets_to_avoid = [skip_window_size]

    for j in range(num_skips):
      while target in targets_to_avoid:
        # choose a random index from span
        # and add it to targets_to_avoid
        # that is, if it already isn't there
        target = random.randint(0, span-1)

      targets_to_avoid.append(target)
      batch[i * num_skips + j] = buffer[skip_window_size]
      labels[i * num_skips + j, 0] = buffer[target]
    
    buffer.append(word_indices[global_index])
    global_index = (global_index + 1) % len(word_indices)
  
  global_index = (global_index + len(word_indices) - span) % len(word_indices)
  return batch, labels

In [0]:
batch, labels = generate_batch(
    word_indices=word_indices,
    batch_size=10,
    num_skips=2,
    skip_window_size=5
)

deque([0, 3081, 12, 6, 195, 2, 3134, 46, 59, 156, 128], maxlen=11)


In [0]:
print(batch)
print(labels)

[   2    2 3134 3134   46   46   59   59  156  156]
[[ 12]
 [195]
 [  6]
 [128]
 [128]
 [  6]
 [742]
 [156]
 [134]
 [477]]


In [0]:
# input word is used to print two target words from its context window
for i in range(9):
  print(reversed_dict[batch[i]], ": ", reversed_dict[labels[i][0]])

of :  as
of :  term
abuse :  a
abuse :  early
first :  early
first :  a
used :  working
used :  against
against :  including


In [0]:
# within 100 words, pick 16 at random
valid_size = 16
valid_window = 100

valid_examples = np.random.choice(valid_window, valid_size, replace=False)

In [0]:
# no. of input words
batch_size = 128
# hidden layer will have 50 neurons
embedding_size = 50
skip_window_size = 2
num_skips = 2

In [0]:
tf.reset_default_graph()

In [0]:
train_inputs = tf.placeholder(dtype=tf.int32, shape=[batch_size])
train_labels = tf.placeholder(dtype=tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

In [0]:
# The embeddings are generated using training dataset
# It contains an embedding of shape 1X50 for every word
embeddings = tf.Variable(
    initial_value=tf.random_uniform(
        shape=[VOCAB_SIZE, embedding_size],
        minval=-1.0,
        maxval=1.0
    )
)

In [0]:
# For every iteration in training, we can only generate
# or train embeddings for the words in that particular batch

# The word inputs in every training batch will look up the
# embeddings for those words in the embedding matrix

# 'train_inputs' placeholder contains the unique word indices
# in this batch and they are looked up in the embeddings matrix
embed = tf.nn.embedding_lookup(params=embeddings, ids=train_inputs)

In [0]:
# set up a hidden layer by using ops y = Wx + b
weights = tf.Variable(tf.truncated_normal(
    shape=[VOCAB_SIZE, embedding_size],
    stddev=1.0 / math.sqrt(embedding_size)
    ))

biases = tf.Variable(initial_value=tf.zeros(shape=[VOCAB_SIZE]))

# A neural network with no activation function
# that is, a linear layer
hidden_output = tf.matmul(embed, tf.transpose(weights)) + biases

In [0]:
train_one_hot = tf.one_hot(train_labels, VOCAB_SIZE)
loss = tf.reduce_mean(
    input_tensor=tf.nn.softmax_cross_entropy_with_logits_v2(
        logits=hidden_output,
        labels=train_one_hot
    )
)

In [0]:
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.1).minimize(loss)

In [0]:
l2_norm = tf.linalg.norm(ord='euclidean', tensor=embeddings)
normalized_embeddings = embeddings /l2_norm
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)

In [0]:
# No of corrupted sample pairs
# to be fed into NCE
num_samples = 64
num_steps = 20001

nce_weights = tf.Variable(tf.truncated_normal(
    shape=[VOCAB_SIZE, embedding_size],
    stddev=1.0 / math.sqrt(embedding_size)
    ))

nce_biases = tf.Variable(initial_value=tf.zeros(shape=[VOCAB_SIZE]))

nce_loss = tf.reduce_mean(
    input_tensor=tf.nn.nce_loss(
        weights=nce_weights,
        biases=nce_biases,
        labels=train_labels,
        inputs=embed,
        num_sampled=num_samples,
        num_classes=VOCAB_SIZE
    )
)

In [0]:
nce_optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.1).minimize(nce_loss)
similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)

In [0]:
init = tf.global_variables_initializer()

In [0]:
with tf.Session() as sess:
  sess.run(init)

  avg_loss = 0
  for step in range(num_steps):
    batch_inputs, batch_labels = generate_batch(
        word_indices,
        batch_size,
        num_skips,
        skip_window_size
    )
    feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
    _, loss_val = sess.run([nce_optimizer, nce_loss], feed_dict=feed_dict)
    avg_loss += loss_val

    if step % 2000 == 0 and step != 0:
      avg_loss /= 2000
      print("Avg loss at step", step, ': ', avg_loss)
      avg_loss=0
    
    if step % 10000 == 0:
      sim = similarity.eval()

      for i in range(valid_size):
        valid_word = reversed_dict[valid_examples[i]]
        top_k = 8

        nearest = (-sim[i, :]).argsort()[1: top_k + 1]
        log_str = 'Nearest to %s' % valid_word

        for k in range(top_k):
          close_word = reversed_dict[nearest[k]]
          log_str = '%s %s, ' % (log_str, close_word)
        print(log_str)
      print("\n")
  final_embeddings = normalized_embeddings.eval()

Nearest to not stones,  mostly,  exile,  levels,  du,  faster,  masters,  printed, 
Nearest to were infant,  birds,  pakistan,  unable,  hotel,  wrong,  brazil,  warner, 
Nearest to be mine,  evening,  fell,  crusade,  put,  milton,  hot,  jurisdiction, 
Nearest to will george,  concept,  leaving,  bread,  foundations,  merely,  production,  pierre, 
Nearest to during salt,  los,  ira,  digital,  fundamental,  ohio,  painting,  arrangement, 
Nearest to i dialects,  assumed,  physician,  substances,  unified,  needed,  playing,  branches, 
Nearest to which laser,  attributes,  regular,  editions,  approach,  hindu,  split,  occupied, 
Nearest to d and,  communism,  industrial,  worked,  officer,  manufactured,  unit,  dynamic, 
Nearest to three operators,  norwegian,  wisconsin,  leo,  script,  indo,  simple,  destroyed, 
Nearest to between father,  bringing,  heavily,  successor,  songwriter,  requires,  comments,  motion, 
Nearest to known chris,  rear,  role,  consequences,  cult,  c

In [0]:
def save_embeddings(embeddings, word_counts):
  np.save('embeddings.npy', embeddings)
  words = [x[0] for x in word_counts]
  np.save('words.npy', words)

save_embeddings(final_embeddings, word_counts)