# Another implementation of word2vec: this is the first version that I saw

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
import collections
import math
import numpy as np
import os
import random
import tensorflow as tf
import inspect
import string
import time
from word2vec import Config, DataHolder
from data_process import batch_generator

In [2]:
filename = 'pt96.txt'  
file_path = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
file_path = os.path.join(file_path,'data')
file_path = os.path.join(file_path,filename)

my_data = DataHolder(file_path)
vocab_size = 50000

In [3]:
data, count, word2index, index_to_word = my_data.build_data(vocab_size)

In [4]:
for i in data[:100]:
    print(index_to_word[i], end = " ")

Conto Contos UNK 1870 Contos UNK Textofonte Obra Completa Machado de Assis vol II Rio de Janeiro Nova Aguilar 1994 Publicado originalmente pela Editora Garnier Rio de Janeiro em 1870 ÍNDICE UNK UNK UNK UNK A UNK DE UNK O UNK DE UNK UNK DE UMA UNK UNK UNK UNK E UNK UNK FREI UNK UNK UNK ÍNDICE Capítulo Primeiro Capítulo II Capítulo iii Capítulo UNK Capítulo v Capítulo UNK Capítulo UNK CAPÍTULO VIII CAPÍTULO PRIMEIRO Era conveniente ao romance que o leitor ficasse muito tempo sem saber quem era Miss Dollar Mas por outro lado sem a apresentação de 

In [5]:
batch_size = 128
embedding_size = 128 # Dimension of the embedding vector.
skip_window = 1 # How many words to consider left and right.
num_skips = 2 # How many times to reuse an input to generate a label.
# We pick a random validation set to sample nearest neighbors. here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent. 
valid_size = 16 # Random set of words to evaluate similarity on.
valid_window = 100 # Only pick dev samples in the head of the distribution.
valid_examples = np.array(random.sample(range(valid_window), valid_size))
num_sampled = 64 # Number of negative examples to sample.

graph = tf.Graph()
with graph.as_default():
    # Input data.
    train_dataset = tf.placeholder(tf.int32, shape=[batch_size])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
  
    # Variables.
    embeddings = tf.Variable(
    tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0))
    softmax_weights = tf.Variable(tf.truncated_normal([vocab_size, embedding_size],
                         stddev=1.0 / math.sqrt(embedding_size)))
    softmax_biases = tf.Variable(tf.zeros([vocab_size]))
  
    # Model.
    # Look up embeddings for inputs.
    embed = tf.nn.embedding_lookup(embeddings, train_dataset)
    # Compute the softmax loss, using a sample of the negative labels each time.
    loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(softmax_weights, softmax_biases,train_labels, embed,num_sampled, vocab_size))

    # Optimizer.
    optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)
  
    # Compute the similarity between minibatch examples and all embeddings.
    # We use the cosine distance:
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
    similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings))

In [None]:
num_steps = 100001
data_index = 0

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  ts = time.time()
  print("Initialized")
  average_loss = 0
  for step in range(num_steps):
    data_index,batch_data, batch_labels = batch_generator(
      batch_size, num_skips, skip_window,data_index,data)
    feed_dict = {train_dataset : batch_data, train_labels : batch_labels}
    _, l = session.run([optimizer, loss], feed_dict=feed_dict)
    average_loss += l
    if step % 2000 == 0:
      if step > 0:
        average_loss = average_loss / 2000
      # The average loss is an estimate of the loss over the last 2000 batches.
      print("Average loss at step", step, ":", average_loss)
      average_loss = 0
    # note that this is expensive (~20% slowdown if computed every 500 steps)
    if step % 10000 == 0:
      sim = similarity.eval()
      for i in range(valid_size):
        valid_word = index_to_word[valid_examples[i]]
        top_k = 8 # number of nearest neighbors
        nearest = (-sim[i, :]).argsort()[1:top_k+1]
        log = "Nearest to %s:" % valid_word
        for k in range(top_k):
          close_word = index_to_word[nearest[k]]
          log = "%s %s," % (log, close_word)
        print(log)
  te = time.time()
  final_embeddings = normalized_embeddings.eval()

Initialized
Average loss at step 0 : 7.48771047592
Nearest to parte: Sur·, sol, 20131, onde, SP88, colocar, genuínas, Consegue,
Nearest to para: atrasou, Miz, corrija, Mariz, poderio, Coco, Yzerman, posteridade,
Nearest to dia: doutrinas, verificamos, demarcada, originouse, perfazem, encontram, Friday, Demo,
Nearest to dois: Carlos, 58°, indenizações, estabeleceram, Stravinsky, lembrase, excede, grava,
Nearest to das: Johnson·, extensão, vicariato, apurada, Patton, cuidando, Atlanta, polegar,
Nearest to e: gruta, súplicas, reconstrução, confiadas, interna, reportou, começaram, transpiração,
Nearest to tempo: Rhythm, hospedeiros, Image, SocialDemocrata, Diva, Dolly, matador, ALTENER,
Nearest to É: suspense, Resolveu, mistério, subjetividade, forçaram, Tele, satisfeita, Wendy,
Nearest to está: Sul3, cedido, tão, Mandato, Praticamente, Andrei, trato, alla,
Nearest to até: chorava, Heian, corrigido, veria, Itsuki, colônias, desfile, monástico,
Nearest to na: Pride, encarceramento, patrick,

In [None]:
print("duration= ", te-ts)

In [None]:
from scipy import spatial

def analogy(w1,w2,w3,index_to_word,word2index, embeddings):
    a = embeddings[word2index[w1]]
    b = embeddings[word2index[w2]]
    c = embeddings[word2index[w3]]
    def apply_dot(x):
        return b.dot(x) - a.dot(x) + c.dot(x)
    all_results = [(apply_dot(w), index) for index, w in enumerate(embeddings) if (index!= word2index[w1] and index!= word2index[w2] and index!= word2index[w3])]
    all_results.sort(reverse=True)
    result = [(index_to_word[index],value) for (value,index) in all_results[0:11]]
    return result[0][0]  

def top_k_sim(vector, index_to_word, embeddings,k=10):
    all_sim = [(1 - spatial.distance.cosine(vector, w),index) for index, w in enumerate(embeddings)]
    all_sim.sort(reverse =True)
    result = [(index_to_word[index],value) for (value,index) in all_sim[0:k+1]]
    return result 

In [None]:
mulher = final_embeddings[word2index['mulher']]
top_k_sim(mulher,index_to_word,final_embeddings)

In [None]:
eval_file="AnalogiesBr_little.txt"
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
prefix = os.path.join(currentdir, "evaluation")
file_path = os.path.join(prefix, eval_file)
valid_tests = 0
correct_answer = 0
total_lines = 0
total_loss = 0


In [None]:
import time
import sys
initial_time = time.time()
with open(file_path) as inputfile:
    for line in inputfile:
        total_lines += 1
        initial = time.time()
        list_line = line.strip().split()
        if all([word in word2index for word in list_line]):
            valid_tests += 1
            analogue = analogy(list_line[0],
                               list_line[1],
                               list_line[2],
                               index_to_word,
                               word2index,
                               final_embeddings)
            print("\nAnalogy -->", list_line)
            print("prediction -->", analogue)
            if analogue == list_line[3]:
                correct_answer += 1
                print("\nYESSSSSSSSSSSSSSSSSSSSS\n")
            current_time = time.time() - initial
            sys.stdout.write('\rcurrent_line:{}, duration = {}'.format(total_lines,
                                                                        current_time))
            sys.stdout.flush()

print("\ntotal_lines = {}".format(total_lines))
print("valid_tests = {}".format(valid_tests))
print("correct_answer = {}".format(correct_answer))
print("total_loss = {}".format(total_loss))
print("duration = {}".format(time.time() - initial_time))


26/342