In [1]:
import os
import tensorflow as tf
import numpy as np
import time, random

from sklearn.metrics.pairwise import cosine_similarity
from model import Word2vec
from util import Generator

In [2]:
corpus_path = "imdb.corpus"

batch_size = 128          # 배치 사이즈
window_size = 4           # 앞 뒤 window size
vocab_size = 10000        # vocab 수
shuffle = False            # 데이터 shuffle
emb_dim = 128             # 단어 embedding dimension
num_samples = 5          # samples numbers for sampled softmax or NCE
learning_rate = 0.001     # Learning rate
mode = "nce"              # sampled-softmax or nce_loss
iters = 100000            # iteration 수
use_clip = True           # Gradient clipping 쓸지 여부

In [3]:
## Data input pipeline
data = Generator(corpus_path, vocab_size, batch_size, window_size, shuffle)
## Model graph
w2v = Word2vec("word2vec-nce", emb_dim, vocab_size, num_samples, use_clip, learning_rate, mode)

Now building vocab
corpus : imdb.corpus vocab_size : 10000


In [4]:
def similar_words(word="good"):
    idx = data.word2idx[word]
    total_emb = np.array(sess.run(w2v.emb_w))              # (N, D)
    query_emb = total_emb[idx][None, :]                    # (1, D)
    
    sim_matrix = cosine_similarity(query_emb, total_emb)   # (1, N)
    scores = sim_matrix[0]
    ids = np.argsort(-1 * scores)                          # 내림 차순
    scores = sorted(scores, reverse=True)                  # 내림 차순

    print("\nSimilar words with [{}]".format(word))
    for i in range(1, 10):
        cosine_sim, id = scores[i], ids[i]
        print("{:2d} - {}: {:.4f}".format(i, data.idx2word[id], cosine_sim))

start_time = time.time()
data_point = 0
avg_loss, it_log, it_save, it_sample = .0, 100, 5000, 1000

with tf.Session() as sess:
    
    sess.run(tf.global_variables_initializer())
    
    for it in range(0, iters):
        inputs, labels = data.next()
        loss, update = sess.run([w2v.loss, w2v.update],
                                feed_dict={w2v.inputs: inputs, w2v.labels: labels})
        avg_loss += loss

        if it % it_log == 0 and it:
            print(" it: {:4d} | loss: {:.3f} - {:.2f}s".format(
                it, avg_loss / it_log, time.time() - start_time))
            avg_loss = .0

        if it % it_sample == 0:
            similar_words(word="man")
        if it % it_save == 0 and it > 0:
            w2v.save(sess)


Similar words with [man]
 1 - buildup: 0.3132
 2 - spirit: 0.3120
 3 - luis: 0.3037
 4 - somewhat: 0.3036
 5 - apart: 0.2994
 6 - possession: 0.2971
 7 - reminded: 0.2796
 8 - foreign: 0.2757
 9 - void: 0.2757
 it:  100 | loss: 27.113 - 3.44s
 it:  200 | loss: 25.977 - 6.32s
 it:  300 | loss: 25.745 - 9.55s
 it:  400 | loss: 25.619 - 12.62s
 it:  500 | loss: 25.598 - 15.70s
 it:  600 | loss: 24.699 - 18.65s
 it:  700 | loss: 25.083 - 21.54s
 it:  800 | loss: 24.081 - 24.54s
 it:  900 | loss: 23.459 - 27.56s
 it: 1000 | loss: 24.114 - 30.58s

Similar words with [man]
 1 - <unk>: 0.9755
 2 - film: 0.9748
 3 - movie: 0.9715
 4 - really: 0.9706
 5 - movies: 0.9701
 6 - three: 0.9689
 7 - made: 0.9688
 8 - films: 0.9686
 9 - little: 0.9684
 it: 1100 | loss: 24.293 - 33.59s
 it: 1200 | loss: 23.455 - 36.77s
 it: 1300 | loss: 22.205 - 39.83s
 it: 1400 | loss: 21.600 - 42.89s
 it: 1500 | loss: 22.060 - 45.97s
 it: 1600 | loss: 22.204 - 49.10s
 it: 1700 | loss: 21.897 - 52.24s
 it: 1800 | loss

 it: 14100 | loss: 12.699 - 492.77s
 it: 14200 | loss: 11.585 - 496.32s
 it: 14300 | loss: 10.329 - 499.88s
 it: 14400 | loss: 10.875 - 503.50s
 it: 14500 | loss: 11.794 - 507.04s
 it: 14600 | loss: 12.271 - 510.61s
 it: 14700 | loss: 13.078 - 514.18s
 it: 14800 | loss: 12.199 - 517.72s
 it: 14900 | loss: 11.494 - 521.25s
 it: 15000 | loss: 11.109 - 524.81s

Similar words with [man]
 1 - there's: 0.9880
 2 - played: 0.9867
 3 - world: 0.9865
 4 - around: 0.9859
 5 - us: 0.9859
 6 - come: 0.9855
 7 - enough: 0.9853
 8 - shot: 0.9852
 9 - rest: 0.9852
 * model saved at 'model_word2vec-nce/w2v_model'
 it: 15100 | loss: 11.164 - 528.54s
 it: 15200 | loss: 11.178 - 531.95s
 it: 15300 | loss: 11.350 - 535.38s
 it: 15400 | loss: 12.686 - 538.92s
 it: 15500 | loss: 11.280 - 542.47s
 it: 15600 | loss: 12.381 - 546.01s
 it: 15700 | loss: 10.673 - 549.54s
 it: 15800 | loss: 11.321 - 553.07s
 it: 15900 | loss: 11.278 - 556.79s
 it: 16000 | loss: 11.591 - 560.49s

Similar words with [man]
 1 - ther

 it: 29100 | loss: 10.251 - 1031.24s
 it: 29200 | loss: 8.542 - 1034.78s
 it: 29300 | loss: 8.264 - 1038.40s
 it: 29400 | loss: 8.804 - 1042.12s
 it: 29500 | loss: 8.576 - 1045.79s
 it: 29600 | loss: 9.854 - 1049.51s
 it: 29700 | loss: 8.554 - 1053.15s
 it: 29800 | loss: 8.530 - 1056.73s
 it: 29900 | loss: 7.970 - 1060.29s
 it: 30000 | loss: 8.109 - 1063.82s

Similar words with [man]
 1 - woman: 0.9914
 2 - world: 0.9904
 3 - life: 0.9903
 4 - show: 0.9900
 5 - course: 0.9900
 6 - young: 0.9900
 7 - girl: 0.9897
 8 - house: 0.9897
 9 - guy: 0.9897
 * model saved at 'model_word2vec-nce/w2v_model'
 it: 30100 | loss: 8.516 - 1067.67s
 it: 30200 | loss: 8.030 - 1071.24s
 it: 30300 | loss: 8.056 - 1074.79s
 it: 30400 | loss: 9.524 - 1078.36s
 it: 30500 | loss: 8.873 - 1082.01s
 it: 30600 | loss: 9.090 - 1085.55s
 it: 30700 | loss: 9.349 - 1089.08s
 it: 30800 | loss: 8.295 - 1092.64s
 it: 30900 | loss: 8.665 - 1096.18s
 it: 31000 | loss: 8.618 - 1099.73s

Similar words with [man]
 1 - woman:

 it: 44000 | loss: 8.162 - 1572.92s

Similar words with [man]
 1 - woman: 0.9925
 2 - world: 0.9915
 3 - two: 0.9912
 4 - playing: 0.9912
 5 - guy: 0.9910
 6 - role: 0.9910
 7 - girl: 0.9910
 8 - gets: 0.9909
 9 - killer: 0.9907
 it: 44100 | loss: 8.039 - 1576.47s
 it: 44200 | loss: 7.039 - 1580.03s
 it: 44300 | loss: 8.618 - 1583.62s
 it: 44400 | loss: 7.738 - 1587.33s
 it: 44500 | loss: 6.831 - 1591.13s
 it: 44600 | loss: 8.035 - 1595.02s
 it: 44700 | loss: 7.417 - 1598.69s
 it: 44800 | loss: 6.551 - 1602.31s
 it: 44900 | loss: 7.825 - 1605.95s
 it: 45000 | loss: 7.254 - 1609.59s

Similar words with [man]
 1 - woman: 0.9919
 2 - girl: 0.9917
 3 - two: 0.9914
 4 - gets: 0.9909
 5 - playing: 0.9908
 6 - role: 0.9907
 7 - world: 0.9906
 8 - house: 0.9905
 9 - god: 0.9903
 * model saved at 'model_word2vec-nce/w2v_model'
 it: 45100 | loss: 7.713 - 1613.47s
 it: 45200 | loss: 6.357 - 1617.02s
 it: 45300 | loss: 7.945 - 1620.59s
 it: 45400 | loss: 7.070 - 1624.17s
 it: 45500 | loss: 7.751 -

 it: 58300 | loss: 6.734 - 2094.29s
 it: 58400 | loss: 8.140 - 2097.85s
 it: 58500 | loss: 7.578 - 2101.44s
 it: 58600 | loss: 6.706 - 2105.02s
 it: 58700 | loss: 8.073 - 2108.58s
 it: 58800 | loss: 7.912 - 2112.35s
 it: 58900 | loss: 7.141 - 2116.04s
 it: 59000 | loss: 7.442 - 2119.72s

Similar words with [man]
 1 - woman: 0.9919
 2 - young: 0.9902
 3 - whose: 0.9901
 4 - evil: 0.9897
 5 - role: 0.9894
 6 - death: 0.9893
 7 - played: 0.9891
 8 - girl: 0.9889
 9 - lady: 0.9889
 it: 59100 | loss: 7.591 - 2123.43s
 it: 59200 | loss: 7.542 - 2127.10s
 it: 59300 | loss: 8.368 - 2130.82s
 it: 59400 | loss: 7.543 - 2134.59s
 it: 59500 | loss: 7.878 - 2138.40s
 it: 59600 | loss: 7.993 - 2141.96s
 it: 59700 | loss: 7.141 - 2145.52s
 it: 59800 | loss: 7.370 - 2149.04s
 it: 59900 | loss: 6.999 - 2152.62s
 it: 60000 | loss: 6.931 - 2156.20s

Similar words with [man]
 1 - woman: 0.9919
 2 - young: 0.9911
 3 - whose: 0.9903
 4 - evil: 0.9902
 5 - girl: 0.9901
 6 - played: 0.9900
 7 - plays: 0.9893


 it: 73100 | loss: 6.209 - 2640.82s
 it: 73200 | loss: 7.816 - 2644.50s
 it: 73300 | loss: 8.290 - 2648.19s
 it: 73400 | loss: 7.031 - 2651.86s
 it: 73500 | loss: 7.130 - 2655.54s
 it: 73600 | loss: 6.862 - 2659.23s
 it: 73700 | loss: 7.117 - 2662.92s
 it: 73800 | loss: 6.191 - 2666.62s
 it: 73900 | loss: 7.944 - 2670.34s
 it: 74000 | loss: 6.875 - 2674.03s

Similar words with [man]
 1 - woman: 0.9907
 2 - girl: 0.9899
 3 - women: 0.9898
 4 - dead: 0.9898
 5 - world: 0.9895
 6 - played: 0.9894
 7 - city: 0.9891
 8 - leads: 0.9891
 9 - death: 0.9890
 it: 74100 | loss: 6.958 - 2677.76s
 it: 74200 | loss: 7.909 - 2681.43s
 it: 74300 | loss: 7.471 - 2685.11s
 it: 74400 | loss: 7.434 - 2688.85s
 it: 74500 | loss: 7.722 - 2692.53s
 it: 74600 | loss: 6.750 - 2696.20s
 it: 74700 | loss: 7.288 - 2699.91s
 it: 74800 | loss: 7.659 - 2703.66s
 it: 74900 | loss: 6.349 - 2707.38s
 it: 75000 | loss: 7.879 - 2711.10s

Similar words with [man]
 1 - girl: 0.9907
 2 - woman: 0.9906
 3 - women: 0.9898
 4 

 it: 87900 | loss: 7.975 - 3190.64s
 it: 88000 | loss: 6.586 - 3194.40s

Similar words with [man]
 1 - girl: 0.9898
 2 - woman: 0.9891
 3 - young: 0.9879
 4 - leading: 0.9877
 5 - mr: 0.9871
 6 - car: 0.9870
 7 - killer: 0.9870
 8 - death: 0.9867
 9 - women: 0.9866
 it: 88100 | loss: 7.261 - 3198.08s
 it: 88200 | loss: 7.475 - 3201.75s
 it: 88300 | loss: 7.959 - 3205.44s
 it: 88400 | loss: 6.544 - 3209.13s
 it: 88500 | loss: 7.062 - 3212.91s
 it: 88600 | loss: 7.568 - 3216.66s
 it: 88700 | loss: 7.841 - 3220.35s
 it: 88800 | loss: 6.974 - 3224.05s
 it: 88900 | loss: 7.241 - 3227.74s
 it: 89000 | loss: 7.318 - 3231.45s

Similar words with [man]
 1 - girl: 0.9889
 2 - woman: 0.9884
 3 - young: 0.9879
 4 - leading: 0.9872
 5 - car: 0.9867
 6 - wife: 0.9862
 7 - killer: 0.9862
 8 - whose: 0.9861
 9 - death: 0.9860
 it: 89100 | loss: 8.219 - 3235.31s
 it: 89200 | loss: 6.878 - 3239.18s
 it: 89300 | loss: 6.639 - 3242.99s
 it: 89400 | loss: 6.893 - 3246.79s
 it: 89500 | loss: 7.051 - 3250.46

In [5]:
def calc_words(word1, word2, word3):
    idx1 = data.word2idx[word1]
    idx2 = data.word2idx[word2]
    idx3 = data.word2idx[word3]
    
    total_emb = np.array(sess.run(w2v.emb_w))              # (N, D)
    word_emb_1 = total_emb[idx1][None, :]                  # (1, D)
    word_emb_2 = total_emb[idx2][None, :]                  # (1, D)
    word_emb_3 = total_emb[idx3][None, :]                  # (1, D)
    
    query_emb = word_emb_1 - word_emb_2 + word_emb_3
    
    sim_matrix = cosine_similarity(query_emb, total_emb)   # (1, N)
    scores = sim_matrix[0]
    ids = np.argsort(-1 * scores)                          # 내림 차순
    scores = sorted(scores, reverse=True)                  # 내림 차순

    print(f"\nSimilar words with [{word1} - {word2} + {word3}]")
    for i in range(1, 10):
        cosine_sim, id = scores[i], ids[i]
        print("{:2d} - {}: {:.4f}".format(i, data.idx2word[id], cosine_sim))

In [7]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    w2v.restore(sess)
    similar_words(word="king")

 - Restoring variables...
INFO:tensorflow:Restoring parameters from model_word2vec-nce/w2v_model
 * model restored 

Similar words with [king]
 1 - water: 0.9927
 2 - heroine: 0.9924
 3 - friendship: 0.9923
 4 - angry: 0.9923
 5 - artist: 0.9921
 6 - al: 0.9921
 7 - animal: 0.9921
 8 - cat: 0.9920
 9 - queen: 0.9919
