In [1]:
import os
import tensorflow as tf
import numpy as np
import time, random

from sklearn.metrics.pairwise import cosine_similarity
from model import Word2vec
from util import Generator

In [2]:
corpus_path = "imdb.corpus"

batch_size = 128          # 배치 사이즈
window_size = 4           # 앞 뒤 window size
vocab_size = 10000        # vocab 수
shuffle = False            # 데이터 shuffle
emb_dim = 128             # 단어 embedding dimension
num_samples = 1024        # samples numbers for sampled softmax or NCE
learning_rate = 0.001     # Learning rate
mode = "sampled-softmax"  # sampled-softmax or nce_loss
iters = 100000            # iteration 수
use_clip = True           # Gradient clipping 쓸지 여부

## Define

In [3]:
## Data input pipeline
data = Generator(corpus_path, vocab_size, batch_size, window_size, shuffle)
## Model graph
w2v = Word2vec("word2vec-sampled-softmax", emb_dim, vocab_size, num_samples, use_clip, learning_rate, mode)

Now building vocab
corpus : imdb.corpus vocab_size : 10000
Instructions for updating:
Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.


In [4]:
def similar_words(word="good"):
    idx = data.word2idx[word]
    total_emb = np.array(sess.run(w2v.emb_w))              # (N, D)
    query_emb = total_emb[idx][None, :]                    # (1, D)
    
    sim_matrix = cosine_similarity(query_emb, total_emb)   # (1, N)
    scores = sim_matrix[0]
    ids = np.argsort(-1 * scores)                          # 내림 차순
    scores = sorted(scores, reverse=True)                  # 내림 차순

    print("\nSimilar words with [{}]".format(word))
    for i in range(1, 10):
        cosine_sim, id = scores[i], ids[i]
        print("{:2d} - {}: {:.4f}".format(i, data.idx2word[id], cosine_sim))

start_time = time.time()
data_point = 0
avg_loss, it_log, it_save, it_sample = .0, 100, 5000, 1000

with tf.Session() as sess:
    
    sess.run(tf.global_variables_initializer())
    
    for it in range(0, iters):
        inputs, labels = data.next()
        loss, update = sess.run([w2v.loss, w2v.update],
                                feed_dict={w2v.inputs: inputs, w2v.labels: labels})
        avg_loss += loss

        if it % it_log == 0 and it:
            print(" it: {:4d} | loss: {:.3f} - {:.2f}s".format(
                it, avg_loss / it_log, time.time() - start_time))
            avg_loss = .0

        if it % it_sample == 0:
            similar_words(word="man")
        if it % it_save == 0 and it > 0:
            w2v.save(sess)


Similar words with [man]
 1 - louis: 0.3274
 2 - sea: 0.2901
 3 - sometime: 0.2895
 4 - resembles: 0.2785
 5 - attributed: 0.2778
 6 - mess: 0.2763
 7 - russell: 0.2728
 8 - records: 0.2725
 9 - psychopath: 0.2718
 it:  100 | loss: 8.343 - 4.01s
 it:  200 | loss: 8.074 - 7.56s
 it:  300 | loss: 7.971 - 11.44s
 it:  400 | loss: 7.679 - 15.29s
 it:  500 | loss: 7.671 - 19.19s
 it:  600 | loss: 7.418 - 22.92s
 it:  700 | loss: 7.466 - 27.31s
 it:  800 | loss: 7.258 - 32.56s
 it:  900 | loss: 7.075 - 37.97s
 it: 1000 | loss: 7.139 - 43.19s

Similar words with [man]
 1 - worst: 0.9180
 2 - without: 0.9179
 3 - world: 0.9159
 4 - seems: 0.9127
 5 - know: 0.9121
 6 - black: 0.9103
 7 - felt: 0.9102
 8 - getting: 0.9094
 9 - gore: 0.9091
 it: 1100 | loss: 7.022 - 48.16s
 it: 1200 | loss: 6.947 - 52.73s
 it: 1300 | loss: 6.955 - 57.18s
 it: 1400 | loss: 6.848 - 61.65s
 it: 1500 | loss: 6.823 - 66.12s
 it: 1600 | loss: 6.845 - 71.07s
 it: 1700 | loss: 6.818 - 75.89s
 it: 1800 | loss: 6.753 - 80

 it: 14100 | loss: 6.410 - 674.16s
 it: 14200 | loss: 6.471 - 678.80s
 it: 14300 | loss: 6.365 - 683.42s
 it: 14400 | loss: 6.368 - 688.03s
 it: 14500 | loss: 6.377 - 692.71s
 it: 14600 | loss: 6.433 - 697.40s
 it: 14700 | loss: 6.447 - 702.17s
 it: 14800 | loss: 6.427 - 706.79s
 it: 14900 | loss: 6.386 - 711.45s
 it: 15000 | loss: 6.443 - 716.07s

Similar words with [man]
 1 - husband: 0.8066
 2 - wife: 0.8014
 3 - father: 0.8007
 4 - young: 0.7931
 5 - woman: 0.7862
 6 - named: 0.7817
 7 - town: 0.7712
 8 - meets: 0.7598
 9 - lady: 0.7560
 * model saved at 'model_word2vec-sampled-softmax/w2v_model'
 it: 15100 | loss: 6.324 - 721.06s
 it: 15200 | loss: 6.425 - 725.81s
 it: 15300 | loss: 6.447 - 730.32s
 it: 15400 | loss: 6.400 - 734.77s
 it: 15500 | loss: 6.414 - 739.24s
 it: 15600 | loss: 6.271 - 743.87s
 it: 15700 | loss: 6.368 - 748.50s
 it: 15800 | loss: 6.365 - 753.13s
 it: 15900 | loss: 6.391 - 757.77s
 it: 16000 | loss: 6.399 - 762.44s

Similar words with [man]
 1 - father: 0.7

 it: 28900 | loss: 6.337 - 1349.80s
 it: 29000 | loss: 6.278 - 1354.25s

Similar words with [man]
 1 - woman: 0.7170
 2 - who's: 0.6417
 3 - father: 0.6391
 4 - married: 0.6331
 5 - girl: 0.6269
 6 - runs: 0.6187
 7 - meets: 0.6175
 8 - wife: 0.6123
 9 - kills: 0.6111
 it: 29100 | loss: 6.373 - 1358.74s
 it: 29200 | loss: 6.370 - 1363.21s
 it: 29300 | loss: 6.344 - 1367.66s
 it: 29400 | loss: 6.350 - 1372.12s
 it: 29500 | loss: 6.304 - 1376.58s
 it: 29600 | loss: 6.285 - 1381.02s
 it: 29700 | loss: 6.342 - 1385.48s
 it: 29800 | loss: 6.330 - 1389.97s
 it: 29900 | loss: 6.310 - 1394.46s
 it: 30000 | loss: 6.205 - 1398.92s

Similar words with [man]
 1 - woman: 0.7336
 2 - married: 0.6729
 3 - meets: 0.6637
 4 - father: 0.6620
 5 - kills: 0.6365
 6 - boy: 0.6342
 7 - wife: 0.6246
 8 - accused: 0.6192
 9 - murdered: 0.6190
 * model saved at 'model_word2vec-sampled-softmax/w2v_model'
 it: 30100 | loss: 6.221 - 1403.68s
 it: 30200 | loss: 6.310 - 1408.16s
 it: 30300 | loss: 6.351 - 1412.64s


 it: 43100 | loss: 6.328 - 1986.16s
 it: 43200 | loss: 6.409 - 1990.58s
 it: 43300 | loss: 6.382 - 1995.04s
 it: 43400 | loss: 6.252 - 1999.50s
 it: 43500 | loss: 6.284 - 2003.99s
 it: 43600 | loss: 6.399 - 2008.42s
 it: 43700 | loss: 6.398 - 2012.89s
 it: 43800 | loss: 6.336 - 2017.35s
 it: 43900 | loss: 6.304 - 2021.82s
 it: 44000 | loss: 6.368 - 2026.26s

Similar words with [man]
 1 - woman: 0.5908
 2 - girl: 0.5350
 3 - boy: 0.5244
 4 - father: 0.5096
 5 - who's: 0.5061
 6 - guy: 0.5054
 7 - marry: 0.4844
 8 - doctor: 0.4740
 9 - loves: 0.4729
 it: 44100 | loss: 6.389 - 2030.71s
 it: 44200 | loss: 6.332 - 2035.10s
 it: 44300 | loss: 6.475 - 2039.41s
 it: 44400 | loss: 6.276 - 2043.78s
 it: 44500 | loss: 6.236 - 2048.06s
 it: 44600 | loss: 6.286 - 2052.62s
 it: 44700 | loss: 6.148 - 2057.19s
 it: 44800 | loss: 6.235 - 2062.35s
 it: 44900 | loss: 6.208 - 2067.23s
 it: 45000 | loss: 6.381 - 2071.87s

Similar words with [man]
 1 - woman: 0.6498
 2 - girl: 0.5633
 3 - guy: 0.5549
 4 - b

 it: 57900 | loss: 6.183 - 2651.60s
 it: 58000 | loss: 6.260 - 2656.05s

Similar words with [man]
 1 - woman: 0.6183
 2 - named: 0.5866
 3 - girl: 0.5168
 4 - guy: 0.5065
 5 - father: 0.4892
 6 - boy: 0.4812
 7 - brother: 0.4774
 8 - wife: 0.4741
 9 - whose: 0.4702
 it: 58100 | loss: 6.314 - 2660.53s
 it: 58200 | loss: 6.288 - 2664.98s
 it: 58300 | loss: 6.249 - 2669.47s
 it: 58400 | loss: 6.267 - 2673.93s
 it: 58500 | loss: 6.359 - 2678.39s
 it: 58600 | loss: 6.297 - 2682.85s
 it: 58700 | loss: 6.002 - 2687.32s
 it: 58800 | loss: 6.249 - 2691.78s
 it: 58900 | loss: 6.317 - 2696.23s
 it: 59000 | loss: 6.276 - 2700.69s

Similar words with [man]
 1 - woman: 0.5932
 2 - named: 0.5755
 3 - guy: 0.5364
 4 - girl: 0.5178
 5 - father: 0.4911
 6 - boy: 0.4876
 7 - brother: 0.4764
 8 - lady: 0.4749
 9 - criminal: 0.4705
 it: 59100 | loss: 6.274 - 2705.14s
 it: 59200 | loss: 6.372 - 2709.44s
 it: 59300 | loss: 6.452 - 2713.77s
 it: 59400 | loss: 6.174 - 2718.23s
 it: 59500 | loss: 6.277 - 2722.7

 it: 72100 | loss: 6.425 - 3288.40s
 it: 72200 | loss: 6.206 - 3292.95s
 it: 72300 | loss: 6.235 - 3297.42s
 it: 72400 | loss: 6.320 - 3301.85s
 it: 72500 | loss: 6.257 - 3306.34s
 it: 72600 | loss: 6.262 - 3310.82s
 it: 72700 | loss: 6.299 - 3315.31s
 it: 72800 | loss: 6.169 - 3319.74s
 it: 72900 | loss: 6.183 - 3324.00s
 it: 73000 | loss: 6.372 - 3328.41s

Similar words with [man]
 1 - woman: 0.5691
 2 - girl: 0.5630
 3 - boy: 0.5308
 4 - named: 0.5000
 5 - lady: 0.4970
 6 - friend's: 0.4647
 7 - brother: 0.4267
 8 - father: 0.4215
 9 - priest: 0.4146
 it: 73100 | loss: 6.203 - 3332.75s
 it: 73200 | loss: 6.115 - 3337.27s
 it: 73300 | loss: 6.234 - 3341.91s
 it: 73400 | loss: 6.316 - 3346.49s
 it: 73500 | loss: 6.157 - 3350.95s
 it: 73600 | loss: 6.244 - 3355.42s
 it: 73700 | loss: 6.297 - 3359.92s
 it: 73800 | loss: 5.918 - 3364.47s
 it: 73900 | loss: 6.270 - 3368.98s
 it: 74000 | loss: 6.304 - 3373.45s

Similar words with [man]
 1 - woman: 0.6345
 2 - girl: 0.5355
 3 - boy: 0.5317


 it: 86600 | loss: 6.173 - 3940.91s
 it: 86700 | loss: 6.255 - 3945.78s
 it: 86800 | loss: 6.286 - 3950.43s
 it: 86900 | loss: 6.183 - 3955.08s
 it: 87000 | loss: 6.203 - 3959.75s

Similar words with [man]
 1 - woman: 0.5863
 2 - boy: 0.5506
 3 - girl: 0.5448
 4 - guy: 0.4804
 5 - lady: 0.4794
 6 - man's: 0.4546
 7 - priest: 0.4489
 8 - accused: 0.4437
 9 - son: 0.4349
 it: 87100 | loss: 6.102 - 3964.35s
 it: 87200 | loss: 6.181 - 3968.81s
 it: 87300 | loss: 6.203 - 3973.28s
 it: 87400 | loss: 6.210 - 3977.72s
 it: 87500 | loss: 6.270 - 3982.19s
 it: 87600 | loss: 6.269 - 3986.64s
 it: 87700 | loss: 6.216 - 3991.11s
 it: 87800 | loss: 6.316 - 3995.58s
 it: 87900 | loss: 6.222 - 4000.13s
 it: 88000 | loss: 6.186 - 4004.89s

Similar words with [man]
 1 - woman: 0.5855
 2 - boy: 0.5703
 3 - girl: 0.5501
 4 - lady: 0.5074
 5 - guy: 0.4554
 6 - daughter: 0.4424
 7 - friend's: 0.4354
 8 - man's: 0.4328
 9 - loretta: 0.4307
 it: 88100 | loss: 6.145 - 4009.55s
 it: 88200 | loss: 6.169 - 4014.1

In [5]:
def calc_words(word1, word2, word3):
    idx1 = data.word2idx[word1]
    idx2 = data.word2idx[word2]
    idx3 = data.word2idx[word3]
    
    total_emb = np.array(sess.run(w2v.emb_w))              # (N, D)
    word_emb_1 = total_emb[idx1][None, :]                  # (1, D)
    word_emb_2 = total_emb[idx2][None, :]                  # (1, D)
    word_emb_3 = total_emb[idx3][None, :]                  # (1, D)
    
    query_emb = word_emb_1 - word_emb_2 + word_emb_3
    
    sim_matrix = cosine_similarity(query_emb, total_emb)   # (1, N)
    scores = sim_matrix[0]
    ids = np.argsort(-1 * scores)                          # 내림 차순
    scores = sorted(scores, reverse=True)                  # 내림 차순

    print(f"\nSimilar words with [{word1} - {word2} + {word3}]")
    for i in range(1, 10):
        cosine_sim, id = scores[i], ids[i]
        print("{:2d} - {}: {:.4f}".format(i, data.idx2word[id], cosine_sim))

In [11]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    w2v.restore(sess)
    calc_words("father", "man", "woman")
    calc_words("brother", "man", "woman")
    calc_words("son", "man", "woman")

 - Restoring variables...
INFO:tensorflow:Restoring parameters from model_word2vec-sampled-softmax/w2v_model
 * model restored 

Similar words with [father - man + woman]
 1 - woman: 0.5935
 2 - son: 0.5519
 3 - mother: 0.5391
 4 - daughter: 0.5023
 5 - husband: 0.4940
 6 - wife: 0.4678
 7 - sister: 0.4476
 8 - brother: 0.4407
 9 - pregnant: 0.4258

Similar words with [brother - man + woman]
 1 - sister: 0.6779
 2 - woman: 0.5811
 3 - father: 0.5192
 4 - mother: 0.5188
 5 - boyfriend: 0.5134
 6 - dad: 0.5038
 7 - daughter: 0.5024
 8 - younger: 0.4947
 9 - son: 0.4721

Similar words with [son - man + woman]
 1 - woman: 0.5892
 2 - daughter: 0.5778
 3 - father: 0.5697
 4 - mother: 0.5355
 5 - girlfriend: 0.5075
 6 - wife: 0.4992
 7 - sister: 0.4623
 8 - lover: 0.4314
 9 - husband: 0.4259
