In [1]:
import os
import tensorflow as tf
import numpy as np
import time, random

from sklearn.metrics.pairwise import cosine_similarity
from model import Word2vec
from util import Generator

In [None]:
tf.nn.nce_loss()

In [2]:
corpus_path = "imdb.corpus"

batch_size = 128        # 배치 사이즈
window_size = 4         # 앞 뒤 window size
vocab_size = 10000      # vocab 수
shuffle = True          # 데이터 shuffle
emb_dim = 128           # 단어 embedding dimension
num_samples = 1024      # samples numbers for sampled softmax or NCE
learning_rate = 0.001   # Learning rate
iters = 100000     # iteration 수
use_clip = True        # Gradient clipping 쓸지 여부

## Define

In [3]:
## Data input pipeline
data = Generator(corpus_path, vocab_size, batch_size, window_size, shuffle)
## Model graph
w2v = Word2vec("word2vec", emb_dim, vocab_size, num_samples, use_clip, learning_rate)

Now building vocab
corpus : imdb.corpus vocab_size : 10000
Instructions for updating:
Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.


In [4]:
def similar_words(word="good"):
    idx = data.word2idx[word]
    total_emb = np.array(sess.run(w2v.emb_w))              # (N, D)
    query_emb = total_emb[idx][None, :]                    # (1, D)
    
    sim_matrix = cosine_similarity(query_emb, total_emb)   # (1, N)
    scores = sim_matrix[0]
    ids = np.argsort(-1 * scores)                          # 내림 차순
    scores = sorted(scores, reverse=True)                  # 내림 차순

    print("\nSimilar words with [{}]".format(word))
    for i in range(1, 10):
        cosine_sim, id = scores[i], ids[i]
        print("{:2d} - {}: {:.4f}".format(i, data.idx2word[id], cosine_sim))

start_time = time.time()
data_point = 0
avg_loss, it_log, it_save, it_sample = .0, 100, 5000, 1000

with tf.Session() as sess:
    
    sess.run(tf.global_variables_initializer())
    
    for it in range(0, iters):
        inputs, labels = data.next()
        loss, update = sess.run([w2v.loss, w2v.update],
                                feed_dict={w2v.inputs: inputs, w2v.labels: labels})
        avg_loss += loss

        if it % it_log == 0 and it:
            print(" it: {:4d} | loss: {:.3f} - {:.2f}s".format(
                it, avg_loss / it_log, time.time() - start_time))
            avg_loss = .0

        if it % it_sample == 0:
            similar_words(word="man")
        if it % it_save == 0 and it > 0:
            w2v.save(sess)


Similar words with [man]
 1 - question: 0.3067
 2 - alongside: 0.2905
 3 - harder: 0.2898
 4 - suspect: 0.2782
 5 - '70s: 0.2719
 6 - welch: 0.2687
 7 - meaningless: 0.2680
 8 - noises: 0.2677
 9 - emotional: 0.2672
 it:  100 | loss: 8.339 - 4.16s
 it:  200 | loss: 8.073 - 7.41s
 it:  300 | loss: 7.977 - 11.02s
 it:  400 | loss: 7.682 - 14.56s
 it:  500 | loss: 7.674 - 18.10s
 it:  600 | loss: 7.425 - 21.61s
 it:  700 | loss: 7.475 - 25.30s
 it:  800 | loss: 7.266 - 29.21s
 it:  900 | loss: 7.075 - 32.74s
 it: 1000 | loss: 7.137 - 36.43s

Similar words with [man]
 1 - hollywood: 0.9218
 2 - sense: 0.9149
 3 - version: 0.9122
 4 - interesting: 0.9106
 5 - gore: 0.9101
 6 - meet: 0.9100
 7 - violence: 0.9095
 8 - 10: 0.9095
 9 - used: 0.9085
 it: 1100 | loss: 7.028 - 40.16s
 it: 1200 | loss: 6.944 - 44.45s
 it: 1300 | loss: 6.952 - 48.87s
 it: 1400 | loss: 6.849 - 53.45s
 it: 1500 | loss: 6.812 - 58.17s
 it: 1600 | loss: 6.841 - 62.55s
 it: 1700 | loss: 6.827 - 66.80s
 it: 1800 | loss: 

 it: 14200 | loss: 6.469 - 582.31s
 it: 14300 | loss: 6.361 - 586.67s
 it: 14400 | loss: 6.365 - 590.82s
 it: 14500 | loss: 6.373 - 594.96s
 it: 14600 | loss: 6.426 - 599.10s
 it: 14700 | loss: 6.444 - 603.25s
 it: 14800 | loss: 6.427 - 607.66s
 it: 14900 | loss: 6.382 - 612.53s
 it: 15000 | loss: 6.443 - 617.40s

Similar words with [man]
 1 - wife: 0.8076
 2 - charles: 0.8039
 3 - meets: 0.7980
 4 - husband: 0.7940
 5 - named: 0.7896
 6 - woman: 0.7880
 7 - whose: 0.7868
 8 - finds: 0.7827
 9 - young: 0.7799
 * model saved at 'models/w2v_model'
 it: 15100 | loss: 6.325 - 622.25s
 it: 15200 | loss: 6.423 - 626.59s
 it: 15300 | loss: 6.447 - 630.87s
 it: 15400 | loss: 6.399 - 635.08s
 it: 15500 | loss: 6.412 - 639.23s
 it: 15600 | loss: 6.271 - 643.41s
 it: 15700 | loss: 6.366 - 647.73s
 it: 15800 | loss: 6.361 - 652.19s
 it: 15900 | loss: 6.393 - 656.83s
 it: 16000 | loss: 6.398 - 661.21s

Similar words with [man]
 1 - wife: 0.7705
 2 - sent: 0.7634
 3 - husband: 0.7584
 4 - charles: 0

 it: 29100 | loss: 6.372 - 1256.93s
 it: 29200 | loss: 6.372 - 1261.32s
 it: 29300 | loss: 6.344 - 1265.66s
 it: 29400 | loss: 6.349 - 1269.76s
 it: 29500 | loss: 6.305 - 1273.77s
 it: 29600 | loss: 6.280 - 1278.17s
 it: 29700 | loss: 6.343 - 1283.00s
 it: 29800 | loss: 6.329 - 1287.88s
 it: 29900 | loss: 6.308 - 1293.04s
 it: 30000 | loss: 6.207 - 1297.72s

Similar words with [man]
 1 - woman: 0.7426
 2 - wife: 0.6939
 3 - married: 0.6920
 4 - meets: 0.6793
 5 - finds: 0.6578
 6 - living: 0.6342
 7 - marry: 0.6247
 8 - murdered: 0.6163
 9 - son: 0.6160
 * model saved at 'models/w2v_model'
 it: 30100 | loss: 6.223 - 1303.25s
 it: 30200 | loss: 6.308 - 1308.20s
 it: 30300 | loss: 6.349 - 1312.99s
 it: 30400 | loss: 6.358 - 1317.13s
 it: 30500 | loss: 6.239 - 1321.28s
 it: 30600 | loss: 6.360 - 1325.53s
 it: 30700 | loss: 6.453 - 1329.79s
 it: 30800 | loss: 6.390 - 1333.89s
 it: 30900 | loss: 6.192 - 1337.89s
 it: 31000 | loss: 6.385 - 1341.87s

Similar words with [man]
 1 - woman: 0.701

 it: 43900 | loss: 6.303 - 1864.35s
 it: 44000 | loss: 6.367 - 1868.49s

Similar words with [man]
 1 - woman: 0.6079
 2 - doctor: 0.5359
 3 - marry: 0.5347
 4 - loves: 0.5336
 5 - boy: 0.5298
 6 - who's: 0.5271
 7 - possessed: 0.5111
 8 - young: 0.5022
 9 - girl: 0.4988
 it: 44100 | loss: 6.387 - 1872.58s
 it: 44200 | loss: 6.335 - 1876.69s
 it: 44300 | loss: 6.470 - 1880.77s
 it: 44400 | loss: 6.277 - 1884.90s
 it: 44500 | loss: 6.237 - 1888.95s
 it: 44600 | loss: 6.287 - 1892.87s
 it: 44700 | loss: 6.146 - 1896.85s
 it: 44800 | loss: 6.238 - 1901.05s
 it: 44900 | loss: 6.202 - 1905.23s
 it: 45000 | loss: 6.381 - 1909.42s

Similar words with [man]
 1 - woman: 0.6827
 2 - rich: 0.5809
 3 - doctor: 0.5692
 4 - marry: 0.5580
 5 - who's: 0.5563
 6 - whose: 0.5480
 7 - possessed: 0.5463
 8 - girl: 0.5426
 9 - boy: 0.5415
 * model saved at 'models/w2v_model'
 it: 45100 | loss: 6.382 - 1913.84s
 it: 45200 | loss: 6.228 - 1918.02s
 it: 45300 | loss: 6.181 - 1922.01s
 it: 45400 | loss: 6.123 -

 it: 58200 | loss: 6.285 - 2462.65s
 it: 58300 | loss: 6.249 - 2466.87s
 it: 58400 | loss: 6.265 - 2470.97s
 it: 58500 | loss: 6.361 - 2475.03s
 it: 58600 | loss: 6.299 - 2479.15s
 it: 58700 | loss: 6.001 - 2483.24s
 it: 58800 | loss: 6.248 - 2487.38s
 it: 58900 | loss: 6.319 - 2492.17s
 it: 59000 | loss: 6.276 - 2497.13s

Similar words with [man]
 1 - woman: 0.6284
 2 - named: 0.5343
 3 - guy: 0.5277
 4 - finds: 0.5024
 5 - girl: 0.4811
 6 - married: 0.4704
 7 - boy: 0.4636
 8 - loves: 0.4636
 9 - calls: 0.4474
 it: 59100 | loss: 6.276 - 2502.12s
 it: 59200 | loss: 6.367 - 2506.97s
 it: 59300 | loss: 6.451 - 2511.62s
 it: 59400 | loss: 6.175 - 2516.19s
 it: 59500 | loss: 6.275 - 2520.67s
 it: 59600 | loss: 6.368 - 2524.92s
 it: 59700 | loss: 6.422 - 2529.08s
 it: 59800 | loss: 6.173 - 2533.09s
 it: 59900 | loss: 6.271 - 2537.09s
 it: 60000 | loss: 6.250 - 2541.12s

Similar words with [man]
 1 - woman: 0.6455
 2 - named: 0.5383
 3 - finds: 0.5147
 4 - guy: 0.5012
 5 - girl: 0.4882
 6 -

 it: 73000 | loss: 6.374 - 3103.84s

Similar words with [man]
 1 - woman: 0.6258
 2 - boy: 0.4999
 3 - lady: 0.4708
 4 - named: 0.4596
 5 - girl: 0.4520
 6 - finds: 0.4167
 7 - guy: 0.4133
 8 - handsome: 0.4096
 9 - brother: 0.4077
 it: 73100 | loss: 6.207 - 3108.10s
 it: 73200 | loss: 6.115 - 3112.32s
 it: 73300 | loss: 6.238 - 3116.60s
 it: 73400 | loss: 6.320 - 3121.04s
 it: 73500 | loss: 6.156 - 3125.52s
 it: 73600 | loss: 6.246 - 3129.86s
 it: 73700 | loss: 6.296 - 3134.12s
 it: 73800 | loss: 5.918 - 3138.36s
 it: 73900 | loss: 6.272 - 3142.44s
 it: 74000 | loss: 6.311 - 3146.52s

Similar words with [man]
 1 - woman: 0.6765
 2 - boy: 0.4978
 3 - named: 0.4512
 4 - handsome: 0.4408
 5 - husband: 0.4258
 6 - girl: 0.4237
 7 - lady: 0.4181
 8 - mysterious: 0.4095
 9 - brother: 0.3920
 it: 74100 | loss: 6.264 - 3150.78s
 it: 74200 | loss: 6.230 - 3155.04s
 it: 74300 | loss: 6.257 - 3159.27s
 it: 74400 | loss: 6.243 - 3163.58s
 it: 74500 | loss: 6.350 - 3168.02s
 it: 74600 | loss: 6.13

 it: 87100 | loss: 6.098 - 3727.05s
 it: 87200 | loss: 6.182 - 3731.41s
 it: 87300 | loss: 6.203 - 3735.67s
 it: 87400 | loss: 6.211 - 3739.85s
 it: 87500 | loss: 6.268 - 3743.99s
 it: 87600 | loss: 6.264 - 3748.13s
 it: 87700 | loss: 6.217 - 3752.34s
 it: 87800 | loss: 6.317 - 3756.70s
 it: 87900 | loss: 6.222 - 3761.05s
 it: 88000 | loss: 6.187 - 3765.35s

Similar words with [man]
 1 - woman: 0.6129
 2 - girl: 0.5339
 3 - father: 0.5184
 4 - lady: 0.5171
 5 - boy: 0.4882
 6 - son: 0.4485
 7 - daughter's: 0.4474
 8 - daughter: 0.4473
 9 - pregnant: 0.4418
 it: 88100 | loss: 6.149 - 3769.70s
 it: 88200 | loss: 6.172 - 3774.10s
 it: 88300 | loss: 6.379 - 3778.42s
 it: 88400 | loss: 6.227 - 3782.71s
 it: 88500 | loss: 6.333 - 3787.03s
 it: 88600 | loss: 6.275 - 3791.36s
 it: 88700 | loss: 6.129 - 3795.85s
 it: 88800 | loss: 6.239 - 3800.70s
 it: 88900 | loss: 6.250 - 3805.83s
 it: 89000 | loss: 6.406 - 3811.09s

Similar words with [man]
 1 - woman: 0.5947
 2 - girl: 0.5537
 3 - lady: 0.5

In [5]:
def calc_words(word1, word2, word3):
    idx1 = data.word2idx[word1]
    idx2 = data.word2idx[word2]
    idx3 = data.word2idx[word3]
    
    total_emb = np.array(sess.run(w2v.emb_w))              # (N, D)
    word_emb_1 = total_emb[idx1][None, :]                  # (1, D)
    word_emb_2 = total_emb[idx2][None, :]                  # (1, D)
    word_emb_3 = total_emb[idx3][None, :]                  # (1, D)
    
    query_emb = word_emb_1 - word_emb_2 + word_emb_3
    
    sim_matrix = cosine_similarity(query_emb, total_emb)   # (1, N)
    scores = sim_matrix[0]
    ids = np.argsort(-1 * scores)                          # 내림 차순
    scores = sorted(scores, reverse=True)                  # 내림 차순

    print(f"\nSimilar words with [{word1} - {word2} + {word3}]")
    for i in range(1, 10):
        cosine_sim, id = scores[i], ids[i]
        print("{:2d} - {}: {:.4f}".format(i, data.idx2word[id], cosine_sim))

In [15]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    w2v.restore(sess)
    calc_words("father", "man", "woman")
    calc_words("brother", "man", "woman")
    calc_words("son", "man", "woman")
    calc_words("husband", "man", "woman")

 - Restoring variables...
INFO:tensorflow:Restoring parameters from models/w2v_model
 * model restored 

Similar words with [father - man + woman]
 1 - woman: 0.6776
 2 - mother: 0.6346
 3 - son: 0.5731
 4 - wife: 0.5570
 5 - husband: 0.5481
 6 - daughter: 0.5050
 7 - sister: 0.4965
 8 - pregnant: 0.4462
 9 - relationship: 0.4399

Similar words with [brother - man + woman]
 1 - sister: 0.6767
 2 - woman: 0.6400
 3 - mother: 0.5450
 4 - wife: 0.5153
 5 - father: 0.5038
 6 - elder: 0.4924
 7 - son: 0.4897
 8 - boyfriend: 0.4856
 9 - boy: 0.4842

Similar words with [son - man + woman]
 1 - father: 0.6215
 2 - mother: 0.6093
 3 - woman: 0.5991
 4 - wife: 0.5660
 5 - daughter: 0.5493
 6 - sister: 0.5149
 7 - adopted: 0.5098
 8 - brother: 0.4653
 9 - husband: 0.4648

Similar words with [husband - man + woman]
 1 - woman: 0.6052
 2 - wife: 0.5863
 3 - mother: 0.5787
 4 - father: 0.5535
 5 - married: 0.5523
 6 - friend's: 0.5496
 7 - pregnant: 0.5484
 8 - returns: 0.5330
 9 - abusive: 0.5202
