# PTB 테이터 불러오기

In [None]:
import os

if 'ptb.train.txt' in os.listdir():
    with open("./ptb.train.txt", 'r') as f:
        text = f.read()        
else:
    from urllib.request import urlopen
    url = 'https://raw.githubusercontent.com/tomsercu/lstm/master/data/ptb.train.txt'
    html = urlopen(url)
    text = html.read().decode()

    with open("./ptb.train.txt", 'w') as f:
        f.write(text)
words = text.replace('\n', '<eos>')

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=10000, filters='')
tokenizer.fit_on_texts([words])
sequences = tokenizer.texts_to_sequences([words])[0]

In [None]:
import numpy as np
def create_contexts_target(corpus, window_size=1):
    target = corpus[window_size:-window_size]
    contexts = []
    
    for idx in range(window_size, len(corpus)-window_size):
        cs = []
        for t in range(-window_size, window_size+1):
            if t == 0:
                continue
            cs.append(corpus[idx+t])
        contexts.append(cs)
        
    return np.array(contexts), np.array(target)

window_size = 5
contexts, target = create_contexts_target(sequences, window_size)

In [None]:
from tensorflow import keras

model = keras.models.Sequential([
    keras.layers.Embedding(10000, 100, input_shape=(10,)),
    keras.layers.Lambda(lambda x: keras.backend.mean(x, axis=1)),
    keras.layers.Dense(10000, activation='softmax')
])

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 10, 100)           1000000   
_________________________________________________________________
lambda (Lambda)              (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 10000)             1010000   
Total params: 2,010,000
Trainable params: 2,010,000
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(loss='sparse_categorical_crossentropy')
model.fit(contexts, target, batch_size=128, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f16d50228d0>

In [None]:
embedding_layer = model.layers[0]
wordvecs = embedding_layer.get_weights()[0]
wordvecs.shape

(10000, 100)

In [None]:
def most_similar(query, word_to_id, similarity_matirx, top=5):
    id_to_word = {id_:word for word, id_ in word_to_id.items()}
    if query not in word_to_id:
        print('%s(을)를 찾을 수 없습니다.' % query)
        return
    
    print('\n[query] ' + query)
    query_id = word_to_id[query]
    similarity = similarity_matirx[query_id]
    
    # 코사인 유사도 기준으로 내림차순으로 출력
    count = 0
    for i in (-1 * similarity).argsort():
        if id_to_word[i] == query:
            continue
        print(' %s: %s' % (id_to_word[i], similarity[i]))
        
        count += 1
        if count >= top:
            return

from sklearn.metrics.pairwise import cosine_similarity
similarity_matirx = cosine_similarity(wordvecs)

querys = ['korea', 'year', 'car', 'toyota']
for query in querys:
    most_similar(query, tokenizer.word_index, similarity_matirx)


[query] korea
 carolina: 0.8236491
 taiwan: 0.82193553
 africa: 0.8071295
 gardens: 0.72406775
 african: 0.7103499

[query] year
 month: 0.82229835
 week: 0.81950843
 spring: 0.7382311
 summer: 0.73065233
 decade: 0.6150091

[query] car
 building: 0.81746835
 domestic: 0.8134144
 computer: 0.8111441
 product: 0.8103106
 line: 0.80804193

[query] toyota
 printing: 0.7315754
 honda: 0.7289441
 rivals: 0.7222565
 nissan: 0.71424437
 hitachi: 0.6996551


In [31]:
def analogy(w1, w2, w3):
  a = tokenizer.word_index[w1]
  b = tokenizer.word_index[w2]
  c = tokenizer.word_index[w3]
  s = wordvecs[b] + wordvecs[c] - wordvecs[a]

  dist = -cosine_similarity(wordvecs, s.reshape(1, -1)).ravel()
  id_to_word = {id_:word for word, id_ in tokenizer.word_index.items()}

  for i in dist.argsort()[:5]:
      print(' %s: %s' % (id_to_word[i], dist[i]))

analogy("king", "man", "queen")

 man: -0.8297522
 father: -0.7546481
 wife: -0.6914324
 psychological: -0.6879203
 artist: -0.6876626


In [32]:
analogy("take", "took", "go")

 went: -0.78825736
 took: -0.7797737
 goes: -0.7756917
 became: -0.7514231
 started: -0.7483656


In [43]:
analogy("started", "start", "grew")

 slid: -0.78581274
 grow: -0.7752774
 cigna: -0.76858586
 slipped: -0.75407887
 bond-equivalent: -0.7518454


# Gensim

훈련하고 싶은 텍스트를 문서 리스트 안의 토큰 리스트 형식으로 넘기면 된다.

In [None]:
from gensim.models import Word2Vec

model = Word2Vec([words.split()], iter=100)

In [None]:
model.wv.most_similar("korea")

  if np.issubdtype(vec.dtype, np.int):


[('south', 0.9646520614624023),
 ('opened', 0.9176843166351318),
 ('taiwan', 0.916343092918396),
 ('establishing', 0.9022989273071289),
 ('vowed', 0.8992795348167419),
 ('diplomatic', 0.8976802825927734),
 ('enact', 0.8959468603134155),
 ('applied', 0.892693817615509),
 ('johnson', 0.879311203956604),
 ('control', 0.8779106736183167)]

In [None]:
model.wv.most_similar('year')

  if np.issubdtype(vec.dtype, np.int):


[('unsuccessfully', 0.9074409008026123),
 ('getting', 0.8797416687011719),
 ('triple', 0.8701896071434021),
 ('week', 0.8668280839920044),
 ('spring', 0.8605564832687378),
 ('win', 0.8502823114395142),
 ('earlier', 0.8499799370765686),
 ('units', 0.8388738632202148),
 ('germans', 0.8373819589614868),
 ('february', 0.8328527212142944)]

In [None]:
model.wv.most_similar(positive=['queen', 'man'], negative=['king'])

  if np.issubdtype(vec.dtype, np.int):


[('hero', 0.8398237228393555),
 ('unusually', 0.8334057331085205),
 ('recommend', 0.8330715894699097),
 ('brief', 0.8311742544174194),
 ('fiber', 0.8297237157821655),
 ('western', 0.8273922204971313),
 ('reality', 0.8273218870162964),
 ('sweet', 0.8270956873893738),
 ('cases', 0.8245373964309692),
 ('stadium', 0.8243252038955688)]

In [None]:
tokenizer.word_index['king']

2241

In [None]:
wordvecs[2241]

array([-1.20836906e-01, -2.80994058e-01,  3.51361856e-02, -4.37688008e-02,
        1.67382255e-01, -7.45799392e-03,  2.09649622e-01, -2.03219861e-01,
       -1.37367696e-01, -1.75746351e-01, -2.06780300e-01, -2.26434246e-01,
        3.06367964e-01, -4.53574836e-01,  2.27138326e-01,  1.80569440e-01,
        3.79854627e-02, -2.11192757e-01,  7.43130967e-02, -1.01363264e-01,
        4.59703207e-01,  3.17494243e-01, -2.48105928e-01, -2.48108491e-01,
       -9.67620015e-02,  1.44218490e-01,  3.88319314e-01,  1.90000206e-01,
        1.18373118e-01, -1.85312167e-01, -3.69629949e-01, -3.35750252e-01,
       -3.77131999e-01, -3.15516561e-01, -2.38127887e-01,  3.35058309e-02,
       -1.28123641e-01,  1.30529162e-02,  3.52896117e-02, -5.82961999e-02,
       -1.53089399e-02, -5.58379404e-02,  3.64413392e-03, -1.27347648e-01,
       -9.45928879e-03,  3.47996861e-01,  1.55066118e-01, -2.37420961e-01,
       -1.67009741e-01, -4.79563661e-02,  3.59300703e-01,  4.39852566e-01,
        1.20820953e-02, -