### SkipGram예제

In [None]:
import collections
import math
import random
import numpy as np
import tensorflow as tf
import sg_util

In [None]:
# 파일을 가져옴
filename=sg_util.maybe_download()

# 파일을 읽어서 모든 단어들을 words에 list 형태로 집어넣음
words = sg_util.read_data(filename)
vocabulary_size = 50000

# data - 전체 문서에서 각 단어들의 id를 list로 나열
# count - 각 단어들이 몇번 나왔는지
# dicionary와 reverse_dictionary는 id-word를 매핑한 부분
data, count, dictionary, reverse_dictionary = sg_util.build_dataset(words,vocabulary_size)

# 나중에 embedding을 마치고 테스트 해볼 갯수와 그 index 최댓값
valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.randint(1,valid_window, valid_size)

In [None]:
def generate_batch(batch_size, skip_window, data_index):
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1  # [ skip_window target skip_window ]
    buffer = collections.deque(maxlen=span)  # 아무것도 안들어있는 deque를 생성
    generated_batch = 0
    while True:
        if data_index + span > len(data):  # data의 길이에 끝에 다다르면 data의 첫지점으로 되돌아감
            data_index = 0
        for _ in range(span):  # span만큼 deque에 삽입
            buffer.append(data[data_index])
            data_index = data_index + 1
        data_index = data_index - span + 1  # data index를 원래대로 돌려놓고, 한칸 움직임
        for i in range(1,skip_window+1):  # deque에 들어가있는 data로 학습 데이터 생성
            batch[generated_batch] = buffer[skip_window]
            labels[generated_batch] = buffer[skip_window - i]
            generated_batch += 1
            batch[generated_batch] = buffer[skip_window]
            labels[generated_batch] = buffer[skip_window + i]
            generated_batch += 1
            if generated_batch == batch_size:
                break
        if generated_batch == batch_size:
            break
    return batch, labels, data_index

In [None]:
batch_size=100
skip_window=2
embedding_size=100
n_iteration = 100000 #주어졌던 기본 코드는 1000000

X = tf.placeholder(tf.int32,[None])
y_ = tf.placeholder(tf.int32,(None,1))

embeddings = tf.Variable(tf.random_uniform([vocabulary_size,embedding_size], -1,1))
embedded = tf.nn.embedding_lookup(embeddings,X)
weights = tf.Variable(tf.truncated_normal(shape=(vocabulary_size,embedding_size),stddev=1.0/math.sqrt(embedding_size)))
bias = tf.zeros(vocabulary_size)

loss=tf.reduce_mean(tf.nn.nce_loss(weights=weights, biases=bias, labels=y_, inputs=embedded, num_sampled=64, num_classes=vocabulary_size))
optimizer=tf.train.GradientDescentOptimizer(1.0).minimize(loss)

In [None]:
embeddings

In [None]:
embedded

In [None]:
X

In [None]:
with tf.Session() as sess:
    tf.global_variables_initializer().run()
    average_loss = 0
    data_index = 0
    step = 0
    for epoch in range(n_iteration):
        step += 1

        batch_input, batch_output, data_index = generate_batch(batch_size, skip_window, data_index)

        _, loss_eval = sess.run([optimizer, loss], feed_dict={X: batch_input, y_: batch_output})
        average_loss += loss_eval

        if step % 10000 == 0:
            average_loss /= 10000
            print('average loss at step ' + str(step) + ': ' + str(average_loss) + ' epoch is ' + str(epoch))
            sg_util.closest_words(sess, embeddings.eval(), reverse_dictionary, valid_examples)
            average_loss = 0

In [None]:
valid_examples

In [None]:
reverse_dictionary[1]

In [None]:
average_loss

# 구분선

### CBOW 예제

In [None]:
import collections
import math
import random
import numpy as np
import tensorflow as tf
import sg_util

In [None]:
# 파일을 가져옴
filename=sg_util.maybe_download()

# 파일을 읽어서 모든 단어들을 words에 list 형태로 집어넣음
words = sg_util.read_data(filename)
vocabulary_size = 50000

# data - 전체 문서에서 각 단어들의 id를 list로 나열
# count - 각 단어들이 몇번 나왔는지
# dicionary와 reverse_dictionary는 id-word를 매핑한 부분
data, count, dictionary, reverse_dictionary = sg_util.build_dataset(words,vocabulary_size)

valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.randint(1,valid_window, valid_size)

In [None]:
def generate_batch(batch_size, skip_window, data_index):
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1  # [ skip_window target skip_window ]
    buffer = collections.deque(maxlen=span)  # 아무것도 안들어있는 deque를 생성
    generated_batch = 0
    while True:
        if data_index + span > len(data):  # data의 길이에 끝에 다다르면 data의 첫지점으로 되돌아감
            data_index = 0
        for _ in range(span):  # span만큼 deque에 삽입
            buffer.append(data[data_index])
            data_index = data_index + 1
        data_index = data_index - span + 1  # data index를 원래대로 돌려놓고, 한칸 움직임
        for i in range(1,skip_window+1):  # deque에 들어가있는 data로 학습 데이터 생성
            # CBOW 형태로 batch 생성 및 학습
            batch[generated_batch] = buffer[skip_window - i]
            labels[generated_batch] = buffer[skip_window]
            generated_batch += 1
            batch[generated_batch] = buffer[skip_window + i]
            labels[generated_batch] = buffer[skip_window]
            generated_batch += 1
            if generated_batch == batch_size:
                break
        if generated_batch == batch_size:
            break
    return batch, labels, data_index

In [None]:
batch_size=100
skip_window=2
embedding_size=100
n_iteration = 10000

X = tf.placeholder(tf.int32,[None])
y_ = tf.placeholder(tf.int32,(None,1))

embeddings = tf.Variable(tf.random_uniform([vocabulary_size,embedding_size], -1,1))
embedded = tf.nn.embedding_lookup(embeddings,X)
weights = tf.Variable(tf.truncated_normal(shape=(vocabulary_size,embedding_size),stddev=1.0/math.sqrt(embedding_size)))
bias = tf.zeros(vocabulary_size)

loss=tf.reduce_mean(tf.nn.nce_loss(weights=weights, biases=bias, labels=y_, inputs=embedded, num_sampled=64, num_classes=vocabulary_size))
optimizer=tf.train.GradientDescentOptimizer(1.0).minimize(loss)

In [None]:
with tf.Session() as sess:
    tf.global_variables_initializer().run()
    average_loss = 0
    data_index = 0
    step = 0
    for epoch in range(n_iteration):
        step += 1

        batch_input, batch_output, data_index = generate_batch(batch_size, skip_window, data_index)

        _, loss_eval = sess.run([optimizer, loss], feed_dict={X: batch_input, y_: batch_output})
        average_loss += loss_eval

        if step % 10000 == 0:
            average_loss /= 10000
            print('average loss at step ' + str(step) + ': ' + str(average_loss) + ' epoch is ' + str(epoch))
            norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings.eval()), 1, keep_dims=True))
            normalized_embeddings = embeddings.eval() / norm
            # 행렬에 트레이닝 데이터를 지정
            valid_embeddings = tf.nn.embedding_lookup(
            normalized_embeddings, valid_examples)
            similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)
            nearests = tf.nn.top_k(similarity,5)[1]     
            for word,nearest_words in enumerate(nearests.eval()):
                print('nearests to ',reverse_dictionary[valid_examples[word]],' :',end=' ')
                for nearest_word in nearest_words[1:]:
                    print(reverse_dictionary[nearest_word],end=', ')
                print()
            average_loss = 0

In [None]:
nearest_words

In [None]:
word

In [None]:
reverse_dictionary[51]

### CBOW 예제 - Use List

In [1]:
import collections
import math
import random
import numpy as np
import tensorflow as tf
import sg_util

In [2]:
# 파일을 가져옴
filename=sg_util.maybe_download()

# 파일을 읽어서 모든 단어들을 words에 list 형태로 집어넣음
words = sg_util.read_data(filename)
vocabulary_size = 50000

# data - 전체 문서에서 각 단어들의 id를 list로 나열
# count - 각 단어들이 몇번 나왔는지
# dicionary와 reverse_dictionary는 id-word를 매핑한 부분
data, count, dictionary, reverse_dictionary = sg_util.build_dataset(words,vocabulary_size)

valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.randint(1,valid_window, valid_size)

veryfied


In [10]:
def generate_batch(batch_size, skip_window, data_index):
    span = 2 * skip_window + 1  # [ skip_window target skip_window ]
    batch = np.ndarray(shape=(batch_size, skip_window*2), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size,1), dtype=np.int32)
    buffer = collections.deque(maxlen=span)  # 아무것도 안들어있는 deque를 생성
    generated_batch = 0
    while True:
        if data_index + span > len(data):  # data의 길이에 끝에 다다르면 data의 첫지점으로 되돌아감
            data_index = 0
        for _ in range(span):  # span만큼 deque에 삽입
            buffer.append(data[data_index])
            data_index = data_index + 1
        data_index = data_index - span + 1  # data index를 원래대로 돌려놓고, 한칸 움직임
        for i in range(1,skip_window+1):  # deque에 들어가있는 data로 학습 데이터 생성
            # CBOW 형태로 batch 생성 및 학습
            buffer_list = list(buffer)
            labels[generated_batch] = buffer_list.pop(skip_window)
            batch[generated_batch] = buffer_list
            generated_batch += 1
            if generated_batch == batch_size:
                break
        if generated_batch == batch_size:
            break
    return batch, labels, data_index

data_index = 0
batch_input, batch_output, data_index = generate_batch(batch_size, skip_window, data_index)
batch_output

array([[   12],
       [   12],
       [    6],
       [    6],
       [  195],
       [  195],
       [    2],
       [    2],
       [ 3134],
       [ 3134],
       [   46],
       [   46],
       [   59],
       [   59],
       [  156],
       [  156],
       [  128],
       [  128],
       [  742],
       [  742],
       [  477],
       [  477],
       [10572],
       [10572],
       [  134],
       [  134],
       [    1],
       [    1],
       [27350],
       [27350],
       [    2],
       [    2],
       [    1],
       [    1],
       [  103],
       [  103],
       [  855],
       [  855],
       [    3],
       [    3],
       [    1],
       [    1],
       [15068],
       [15068],
       [    0],
       [    0],
       [    2],
       [    2],
       [    1],
       [    1],
       [  151],
       [  151],
       [  855],
       [  855],
       [ 3581],
       [ 3581],
       [    1],
       [    1],
       [  195],
       [  195],
       [   11],
       [   11],
       [

In [12]:
batch_size=100
skip_window=2
embedding_size=100
n_iteration = 100000

X = tf.placeholder(tf.int32, shape=[batch_size, skip_window*2])
y_ = tf.placeholder(tf.int32,(None,1))

embeddings = tf.Variable(tf.random_uniform([vocabulary_size,embedding_size], -1,1))
#embedded = tf.nn.embedding_lookup(embeddings,X)
# embedded는 3차원
embedded = tf.nn.embedding_lookup(embeddings,X)
print(embedded.shape)
embedded = tf.reduce_mean(embedded,1)
print(embedded.shape)
weights = tf.Variable(tf.truncated_normal(shape=(vocabulary_size,embedding_size),stddev=1.0/math.sqrt(embedding_size)))
bias = tf.zeros(vocabulary_size)

loss=tf.reduce_mean(tf.nn.sampled_softmax_loss(weights=weights, biases=bias, labels=y_, inputs=embedded, num_sampled=64, num_classes=vocabulary_size))
optimizer=tf.train.GradientDescentOptimizer(1.0).minimize(loss)


(100, 4, 100)
(100, 100)


In [None]:
with tf.Session() as sess:
    tf.global_variables_initializer().run()
    average_loss = 0
    data_index = 0
    step = 0
    for epoch in range(n_iteration):
        step += 1

        batch_input, batch_output, data_index = generate_batch(batch_size, skip_window, data_index)

        _, loss_eval = sess.run([optimizer, loss], feed_dict={X: batch_input, y_: batch_output})
        average_loss += loss_eval

        if step % 10000 == 0:
            average_loss /= 10000
            print('average loss at step ' + str(step) + ': ' + str(average_loss) + ' epoch is ' + str(epoch))
            norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings.eval()), 1, keep_dims=True))
            normalized_embeddings = embeddings.eval() / norm
            # 행렬에 트레이닝 데이터를 지정
            valid_embeddings = tf.nn.embedding_lookup(
            normalized_embeddings, valid_examples)            
            similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)
            nearests = tf.nn.top_k(similarity,5)[1]     
            for word,nearest_words in enumerate(nearests.eval()):
                print('nearests to ',reverse_dictionary[valid_examples[word]],' :',end=' ')
                for nearest_word in nearest_words[1:]:
                    print(reverse_dictionary[nearest_word],end=', ')
                print()
            average_loss = 0

In [None]:
embedded

In [None]:
X