In [1]:
import collections
import math
import os
import zipfile
import random
import tensorflow as tf
import numpy as np

In [2]:
def read_data(filename):
   # extract the first file enclosed in a zip file as a list of words
    with zipfile.ZipFile(filename) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data
words = read_data("text8")
words

['anarchism',
 'originated',
 'as',
 'a',
 'term',
 'of',
 'abuse',
 'first',
 'used',
 'against',
 'early',
 'working',
 'class',
 'radicals',
 'including',
 'the',
 'diggers',
 'of',
 'the',
 'english',
 'revolution',
 'and',
 'the',
 'sans',
 'culottes',
 'of',
 'the',
 'french',
 'revolution',
 'whilst',
 'the',
 'term',
 'is',
 'still',
 'used',
 'in',
 'a',
 'pejorative',
 'way',
 'to',
 'describe',
 'any',
 'act',
 'that',
 'used',
 'violent',
 'means',
 'to',
 'destroy',
 'the',
 'organization',
 'of',
 'society',
 'it',
 'has',
 'also',
 'been',
 'taken',
 'up',
 'as',
 'a',
 'positive',
 'label',
 'by',
 'self',
 'defined',
 'anarchists',
 'the',
 'word',
 'anarchism',
 'is',
 'derived',
 'from',
 'the',
 'greek',
 'without',
 'archons',
 'ruler',
 'chief',
 'king',
 'anarchism',
 'as',
 'a',
 'political',
 'philosophy',
 'is',
 'the',
 'belief',
 'that',
 'rulers',
 'are',
 'unnecessary',
 'and',
 'should',
 'be',
 'abolished',
 'although',
 'there',
 'are',
 'differing',
 '

In [3]:
def build_dataset(words, vocabulary_size):
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(vocabulary_size - 1)) # 记载单词及对应出现次数 + 排序
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary) # 给排序后的每个单词标序号
    data = list() # 记载words中每个单词在排序中对应的序号
    unk_count = 0  
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0 
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reverse_dictionary

In [4]:
vocabulary_size = 5000
data, count, dictionary, reverse_dictionary = build_dataset(words, vocabulary_size)
del words
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])

Most common words (+UNK) [['UNK', 2735459], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764)]
Sample data [0, 3084, 12, 6, 195, 2, 3136, 46, 59, 156] ['UNK', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']


In [5]:
data_index = 0

In [6]:
def generate_batch(batch_size, num_skips, skip_window):
    # batch_size 
    # num_size: 目标对象被重复使用的次数，一般等于2 * skip_window
    # skip_window: 单向看的长度
    # 输出目标对象和其窗口内的其他对象的配对
    # 当num_skips < 2 * skip_window的时候，随机选择num_skip数量的窗口内对象。
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape = (batch_size), dtype = np.int32)
    labels = np.ndarray(shape = (batch_size, 1), dtype = np.int32)
    span = 2 * skip_window + 1 
    buffer = collections.deque(maxlen = span) # 用于暂时存储窗口内的内容
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    for i in range(batch_size // num_skips):
        target = skip_window
        targets_to_avoid = [skip_window] # 使用禁止项，使得即可遍历窗内数据，又可以使之无序化。但是就是跑了就了点
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
            targets_to_avoid.append(target)
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[target]
        buffer.append(data[data_index]) # 因为设立了buffer的上限，所以直接apeend就相当于窗口右移
        data_index = (data_index + 1) % len(data)
    data_index = (data_index + len(data) -span) % len(data)
    return batch, labels

In [7]:
batch, labels = generate_batch(batch_size = 16, num_skips = 4, skip_window =2)
for i in range(16):
    print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0], reverse_dictionary[labels[i, 0]])
print("data_index", data_index)

12 as -> 3084 originated
12 as -> 195 term
12 as -> 0 UNK
12 as -> 6 a
6 a -> 195 term
6 a -> 12 as
6 a -> 2 of
6 a -> 3084 originated
195 term -> 6 a
195 term -> 12 as
195 term -> 2 of
195 term -> 3136 abuse
2 of -> 3136 abuse
2 of -> 195 term
2 of -> 46 first
2 of -> 6 a
data_index 4


In [8]:
batch_size = 128
embedding_size = 128
skip_window = 1
num_skips = 2

valid_size = 16
valid_window = 100
valid_examples = np.random.choice(valid_window, valid_size, replace = False)
num_sampled = 64

In [9]:
graph = tf.Graph()

In [10]:
with graph.as_default():
    # input data
    train_inputs = tf.placeholder(tf.int32, shape = [batch_size])
    train_labels = tf.placeholder(tf.int32, shape = [batch_size, 1])
    valid_dataset = tf.constant(valid_examples, dtype = tf.int32)
    
    # Ops and variables pinned to the CPU because of missing GPU implimentation
    with tf.device('/cpu:0'):
        embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
        embed = tf.nn.embedding_lookup(embeddings, train_inputs)
        
        # Constrauct the variables for the nce loss，
        # 调nce的参数代表的是什么意思，这部分没有搞明白
        nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size], stddev = 1.0/math.sqrt(embedding_size)))
        nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
    # compute the average NCE loss for the batch
    # tf.nce_loss automatically draws a new sample of the negative labels each time we evaluate the loss.
    loss = tf.reduce_mean(tf.nn.nce_loss(weights = nce_weights,
                                        biases = nce_biases,
                                        labels = train_labels,
                                        inputs = embed,
                                        num_sampled = num_sampled,
                                        num_classes = vocabulary_size))
    optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
    
    # Compute the cosine similarity between minibatch examples and all embeddings.
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims = True))
    normalized_embeddings = embeddings /norm
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
    similarity = tf.matmul(valid_embeddings, 
                          normalized_embeddings,
                          transpose_b = True)
    init = tf.global_variables_initializer()

In [11]:
num_steps = 100001

In [12]:
from six.moves import xrange

In [13]:
with tf.Session(graph = graph) as sess:
    init.run()
    print("Initialized!")
    average_loss = 0
    for step in xrange(num_steps):
        batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window)
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
        _, loss_val = sess.run([optimizer, loss], feed_dict = feed_dict)
        average_loss += loss_val
        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
            # The average loss is an estimate of the loss over last 2000 batches
            print("Average loss at step: ", step, ": ", average_loss)
            average_loss = 0
        
        # Note that this is expensive(~20% slowdown if computed every 500 steps) ??不懂这部分
        if step % 10000 == 0:
            sim = similarity.eval()
            for i in xrange(valid_size):
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 8 # number of nearest neighbors
                nearest = (-sim[i, :]).argsort()[1: top_k + 1]
                log_str = "Nearest to %s" % valid_word
                for k in xrange(top_k):
                    close_word = reverse_dictionary[nearest[k]]
                    log_str = "%s %s," %(log_str, close_word)
                print(log_str)
        final_embeddings = normalized_embeddings.eval()

Initialized!
Average loss at step:  0 :  198.105102539
Nearest to its variety, continent, jacques, region, linguistic, businesses, solid, lebanon,
Nearest to which bands, side, broken, tourism, jimmy, having, offering, describe,
Nearest to by shore, you, contributed, t, connecticut, forum, receive, someone,
Nearest to system influential, museum, radical, diamonds, begins, like, billy, abolished,
Nearest to while hiv, finite, connected, theology, stage, covenant, component, names,
Nearest to to message, june, christianity, analog, atlantic, aspects, pilots, opposition,
Nearest to has though, lose, ca, uss, surfaces, rank, doesn, electoral,
Nearest to there truth, is, pool, recognised, wish, cross, community, asked,
Nearest to are upon, hindi, circumcision, reform, run, agreements, disorders, drive,
Nearest to one past, took, consecutive, volcanic, manufacturing, singapore, achieved, performed,
Nearest to a this, subset, sydney, finally, levels, removed, soon, spoke,
Nearest to time arch

Average loss at step:  52000 :  4.25143530774
Average loss at step:  54000 :  4.32992819965
Average loss at step:  56000 :  4.30169002521
Average loss at step:  58000 :  4.31986963356
Average loss at step:  60000 :  4.30205368316
Nearest to its their, his, the, sex, read, python, our, describes,
Nearest to which that, this, who, also, it, and, there, broken,
Nearest to by with, voyage, from, be, was, assumed, eight, after,
Nearest to system radical, decimal, weight, effects, berkeley, limits, diamonds, iceland,
Nearest to while when, completion, after, finite, however, states, names, although,
Nearest to to would, could, nine, demand, for, can, monopoly, in,
Nearest to has had, have, is, was, though, agave, rank, makes,
Nearest to there they, it, he, which, said, asked, still, pool,
Nearest to are were, is, have, be, provide, shares, agreements, bird,
Nearest to one two, three, six, UNK, four, eight, seven, five,
Nearest to a the, another, drawn, agave, eve, vs, their, layers,
Nearest 

In [83]:
def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
    assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
    plt.figure(figsize=(18, 18))
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i, :]
        plt.scatter(x, y)
        plt.annotate(label,
                    xy = (x, y),
                    xytext = (5, 2),
                    textcoords = 'offset points',
                    ha = 'right',
                    va = 'bottom')
        plt.savefig(filename)

try:
    from sklearn.manifold import TSNE
    import matplotlib.pyplot as plt
    
    tsne = TSNE(perplexity = 30, n_components = 2, init = 'pca', n_iter = 5000)
    plot_only = 500
    low_dim_embs = tsne.fit_transform(final_embeddings[: plot_only, :])
    labels = [reverse_dictionary[i] for i in xrange(plot_only)]
    plot_with_labels(low_dim_embs, labels)
except ImportError:
    print("Please install sklearn, matplotlib, and scipy to visualize embeddings.")

1. 知道怎么从本地加载文件，并建立batch
2. 了解word2vec的guocheng
3. 会构建训练网络
4. 会根据相似度进行evaluation  # 这个还不会的样子
5. 会可视化   # 还有这个也没有弄懂的说