In [1]:
import collections
import glob
from itertools import chain
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
np.random.seed(1)

In [2]:
files = glob.glob('./res/writer_user_doc.txt')

words = []
for f in files:
    file = open(f)
    words.append(file.read())
    file.close()

words = list(chain.from_iterable(words))
words = ''.join(words)[:-1]
sentences = words.split('\n')

In [3]:
sentences_df = pd.DataFrame(sentences)

In [11]:
sentences_df['user'] = sentences_df[0].apply(lambda x : x.split()[0])
sentences_df['words'] = sentences_df[0].apply(lambda x : ' '.join(x.split()[1:]))

In [13]:
sentences_df.head()

Unnamed: 0,0,user,words
0,@nicemerry 학교 휴직 영화 프라이팬 일상 끄적 일상 요리 환갑 일상 쉼 목...,@nicemerry,학교 휴직 영화 프라이팬 일상 끄적 일상 요리 환갑 일상 쉼 목표 충전 일상 여행 ...
1,@dltjtks 응급실 죽음 간호사,@dltjtks,응급실 죽음 간호사
2,@diversityinlife 생명 다양 성 재단 그림일기 윈 저성 활동가 활동 생...,@diversityinlife,생명 다양 성 재단 그림일기 윈 저성 활동가 활동 생명 도 토리 생명 다양 성 재단...
3,@hyokyoungko 기다림 기 도 믿 음 엄마 어버이날 생일 concert 생일...,@hyokyoungko,기다림 기 도 믿 음 엄마 어버이날 생일 concert 생일 파티 기부 친구 사람 ...
4,@erish2150 프로그래밍 개발 생각 생각 프로젝트 인간,@erish2150,프로그래밍 개발 생각 생각 프로젝트 인간


In [52]:
sentences_df_indexed = sentences_df.reset_index().set_index('user')

In [18]:
vocabulary_size = 40000

def build_dataset(sentences):
    words = ''.join(sentences).split()
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    
    unk_count = 0
    sent_data = []
    for sentence in sentences:
        data = []
        for word in sentence.split():
            if word in dictionary:
                index = dictionary[word]
            else:
                index = 0  # dictionary['UNK']
                unk_count = unk_count + 1
            data.append(index)
        sent_data.append(data)
    
    count[0][1] = unk_count
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 
    return sent_data, count, dictionary, reverse_dictionary

data, count, dictionary, reverse_dictionary = build_dataset(sentences_df_indexed['words'].tolist())
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:2])
# del words  # Hint to reduce memory.

Most common words (+UNK) [['UNK', 19141], ('여행', 133392), ('에', 86381), ('세이', 76152), ('사랑', 60928)]
Sample data [[122, 409, 5, 10034, 13, 6639, 13, 75, 7447, 13, 3185, 270, 1930, 13, 1, 13, 295, 1], [982, 208, 538]]


In [19]:
skip_window = 5
instances = 0

# Pad sentence with skip_windows
for i in range(len(data)):
    data[i] = [vocabulary_size]*skip_window+data[i]+[vocabulary_size]*skip_window

# Check how many training samples that we get    
for sentence  in data:
    instances += len(sentence)-2*skip_window
print(instances)

5539467


In [29]:
context = np.zeros((instances,skip_window*2+1),dtype=np.int32)
labels = np.zeros((instances,1),dtype=np.int32)
doc = np.zeros((instances,1),dtype=np.int32)

k = 0
for doc_id, sentence  in enumerate(data):
    for i in range(skip_window, len(sentence)-skip_window):
        context[k] = sentence[i-skip_window:i+skip_window+1] # Get surrounding words
        labels[k] = sentence[i] # Get target variable
        doc[k] = doc_id
        k += 1
        
context = np.delete(context,skip_window,1) # delete the middle word        
        
shuffle_idx = np.random.permutation(k)
labels = labels[shuffle_idx]
doc = doc[shuffle_idx]
context = context[shuffle_idx]

In [30]:
batch_size = 256
context_window = 2*skip_window
embedding_size = 50 # Dimension of the embedding vector.
softmax_width = embedding_size # +embedding_size2+embedding_size3
num_sampled = 5 # Number of negative examples to sample.
sum_ids = np.repeat(np.arange(batch_size),context_window)

len_docs = len(data)

graph = tf.Graph()

with graph.as_default(): # , tf.device('/cpu:0')
    # Input data.
    train_word_dataset = tf.placeholder(tf.int32, shape=[batch_size*context_window])
    train_doc_dataset = tf.placeholder(tf.int32, shape=[batch_size])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])

    segment_ids = tf.constant(sum_ids, dtype=tf.int32)

    word_embeddings = tf.Variable(tf.random_uniform([vocabulary_size,embedding_size],-1.0,1.0))
    word_embeddings = tf.concat([word_embeddings,tf.zeros((1,embedding_size))],0)
    doc_embeddings = tf.Variable(tf.random_uniform([len_docs,embedding_size],-1.0,1.0))

    softmax_weights = tf.Variable(tf.truncated_normal([vocabulary_size, softmax_width],
                             stddev=1.0 / np.sqrt(embedding_size)))
    softmax_biases = tf.Variable(tf.zeros([vocabulary_size]))

    # Model.
    # Look up embeddings for inputs.
    embed_words = tf.segment_mean(tf.nn.embedding_lookup(word_embeddings, train_word_dataset),segment_ids)
    embed_docs = tf.nn.embedding_lookup(doc_embeddings, train_doc_dataset)
    embed = (embed_words+embed_docs)/2.0#+embed_hash+embed_users

    # Compute the softmax loss, using a sample of the negative labels each time.
    loss = tf.reduce_mean(tf.nn.nce_loss(softmax_weights, softmax_biases, train_labels, 
                                         embed, num_sampled, vocabulary_size))

    # Optimizer.
    optimizer = tf.train.AdagradOptimizer(0.5).minimize(loss)
        
    norm = tf.sqrt(tf.reduce_sum(tf.square(doc_embeddings), 1, keep_dims=True))
    normalized_doc_embeddings = doc_embeddings / norm

In [31]:

############################
# Chunk the data to be passed into the tensorflow Model
###########################
data_idx = 0
def generate_batch(batch_size):
    global data_idx

    if data_idx+batch_size<instances:
        batch_labels = labels[data_idx:data_idx+batch_size]
        batch_doc_data = doc[data_idx:data_idx+batch_size]
        batch_word_data = context[data_idx:data_idx+batch_size]
        data_idx += batch_size
    else:
        overlay = batch_size - (instances-data_idx)
        batch_labels = np.vstack([labels[data_idx:instances],labels[:overlay]])
        batch_doc_data = np.vstack([doc[data_idx:instances],doc[:overlay]])
        batch_word_data = np.vstack([context[data_idx:instances],context[:overlay]])
        data_idx = overlay
    batch_word_data = np.reshape(batch_word_data,(-1,1))

    return batch_labels, batch_word_data, batch_doc_data

In [33]:
num_steps = 200001
step_delta = int(num_steps/20)


with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print('Initialized')
    average_loss = 0
    for step in range(num_steps):
        batch_labels, batch_word_data, batch_doc_data\
        = generate_batch(batch_size)
        feed_dict = {train_word_dataset : np.squeeze(batch_word_data),
                     train_doc_dataset : np.squeeze(batch_doc_data),
                     train_labels : batch_labels}
        _, l = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += l
        if step % step_delta == 0:
            if step > 0:
                average_loss = average_loss / step_delta
            # The average loss is an estimate of the loss over the last 2000 batches.
            print('Average loss at step %d: %f' % (step, average_loss))
            average_loss = 0
    save_path = tf.train.Saver().save(session, "./doc2vec_model")    

    # Get the weights to save for later
    final_word_embeddings = word_embeddings.eval()
    final_word_embeddings_out = softmax_weights.eval()
    final_doc_embeddings = normalized_doc_embeddings.eval()

Initialized
Average loss at step 0: 39.481388
Average loss at step 10000: 16.504855
Average loss at step 20000: 10.137205
Average loss at step 30000: 7.782632
Average loss at step 40000: 6.338275
Average loss at step 50000: 5.252970
Average loss at step 60000: 4.594733
Average loss at step 70000: 4.111236
Average loss at step 80000: 3.755370
Average loss at step 90000: 3.412666
Average loss at step 100000: 3.143643
Average loss at step 110000: 2.901804
Average loss at step 120000: 2.884862
Average loss at step 130000: 2.625388
Average loss at step 140000: 2.487165
Average loss at step 150000: 2.382428
Average loss at step 160000: 2.302203
Average loss at step 170000: 2.210721
Average loss at step 180000: 2.177297
Average loss at step 190000: 2.111760
Average loss at step 200000: 2.025514


In [36]:
rand_doc = np.random.randint(len_docs)
dist = final_doc_embeddings.dot(final_doc_embeddings[rand_doc][:,None])
closest_doc = np.argsort(dist,axis=0)[-10:][::-1]
furthest_doc = np.argsort(dist,axis=0)[0][::-1]

for idx in closest_doc:
    print(dist[idx][0][0])

0.9999999
0.6036673
0.603291
0.59949213
0.5988153
0.59659284
0.5926226
0.59009594
0.5883219
0.58178824


In [102]:
def most_similar(user_id, size):
    user_index = sentences_df_indexed.loc[user_id]['index']
    dist = final_doc_embeddings.dot(final_doc_embeddings[user_index][:,None])
    closest_doc = np.argsort(dist,axis=0)[-size:][::-1]
    furthest_doc = np.argsort(dist,axis=0)[0][::-1]

    result = []
    for idx, item in enumerate(closest_doc):
        user = sentences[closest_doc[idx][0]].split()[0]
        dist_value = dist[item][0][0]
        result.append([user, dist_value])
    return result

In [99]:
type(final_doc_embeddings)

numpy.ndarray

In [86]:
sim_doc(5, 10)

[['@sangjunleeinfo', 1.0000001],
 ['@mylife4iu', 0.5428308],
 ['@johnbird', 0.51903224],
 ['@ongrim', 0.51678944],
 ['@hash-on', 0.5149888],
 ['@kimikim', 0.5143247],
 ['@yerinirenekang', 0.5140254],
 ['@hillitoot', 0.5125634],
 ['@wkwn71', 0.5020619],
 ['@bookspause', 0.50160414]]

In [93]:
sentences_df_indexed.loc['#d6866a498157771069fdf15361cb012b']['index']

21422

In [103]:
most_similar('#d6866a498157771069fdf15361cb012b', 10)

[['#d6866a498157771069fdf15361cb012b', 1.0000001],
 ['#ac8fc562dac1a5c6ef86dfc616db6b45', 0.6216059],
 ['@hwiki', 0.6086082],
 ['@aidfox', 0.6038921],
 ['#29a089a9401ad7e26b20d5df89f10332', 0.60082906],
 ['@thsgmlgns2', 0.59521574],
 ['@kierstenlee', 0.5884473],
 ['@wkdanswjd102', 0.57278204],
 ['@seongminyoo', 0.5682178],
 ['#2afdcfd40f7d105c79ff68b7c3c52f65', 0.5680978]]

In [95]:
most_similar('#87a6479c91e4276374378f1d28eb307c', 10)

[['#87a6479c91e4276374378f1d28eb307c', 0.9999999],
 ['@parkkimsoojin', 0.570564],
 ['@ttwakid', 0.5545074],
 ['#7f4912b56ff1e9cf0bfff5067321c9d9', 0.5463936],
 ['@wavejdh', 0.5359304],
 ['@hyunikun', 0.5332689],
 ['@chogh31', 0.52276725],
 ['@jonylee', 0.5210302],
 ['@parkhyungsik', 0.5201211],
 ['@machinoh', 0.51795894]]

In [97]:
most_similar('#a0df5bd0e5a5bbc28b87f8c64462667c', 10)

[['#a0df5bd0e5a5bbc28b87f8c64462667c', 1.0000001],
 ['@hss730', 0.65026903],
 ['@joy2003bqji', 0.6284451],
 ['@inqatar', 0.61798096],
 ['@brunch9d21', 0.61397386],
 ['@miriaemoon', 0.61324745],
 ['@elara1020', 0.6099669],
 ['@ike', 0.607323],
 ['@myfriendjesus', 0.5987311],
 ['#ce81bac9c227d00bb554a2463bcb117e', 0.5972982]]

In [107]:
def similar(user_id, writer_id):
    user_index = sentences_df_indexed.loc[user_id]['index']
    writer_index = sentences_df_indexed.loc[writer_id]['index']
    dist = final_doc_embeddings[user_index].dot(final_doc_embeddings[writer_index])
    #print('{} - {} : {}'.format(user_id, writer_id, dist))
    return dist

In [109]:
similar('#d6866a498157771069fdf15361cb012b', '@brunch')

#d6866a498157771069fdf15361cb012b - @brunch : 0.16178107261657715


In [108]:
similar('#d6866a498157771069fdf15361cb012b', '@seochogirl')

#d6866a498157771069fdf15361cb012b - @seochogirl : 0.28863343596458435


In [111]:
np.save('doc_embeddings', final_doc_embeddings)