In [256]:
import collections
import glob
from itertools import chain
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
np.random.seed(1)

In [257]:
files = glob.glob('./res/writer_user_sentences_keyword.txt')

words = []
for f in files:
    file = open(f)
    words.append(file.read())
    file.close()

words = list(chain.from_iterable(words))
words = ''.join(words)[:-1]
sentences = words.split('\n')

In [258]:
sentences_df = pd.DataFrame(sentences)

In [259]:
sentences_df['user'] = sentences_df[0].apply(lambda x : x.split()[0])
sentences_df['words'] = sentences_df[0].apply(lambda x : ' '.join(x.split()[1:]))

In [260]:
sentences_df.shape

(32359, 3)

In [261]:
sentences_df_indexed = sentences_df.reset_index().set_index('user')

In [262]:
final_doc_embeddings = np.load('./doc_embeddings_keyword.npy')

In [263]:
final_doc_embeddings.shape

(32359, 50)

In [264]:
def most_similar(user_id, size):
    if user_id in sentences_df_indexed.index:
        user_index = sentences_df_indexed.loc[user_id]['index']
        dist = final_doc_embeddings.dot(final_doc_embeddings[user_index][:,None])
        closest_doc = np.argsort(dist,axis=0)[-size:][::-1]
        furthest_doc = np.argsort(dist,axis=0)[0][::-1]

        result = []
        for idx, item in enumerate(closest_doc):
            user = sentences[closest_doc[idx][0]].split()[0]
            dist_value = dist[item][0][0]
            result.append([user, dist_value])
        return result

In [265]:
from scipy import spatial

def similar(user_id, writer_id):
    if user_id in sentences_df_indexed.index and writer_id in sentences_df_indexed.index:
        user_index = sentences_df_indexed.loc[user_id]['index']
        writer_index = sentences_df_indexed.loc[writer_id]['index']
        sim = spatial.distance.cosine(final_doc_embeddings[user_index], final_doc_embeddings[writer_index])
        print('{} - {} : {}'.format(user_id, writer_id, sim))
        return sim

In [266]:
most_similar('#87a6479c91e4276374378f1d28eb307c', 5)

[['#87a6479c91e4276374378f1d28eb307c', 1.0000002],
 ['#3fe808daf9829225707fb77739858636', 0.7260888],
 ['@jh2019', 0.59609675],
 ['@dkfdkfdl', 0.57348394],
 ['@skyline1019', 0.54642665]]

In [267]:
most_similar('#a0df5bd0e5a5bbc28b87f8c64462667c', 5)

[['#a0df5bd0e5a5bbc28b87f8c64462667c', 0.9999999],
 ['@realplan1', 0.67656094],
 ['@aviationusa', 0.64471817],
 ['#c92cec1183fbea36203d8390cb9fdf26', 0.62998474],
 ['@sanjang', 0.6293229]]

In [268]:
similar('#d6866a498157771069fdf15361cb012b', '@seochogirl')
similar('#d6866a498157771069fdf15361cb012b', '@brunch')
similar('#87a6479c91e4276374378f1d28eb307c', '@begintalk')
similar('#87a6479c91e4276374378f1d28eb307c', '@tnrud572')
similar('#a0df5bd0e5a5bbc28b87f8c64462667c', '@kimmh12728xrf')
similar('#a0df5bd0e5a5bbc28b87f8c64462667c', '@brunch')
similar('#ec0fb734ba02a29c62c64e7ac7a8f13e', '@sethahn')
similar('#ec0fb734ba02a29c62c64e7ac7a8f13e', '@nomadesk')

#d6866a498157771069fdf15361cb012b - @seochogirl : 0.9860673602670431
#d6866a498157771069fdf15361cb012b - @brunch : 0.8184942752122879
#87a6479c91e4276374378f1d28eb307c - @begintalk : 0.9809975810348988
#87a6479c91e4276374378f1d28eb307c - @tnrud572 : 0.7063705027103424
#a0df5bd0e5a5bbc28b87f8c64462667c - @kimmh12728xrf : 0.7746954560279846
#a0df5bd0e5a5bbc28b87f8c64462667c - @brunch : 0.6628093719482422
#ec0fb734ba02a29c62c64e7ac7a8f13e - @sethahn : 0.919483095407486
#ec0fb734ba02a29c62c64e7ac7a8f13e - @nomadesk : 1.0463214404881


1.0463214404881