In [1]:
from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.sparse import dok_matrix
from math import log10
import numpy as np
import networkx as nx
import pickle

In [2]:
with open("./preprocessed_bitcoin.pkl", 'rb') as f:
    data = pickle.load(f)

voca2idx = {w: i for i, w in enumerate(data['voca'])}
voca = data['voca']

In [3]:
# user_score = nx.pagerank(data['user_network'], tol=1e-8, max_iter=200)
hubs, user_score = nx.hits(data['user_network'], max_iter=500)
total_user_num = len(data['user_network'].nodes())
top_users = sorted(user_score, key=user_score.get, reverse=True)

fwrite = open('hits_user_score_post_num.tsv', 'w')
fwrite.write("id\tlog id\tuser\tscore\tpost num\tget comment num\twrite comment num\n")
for i, user in enumerate(top_users):
    fwrite.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
        i + 1, log10(i+1), user, user_score[user], data['user_posts_num'][user], 
        data['get_comment_num'][user], data['write_comment_num'][user]))
fwrite.close()
# 전체 게시물 수와 상위유저+하위유저 게시물 갯수가 다른건 아예 소통이 없던 유저의 게시물이 있기 때문!

In [4]:
# 전체 유저 분석!
tdm = dok_matrix((len(data['posts']), len(voca)), dtype=np.float32)
for i, post in enumerate(data['posts']):
    for word in post:
        tdm[i, voca2idx[word]] += 1
        
tdm = normalize(tdm)
tdm = tdm.tocsr()
print(tdm.shape)

(390572, 12021)


In [5]:
K = 10
nmf = NMF(n_components=K, alpha=0.1, max_iter=500)
nmf.fit(tdm)
H_total = nmf.components_

In [6]:
# 상위 유저 분석!
acc_sum = 0
top_index = 0
score_sum = sum(user_score.values())
for i, top_user in enumerate(sorted(user_score, key=user_score.get, reverse=True)):
    acc_sum += user_score[top_user]/score_sum
    if acc_sum > 0.8:
        top_index = i
        break

print(top_index)
top_users = top_users[:top_index]

3636


In [7]:
user_posts = []
for user in top_users:
    for post in data['user_posts'][user]:
        user_posts.append(post)

In [8]:
tdm = dok_matrix((len(user_posts), len(voca)), dtype=np.float32)
for i, post in enumerate(user_posts):
    for word in post:
        tdm[i, voca2idx[word]] += 1
        
tdm = normalize(tdm)
tdm = tdm.tocsr()
print(tdm.shape)

(192292, 12021)


In [9]:
nmf = NMF(n_components=K, alpha=0.1, max_iter=500)
nmf.fit(tdm)
H_top = nmf.components_

In [10]:
# 하위 유저 분석!
low_users = sorted(user_score, key=user_score.get, reverse=False)[:-top_index]

user_posts = []
for user in low_users:
    for post in data['user_posts'][user]:
        user_posts.append(post)

In [11]:
tdm = dok_matrix((len(user_posts), len(voca)), dtype=np.float32)
for i, post in enumerate(user_posts):
    for word in post:
        tdm[i, voca2idx[word]] += 1
        
tdm = normalize(tdm)
tdm = tdm.tocsr()
print(tdm.shape)

(197344, 12021)


In [12]:
nmf = NMF(n_components=K, alpha=0.1, max_iter=500)
nmf.fit(tdm)
H_low = nmf.components_

In [13]:
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity
from scipy.optimize import linear_sum_assignment

top_distances = pairwise_distances(H_total, H_top, metric='cosine')
_, top_indices = linear_sum_assignment(top_distances)

low_distances = pairwise_distances(H_total, H_low, metric='cosine')
_, low_indices = linear_sum_assignment(low_distances)

top_similarity_average = 0
low_similarity_average = 0
for k in range(K):
    top_similarity = cosine_similarity(H_top[top_indices[k]].reshape(1, -1), H_total[k].reshape(1,-1))[0, 0]
    low_similarity = cosine_similarity(H_low[low_indices[k]].reshape(1, -1), H_total[k].reshape(1,-1))[0, 0]
    top_similarity_average += top_similarity
    low_similarity_average += low_similarity
   
    print(f"total users: {k}th topic")
    for i in H_total[k, :].argsort()[::-1][:20]:
        print(voca[i], end=' ')
    print()
    
    print(f"top users: {top_indices[k]}th topic, similarity - {top_similarity}")
    for i in H_top[top_indices[k]].argsort()[::-1][:20]:
        print(voca[i], end=' ')
    print()
    
    print(f"top users: {low_indices[k]}th topic, similarity - {low_similarity}")
    for i in H_low[low_indices[k]].argsort()[::-1][:20]:
        print(voca[i], end=' ')
    print()
    
    print()
    
top_similarity_average /= K
low_similarity_average /= K

print(top_similarity_average, low_similarity_average)

total users: 0th topic
bitcoin payment country use value accept world future cash using currency user buy government new news year gold accepting first 
top users: 0th topic, similarity - 0.9998469460740503
bitcoin use payment currency accept using world value country user buy cash future government new accepting news fiat used business 
top users: 0th topic, similarity - 0.9998326214435246
bitcoin country value future payment world use cash accept buy user year government currency using gold new fork first already 

total users: 1th topic
time now good year right make block need day new long take first idea mining work look sure back already 
top users: 3th topic, similarity - 0.9688464324195369
time now good make need year right new block take work day idea long first mining look sure change network 
top users: 9th topic, similarity - 0.7231921399591761
time long take year block day first every last new next need back mining ago around month right wait hour 

total users: 2th topic
b