In [1]:
from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfTransformer
from datetime import datetime
import numpy as np
import networkx as nx
import pickle

In [2]:
with open("./preprocessed_bitcoin.pkl", 'rb') as f:
    data = pickle.load(f)

voca2idx = {w: i for i, w in enumerate(data['voca'])}
voca = data['voca']

In [14]:
prices_list = []
with open("price_bitcoin.tsv") as f:
    next(f)
    for i, line in enumerate(f):
        price = {}
        line = line[:-1]
        date, open_, high, low, close, volume, market_cap = line.split('\t')
        price['date'] = datetime.strptime(date, '%b %d, %Y').date()
        price['open'] = float(open_.replace(',', ''))
        price['high'] =  float(high.replace(',', ''))
        price['low'] =  float(low.replace(',', ''))
        price['close'] =  float(close.replace(',', ''))
        try:
            price['volume'] =  float(volume.replace(',', ''))
        except:
            break
        price['market_cap'] =  float(market_cap.replace(',', ''))
        prices_list.append(price)
        
prices_list.reverse()
prices_list = prices_list[521:-27] # Feb 02, 2018까지
print(len(prices_list))

978


In [4]:
# 전체 토픽 분석!

In [5]:
time_list = sorted(list(data['time_posts'].keys()))
time_list = [t for t in time_list if t>= datetime(2015, 6, 1).date()]

tdm = np.zeros((len(time_list), len(voca)), dtype=np.float32)
for i, time in enumerate(time_list):
    for post in data['time_posts'][time]:
        for word in post:
            tdm[i, voca2idx[word]] += 1
            
tdm = normalize(tdm)

In [6]:
K = 10
nmf = NMF(n_components=K, alpha=0.1, max_iter=500)
W = nmf.fit_transform(tdm).T
H = nmf.components_

In [17]:
from scipy.stats import pearsonr
prices = np.array([p['high'] for p in prices_list])
topic_time_correlation = []
for k in range(K):
    topic_time_correlation.append(pearsonr(prices, W[k, :]))
    
topic_time_correlation = np.array(topic_time_correlation) # pearson correlation and p-value
top_topic = np.argsort(np.abs(topic_time_correlation[:,0]))[::-1]

for k in top_topic[:5]:
    print(f"{k+1}th topic, correlation: %f, p-value: %.2e" % (topic_time_correlation[k,0], topic_time_correlation[k,1]))
    for i in H[k, :].argsort()[::-1][:30]:
        print(voca[i], end=' ')
    print()

5th topic, correlation: 0.632263, p-value: 2.69e-110
bitcoin price btc coin now market exchange money time crypto currency buy year know value sell make good day get mining cash cryptocurrency new country future trading china wallet profit 
6th topic, correlation: -0.450832, p-value: 3.95e-50
bitcoin get btc good money know time use site need make idea work buy now exchange pay using account service new card better business website first game take give payment 
9th topic, correlation: -0.375215, p-value: 4.69e-34
block bitcoin miner size node time need mining network transaction make year limit blockchain increase core new get now work chain change every system day power number full know take 
10th topic, correlation: -0.187180, p-value: 3.67e-09
address bitcoin coin key transaction btc private know wallet get send time number possible blockchain public use output exchange used using mining amount find million question never sent make value 
2th topic, correlation: 0.164816, p-value: 2

In [13]:
from pyecharts import Line
from bokeh.palettes import Set1

line = Line()
for k in  top_topic[:3]:
    line.add(f"topic {k+1}",
             time_list, W[k], is_fill=True, is_stack=True, is_symbol_show=False,
             line_width=0.2, area_opacity=0.4,
             label_color=Set1[7], is_smooth=True, x_axis_type='time')

line.height = 500
line.width = 800
line

In [14]:
# network analysis
# user_score = nx.pagerank(data['user_network'], tol=1e-8, max_iter=200)
_, user_score = nx.hits(data['user_network'], max_iter=500) # output: hubs, authority
total_user_num = len(data['user_network'].nodes())

In [15]:
# 상위 유저 분석!
top_users = sorted(user_score, key=user_score.get, reverse=True)

acc_sum = 0
top_index = 0
score_sum = sum(user_score.values())
for i, top_user in enumerate(top_users):
    acc_sum += user_score[top_user]/score_sum
    if acc_sum > 0.8:
        top_index = i
        break

top_users = top_users[:top_index]

In [16]:
time_list = set()
for top_user in top_users:
    time_list.update(data['user_time_posts'][top_user].keys())

time_list = sorted(time_list)
time_list = [t for t in time_list if t>= datetime(2015, 6, 1).date()]
time_list_dict = {t: i for i, t in enumerate(time_list)}
print(len(time_list))
print(time_list[0], time_list[-1])

977
2015-06-01 2018-02-01


In [17]:
tdm = np.zeros((len(time_list), len(voca)), dtype=np.float32)
for top_user in top_users:
    for time, posts in data['user_time_posts'][top_user].items():
        if time in time_list_dict:
            for post in posts:
                for word in post:
                    tdm[time_list_dict[time], voca2idx[word]] += 1
                    
tdm = normalize(tdm)

In [19]:
nmf = NMF(n_components=K, alpha=0.1, max_iter=500)
W = nmf.fit_transform(tdm).T
H = nmf.components_

In [20]:
from scipy.stats import pearsonr
prices = np.array([p['high'] for p in prices_list if p['date'] in time_list_dict])
topic_time_correlation = []
for k in range(K):
    topic_time_correlation.append(pearsonr(prices, W[k, :]))
    
topic_time_correlation = np.array(topic_time_correlation) # pearson correlation and p-value
top_topic = np.argsort(np.abs(topic_time_correlation[:,0]))[::-1]

for k in top_topic[:5]:
    print(f"{k+1}th topic, correlation: %f, p-value: %.2e" % (topic_time_correlation[k,0], topic_time_correlation[k,1]))
    for i in H[k, :].argsort()[::-1][:30]:
        print(voca[i], end=' ')
    print()

2th topic, correlation: 0.584437, p-value: 1.46e-90
bitcoin price currency exchange now market value time year cryptocurrency crypto day gold buy government china money world trading mining new investment investor digital big high future country news make 
9th topic, correlation: 0.191275, p-value: 1.67e-09
coin bitcoin money get know now exchange make good time take crypto world back keep sell need buy alt give mean everyone never new lose right since first fiat start 
1th topic, correlation: -0.162797, p-value: 3.12e-07
block size transaction node bitcoin miner network time limit mining new increase chain change number minute full version add bip every consensus first reward system code fix mined need total 
7th topic, correlation: -0.119300, p-value: 1.86e-04
satoshi know nakamoto wright time blockchain craig bitcoin claim technology now key work group year world munity get project address make first forum trying node back gavin give might news 
6th topic, correlation: 0.114232, p-v

In [21]:
from pyecharts import Line
from bokeh.palettes import Set1

line = Line()
for k in  top_topic[:3]:
    line.add(f"topic {k+1}",
             time_list, W[k], is_fill=True, is_stack=True, is_symbol_show=False,
             line_width=0.2, area_opacity=0.4,
             label_color=Set1[7], is_smooth=True, x_axis_type='time')

line.height = 500
line.width = 800
line

In [22]:
# 하위 유저 분석!
#low_users = sorted(user_score, key=user_score.get, reverse=False)[:int(total_user_num*0.8)]
low_users = sorted(user_score, key=user_score.get, reverse=False)[:-top_index]

In [23]:
time_list = set()
for low_user in low_users:
    time_list.update(data['user_time_posts'][low_user].keys())

time_list = sorted(time_list)
time_list = [t for t in time_list if t>= datetime(2015, 6, 1).date()]
time_list_dict = {t: i for i, t in enumerate(time_list)}
print(len(time_list))
print(time_list[0], time_list[-1])

735
2015-06-01 2018-02-01


In [24]:
tdm = np.zeros((len(time_list), len(voca)), dtype=np.float32)
for low_user in low_users:
    for time, posts in data['user_time_posts'][low_user].items():
        if time in time_list_dict:
            for post in posts:
                for word in post:
                    tdm[time_list_dict[time], voca2idx[word]] += 1
                    
tdm = normalize(tdm)

In [25]:
nmf = NMF(n_components=K, alpha=0.1, max_iter=500)
W = nmf.fit_transform(tdm).T
H = nmf.components_

In [26]:
from scipy.stats import pearsonr
prices = np.array([p['high'] for p in prices_list if p['date'] in time_list_dict])
topic_time_correlation = []
for k in range(K):
    topic_time_correlation.append(pearsonr(prices, W[k, :]))
    
topic_time_correlation = np.array(topic_time_correlation) # pearson correlation and p-value
top_topic = np.argsort(np.abs(topic_time_correlation[:,0]))[::-1]

for k in top_topic[:5]:
    print(f"{k+1}th topic, correlation: %f, p-value: %.2e" % (topic_time_correlation[k,0], topic_time_correlation[k,1]))
    for i in H[k, :].argsort()[::-1][:30]:
        print(voca[i], end=' ')
    print()

8th topic, correlation: 0.650151, p-value: 1.62e-89
price coin market now bitcoin money currency exchange crypto time year make value bank new cryptocurrency buy day world get country trading sell china long government dollar future usd fork 
0th topic, correlation: 0.491694, p-value: 5.30e-46
bitcoin new currency good idea cash now buy future mining world payment used thanks business satoshi store place get make need day help user network question fork wondering point source 
1th topic, correlation: 0.243260, p-value: 2.32e-11
btc buy day get coin need exchange fork time know coinbase bcc cash got sent trade now owner thanks using usd question last buying around payment send big price trading 
3th topic, correlation: 0.147920, p-value: 5.68e-05
transaction fee time unconfirmed now sent confirmed confirmation hour amount money send network high first take btc pay paid mean small miner dust payment output segwit show example future made 
2th topic, correlation: 0.119618, p-value: 1.16e-

In [27]:
from pyecharts import Line
from bokeh.palettes import Set1

line = Line()
for k in  top_topic[:3]:
    line.add(f"topic {k+1}",
             time_list, W[k], is_fill=True, is_stack=True, is_symbol_show=False,
             line_width=0.2, area_opacity=0.4,
             label_color=Set1[7], is_smooth=True, x_axis_type='time')

line.height = 500
line.width = 800
line