In [17]:
!pip install jsonlines

import numpy as np
import pandas as pd
import jsonlines
import networkx as nx
import operator
from collections import Counter
from numpy import linalg as LA



In [18]:
n_pages = 11
M_counts = np.zeros((n_pages, n_pages)) 

M_counts[:,0] = 1 
M_counts[2,1] = 1 
M_counts[1,2] = 1 
M_counts[0,3] = 1 
M_counts[1,3] = 1 
M_counts[1,4] = 1 
M_counts[3,4] = 1 
M_counts[5,4] = 1 
M_counts[1,5] = 1 
M_counts[4,5] = 1 
M_counts[1,6] = 1 
M_counts[4,6] = 1
M_counts[1,7] = 1
M_counts[4,7] = 1
M_counts[1,8] = 1
M_counts[4,8] = 1
M_counts[4,9] = 1
M_counts[4,10] = 1

print(M_counts)

[[1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 1. 1. 1. 1. 1. 1. 1. 0. 0.]
 [1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1.]
 [1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


In [19]:
M = np.empty((n_pages, n_pages))

for j in range(n_pages):
    M[:,j] = M_counts[:,j] / M_counts[:,j].sum()
np.set_printoptions(precision=3)

print(M)

[[0.091 0.    0.    0.5   0.    0.    0.    0.    0.    0.    0.   ]
 [0.091 0.    1.    0.5   0.333 0.5   0.5   0.5   0.5   0.    0.   ]
 [0.091 1.    0.    0.    0.    0.    0.    0.    0.    0.    0.   ]
 [0.091 0.    0.    0.    0.333 0.    0.    0.    0.    0.    0.   ]
 [0.091 0.    0.    0.    0.    0.5   0.5   0.5   0.5   1.    1.   ]
 [0.091 0.    0.    0.    0.333 0.    0.    0.    0.    0.    0.   ]
 [0.091 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.   ]
 [0.091 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.   ]
 [0.091 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.   ]
 [0.091 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.   ]
 [0.091 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.   ]]


In [20]:
def page_rank(M, d=0.85, square_error=1e-6):
    n_pages = M.shape[0] 
    v = np.random.rand(n_pages)
    v = v / v.sum() 
    last_v = np.ones((n_pages)) 
    M_hat = d * M + (1-d)/n_pages * np.ones((n_pages, n_pages)) 
    while np.square(v - last_v).sum() > square_error:
        last_v = v
        v = M_hat.dot(v)
        
    return v

page_rank(M)

array([0.033, 0.384, 0.343, 0.039, 0.081, 0.039, 0.016, 0.016, 0.016,
       0.016, 0.016])

In [21]:
def create_tweet_graph_from_file(filename):
    edges_list = [] 

    with jsonlines.open(filename, 'r') as f:
        for jsn in f:
            rt_user_id = jsn["user"]["id"]
            source_user_id = jsn["retweeted_status"]["user"]["id"]
            if rt_user_id != source_user_id:
                edges_list.append((rt_user_id, source_user_id))
    
    
    weighted_edge_list = Counter(edges_list)
    tweet_graph = nx.DiGraph()
    
    for edge in weighted_edge_list.items():
        source = edge[0][0]
        destination = edge[0][1]
        weight = edge[1]
        tweet_graph.add_edge(source, destination, weight=weight)
        
    return tweet_graph

In [22]:
tweet_graph = create_tweet_graph_from_file("HITS.json")
tweet_graph.size()

6177

In [23]:
def hits(graph, iter_count = 20):
    nodes = graph.nodes()
    nodes_count = len(nodes)
    matrix = nx.to_numpy_matrix(graph, nodelist=nodes)
    
    hubs_score = np.ones(nodes_count)
    auth_score = np.ones(nodes_count)
    H = matrix * matrix.T
    A = matrix.T * matrix
   
    for i in range(iter_count):
       
        hubs_score = hubs_score * H 
        auth_score = auth_score * A 
        hubs_score = hubs_score / LA.norm(hubs_score)
        auth_score = auth_score / LA.norm(auth_score)
        
    hubs_score = np.array(hubs_score).reshape(-1,)
    auth_score = np.array(auth_score).reshape(-1,)
    
    hubs = dict(zip(nodes, hubs_score))
    authorities = dict(zip(nodes, auth_score))
    
    return hubs, authorities

In [24]:
def get_top_k_hubs(graph, k = 10):
    hubs = hits(graph)[0]
    return sorted(hubs.items(), key = operator.itemgetter(1), reverse = True)[:k]

def get_top_k_authorities(graph, k = 10):
    auth = hits(graph)[1]
    return sorted(auth.items(), key = operator.itemgetter(1), reverse = True)[:k]

In [25]:
top_10_tweet_hubs = get_top_k_hubs(tweet_graph)
top_10_tweet_hubs

[(3068706044, 0.6228962788346416),
 (3093940760, 0.2960833772615782),
 (2194518394, 0.25979684894330446),
 (2862783698, 0.20250708715416685),
 (3092183276, 0.17046401522271867),
 (3029724797, 0.16693938874412695),
 (2990704188, 0.14781712484957882),
 (3001500121, 0.1450694492814583),
 (3086921438, 0.12911896850758386),
 (3042686360, 0.12523755718547333)]

In [26]:
top_10_tweet_auth = get_top_k_authorities(tweet_graph)
top_10_tweet_auth

[(3042570996, 0.5445084183897634),
 (3065514742, 0.4929955772989205),
 (1638625987, 0.4437589239003996),
 (3077733683, 0.28651236641693445),
 (3039321886, 0.2242627827645839),
 (3077695572, 0.1218423014626474),
 (3019659587, 0.11321175872457184),
 (1358345766, 0.09894209162744073),
 (3061155846, 0.09396927090171626),
 (3092580049, 0.09366139118323492)]