# A simple PageRank implementation

In [1]:
import json
import numpy as np

In [2]:
def read_graph(filename):
    with open(filename, 'r') as f:
        g = json.load(f)
    return g 

In [4]:
G = read_graph('collections/example.json')

We have a transition matrix and a jump vector. 

Computation of the transition matrix given by the graph:

In [5]:
def compute_R(graph):
    #number of elements in the graph
    n = len(graph.keys())
    key_to_pos = dict(zip(graph.keys(), range(0, n)))
    R = np.zeros((n,n))
    for i, source in enumerate(graph.keys()):
        #need to compute the out-degree
        out_deg = len(graph[source])
        #for each connected node
        for dest in graph[source]:
            j = key_to_pos[dest]
            R[i][j] = 1/out_deg
    return R

In [7]:
R = compute_R(G)
R

array([[0.        , 0.33333333, 0.33333333, 0.33333333, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.5       ,
        0.5       ],
       [0.        , 0.        , 0.        , 0.        , 1.        ,
        0.        ],
       [1.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 1.        , 0.        ,
        0.        ],
       [1.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ]])

It can be solved in analytic way, but more often it is solved in iterative way.

In [8]:
def PageRank_iteration(x, R, J, alpha):
    n = len(x)
    one = np.mat(np.ones(n)).T
    P = alpha * one * J  + (1 - alpha) * R 
    x_prime = x * P
    return x_prime

In [11]:
def compute_PageRank(graph, alpha, epsilon):
    n = len(graph.keys())
    R = compute_R(graph)
    #jump vector --> we jump with equal probability 
    J = np.ones(n)/n #needs to be changed for topic specific page rank
    #we converge independently from where we start
    #x = np.ones(n)/n
    #we choose a random starting point
    x = np.random.rand(n)
    x = x/x.sum()
    err = np.inf
    while (err > epsilon):
        x_new = PageRank_iteration(x, R, J, alpha)
        err = (abs(x_new - x)).sum()
        print(f"Error: {err}")
        x = x_new
    print("PageRank scores: ")
    for i,k in enumerate(graph.keys()):
        print(f"{k}: {x[0,i]}")
    return x   

In [12]:
#alpha --> how important is the jump vector
compute_PageRank(G, 0.1, 0.01)

Error: 1.3550848156265085
Error: 1.2195763340638577
Error: 0.9269288195814747
Error: 0.7472273904448014
Error: 0.6304177925964727
Error: 0.5017387772340801
Error: 0.38350367532570695
Error: 0.3040016534881473
Error: 0.2493932877652729
Error: 0.20008015576136626
Error: 0.15620836748841202
Error: 0.12344810001887507
Error: 0.09967998834826308
Error: 0.07998796545277038
Error: 0.06315880765631122
Error: 0.049982781299125936
Error: 0.040034360442340114
Error: 0.03206160788730361
Error: 0.025455616243241094
Error: 0.02018931074100605
Error: 0.01611080297047751
Error: 0.012876066993947916
Error: 0.010247630455591988
Error: 0.00814217891603998
PageRank scores: 
a: 0.30260013611958203
b: 0.10810777165546351
c: 0.10810777165546351
d: 0.2547270322865427
e: 0.16150963287887854
f: 0.06494765540407064


matrix([[0.30260014, 0.10810777, 0.10810777, 0.25472703, 0.16150963,
         0.06494766]])