# Pagerank

Authored by:
* Emil Riis Hansen 
* Jonas Brusokas
* Kashif Rabbani

References:
* Pagerank docs from Networkx - https://networkx.github.io/documentation/networkx-1.10/reference/generated/networkx.algorithms.link_analysis.pagerank_alg.pagerank.html
* General algorithm and references to relevant material - https://en.wikipedia.org/wiki/PageRank

## Installing Python preliminaries
* networkx - library for working with graphs
* matplotlib - plotting library


In [1]:
%%capture
!git clone https://github.com/joerg84/Graph_Powered_ML_Workshop.git
!rsync -av Graph_Powered_ML_Workshop/ ./ --exclude=.git
!pip3 install networkx
!pip3 install matplotlib

In [2]:
import networkx as nx
import matplotlib.pyplot as plt

## Pagerank algorithm implementation

In [3]:
# Handy method for extracting list of elements from list of tuples by position
def map_tuple_pos(list_of_tuples, pos):
  return list(map(lambda tuple: tuple[pos], list_of_tuples))

# Calculate page rank on given directed graph
# orig_dg - graph
# d - dampening parameter
# t - max number of iterations
def pagerank(orig_dg: nx.DiGraph, d: float = 0.85, t: int = 100):

  def out_nodes(curr_node):
    outgoing_nodes = map_tuple_pos(dg.out_edges(curr_node),1)
    return outgoing_nodes
    
  def in_nodes(curr_node):
    return map_tuple_pos(dg.in_edges(curr_node), 0)

  def factor(curr_node):
    return sum(map(lambda in_node: old_pr_dict[in_node] / len(out_nodes(in_node)),in_nodes(curr_node)))

  # Copy the graph to prevent modification of the original
  dg = orig_dg.copy()

  # Find dangling nodes and connect them to all others
  for node in dg.nodes:
    if (len(out_nodes(node)) == 0):
      new_edges = list(map(lambda nodeTo: (node, nodeTo), list(dg.nodes)))
      dg.add_edges_from(new_edges)

  pr_dict = {}
  N = len(dg.nodes)

  # Initial PR 1/N
  for node in dg.nodes:
    pr_dict[node] = 1./len(dg.nodes)

  # Number of iterations / timesteps [0..t)
  for _ in range(t):
    old_pr_dict = pr_dict.copy()
    for node in dg.nodes:
      pr_dict[node] = (1. - d)/N + d * factor(node)
      # print(f"Factor: {factor(node)}")

  return pr_dict

## Test code
- Generates random graphs and computes, compares reference pagerank and notebook implementations 

In [4]:
common_d = 0.85
common_t = 100

graph_error_tuples = []

for n in range(10):
  for p in [0.1, 0.3, 0.5, 0.7, 0.9, 1.0]:
    gg = nx.gnp_random_graph(n, p, seed=1, directed=True).to_directed()
    my_pr = pagerank(gg, common_d, common_t)
    ref_pr = nx.pagerank(gg, alpha=common_d, max_iter=common_t)
    error = sum(map(lambda key: abs(my_pr[key] - ref_pr[key]), list(ref_pr.keys())))
    graph_error_tuples.append( (gg, error) )

error_list = map_tuple_pos(graph_error_tuples, 1)
print("Error between networkx reference implementation and the notebook implementation")
print(f"Minimum error: {min(error_list)}")
print(f"Maximum error: {max(error_list)}")
print(f"Average error: {sum(error_list) / len(error_list)}")

Error between networkx reference implementation and the notebook implementation
Minimum error: 0
Maximum error: 3.362070030051012e-06
Average error: 6.3476070791494e-07
