## Read Files 

### hw3dataset
Each row represents a directed edge (link) between nodes separated by a comma.
The direction of a edge is from the first node to the second node.
```
    graph_1.txt: 6 nodes, 5 edges
    graph_2.txt: 5 nodes, 5 edges (a circle)
    graph_3.txt: 4 nodes, 6 edges
    graph_4.txt: 7 nodes, 18 edges (the example in Lecture3, p29)
    graph_5.txt: 469 nodes, 1102 edges
    graph_6.txt: 1228 nodes, 5220 edges
```
### Transaction Dataset (in hw1)

In [1]:

import os
from collections import defaultdict 

import numpy as np 

from easydict import EasyDict as edict
from utils import timer 
# No self loops 

filedir ='./data/'
edges_data = {} 
for filename in os.listdir(filedir):
    edges = [] 
    filepath = os.path.join(filedir, filename) 
    print(f'reading {filepath}...')
    filepreff= filename.split('.')[0] 
    if filename.startswith('graph'):
        with open(filepath, 'r') as f: 
            for line in f.readlines():
                line = line.strip()
                edge = line.split(',')
                edges.append(edge)  
        edges_data[filepreff] = edges
                
    elif filename.startswith('ibm'):
        with open(filepath, 'r') as f: 
            for line in f.readlines():
                line = line.strip()
                edge = line.split()[1:]
                edges.append(edge)
        edges_data[filepreff] = edges
    
    
    
                    

class Graph:
    def __init__(self, edges):
        self.out_neighbors = defaultdict(list)
        self.in_neighbors = defaultdict(list)
        self.edges = edges 
        nodes = set()
        for u, v in edges:
            nodes.add(u); nodes.add(v) 
        nodes = sorted(nodes)
        print(nodes[:10])    
        nodesmap = {node:nodeidx for nodeidx, node in enumerate(nodes)}
        for u, v in edges:
            u, v = nodesmap[u], nodesmap[v]
            self.out_neighbors[u].append(v)
            self.in_neighbors[v].append(u)  
        self.N = len(nodes)
edges_data = sorted(edges_data.items(), key = lambda x:x[0])

reading ./data/graph_4.txt...
reading ./data/graph_5.txt...
reading ./data/graph_6.txt...
reading ./data/graph_2.txt...
reading ./data/.DS_Store...
reading ./data/graph_3.txt...
reading ./data/graph_1.txt...
reading ./data/ibm-5000.txt...


In [19]:
# damping factor 
D = 0.15 
# decay factor 
C = 0.9
# number of iterations
T = 100 

In [3]:
from pprint import pprint
Graphs = {}
for fname, edges in edges_data:
    G = Graph(edges)
    Graphs[fname] = G
    print(fname, f': graph with {G.N} nodes')
    # pprint(G.out_neighbors)

['1', '2', '3', '4', '5', '6']
graph_1 : graph with 6 nodes
['1', '2', '3', '4', '5']
graph_2 : graph with 5 nodes
['1', '2', '3', '4']
graph_3 : graph with 4 nodes
['1', '2', '3', '4', '5', '6', '7']
graph_4 : graph with 7 nodes
['1', '10', '100', '101', '102', '103', '104', '105', '106', '107']
graph_5 : graph with 469 nodes
['1', '10', '100', '1000', '1001', '1002', '1003', '1004', '1005', '1006']
graph_6 : graph with 1228 nodes
['1', '10', '100', '101', '102', '103', '104', '105', '106', '107']
ibm-5000 : graph with 836 nodes


In [4]:

def nodeid(node:str):
    return int(node)-1 

def PageRank(G:Graph, 
            max_iters:int, 
            damping_factor:float):
    """
    Args:
        G (networkx.classes.graph.Graph): 
        max_iters (int): number of iters 
        damping_factor (float): 
        The PageRank theory holds that an imaginary surfer who is randomly clicking on links will eventually stop clicking. The probability, at any step, that the person will continue is a damping factor d.
    Note that index 0 is null
    Note that links from a page to itself are ignored 
    """
    N = G.N
    if N == 0:
        raise ValueError('Empty Graph')
    PageRanksHistory = []  
    d = damping_factor
    # initialization 
    PageRanks = np.full(N, 1/N)
    for iter in range(max_iters):
        newPageRanks = np.zeros(N)
        for i in range(N):
            for n in G.in_neighbors[i]:
                newPageRanks[i] += PageRanks[n] / len(G.out_neighbors[n])
        PageRanks =  d/N + (1-d) * newPageRanks
    PageRanks = PageRanks / (PageRanks.sum())
    return PageRanks

In [12]:
# Outputs = defaultdict(lambda:{})
# for filename, g in Graphs.items():
#     # print(filename)
#     pageranks = PageRank(G = g, 
#                         max_iters = T, 
#                         damping_factor = D)
#     # print(pageranks[:10]) 
#     Outputs[filename]['pg'] = pageranks 
    

In [13]:


from typing import Tuple
def HITS(G:Graph, 
    max_iters:int, 
    denominator = 'sum')-> Tuple[np.array, np.array]:
    """
    HITS(Hyperlink-induced topic search)
    Authority: Providing valuable infor on certain topic 
    Hub: Give good supports to those pages with high authority
    - A good hub increases the authority weight of the pages it points. 
    - A good authority increases the hub weight of the pages that point to it. 
    The idea is then to apply the two operations above alternatively until equilibrium values for the hub and authority weights are reached.
    Args:
        G (Graph): _description_
    Returns:
        Tuple(np.array, np.array): Auth, Hub Vectors 
            Auth: shape (N, ) Auth[n] is the authority score of node n
            Hub: shape (N, )  Similarly, Hub[n] is the hub score of node n
    """
    auths = np.ones(G.N)
    hubs = np.ones(G.N) 
    def get_update_Auth(n):
        # authority: the node being pointed to 越多人指向他越高分
        return hubs[G.in_neighbors[n]].sum()
    def get_update_Hub(n):
        return auths[G.out_neighbors[n]].sum()
    
    for _ in range(max_iters):
        new_auths = np.zeros_like(auths)
        new_hubs = np.zeros_like(hubs)
        for n in range(G.N):
            new_auths[n] = get_update_Auth(n)
            new_hubs[n] = get_update_Hub(n)
        if denominator == 'sum':
            auths = new_auths / np.sum(new_auths)
            hubs = new_hubs / np.sum(new_hubs)
        else: # root of sum of squares
            # wiki 上面的正規做法
            # https://en.wikipedia.org/wiki/HITS_algorithm
            auths = new_auths / np.sqrt(np.sum(new_auths**2))
            hubs = new_hubs / np.sqrt(np.sum(new_hubs**2))
    
    return auths, hubs 


In [14]:
# for filename, g in Graphs.items():
#     auths, hubs = HITS(g, max_iters=100)
#     Outputs[filename]['auths'] = auths
#     Outputs[filename]['hubs'] = hubs

In [17]:
'''

def simrank_algorithm(self,iter,C):
        
        simrank_matrix = np.eye( self.n )

        for _ in range(iter):
            # create new simrank matrix
            new_simrank_matrix = np.eye( self.n )
            # update every node of a:b
            for node_a in range(self.n):
                for node_b in range(self.n):
                    if len(self.in_neighbors[node_a]) == 0:
                        break  
                    if len(self.in_neighbors[node_b]) == 0 or node_a == node_b: 
                        continue
                    new_simrank_matrix[node_a][node_b] = self.get_simrank_score( node_a,node_b,simrank_matrix,C=C )

            simrank_matrix = new_simrank_matrix.copy()
            
        return simrank_matrix
        
def get_simrank_score( self,node_a,node_b,old_simrank_matrix , C)->float:

        simrank_sum = 0.0
        for a in self.in_neighbors[node_a]:
            for b in self.in_neighbors[node_b]:
                simrank_sum += old_simrank_matrix[a][b]

        simrank_sum = (simrank_sum * C) / (len( self.in_neighbors[node_a])*len(self.in_neighbors[node_b])) 
        return simrank_sum


'''




@timer 
def SimRank(G: Graph, 
            max_iters:int, 
            decay_factor:float):
    # SimRank_sum = the sum of SimRank value of all in-neighbor pairs (SimRank value is from the previous iteration)
    C = decay_factor 
    def get_update_simrank(
                    a:int, 
                    b:int, 
                    simRank: np.array):
        if a == b: 
            return 1    
        a_in_neighbors = G.in_neighbors[a] # I_i(a)
        b_in_neighbors = G.in_neighbors[b] # I_j(b)
        a_in_size, b_in_size = len(a_in_neighbors), len(b_in_neighbors)
        if not a_in_size or not b_in_size:
            return 0
        temp = 0 
        for i in a_in_neighbors:
            for j in b_in_neighbors:
                temp += simRank[i, j]
        # scaling the simRank 
        return C * temp / (a_in_size * b_in_size) 
                        
    simRank = np.zeros((G.N, G.N))
    for iter in range(max_iters):
        newSimRank = np.zeros_like(simRank)
        for a in range(G.N):
            for b in range(a, G.N):
                newSimRank[a, b] = newSimRank[b, a] = get_update_simrank(a, b, simRank)
        simRank = newSimRank.copy() 
    return simRank    

In [None]:
for filename, g in Graphs.items():
    # print(filename)
    simranks = SimRank(g, max_iters=100, decay_factor = C)
    # print(simranks)

## Stardard Output 
2021 version 

In [32]:
# Which year's hyperparams to use 
# hyperparams 
Hyperparams ={2021: {'D': 0.15, 'C': 0.9, 'T': 100}, 
              2022: {'D':0.1, 'C': 0.7, 'T': 30}}
filedir ='./data/'

def save_and_display(year_hyper):
    result_root = f'./results/{year_hyper}'
    os.makedirs(result_root, exist_ok=True)
    prec = 3 
    D = Hyperparams[year_hyper]['D']
    C = Hyperparams[year_hyper]['C']
    T = Hyperparams[year_hyper]['T']
    
    np.set_printoptions(precision=prec)

    for gname, g in Graphs.items():
        file_prefix = gname.split('.')[0]
        gpath = os.path.join(result_root, file_prefix)
        os.makedirs(gpath, exist_ok=True)

        
        print(f'==== Graph: {gname} ===')
        filename = f'{gname}_PageRank.txt'
        pagerank = PageRank(g, max_iters = T, damping_factor = D)
        np.savetxt(os.path.join(gpath, filename), pagerank, fmt = f'%.{prec}f')
        print('🪬 pagerank:\n', pagerank)

        filename = f'{gname}_HITS'
        auths, hubs = HITS(g, max_iters = T)
        np.savetxt(os.path.join(gpath, filename+'_authority.txt'), auths, fmt=f'%.{prec}f')
        np.savetxt(os.path.join(gpath, filename+'_hub.txt'), hubs, fmt=f'%.{prec}f')
        print('🪬 auths:\n', auths)
        print('🪬 hubs:\n', hubs)

        # avoid running SimRank on large graphs 
        if gname.startswith('ibm') or gname.startswith('graph_6'):
            continue 

        filename = f'{gname}_SimRank.txt'
        simrank = SimRank(g, max_iters = T, decay_factor = C)
        np.savetxt(os.path.join(gpath, filename), simrank, fmt=f'%.{prec}f')
        print('🪬 simrank:\n', simrank)


In [33]:
save_and_display(year_hyper = 2021)

==== Graph: graph_1 ===
🪬 pagerank:
 [0.061 0.112 0.156 0.193 0.225 0.252]
🪬 auths:
 [0.  0.2 0.2 0.2 0.2 0.2]
🪬 hubs:
 [0.2 0.2 0.2 0.2 0.2 0. ]
SimRank Done in 0.00 seconds.
🪬 simrank:
 [[1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1.]]
==== Graph: graph_2 ===
🪬 pagerank:
 [0.2 0.2 0.2 0.2 0.2]
🪬 auths:
 [0.2 0.2 0.2 0.2 0.2]
🪬 hubs:
 [0.2 0.2 0.2 0.2 0.2]
SimRank Done in 0.00 seconds.
🪬 simrank:
 [[1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]]
==== Graph: graph_3 ===
🪬 pagerank:
 [0.175 0.325 0.325 0.175]
🪬 auths:
 [0.191 0.309 0.309 0.191]
🪬 hubs:
 [0.191 0.309 0.309 0.191]
SimRank Done in 0.00 seconds.
🪬 simrank:
 [[1.    0.    0.818 0.   ]
 [0.    1.    0.    0.818]
 [0.818 0.    1.    0.   ]
 [0.    0.818 0.    1.   ]]
==== Graph: graph_4 ===
🪬 pagerank:
 [0.28  0.159 0.139 0.108 0.184 0.061 0.069]
🪬 auths:
 [0.139 0.178 0.201 0.14  0.201 0.056 0.084]
🪬 hubs:
 [

In [34]:
save_and_display(year_hyper = 2022)

==== Graph: graph_1 ===
🪬 pagerank:
 [0.056 0.107 0.152 0.193 0.23  0.263]
🪬 auths:
 [0.  0.2 0.2 0.2 0.2 0.2]
🪬 hubs:
 [0.2 0.2 0.2 0.2 0.2 0. ]
SimRank Done in 0.00 seconds.
🪬 simrank:
 [[1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1.]]
==== Graph: graph_2 ===
🪬 pagerank:
 [0.2 0.2 0.2 0.2 0.2]
🪬 auths:
 [0.2 0.2 0.2 0.2 0.2]
🪬 hubs:
 [0.2 0.2 0.2 0.2 0.2]
SimRank Done in 0.00 seconds.
🪬 simrank:
 [[1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]]
==== Graph: graph_3 ===
🪬 pagerank:
 [0.172 0.328 0.328 0.172]
🪬 auths:
 [0.191 0.309 0.309 0.191]
🪬 hubs:
 [0.191 0.309 0.309 0.191]
SimRank Done in 0.00 seconds.
🪬 simrank:
 [[1.    0.    0.538 0.   ]
 [0.    1.    0.    0.538]
 [0.538 0.    1.    0.   ]
 [0.    0.538 0.    1.   ]]
==== Graph: graph_4 ===
🪬 pagerank:
 [0.288 0.161 0.139 0.107 0.183 0.055 0.066]
🪬 auths:
 [0.139 0.178 0.201 0.14  0.201 0.056 0.084]
🪬 hubs:
 [

## Graph Visualization 

In [23]:
import networkx as nx 
import matplotlib.pyplot as plt

def save_graph(raw_g: Graph, gname:str):
    G = nx.DiGraph()        
    for edge in raw_g.edges:
        # print(edge)
        G.add_edge(*edge)
    nx.draw(G, with_labels=True)
    # plt.show(block=False)
    plt.savefig(f"{gname}.png", format="PNG")
    plt.clf() # clean plots

    

In [None]:
for idx, (gname, g) in enumerate(Graphs.items()):
    g_prefix = gname.split('.')[0]
    save_graph(raw_g = g, gname = gname)
    if int(g_prefix.split('_')[1]) == 5: 
        continue 
    if g_prefix.split('_')[0] == "ibm":
        continue


## Find A Way 
