The file contains the edges of a directed graph. Vertices are labeled as positive integers from 1 to 875714. Every row indicates an edge, the vertex label in first column is the tail and the vertex label in second column is the head (recall the graph is directed, and the edges are directed from the first column vertex to the second column vertex). So for example, the 11^{th}11 
th
  row looks liks : "2 47646". This just means that the vertex with label 2 has an outgoing edge to the vertex with label 47646

Your task is to code up the algorithm from the video lectures for computing strongly connected components (SCCs), and to run this algorithm on the given graph.

Output Format: You should output the sizes of the 5 largest SCCs in the given graph, in decreasing order of sizes, separated by commas (avoid any spaces). So if your algorithm computes the sizes of the five largest SCCs to be 500, 400, 300, 200 and 100, then your answer should be "500,400,300,200,100" (without the quotes). If your algorithm finds less than 5 SCCs, then write 0 for the remaining terms. Thus, if your algorithm computes only 3 SCCs whose sizes are 400, 300, and 100, then your answer should be "400,300,100,0,0" (without the quotes). (Note also that your answer should not have any spaces in it.)

WARNING: This is the most challenging programming assignment of the course. Because of the size of the graph you may have to manage memory carefully. The best way to do this depends on your programming language and environment, and we strongly suggest that you exchange tips for doing this on the discussion forums.

In [214]:
import os
import sys
from collections import deque
from operator import itemgetter

In [5]:
data_folder = os.path.join(os.path.dirname(os.path.dirname(os.getcwd())), 'data')
fname_input = data_folder +"/SCC.txt"

In [139]:
# chnage to two adjacent list for efficiency refactoring
lst_tails, lst_heads = [], []
reversed_graph = {}


with open(fname_input, "r") as f:
    prev_tail = 0
    head_list = []
    for line in f:
        tail, head = [int(v) for v in line.split()]
        reversed_graph.setdefault(head, []).append(tail)
        if tail == prev_tail:
            head_list.append(head)
        else:
            if prev_tail:
                lst_tails.append(prev_tail)
                lst_heads.append(head_list)
            head_list=[head]
            prev_tail = tail
print("double check on the input read:", len(lst_tails), len(lst_heads), len(reversed_graph))

double check on the input read: 739453 739453 714547


In [140]:
reversed_tails, reversed_heads = [], []
for tail, head_list in reversed_graph.items():
    reversed_tails.append(tail)
    reversed_heads.append(head_list)
print("check first run pass with reversed edges:", len(reversed_tails), len(reversed_heads))

check first run pass with reversed edges: 714547 714547


In [211]:
def DFS_Loop(lst_tails, lst_nodes, sorted_lst):
    global time
    time = 0
    global source_vertex
    source_vertex = None
    global finished_nodes
    finished_nodes = []
    global leaders
    leaders = {}
    global visited
    visited = set()
    for node_i in sorted_lst:
        if node_i in visited:
            continue
        source_vertex = node_i
        DFS(lst_tails, lst_nodes, node_i)


In [210]:
def DFS(lst_tails, lst_nodes, node_i):
    global visited
    global source_vertex
    global leaders
    visited.add(node_i)
    
    leaders.setdefault(source_vertex, []).append(node_i)
    try:
        pos_node = lst_tails.index(node_i)
        arc = lst_nodes[pos_node]
        #print((node_i, arc))
    except ValueError:
        arc = []
    for node_j in arc:
        if node_j not in visited:
            DFS(lst_tails, lst_nodes, node_j)
    
    global time
    time += 1
    global finished_nodes
    finished_nodes.append(node_i)
    #print(finished_nodes)

In [184]:
test_tails = [1, 2, 3,4,]
test_heads = [[2], [3], [1]]

r_test_tails = [1,2,3]
r_test_heads = [[3],[1], [2]]

In [208]:
r_t2_tails = [1,2,3,4,5,6,7,8,9]
r_t2_heads = [[7], [5],[9],[1],[8],[3,8],[4,9],[2],[6]]
t2_tails = [1,2,3,4,5,6,7,8,9]
t2_heads = [[4],[8],[6],[7],[2],[9],[1],[5,6],[3,7]]

In [215]:
leaders = {}
finished_nodes = []
DFS_Loop(r_t2_tails, r_t2_heads, reversed(r_t2_tails))
print(finished_nodes)
print("---------")
DFS_Loop(t2_tails, t2_heads, reversed(finished_nodes))
print(leaders)

[3, 5, 2, 8, 6, 9, 1, 4, 7]
---------
{7: [7, 1, 4], 9: [9, 3, 6], 8: [8, 5, 2]}


In [186]:
leaders = {}
finished_nodes = []
DFS_Loop(r_test_tails, r_test_heads)
DFS_Loop(finished_nodes, test_heads)
print(finished_nodes)
print(leaders)

[2, 1, 3]
{3: [3, 1, 2]}


In [None]:
sys.setrecursionlimit(1000000)
leaders = {}
finished_nodes = []
DFS_Loop(reversed_tails, reversed_heads, reversed(reversed_tails))
DFS_Loop(lst_tails, lst_heads, reversed(finished_nodes))
print(len(leaders))

In [31]:
def scc_finder(leaders):
    # from the highest 
    lst_scc_size = []
    for leader_node in leaders:
        lst_scc_size.append((leader_node, len(leaders[leader_node])))
    sorted_scc_size = sorted(lst_scc_size, key = itemgetter(1), reverse=True)
    return sorted_scc_size

695495