In [1]:
import os

from pyspark import SparkContext
from datetime import datetime
import numpy as np
import pandas as pd

In [2]:
sc = SparkContext()

In [3]:
def add_log(df, filepath, algo_type, loop_counter, command, timestamp, accum):
    
    single_row = pd.DataFrame(columns=['filename', 'edge_per_node', 'loop_counter', 'command', 'end_of_command', 'accum'])
    single_row.loc[0] = [filepath, algo_type, loop_counter, command, timestamp, accum]
    with_row = pd.concat([df, single_row])
    return with_row

def processGraphMin(filepath, df_logs, edge_per_node):
  
    def reduce_ccf_min(x):
        key = x[0]
        values = x[1]
        min_value = values.pop(np.argmin(values))
        ret = []
        if min_value < key:
            ret.append((key, min_value))
            accum.add(len(values))
            for value in values:
                ret.append((value, min_value))
        return (ret)

    text_file = sc.textFile(filepath)
    text_file = text_file.filter(lambda x: "#" not in x)
    
    df_logs = add_log(df_logs, filepath, edge_per_node, 0, "start", datetime.now(), 0)
    text_file_split = text_file.map(lambda x: x.split())
    input = text_file_split.map(lambda x: (int(x[0]), int(x[1])))

    accum = sc.accumulator(1)
    loop_counter = 1
    while accum.value > 0:

        accum.value = 0
        print(f"----------\nStart loop at {datetime.now()}", accum.value)

        # CCF-Iterate
        it_map = input.flatMap(lambda x: ((x[0], x[1]), (x[1], x[0])))
        it_map.collect()
        df_logs = add_log(df_logs, filepath, edge_per_node, loop_counter, "it_map", datetime.now(), accum.value)
        
        it_groupby = it_map.groupByKey().mapValues(list)
        it_groupby.collect()
        df_logs = add_log(df_logs, filepath, edge_per_node, loop_counter, "it_groupby", datetime.now(), accum.value)
        
        it_reduce = it_groupby.flatMap(lambda x: reduce_ccf_min(x))
        it_reduce.collect()
        df_logs = add_log(df_logs, filepath, edge_per_node, loop_counter, "it_reduce", datetime.now(), accum.value)

        # CCF-Dedup
        ded_map = it_reduce.map(lambda x: ((x[0], x[1]), None))
        ded_map.collect()
        df_logs = add_log(df_logs, filepath, edge_per_node, loop_counter, "ded_map", datetime.now(), accum.value)
        
        ded_groupby = ded_map.groupByKey().mapValues(list)
        ded_groupby.collect()
        df_logs = add_log(df_logs, filepath, edge_per_node, loop_counter, "ded_groupby", datetime.now(), accum.value)
        
        input = ded_groupby.map(lambda x: (x[0][0], x[0][1]))        
        viz = input.collect()
        df_logs = add_log(df_logs, filepath, edge_per_node, loop_counter, "ded_reduce", datetime.now(), accum.value)

        print(f"End loop at {datetime.now()}, accum_value is {accum.value}")
        loop_counter += 1

    print(f"----------\nProcessed file at {datetime.now()}\n----------")
    return df_logs

In [4]:
user_inputs = [3, 7, 11]

df_logs = pd.DataFrame(columns=['filename', 'edge_per_node', 'loop_counter', 'command', 'end_of_command', 'accum'])

for v in user_inputs:
    
    k = "10M"
    filename = "graph_" + k + "_" + str(v) +".txt"
    print(filename)
    rel_filepath = os.path.join("..", "data", filename)
    !cd ../../generator && make re
    !cd ../../generator && ./graph_generator $k $rel_filepath $v
    
    now = datetime.now()
    print(f"Launching min at {now}")    
    df_logs = processGraphMin(os.path.join('..', rel_filepath), df_logs, v)

graph_10M_3.txt
make re graph_generator
make clean graph_generator
rm -f .o files
make fclean graph_generator
rm -f .o files
rm -f libft.a
rm -f ft_printf
make libft.a
ar: creating archive libft.a
make graph_generator
Start program at Wed Mar 17 15:05:19 2021
Has parsed user argument
Has converted user argument to bytes
Final number of nodes is 266282
Final number of edges approximation is 798847
End program at Wed Mar 17 15:05:19 2021
Launching min at 2021-03-17 15:05:20.072523
----------
Start loop at 2021-03-17 15:05:21.172890 0
End loop at 2021-03-17 15:06:09.769468, accum_value is 3446652
----------
Start loop at 2021-03-17 15:06:09.769586 0
End loop at 2021-03-17 15:07:16.838224, accum_value is 6686442
----------
Start loop at 2021-03-17 15:07:16.838329 0
End loop at 2021-03-17 15:11:32.961554, accum_value is 13131825
----------
Start loop at 2021-03-17 15:11:32.961665 0
End loop at 2021-03-17 15:14:50.436403, accum_value is 23640354
----------
Start loop at 2021-03-17 15:14:50.4

In [5]:
df_logs

Unnamed: 0,filename,edge_per_node,loop_counter,command,end_of_command,accum
0,../../data/graph_10M_3.txt,3,0,start,2021-03-17 15:05:21.156220,0
0,../../data/graph_10M_3.txt,3,1,it_map,2021-03-17 15:05:25.460289,0
0,../../data/graph_10M_3.txt,3,1,it_groupby,2021-03-17 15:05:33.341390,0
0,../../data/graph_10M_3.txt,3,1,it_reduce,2021-03-17 15:05:37.534281,1148884
0,../../data/graph_10M_3.txt,3,1,ded_map,2021-03-17 15:05:42.940203,2297768
...,...,...,...,...,...,...
0,../../data/graph_10M_11.txt,11,5,it_groupby,2021-03-17 15:26:10.918901,0
0,../../data/graph_10M_11.txt,11,5,it_reduce,2021-03-17 15:26:11.626995,0
0,../../data/graph_10M_11.txt,11,5,ded_map,2021-03-17 15:26:12.390065,0
0,../../data/graph_10M_11.txt,11,5,ded_groupby,2021-03-17 15:26:13.853901,0


In [6]:
df_logs.to_csv('~/Downloads/comparison_edges_per_nodes2.csv', index=False)

In [8]:
user_inputs = {
    "4M": 3,
    "10M": 7,
    "15M": 11
}

df_logs = pd.DataFrame(columns=['filename', 'edge_per_node', 'loop_counter', 'command', 'end_of_command', 'accum'])

for k, v in user_inputs.items():
    
    filename = "graph_" + k + "_" + str(v) +".txt"
    print(filename)
    rel_filepath = os.path.join("..", "data", filename)
    !cd ../../generator && make re
    !cd ../../generator && ./graph_generator $k $rel_filepath $v
    
    now = datetime.now()
    print(f"Launching min at {now}")    
    df_logs = processGraphMin(os.path.join('..', rel_filepath), df_logs, v)
    
df_logs.to_csv('~/Downloads/comparison_edges_per_nodes4.csv', index=False)

graph_4M_3.txt
make re graph_generator
make clean graph_generator
rm -f .o files
make fclean graph_generator
rm -f .o files
rm -f libft.a
rm -f ft_printf
make libft.a
ar: creating archive libft.a
make graph_generator
Start program at Wed Mar 17 17:13:10 2021
Has parsed user argument
Has converted user argument to bytes
Final number of nodes is 115426
Final number of edges approximation is 346279
End program at Wed Mar 17 17:13:11 2021
Launching min at 2021-03-17 17:13:11.149308
----------
Start loop at 2021-03-17 17:13:11.202376 0
End loop at 2021-03-17 17:13:29.131498, accum_value is 1507323
----------
Start loop at 2021-03-17 17:13:29.131599 0
End loop at 2021-03-17 17:13:58.795912, accum_value is 2921124
----------
Start loop at 2021-03-17 17:13:58.796022 0
End loop at 2021-03-17 17:14:40.492212, accum_value is 5728992
----------
Start loop at 2021-03-17 17:14:40.492324 0
End loop at 2021-03-17 17:15:12.536872, accum_value is 9529485
----------
Start loop at 2021-03-17 17:15:12.5369