In [1]:
import os

from pyspark import SparkContext
from datetime import datetime
import numpy as np
import pandas as pd

In [2]:
sc = SparkContext()

In [3]:
def add_log(df, filepath, algo_type, loop_counter, command, timestamp, accum):
    
    single_row = pd.DataFrame(columns=['filename', 'edge_per_node', 'loop_counter', 'command', 'end_of_command', 'accum'])
    single_row.loc[0] = [filepath, algo_type, loop_counter, command, timestamp, accum]
    with_row = pd.concat([df, single_row])
    return with_row

def processGraphMin(filepath, df_logs, edge_per_node):
  
    def reduce_ccf_min(x):
        key = x[0]
        values = x[1]
        min_value = values.pop(np.argmin(values))
        ret = []
        if min_value < key:
            ret.append((key, min_value))
            accum.add(len(values))
            for value in values:
                ret.append((value, min_value))
        return (ret)

    text_file = sc.textFile(filepath)
    text_file = text_file.filter(lambda x: "#" not in x)
    
    df_logs = add_log(df_logs, filepath, "min", 0, "start", datetime.now(), 0)
    text_file_split = text_file.map(lambda x: x.split())
    input = text_file_split.map(lambda x: (int(x[0]), int(x[1])))

    accum = sc.accumulator(1)
    loop_counter = 1
    while accum.value > 0:

        accum.value = 0
        print(f"----------\nStart loop at {datetime.now()}", accum.value)

        # CCF-Iterate
        it_map = input.flatMap(lambda x: ((x[0], x[1]), (x[1], x[0])))
        it_map.collect()
        df_logs = add_log(df_logs, filepath, edge_per_node, loop_counter, "it_map", datetime.now(), accum.value)
        
        it_groupby = it_map.groupByKey().mapValues(list)
        it_groupby.collect()
        df_logs = add_log(df_logs, filepath, edge_per_node, loop_counter, "it_groupby", datetime.now(), accum.value)
        
        it_reduce = it_groupby.flatMap(lambda x: reduce_ccf_min(x))
        it_reduce.collect()
        df_logs = add_log(df_logs, filepath, edge_per_node, loop_counter, "it_reduce", datetime.now(), accum.value)

        # CCF-Dedup
        ded_map = it_reduce.map(lambda x: ((x[0], x[1]), None))
        ded_map.collect()
        df_logs = add_log(df_logs, filepath, edge_per_node, loop_counter, "ded_map", datetime.now(), accum.value)
        
        ded_groupby = ded_map.groupByKey().mapValues(list)
        ded_groupby.collect()
        df_logs = add_log(df_logs, filepath, edge_per_node, loop_counter, "ded_groupby", datetime.now(), accum.value)
        
        input = ded_groupby.map(lambda x: (x[0][0], x[0][1]))        
        viz = input.collect()
        df_logs = add_log(df_logs, filepath, edge_per_node, loop_counter, "ded_reduce", datetime.now(), accum.value)

        print(f"End loop at {datetime.now()}, accum_value is {accum.value}")
        loop_counter += 1

    print(f"----------\nProcessed file at {datetime.now()}\n----------")
    return df_logs

In [4]:
user_inputs = {
    "10M": 3,
    "10M": 7,
    "10M": 11,
}

df_logs = pd.DataFrame(columns=['filename', 'edge_per_node', 'loop_counter', 'command', 'end_of_command', 'accum'])

for k, v in user_inputs.items():
  
    filename = "graph_" + k + ".txt"
    rel_filepath = os.path.join("..", "data", filename)
    !cd ../generator && make re
    !cd ../generator && ./graph_generator $k $rel_filepath $v

    now = datetime.now()
    print(f"\n\nLaunching sorted at {now}")
    df_logs = processGraphSorted(rel_filepath, df_logs, v)
    
    now = datetime.now()
    print(f"Launching min at {now}")    
    df_logs = processGraphMin(rel_filepath, df_logs, v)
    
    now = datetime.now()
    print(f"Launching zip at {now}")    
    df_logs = processGraphZip(rel_filepath, df_logs, v)

make re graph_generator
make clean graph_generator
rm -f .o files
make fclean graph_generator
rm -f .o files
rm -f libft.a
rm -f ft_printf
make libft.a
ar: creating archive libft.a
make graph_generator
Start program at Tue Mar 16 18:41:30 2021
Has parsed user argument
Has converted user argument to bytes
Final number of nodes is 89227
Final number of edges approximation is 446137
End program at Tue Mar 16 18:41:31 2021


Launching sorted at 2021-03-16 18:41:31.472020
----------
Start loop at 2021-03-16 18:41:32.760930, accum_value is 0
End loop at 2021-03-16 18:42:05.564894, accum_value is 2193393
----------
Start loop at 2021-03-16 18:42:05.565010, accum_value is 0
End loop at 2021-03-16 18:42:48.178407, accum_value is 4335951
----------
Start loop at 2021-03-16 18:42:48.178581, accum_value is 0
End loop at 2021-03-16 18:43:36.239465, accum_value is 8506668
----------
Start loop at 2021-03-16 18:43:36.239577, accum_value is 0
End loop at 2021-03-16 18:44:00.483830, accum_value is 6774

End loop at 2021-03-16 20:52:09.103369, accum_value is 18976641
----------
Start loop at 2021-03-16 20:52:09.103495 0
End loop at 2021-03-16 21:02:36.327081, accum_value is 37634628
----------
Start loop at 2021-03-16 21:02:36.327296 0
End loop at 2021-03-16 21:08:47.231256, accum_value is 47268825
----------
Start loop at 2021-03-16 21:08:47.231367 0
End loop at 2021-03-16 21:09:13.855360, accum_value is 1555602
----------
Start loop at 2021-03-16 21:09:13.855486 0
End loop at 2021-03-16 21:09:33.147774, accum_value is 0
----------
Processed file at 2021-03-16 21:09:33.147880
----------


In [8]:
df_logs

Unnamed: 0,filename,algo_type,loop_counter,command,end_of_command,accum
0,../data/graph_5M.txt,sorted,0,start,2021-03-16 18:41:32.740990,0
0,../data/graph_5M.txt,sorted,1,it_map,2021-03-16 18:41:36.198681,0
0,../data/graph_5M.txt,sorted,1,it_groupby,2021-03-16 18:41:40.967153,0
0,../data/graph_5M.txt,sorted,1,it_reduce,2021-03-16 18:41:43.039086,731131
0,../data/graph_5M.txt,sorted,1,ded_map,2021-03-16 18:41:45.704459,1462262
...,...,...,...,...,...,...
0,../data/graph_25M.txt,zip,6,it_groupby,2021-03-16 21:09:21.448939,0
0,../data/graph_25M.txt,zip,6,it_reduce,2021-03-16 21:09:23.062968,0
0,../data/graph_25M.txt,zip,6,ded_map,2021-03-16 21:09:25.401246,0
0,../data/graph_25M.txt,zip,6,ded_groupby,2021-03-16 21:09:31.733760,0


In [9]:
df_logs.to_csv('~/Downloads/comparison_5_15_25.csv', index=False)