### Notebook for aggregating a collection of HPC tasks on GraphWorld
Given a type of experiment, this notebook takes all the json result files of a collection of HPC tasks and moves them into a single file in the `processed` directory. It also maintains a summary file in the same folder for all files for the experiment. Finally it loads the result files and prints basic statistics not part of the summary file (see last cell of this file).

Set `RAW_DIR` to the raw experiments you want to process.

Set `PROCESSED_DIR` to where the processed results should be stored.
The processed results will be stored in shards. Each time this notebook is ran, 1 shard is created. E.g. the shard size depends on the contents of the `RAW_DIR`.

The processing assumes that the raw results come from our HPC experimental setup.

In [1]:
mode = '-2-3-marg'
RUN_TO_PROCESS = 'p_to_q_ratio_2'
RAW_DIR = f'/home/data_shares/scara/graphworld/results/mode{mode}/raw/{RUN_TO_PROCESS}'
PROCESSED_DIR = f'/home/data_shares/scara/graphworld/results/mode{mode}/processed'

In [2]:
import os
import pandas as pd
import json
import ast
import re
import math

PROCESSED_SHARDS = f'{PROCESSED_DIR}/shards'

if not os.path.exists(PROCESSED_SHARDS):
    os.makedirs(PROCESSED_SHARDS)

# Read (existing) summary file for experiment
try:
    with open(f'{PROCESSED_DIR}/summary.json', 'r') as f:
        summary = json.load(f)
except FileNotFoundError:
    summary = {
        'N_GRAPHS': 0,
        'N_RUNS': 0,
        'RUN_GRAPHS': [],
        'RUN_MARG': [],
        'RAW_FILES': []
    }

if RUN_TO_PROCESS in summary['RAW_FILES']:
    raise Exception(f'WARNING: {RUN_TO_PROCESS} has already been processed!')

summary['N_RUNS'] += 1
summary['RAW_FILES'] += [RUN_TO_PROCESS]

# Here we read the json shards of each HPC task, 
# aggregate them and store everything in one file in the processed folder
lines = []
results_file_regex = r'results\.ndjson-(\d{5})-of-(\d{5})'
successful_runs = []

for sub_dir in next(os.walk(RAW_DIR))[1]:
	sub_dir_full = os.path.join(RAW_DIR, sub_dir)
	is_successful = False
	result_files = filter(lambda file: re.match(results_file_regex, file), os.listdir(sub_dir_full))
	for result_file in result_files:
		with open(os.path.join(sub_dir_full, result_file)) as f:
			lines.extend(f.readlines())
		is_successful = True
	if is_successful:
		successful_runs += [sub_dir]
                
with open(f'{PROCESSED_DIR}/shards/{summary["N_RUNS"]}.ndjson', "w") as dst:
  for line in lines:
    dst.write(line) # Write all graph experiments to same file

# Load lines dataframe for printing statistics
records = map(json.loads, lines)
results_df = pd.DataFrame.from_records(records)

# Getting running times
times = []

for task in next(os.walk(RAW_DIR))[1]:
  if not task in successful_runs:
     continue
  with open(f'{RAW_DIR}/slurm_{task}.out', 'r') as f:
    last_line = lines[-1].split(" ")[1]
    match = re.search(r'\d+', last_line)

    if match:
      lines = f.readlines()
      times.append(int(match.group()) // 60)
    else:
        print(f)
    
if len(times) == 0:
    times = [math.nan]

# Getting basic statistics of raw data
N_GRAPHS = len(results_df)
N_METHODS = len([col for col in results_df if 'encoder_hidden_channels' in col])
N_TASKS = len(next(os.walk(RAW_DIR))[1])

AVG_TIME = sum(times) / len(times)
MAX_TIME = max(times)
MIN_TIME = min(times)

# Getting methods that have crashed / are skipped
skipped_methods = {}
for s_col in [col for col in results_df if '_skipped' in col]:
    count = results_df[s_col].sum()
    if count > 0:
        skipped_methods.update({s_col.removesuffix('_skipped'): count})

# Update summary file
summary['N_GRAPHS'] += N_GRAPHS
summary['RUN_GRAPHS'].append(N_GRAPHS)
marg = results_df['marginal_param'].astype(str).unique()
if len(marg) > 1:
  summary['RUN_MARG'].append("mixed")
elif len(marg) == 0:
  summary['RUN_MARG'].append([])
else:
  summary['RUN_MARG'].append(ast.literal_eval(marg[0]))

with open(f'{PROCESSED_DIR}/summary.json', 'w') as s:
  s.write(json.dumps(summary))


# Printing statistics
print('------- Task/Graph statistics -------')
print(f'Total processed tasks: {N_TASKS}')
print(f'Total processed graphs: {N_GRAPHS}')
print(f'Graphs per task: {N_GRAPHS / N_TASKS}')
print(f'Avg task runtime (min): {AVG_TIME} ({AVG_TIME / (N_GRAPHS / N_TASKS)} per graph)')
print(f'Max task runtime (min): {MAX_TIME} ({MAX_TIME / (N_GRAPHS / N_TASKS)} per graph)')
print(f'Min task runtime (min): {MIN_TIME} ({MIN_TIME / (N_GRAPHS / N_TASKS)} per graph)\n')

print('------- Skipped (crashed) methods -------')
for k,v in skipped_methods.items():
    print(f'{k} skipped {v} times')

------- Task/Graph statistics -------
Total processed tasks: 100
Total processed graphs: 442
Graphs per task: 4.42
Avg task runtime (min): 292.04 (66.07239819004525 per graph)
Max task runtime (min): 804 (181.9004524886878 per graph)
Min task runtime (min): 6 (1.3574660633484164 per graph)

------- Skipped (crashed) methods -------
