In [1]:
# !pip install tensorboard tensorflow pandas

"""
If tensorboard is not installed (or other dependencies, such as tensorflow and pandas),
uncomment the command in top and re-run. This needs only to be run once in a Jupyter kernel.
"""

%load_ext tensorboard

from tensorflow.python.summary.summary_iterator import summary_iterator
import pandas as pd
from pathlib import Path
import subprocess
import yaml
import os
import json
from collections.abc import MutableMapping



In [None]:
!tensorboard dev upload --logdir \
    '../logging'

In [None]:
"""
Change the LOG_DIR argument to point to the correct directory, you may want to use an
absolute path if you run into issues.
"""
# !!kill 22140
%tensorboard --logdir ./logging

In [2]:
def logs_to_pandas(path: str) -> pd.DataFrame:
    """convert single tensorflow log file to pandas DataFrame
    Parameters
    ----------
    path : str
        path to tensorflow log file
    Returns
    -------
    pd.DataFrame
        converted dataframe
    """

    runlog_data = pd.DataFrame({"metric": [], "value": [], "step": [], "wall_time": []})
    try:
        event_acc = summary_iterator(path)
        for event in list(event_acc)[1:]:
            step, wall_time = event.step, pd.to_datetime(event.wall_time, unit='s')
            simple_extractor = [{"metric": v.tag, "value": v.simple_value, "step": step, 'wall_time': wall_time} for v in event.summary.value]
            event_r = pd.DataFrame(simple_extractor)
            runlog_data = pd.concat([runlog_data, event_r])
    #Dirty catch of DataLossError
    except Exception as e:
        raise(e)
        print("Event file possibly corrupt: {}".format(path))
        print(e)
    return runlog_data

In [8]:
print("collecting log results..")
# creates dictionary: <id, event log dataframe>
dict_log_results = {}
logs = subprocess.getoutput(f'kubectl get pods -n test -l "app.kubernetes.io/name=fltk.extractor" -o jsonpath="{{.items[0].metadata.name}}"')
print(subprocess.getoutput("rm -rf logging"))
print("getting output.. ", subprocess.getoutput(f"kubectl cp -n test {logs}:logging ./logging"))
for path in Path('./logging').rglob('*events.out*'):
    path_iid = path.name.split("trainjob-")[1].split("-master")[0]
    dict_log_results[path_iid] = logs_to_pandas(f"./{path}")
    
print("\n finished creating log dictionary: DICT_LOG_RESULTS")

collecting log results..

getting output..  
Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`

 finished creating log dictionary: DICT_LOG_RESULTS


In [10]:
def flatten_dict(d: MutableMapping, sep: str= '.') -> MutableMapping:
    [flat_dict] = pd.json_normalize(d, sep=sep).to_dict(orient='records')
    return flat_dict

try: 
    os.mkdir("./logging/configmaps") 
except OSError as error: 
    print(error)
    
def job_to_id(job_name: str):
    return job_name.split("master-")[1][:-2]

job_ids = [i for i in subprocess.getoutput("kubectl get configmap --all-namespaces").split(" ") if "master" in i]
job_ids = [j for j in job_ids if job_to_id(j) in dict_log_results.keys()] # union with log results
      
print(f"collecting {len(job_ids)} configmaps ..")
dict_configmaps =  {}
for j in job_ids:
    try:
        # fetching from cloud and saving in logging/configmaps/-id-.txt
        config_map = subprocess.getoutput(f"kubectl get configmaps {j} -o yaml -n test")
        text_file = open(f"./logging/configmaps/{j}.txt", "w")
        n = text_file.write(config_map)
        text_file.close()
        print("fetched! caching..")
    except Exception as e:
        # in case of error (e.g. not in cloud), fetch from cache
        print("error, finding in cache.. ")
        with open(f"./logging/configmaps/{j}.txt", 'r') as file:
            config_map = file.read().replace('\n', '')
        
    dict_configmaps[job_to_id(j)] = flatten_dict(yaml.safe_load(yaml.safe_load(config_map)['data']['node.config.yaml']))
    
print("finished collecting configmaps, see DICT_LOG_RESULTS")

[Errno 17] File exists: './logging/configmaps'
collecting 20 configmaps ..
fetched! caching..
fetched! caching..
fetched! caching..
fetched! caching..
fetched! caching..
fetched! caching..
fetched! caching..
fetched! caching..
fetched! caching..
fetched! caching..
fetched! caching..
fetched! caching..
fetched! caching..
fetched! caching..
fetched! caching..
fetched! caching..
fetched! caching..
fetched! caching..
fetched! caching..
fetched! caching..
finished collecting configmaps, see DICT_LOG_RESULTS


In [11]:
# --- Creating clusters ---
experiments = [(idd, set(value.items())) for idd, value in dict_configmaps.items()]
clusters = []
diff_attr_total = set() # set of total different attr values found between experiments
while len(experiments) > 0:
    next_exp = experiments[0]
    cluster = list([next_exp])
    for e in experiments[1:]:
        diff_attr = next_exp[1].difference(e[1])
        diff_attr_total = diff_attr_total.union(set([s[0] for s in diff_attr]))
        if len(diff_attr) <= 1:
            print(next_exp[1].difference(e[1])) # should only be seed difference!!!
            cluster.append(e)
    for e in cluster:
        experiments.remove(e)
    clusters.append(cluster)

# --- Combining dataframes of clusters ---
cluster_ids = [[e[0] for e in exp_l] for exp_l in clusters]
id_archieved_metric = {}
for k,v in dict_log_results.items():
    df = v.drop_duplicates('metric', keep='last')
    df.insert(0, 'exp_id', k)
    for attr in diff_attr_total:
        df.insert(0, attr, dict_configmaps[k][attr])
    id_archieved_metric[k] = df

cluster_to_metrics = [(ids, pd.concat([id_archieved_metric[i] for i in ids])) for ids in cluster_ids]

cluster_to_metrics
# diff_attr_total

# dict_configmaps
# cluster_ids

{('seed', 539797574)}
{('seed', 698416725)}
{('seed', 1977273656)}
{('seed', 1977273656)}
{('seed', 391183769)}
{('seed', 2469606724)}
{('seed', 4006973926)}
{('seed', 4006973926)}
{('seed', 3828286053)}
{('seed', 3393082523)}
{('seed', 3393082523)}


[(['11dc762c-0781-45f6-b16f-8f0b739493e4',
   '505a1dd5-09ae-46ad-8d82-039016c0c1bb'],
     optimizer_args.lr  test_batch_size        seed  batch_size  \
  0              0.003              100   539797574         100   
  0              0.003              100   539797574         100   
  0              0.003              100   539797574         100   
  0              0.003              100  4283453153         100   
  0              0.003              100  4283453153         100   
  0              0.003              100  4283453153         100   
  
     service_time_budget  model_size  parallel  learning_rate optimizer  \
  0                  180           2         2          0.003      Adam   
  0                  180           2         2          0.003      Adam   
  0                  180           2         2          0.003      Adam   
  0                  180           2         2          0.003      Adam   
  0                  180           2         2          0.003     