In [None]:
from datetime import datetime
import os
import pandas as pd
import re
import time
from IPython.display import clear_output, display
import pickle

In [None]:
EXPERIMENT_ID = 'circymxx' # Enter the ID of the experiment (name of the folder) to export
interval = 60 # in seconds for monitoring

In [None]:
def experiment_to_csv(path, experiment):
    jobs = pd.read_csv(f'{experiment.path}/design.csv', sep='\t')
    jobs.columns.values[0] = 'Index'
    jobs = jobs[jobs.Index >= 0]
    
    jobs['accuracy'] = 0.0
    jobs['epochs'] = 0
    pat = re.compile("accuracy: (\d+\.\d+)")
    pat_clock = re.compile("Wall clock time is (\d+\.\d+) ms")
    found, not_existing = 0, 0
    for idx, row in jobs.iterrows():
        filename = f"{int(row.Index)}-model{row.model}-epochs{row.max_epochs}-executor_memory{row.executor_memory}-executor_cores{row.executor_cores}.log"
        
        try:
            with open(f'{experiment.path}/{filename}', 'r') as file:
                content = file.read()
                result = pat.findall(content)
                result_wallclock = float(pat_clock.findall(content)[-1])/1000
                jobs.at[idx, 'time'] = result_wallclock
                jobs.at[idx, 'accuracy'] = result[-1] if len(result) > 0 else 0
                jobs.loc[idx, 'epochs'] = len(result)
                found += 1
        except FileNotFoundError as e:
            jobs.at[idx, 'time'] = -1
            jobs.at[idx, 'accuracy'] = -1
            jobs.loc[idx, 'epochs'] = -1
            not_existing += 1
    
    time = os.stat(f'{experiment.path}/design.csv').st_mtime
    dt = datetime.fromtimestamp(time)
    jobs.to_csv(f'ex-{dt.strftime("%Y-%m-%d_%H:%M")}.csv')
    return (jobs, f'ex-{dt.strftime("%Y-%m-%d_%H:%M")}.csv', found, not_existing)

In [None]:
def fetch_results():
    dir = os.scandir('raw/')
    experiments = list(filter(lambda x: x.is_dir() and x.name[0] != '.' and x.name == EXPERIMENT_ID, dir))
    assert len(experiments) > 0, f'The folder {EXPERIMENT_ID} does not exist!'

    for experiment in experiments:
        if os.path.exists(f'{experiment.path}/design.csv'):
            jobs, path, found, not_existing = experiment_to_csv(experiment.path, experiment)
            print(f'Results of {experiment} stored in {path}')
            print(f'{found} of the {found + not_existing} expected log files are present. The other experiments are probably missing.\n')
            df = pd.read_csv(path)
            df = df[['max_epochs', 'executor_memory', 'executor_cores', 'model', 'accuracy', 'time']]
        else:
            raise Exception(f'{experiment.path}/design.csv not found!')
        
        if 
    done = not_existing == 0
    return df, done

In [None]:
df, _ = fetch_results()
df

In [None]:
done = False
while not done:
    df, done = fetch_results()
    clear_output(wait=True)
    dt = datetime.fromtimestamp(time.time())
    print(f'Last update: {dt.strftime("%H:%M:%S")}')
    display(df)
    time.sleep(interval)
    
with open(f'data/{EXPERIMENT_ID}.pickle', 'wb') as handle:
    pickle.dump(df, handle)

# ANOVA

In [175]:
import os
import researchpy as rp

import statsmodels.api as sm
from statsmodels.formula.api import ols

dir = os.scandir('data/')
experiments = list(filter(lambda x: x.name.endswith(".pickle"), dir))

df = pd.DataFrame()

for experiment in experiments:
    dffile = open(experiment.path, 'rb')      
    read_df = pickle.load(dffile)
    read_df = read_df[['max_epochs', 'executor_memory', 'executor_cores', 'model', 'accuracy', 'time']]
    df = pd.concat([df, read_df])

        
acc_df = pd.DataFrame({'y': df["accuracy"],
                       'epochs': df["max_epochs"],
                       'memory': df["executor_memory"],
                       'cores': df["executor_cores"],
                       'model': df["model"]})

time_df = pd.DataFrame({'y': df["time"],
                        'epochs': df["max_epochs"],
                        'memory': df["executor_memory"],
                        'cores': df["executor_cores"],
                        'model': df["model"]})

print(f"Imported {len(experiments)} experiments!")

Imported 5 experiments!


## DF Summaries

In [176]:
summary_acc_df = rp.summary_cont(acc_df.groupby(['model', 'epochs', 'cores', 'memory']))['y']
summary_acc_df





Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,N,Mean,SD,SE,95% Conf.,Interval
model,epochs,cores,memory,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
bi-rnn,20,8,4,5,0.3995,0.0106,0.0048,0.3863,0.4127
bi-rnn,20,8,32,5,0.4163,0.0203,0.0091,0.3911,0.4416
bi-rnn,20,16,4,5,0.4103,0.0139,0.0062,0.3931,0.4275
bi-rnn,20,16,32,5,0.4074,0.0075,0.0033,0.3981,0.4167
bi-rnn,60,8,4,5,0.7198,0.0334,0.0149,0.6784,0.7613
bi-rnn,60,8,32,5,0.7261,0.0327,0.0146,0.6855,0.7668
bi-rnn,60,16,4,5,0.7159,0.0595,0.0266,0.642,0.7897
bi-rnn,60,16,32,5,0.7497,0.0336,0.015,0.708,0.7914
lenet5,20,8,4,5,0.9323,0.0023,0.001,0.9294,0.9351
lenet5,20,8,32,5,0.9351,0.0034,0.0015,0.9309,0.9392


In [177]:
summary_time_df = rp.summary_cont(time_df.groupby(['model', 'epochs', 'cores', 'memory']))['y']
summary_time_df





Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,N,Mean,SD,SE,95% Conf.,Interval
model,epochs,cores,memory,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
bi-rnn,20,8,4,5,347.3691,28.084,12.5596,312.4982,382.24
bi-rnn,20,8,32,5,345.7391,20.0969,8.9876,320.7855,370.6927
bi-rnn,20,16,4,5,308.655,34.6991,15.5179,265.5704,351.7397
bi-rnn,20,16,32,5,308.172,38.2285,17.0963,260.705,355.639
bi-rnn,60,8,4,5,996.7127,64.3048,28.758,916.8677,1076.5576
bi-rnn,60,8,32,5,1008.8503,65.058,29.0948,928.0701,1089.6304
bi-rnn,60,16,4,5,888.7607,91.2986,40.83,775.3985,1002.1229
bi-rnn,60,16,32,5,895.521,114.3337,51.1316,753.557,1037.485
lenet5,20,8,4,5,174.9143,11.3552,5.0782,160.8149,189.0136
lenet5,20,8,32,5,175.6735,10.5155,4.7027,162.6167,188.7302


## ANOVA analysis

In [178]:
def anova_analysis(input_df):
    model = ols('y ~ C(model)*C(epochs)*C(cores)*C(memory)', input_df).fit()
    
    res = sm.stats.anova_lm(model, typ=2)
    res['PR(>F) < 0.05'] = res['PR(>F)'] < 0.05
    return res

In [179]:
acc_res = anova_analysis(acc_df)
print("Accuracy ANOVA Analysis")
acc_res

Accuracy ANOVA Analysis


Unnamed: 0,sum_sq,df,F,PR(>F),PR(>F) < 0.05
C(model),2.868766,1.0,5961.048481,6.893245e-65,True
C(epochs),0.5947628,1.0,1235.865788,1.448051e-43,True
C(cores),0.0001839211,1.0,0.382172,0.5386369,False
C(memory),0.00103033,1.0,2.140937,0.1483084,False
C(model):C(epochs),0.4325476,1.0,898.796544,2.169089e-39,True
C(model):C(cores),0.0001121011,1.0,0.232936,0.6310017,False
C(epochs):C(cores),8.302813e-05,1.0,0.172525,0.6792665,False
C(model):C(memory),0.0008070851,1.0,1.677053,0.1999688,False
C(epochs):C(memory),0.0002383951,1.0,0.495364,0.484097,False
C(cores):C(memory),5.61125e-07,1.0,0.001166,0.9728668,False


In [180]:
time_res = anova_analysis(time_df)
print("Time ANOVA Analysis")
time_res

Time ANOVA Analysis


Unnamed: 0,sum_sq,df,F,PR(>F),PR(>F) < 0.05
C(model),1827953.0,1.0,777.463445,1.623388e-37,True
C(epochs),4437158.0,1.0,1887.208631,3.252469e-49,True
C(cores),30900.26,1.0,13.142477,0.0005729783,True
C(memory),147.9361,1.0,0.06292,0.8027424,False
C(model):C(epochs),443778.3,1.0,188.747436,9.335448999999999e-21,True
C(model):C(cores),24617.67,1.0,10.470369,0.00192191,True
C(epochs):C(cores),7269.732,1.0,3.091957,0.08346086,False
C(model):C(memory),43.60082,1.0,0.018544,0.8921085,False
C(epochs):C(memory),156.3117,1.0,0.066482,0.7973563,False
C(cores):C(memory),3.406068,1.0,0.001449,0.9697572,False
