In [33]:
import yaml
import os
import logging as log
import pprint

In [34]:
import plotly.graph_objects as go
fig = go.Figure(data=go.Bar(y=[2, 3, 1]))
fig.show()

In [35]:
# Parse experiment yaml file
experiments_path="./experiments/regression_test.yaml"

# Get experiment information from yaml file.
experiment_params = yaml.load(open(experiments_path))

regression_tests_dir = os.path.expandvars(experiment_params['regression_tests_dir'])
params_dir = os.path.expandvars(experiment_params['params_dir'])
dataset_dir = os.path.expandvars(experiment_params['dataset_dir'])
executable_path = os.path.expandvars(experiment_params['executable_path'])

datasets_to_run = experiment_params['datasets_to_run']
regression_params = experiment_params['regression_parameters']

# Build dictionary from parameter name to list of parameter values
param_name_to_values = dict()
for regression_param in regression_params:
    param_name_to_values[regression_param['name']] = regression_param['values']

In [119]:
# Retrieve stats, if they are not there, try to collect them:
full_stats_path = os.path.join(regression_tests_dir, "all_stats.yaml")
stats = dict()
if os.path.isfile(full_stats_path):
    log.info("Found existent stats. Opening full stats from:" + full_stats_path)
    stats = yaml.load(open(full_stats_path))
else:
    log.info("Collecting full stats.")
    # Collect all yaml results for a given parameter name:
    for regression_param in regression_params:
        # Redirect to param_name_value dir param_name = regression_param['name']
        param_name = regression_param['name']
        stats[param_name] = dict()
        for param_value in regression_param['values']:
            results_dir = os.path.join(regression_tests_dir, param_name, str(param_value))
            # Redirect to modified params_dir
            params_dir = os.path.join(results_dir, 'params')
            stats[param_name][param_value] = dict()
            for dataset in datasets_to_run:
                dataset_name = dataset['name']
                pipelines_to_run = dataset['pipelines']
                stats[param_name][param_value][dataset_name] = dict()
                for pipeline in pipelines_to_run:
                    results_file = os.path.join(results_dir, dataset_name, pipeline, "results.yaml")
                    if os.path.isfile(results_file):
                        stats[param_name][param_value][dataset_name][pipeline] = yaml.load(open(results_file,'r'))
                    else:
                        log.warning("Could not find results file: {}. Adding cross to boxplot...".format(results_file))
                        stats[param_name][param_value][dataset_name][pipeline] = False

    # Save all stats in regression tests root directory for future usage.
    with open(full_stats_path, 'w') as outfile:
        outfile.write(yaml.dump(stats))
    
    # Push to the cloud?!

I0727 12:46:19.117278 15219 <ipython-input-119-5c1f4edd58c2>:5] Found existent stats. Opening full stats from:/home/tonirv/Code/spark_vio_evaluation/regression_tests/all_stats.yaml


In [37]:
# Display plots for that result
# pprint.pprint(stats)

In [62]:
import plotly.graph_objects as go

def plot(param_name, dataset_name, x_data, y_data):
    colors = ['rgba(93, 164, 214, 0.5)', 'rgba(255, 144, 14, 0.5)', 'rgba(44, 160, 101, 0.5)',
              'rgba(255, 65, 54, 0.5)', 'rgba(207, 114, 255, 0.5)', 'rgba(127, 96, 0, 0.5)']

    fig = go.Figure()

    for xd, yd in zip(x_data, y_data):
            fig.add_trace(go.Box(
                y=yd,
                name=xd,
                boxpoints='all',
                jitter=0.5,
                whiskerwidth=0.2,
                marker_size=2,
                line_width=1)
            )

    fig.update_layout(
        title='Parameter: ' + param_name + ', dataset: ' + dataset_name,
        yaxis=dict(
            autorange=True,
            showgrid=True,
            zeroline=True,
            dtick=5,
            gridcolor='rgb(255, 255, 255)',
            gridwidth=1,
            zerolinecolor='rgb(255, 255, 255)',
            zerolinewidth=2,
        ),
        margin=dict(
            l=40,
            r=30,
            b=80,
            t=100,
        ),
        paper_bgcolor='rgb(243, 243, 243)',
        plot_bgcolor='rgb(243, 243, 243)',
        showlegend=False
    )

    fig.show()
    
def plot2():

    x = ['day 1', 'day 1', 'day 1', 'day 1', 'day 1', 'day 1',
         'day 2', 'day 2', 'day 2', 'day 2', 'day 2', 'day 2']

    fig = go.Figure()

    fig.add_trace(go.Box(
        y=[0.2, 0.2, 0.6, 1.0, 0.5, 0.4, 0.2, 0.7, 0.9, 0.1, 0.5, 0.3],
        x=x,
        name='kale',
        marker_color='#3D9970'
    ))
    fig.add_trace(go.Box(
        y=[0.6, 0.7, 0.3, 0.6, 0.0, 0.5, 0.7, 0.9, 0.5, 0.8, 0.7, 0.2],
        x=x,
        name='radishes',
        marker_color='#FF4136'
    ))
    fig.add_trace(go.Box(
        y=[0.1, 0.3, 0.1, 0.9, 0.6, 0.6, 0.9, 1.0, 0.3, 0.6, 0.8, 0.5],
        x=x,
        name='carrots',
        marker_color='#FF851B'
    ))

    fig.update_layout(
        yaxis_title='normalized moisture',
        boxmode='group' # group together boxes of the different traces for each value of x
    )
    fig.show()

In [64]:
plot2()


In [124]:
# Organize stats as for Dataset, Param Name, Param values (x_data)
# TODO use tidy panda... for storing errors

# for each dataset and param_name -> one plot
# for each param_value -> one x_axis
# for each pipeline -> one boxplot
# for each dataset, param_name, param_value and pipeline -> errors


df = pd.DataFrame()
x_data = dict()
y_data = dict()
stats_list = []
for param_name in stats:
    x_data[param_name] = []
    y_data[param_name] = dict()
    for param_value in stats[param_name]:
        x_data[param_name].append(param_value)
        y_data[param_name][param_value]=dict()
        for dataset_name in stats[param_name][param_value]:
            y_data[param_name][param_value][dataset_name] = dict()
            for pipeline in stats[param_name][param_value][dataset_name]:
                #y_data[param_name][param_value][dataset_name][pipeline]
                result = stats[param_name][param_value][dataset_name][pipeline]['absolute_errors']
                stats_list.append([param_name, param_value, dataset_name, pipeline, result.np_arrays['error_array']])

#plot(param_name, dataset_name, x_data, y_data)

In [147]:
df = pd.DataFrame.from_records(stats_list)
df.columns = ['Param Name', 'Param Value', 'Dataset Name', 'Pipe Type', 'ATE errors']
df.set_index(['Param Name', 'Param Value', 'Dataset Name', 'Pipe Type'], inplace = True)
print df

                                                                                           ATE errors
Param Name      Param Value Dataset Name Pipe Type                                                   
smartNoiseSigma 2.0         V1_01_easy   S          [0.10997670458005289, 0.10517572913663793, 0.1...
                                         SPR        [0.13134755458566505, 0.1262207963699808, 0.12...
                            MH_01_easy   S          [0.2151444667711187, 0.2157468309814195, 0.216...
                                         SPR        [0.27152230985699444, 0.2716027138288078, 0.27...
                3.0         V1_01_easy   S          [0.10524378348464582, 0.10130184371967772, 0.0...
                                         SPR        [0.142529469669964, 0.13837671696192647, 0.134...
                            MH_01_easy   S          [0.2621647134833297, 0.26193382036350643, 0.26...
                                         SPR        [0.22163350964595124, 0.221520