In [10]:
import yaml
import os
import logging as log
import pprint
import plotly.express as px
import pandas as pd

In [11]:
# Parse experiment yaml file
experiments_path="../experiments/regression_test.yaml"

# Get experiment information from yaml file.
experiment_params = yaml.load(open(experiments_path))

regression_tests_dir = os.path.expandvars(experiment_params['regression_tests_dir'])
params_dir = os.path.expandvars(experiment_params['params_dir'])
dataset_dir = os.path.expandvars(experiment_params['dataset_dir'])
executable_path = os.path.expandvars(experiment_params['executable_path'])

datasets_to_run = experiment_params['datasets_to_run']
regression_params = experiment_params['regression_parameters']

# Build dictionary from parameter name to list of parameter values
param_name_to_values = dict()
for regression_param in regression_params:
    param_name_to_values[regression_param['name']] = regression_param['values']

In [12]:
# Retrieve stats, if they are not there, try to collect them:
full_stats_path = os.path.join(regression_tests_dir, "all_stats.yaml")
stats = dict()
if os.path.isfile(full_stats_path):
    log.info("Found existent stats. Opening full stats from:" + full_stats_path)
    stats = yaml.load(open(full_stats_path))
else:
    log.info("Collecting full stats.")
    # Collect all yaml results for a given parameter name:
    for regression_param in regression_params:
        # Redirect to param_name_value dir param_name = regression_param['name']
        param_name = regression_param['name']
        stats[param_name] = dict()
        for param_value in regression_param['values']:
            results_dir = os.path.join(regression_tests_dir, param_name, str(param_value))
            # Redirect to modified params_dir
            params_dir = os.path.join(results_dir, 'params')
            stats[param_name][param_value] = dict()
            for dataset in datasets_to_run:
                dataset_name = dataset['name']
                pipelines_to_run = dataset['pipelines']
                stats[param_name][param_value][dataset_name] = dict()
                for pipeline in pipelines_to_run:
                    results_file = os.path.join(results_dir, dataset_name, pipeline, "results.yaml")
                    if os.path.isfile(results_file):
                        stats[param_name][param_value][dataset_name][pipeline] = yaml.load(open(results_file,'r'))
                    else:
                        log.warning("Could not find results file: {}. Adding cross to boxplot...".format(results_file))
                        stats[param_name][param_value][dataset_name][pipeline] = False

    # Save all stats in regression tests root directory for future usage.
    with open(full_stats_path, 'w') as outfile:
        outfile.write(yaml.dump(stats))
    
    # Push to the cloud?!

In [13]:
# Display plots for that result
# pprint.pprint(stats)

In [14]:
import plotly.graph_objects as go

def plot(x_data, y_data):
    colors = ['rgba(93, 164, 214, 0.5)', 'rgba(255, 144, 14, 0.5)', 'rgba(44, 160, 101, 0.5)',
              'rgba(255, 65, 54, 0.5)', 'rgba(207, 114, 255, 0.5)', 'rgba(127, 96, 0, 0.5)']

    fig = go.Figure()

    for xd, yd in zip(x_data, y_data):
            fig.add_trace(go.Box(
                y=yd,
                name=xd,
                boxpoints='all',
                jitter=0.5,
                whiskerwidth=0.2,
                marker_size=2,
                line_width=1)
            )

    fig.update_layout(
        title='Parameter: ' + param_name + ', dataset: ' + dataset_name,
        yaxis=dict(
            autorange=True,
            showgrid=True,
            zeroline=True,
            dtick=5,
            gridcolor='rgb(255, 255, 255)',
            gridwidth=1,
            zerolinecolor='rgb(255, 255, 255)',
            zerolinewidth=2,
        ),
        margin=dict(
            l=40,
            r=30,
            b=80,
            t=100,
        ),
        paper_bgcolor='rgb(243, 243, 243)',
        plot_bgcolor='rgb(243, 243, 243)',
        showlegend=False
    )

    fig.show()
    
def plot2():
    colors = ['#3D9970',  '#FF4136', '#FF851B']
    
    names = ['S', 'SP', 'SPR']
    
    x_data = [1, 1, 1, 1, 1, 1,
              2, 2, 2, 2, 2, 2]
    
    y_data= [[0.2, 0.2, 0.6, 1.0, 0.5, 0.4, 0.2, 0.7, 0.9, 0.1, 0.5, 0.3],
            [0.6, 0.7, 0.3, 0.6, 0.0, 0.5, 0.7, 0.9, 0.5, 0.8, 0.7, 0.2],
            [0.1, 0.3, 0.1, 0.9, 0.6, 0.6, 0.9, 1.0, 0.3, 0.6, 0.8, 0.5]]

    fig = go.Figure()
    for i in xrange(len(names)):
        fig.add_trace(go.Box(
            y=y_data[i],
            x=x_data,
            name=names[i],
            marker_color=colors[i]
        ))

    fig.update_layout(
        yaxis_title='ATE errors',
        boxmode='group' # group together boxes of the different traces for each value of x
    )
    fig.show()

In [36]:
# Store stats in a tidy Pandas DataFrame
stats_list = []
for param_name in stats:
    for param_value in stats[param_name]:
        for dataset_name in stats[param_name][param_value]:
            for pipeline in stats[param_name][param_value][dataset_name]:
                result = stats[param_name][param_value][dataset_name][pipeline]['absolute_errors'].np_arrays['error_array']
                stats_list.append([param_name, param_value, dataset_name, pipeline, result])

df = pd.DataFrame.from_records(stats_list)
df.columns = ['Param Name', 'Param Value', 'Dataset Name', 'Pipe Type', 'ATE errors']
df.set_index(['Param Name', 'Dataset Name'], inplace = True)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Param Value,Pipe Type,ATE errors
Param Name,Dataset Name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
smartNoiseSigma,V1_01_easy,2.0,S,"[0.10997670458005289, 0.10517572913663793, 0.1..."
smartNoiseSigma,V1_01_easy,2.0,SPR,"[0.13134755458566505, 0.1262207963699808, 0.12..."
smartNoiseSigma,MH_01_easy,2.0,S,"[0.2151444667711187, 0.2157468309814195, 0.216..."
smartNoiseSigma,MH_01_easy,2.0,SPR,"[0.27152230985699444, 0.2716027138288078, 0.27..."
smartNoiseSigma,V1_01_easy,3.0,S,"[0.10524378348464582, 0.10130184371967772, 0.0..."
smartNoiseSigma,V1_01_easy,3.0,SPR,"[0.142529469669964, 0.13837671696192647, 0.134..."
smartNoiseSigma,MH_01_easy,3.0,S,"[0.2621647134833297, 0.26193382036350643, 0.26..."
smartNoiseSigma,MH_01_easy,3.0,SPR,"[0.22163350964595124, 0.22152040800082398, 0.2..."
smartNoiseSigma,V1_01_easy,2.2,S,"[0.15221248678273705, 0.14793509054390908, 0.1..."
smartNoiseSigma,V1_01_easy,2.2,SPR,"[0.1256631606142888, 0.12203396205882884, 0.12..."


In [71]:
def boxplot(param_name, dataset_name, tidy):
    tidy.set_index(['Param Value', 'Pipe Type'], inplace = True)
    tidy_2 = tidy['ATE errors'].apply(lambda x: pd.Series(x)).stack().reset_index(level=2, drop=True).to_frame('ATE errors')
    tidy_2.reset_index(level=['Pipe Type', 'Param Value'], drop=False, inplace=True)
    tidy_2.rename(columns={'Param Value': param_name}, inplace=True)
    fig = px.box(tidy_2, x=param_name, y="ATE errors", points="all", color="Pipe Type")

    fig.update_layout(
    title=go.layout.Title(
        text="Dataset: " + dataset_name
    ),
    xaxis=go.layout.XAxis(
        title=go.layout.xaxis.Title(
            text=param_name
        )
    ),
    yaxis=go.layout.YAxis(
        title=go.layout.yaxis.Title(
            text="ATE [m]"
            )
        )
    )
    
    fig.show()
#result = [(x,y) for x in df.index.levels[0] for y in df.index.levels[1]]
result = [boxplot(x, y, df.loc[x].loc[y]) for x in df.index.levels[0] for y in df.index.levels[1]]

In [67]:
tidy_2 = tidy['ATE errors'].apply(lambda x: pd.Series(x)).stack().reset_index(level=2, drop=True).to_frame('ATE errors')
tidy_2.reset_index(level=['Pipe Type', 'Param Value'], drop=False, inplace=True)
tidy_2.colum

Unnamed: 0,Param Value,Pipe Type,ATE errors
0,2.0,S,0.109977
1,2.0,S,0.105176
2,2.0,S,0.105346
3,2.0,S,0.104481
4,2.0,S,0.099550
5,2.0,S,0.097792
6,2.0,S,0.097396
7,2.0,S,0.097285
8,2.0,S,0.094557
9,2.0,S,0.091342


In [66]:
fig = px.box(tidy_2, x='Param Value', y="ATE errors", points="all", color="Pipe Type")
fig.show()