# Visualizations in Plotly for the DMP Aspect Test Experiment

In [None]:
%matplotlib notebook

In [None]:
import sys
sys.path.append("../../")

from dmp.data.logging import _get_sql_engine
import pandas as pd
import numpy as np
import plotly
import plotly.express as px
from joblib import Memory
from ipywidgets import interact, interact_manual


In [None]:
db = _get_sql_engine()

memory = Memory(location="./cache", verbose=0)

def query(query_string):
    with db.connect() as engine:
        return pd.read_sql(query_string, engine)

cached_query = memory.cache(query)

def clear_cache():
    memory.clear()
    
options = {
    "dataset": ['529_pollen',
        'sleep',
        'adult',
        '537_houses',
        'nursery',
        '201_pol',
        'mnist',
        'connect_4',
        'wine_quality_white'],
    "agg": ["avg", "min", "max"],
    "topology" : ["exponential", "rectangle", "trapezoid", "wide_first"],
    "loss": ['history_loss', 'history_hinge', 'history_accuracy',
       'history_val_loss', 'history_val_hinge', 'history_val_accuracy',
       'history_squared_hinge', 'history_cosine_similarity',
       'history_val_squared_hinge', 'history_mean_squared_error',
       'history_mean_absolute_error', 'history_val_cosine_similarity',
       'history_val_mean_squared_error', 'history_root_mean_squared_error',
       'history_val_mean_absolute_error',
       'history_kullback_leibler_divergence',
       'history_val_root_mean_squared_error',
       'history_mean_squared_logarithmic_error',
       'history_val_kullback_leibler_divergence',
       'history_val_mean_squared_logarithmic_error'],
    "residual_mode": ["none", "full"]
}

In [None]:
clear_cache()

In [None]:
### Heapmap App using Imshow

@interact_manual(**options, viz=["imshow", "scatter"])
def heatmap_app(groups="('exp00', 'exp01')", dataset="537_houses", topology="wide_first", loss="history_val_mean_squared_error", agg="avg", residual_mode="none", viz="imshow"):
    query_string = f'''
select "config.budget", {agg}(a.val) as value, count(a.val), a.epoch
from
    materialized_experiments_0 t,
    unnest(t.{loss}) WITH ORDINALITY as a(val, epoch)
WHERE
    "groupname" IN {groups} and
    "config.dataset"='{dataset}' and
    "config.topology"='{topology}' and
    "config.residual_mode"='{residual_mode}'
GROUP BY epoch, "config.budget"
'''
    df = cached_query(query_string)
    
    if viz == "imshow":
        #df = df.query("count > 389")
        df["value"] = np.log(df["value"])
        img = df.pivot_table(columns="epoch", index="config.budget", values="value")
        img.index = img.index.astype("str") #plotly interprets the y axis as numeric otherwise
        return px.imshow(img)
    elif viz == "scatter":
        df = df.sort_values(["epoch", "config.budget"], ascending=[True, False])
        df["config.budget"] = df["config.budget"].astype("str")
        df["epoch"] = df["epoch"].astype("str")
        return px.scatter(df, x="epoch", y="config.budget", size="count", color="value")

In [None]:
### Heapmap App using Imshow

axis = ["dataset", "topology", "residual_mode", "budget"]

@interact_manual(**options, viz=["imshow", "scatter"], y_dim=axis)
def heatmap_app(groups="('exp00', 'exp01')", x_dim="epoch", y_dim="budget", budget="64", dataset="537_houses", topology="wide_first", loss="history_val_mean_squared_error", agg="avg", residual_mode="none", viz="imshow"):
    axis = ["dataset", "topology", "residual_mode", "budget"]
    args = locals()
    filter_string = " and ".join([f'''"config.{s}"='{args[s]}' ''' for s in axis if s != y_dim])
    query_string = f'''
select "config.{y_dim}", {agg}(a.val) as value, count(a.val), a.epoch
from
    materialized_experiments_0 t,
    unnest(t.{loss}) WITH ORDINALITY as a(val, epoch)
WHERE
    "groupname" IN {groups} and
    {filter_string}
GROUP BY {x_dim}, "config.{y_dim}"
'''
    df = cached_query(query_string)
    
    if viz == "imshow":
        #df = df.query("count > 389")
        img = df.pivot_table(columns=x_dim, index="config."+y_dim, values="value")
        img.index = img.index.astype("str") #plotly interprets the y axis as numeric otherwise
        return px.imshow(img)
    elif viz == "scatter":
        df = df.sort_values(["epoch", "config.budget"], ascending=[True, False])
        df["config.budget"] = df["config.budget"].astype("str")
        df["epoch"] = df["epoch"].astype("str")
        return px.scatter(df, x="epoch", y="config.budget", size="count", color="value")

In [None]:
### single run app

@interact(**options)
def heatmap_app(dataset="537_houses", topology="wide_first", loss="history_val_mean_squared_error", agg="avg", residual_mode="none",
               budget="256"):
    query_string = f'''
select a.val as value, a.epoch, run_name
from
    materialized_experiments_0 t,
    unnest(t.{loss}) WITH ORDINALITY as a(val, epoch)
WHERE
    "groupname" IN ('exp00', 'exp01') and
    "config.dataset"='{dataset}' and
    "config.topology"='{topology}' and
    "config.residual_mode"='{residual_mode}' and
    "config.budget"='{budget}'
'''
    df = cached_query(query_string)
    return px.line(df, x="epoch", y="value", color="run_name")

##

- 537 houses, (val) kl divergence, trapezoid
- sleep, trapezoid, history_val_loss, 16777216

Notes:
- Logarithmic color map
- look into why sleep runs crash for high budgets
- plateau effect
- review stopping configurations / what is reasonable number of iterations?

# Next Steps

## Look at "plateau" feature from Charle's notebooks
## Add dataset, dataset complexity, as axis
## Add topology as an axis
### learning rate / optimizers / other axis to add

# To re-run using fixed epoch length, how long is reasonable?

In [None]:
### Heapmap App using Imshow

@interact_manual(**options, viz=["imshow", "scatter"])
def heatmap_app(groups="('exp00', 'exp01')", dataset="537_houses", topology="wide_first", residual_mode="none", viz="imshow"):
    query_string = f'''
select
    "config.budget",
    avg(array_length(t.history_loss, 1)) as avg,
    min(array_length(t.history_loss, 1)) as min,
    max(array_length(t.history_loss, 1)) as max,
    stddev(array_length(t.history_loss, 1)) as std
from
    materialized_experiments_0 t
WHERE
    "groupname" IN {groups} and
    "config.dataset"='{dataset}' and
    "config.topology"='{topology}' and
    "config.residual_mode"='{residual_mode}'
GROUP BY "config.budget"
'''
    df = cached_query(query_string).sort_values("config.budget")
    
    return px.line(df, x="config.budget", y="avg", error_y="std", log_x=True)

In [None]:
### Epoch Count App

@interact_manual(**options, viz=["imshow", "scatter"])
def epoch_count_app(groups="('exp00', 'exp01')", dataset="537_houses", topology="wide_first", residual_mode="none", viz="imshow"):
    query_string = f'''
select
    "config.budget",
    array_length(t.history_loss, 1) as num_epochs
from
    materialized_experiments_0 t
WHERE
    "groupname" IN {groups} and
    "config.dataset"='{dataset}' and
    "config.topology"='{topology}' and
    "config.residual_mode"='{residual_mode}'
'''
    df = cached_query(query_string)
    df = df.sort_values("config.budget")
    df["config.budget"] = df["config.budget"].astype("str")
    
    return px.scatter(df, x="config.budget", y="num_epochs", title=f"Number of epochs for {dataset}, {topology} in {groups}")

In [None]:
### Epoch Count App

@interact_manual(**options, viz=["imshow", "scatter"])
def epoch_count_app(groups="('exp00', 'exp01')", dataset="537_houses", topology="wide_first", residual_mode="none", viz="imshow"):
    query_string = f'''
select
    "config.budget",
    "config.topology",
    array_length(t.history_loss, 1) as num_epochs
from
    materialized_experiments_0 t
WHERE
    "groupname" IN {groups} and
    "config.dataset"='{dataset}' and
    "config.residual_mode"='{residual_mode}'
'''
    df = cached_query(query_string).sort_values("config.budget")
    
    df["config.budget"] = df["config.budget"].astype("str")
    
    return px.violin(df,
                    x="config.budget",
                    y="num_epochs",
                    color="config.topology",
                     box=True,
                    title=f"Number of epochs for {dataset}, in {groups}")

In [None]:
### Epoch Count App

@interact_manual(**options, viz=["imshow", "scatter"])
def epoch_count_agg_app(groups="('exp00', 'exp01')", dataset="537_houses", topology="wide_first", residual_mode="none", viz="imshow"):
    query_string = f'''
select
    "config.budget",
    "config.dataset",
    array_length(t.history_loss, 1) as num_epochs
from
    materialized_experiments_0 t
WHERE
    "groupname" IN {groups} and
    "config.topology"='{topology}' and
    "config.residual_mode"='{residual_mode}'
'''
    df = cached_query(query_string).sort_values("config.budget")
    
    df["config.budget"] = df["config.budget"].astype("str")
    
    return px.violin(df,
                    x="config.budget",
                    y="num_epochs",
                    color="config.dataset",
                     box=True,
                    title=f"Number of epochs for {topology}, in {groups}")

In [None]:
### Generate 90th percentile epoch length for each experiment
query_string = f'''
select
    (percentile_disc(0.9) within group (order by array_length(t.history_loss, 1))) as epoch_90_pctile,
    "config.topology",
    "config.residual_mode",
    "config.dataset",
    "config.budget"
FROM
    materialized_experiments_0 t
WHERE
    "groupname" IN ('exp00','exp01')
GROUP BY
    "config.topology",
    "config.residual_mode",
    "config.dataset",
    "config.budget"
'''

df = cached_query(query_string)

df

In [None]:
### Generate 90th percentile epoch length for each experiment
query_string = f'''
    select
        max(array_length(t.history_loss, 1)) as epoch_max,
        (percentile_disc(0.90) within group (order by array_length(t.history_loss, 1))) as epoch_90_pctile,
        "config.topology",
        "config.residual_mode",
        "config.dataset",
        "config.budget"
    FROM
        materialized_experiments_0 t
    WHERE
        "groupname" IN ('exp00','exp01')
    GROUP BY
        "config.topology",
        "config.residual_mode",
        "config.dataset",
        "config.budget"
'''

df = cached_query(query_string)

df

In [None]:
df2 = df.groupby(["config.topology", "config.residual_mode", "config.dataset"]).max().drop("config.budget", axis=1)

df2.to_csv("epoch_90th_pctile.csv")
df2

In [None]:
df.to_csv("epoch_90th_pctile_each_budget.csv")

In [None]:
## Now, we want to add dataset (or some statistic derived from the dataset's complexity) and topology as axis.

In [None]:
df = cached_query('''select * from materialized_experiments_0 WHERE "groupname" IN ('exp04') limit 10''')
display(df.columns)
df

In [None]:
# Plot the heatmap of validation loss for one topology, one dataset, over iteration count and different budgets

q = '''
select "config.budget", avg(a.val) as val_loss, avg(a.loss) as train_loss, count(a.loss), a.epoch
from
    materialized_experiments_0 t,
    unnest(t.history_val_mean_squared_error, t.history_mean_squared_error) WITH ORDINALITY as a(val, loss, epoch)
WHERE
    "groupname" IN ('exp00', 'exp01') and
    "config.dataset"='537_houses' and
    "config.topology"='wide_first' and
    "config.residual_mode"='none'
GROUP BY epoch, "config.budget"
'''

# q = '''
# select distinct("name")
# from
#     materialized_experiments_0
# WHERE
#     "groupname" IN ('exp00', 'exp01') and
#     "config.dataset"='537_houses' and
#     "config.topology"='wide_first' and
#     "config.residual_mode"='none' and
#     "config.budget"=262144
# limit 10
# '''

df = cache_query(q)
display(df.columns)
df