In [324]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

In [325]:
ALE_COLOR = 'rgba(30,144,255,1)'
PD_COLOR = 'rgba(220,20,60,1)'
ALE_COLOR_LIGHT = 'rgba(30,144,255,0.2)'
PD_COLOR_LIGHT = 'rgba(220,20,60,0.2)'

In [326]:
CSV_PATH = "results_final/all_rows.csv"
METRIC_NAMES = {
    "rho": "Spearman rho",
    "l1": "L1",
    "l2": "L2",
    "max_diff": "maximum distance (Linf)"    
}

In [327]:
df = pd.read_csv(CSV_PATH).drop(columns=['Unnamed: 0']).drop_duplicates()
df.head()

Unnamed: 0,name,path,variable,size,seed,lr,iter,constrain,dist_weight,ale_l2,ale_l1,ale_max_diff,ale_rho,pd_l2,pd_l1,pd_max_diff,pd_rho
0,heart,results_final/heart/age_512_0_gradient_0.1_50_...,age,512,0,0.1,50,False,0.01,0.0256,0.0256,0.067292,-0.859558,0.038559,0.038559,0.078188,-0.994805
1,heart,results_final/heart/age_256_0_gradient_0.1_50_...,age,256,0,0.1,50,False,0.01,0.020159,0.020159,0.043379,-0.842653,0.041103,0.041103,0.104379,-0.885714
2,heart,results_final/heart/age_128_0_gradient_0.1_50_...,age,128,0,0.1,50,False,0.01,0.018468,0.018468,0.044442,-0.43433,0.025585,0.025585,0.052349,-0.984416
3,heart,results_final/heart/age_64_0_gradient_0.1_50_0...,age,64,0,0.1,50,False,0.01,0.011359,0.011359,0.023311,-0.304291,0.011301,0.011301,0.028792,-0.976623
5,heart,results_final/heart/age_32_0_gradient_0.1_50_0...,age,32,0,0.1,50,False,0.01,0.024567,0.024567,0.087358,-0.890767,0.033288,0.033288,0.084878,-0.942857


In [328]:
df = df[df["size"].isin([32, 64, 128])]
df = df[df.dist_weight.isin([0, 0.0001, 0.001, 0.01, 0.1, 1])]

In [329]:
def split_rows(df):    
    orig_len = len(df)
    for metric in ("l1", "l2", "max_diff", "rho"):
        df[metric] = df[[f"ale_{metric}", f"pd_{metric}"]].values.tolist()
        df = df.explode(metric)

    cond_ale = (df.l1 == df.ale_l1) & (df.l2 == df.ale_l2) & (df.ale_max_diff == df.max_diff) & (df.ale_rho == df.rho) 
    cond_pd = (df.l1 == df.pd_l1) & (df.l2 == df.pd_l2) & (df.pd_max_diff == df.max_diff) & (df.pd_rho == df.rho) 

    df = df[cond_ale | cond_pd]
    # assert len(df) == 2 * orig_len, f"{len(df)}, {2 *orig_len}"

    cond_ale = (df.l1 == df.ale_l1) & (df.l2 == df.ale_l2) & (df.ale_max_diff == df.max_diff) & (df.ale_rho == df.rho) 
    cond_pd = (df.l1 == df.pd_l1) & (df.l2 == df.pd_l2) & (df.pd_max_diff == df.max_diff) & (df.pd_rho == df.rho)
    df.loc[cond_ale, "explanation"] = "ale"
    df.loc[cond_pd, "explanation"] = "pd"

    return df

In [330]:
df = split_rows(df)

In [331]:
def boxplots(variable, metric="rho", df=df):
    df["split variable"] = f"{variable} " + df[variable].astype(str)
    df = df.sort_values(by=["name", "variable", variable])
    groups = df.groupby(["name", "variable"])
    for (n, v), group in groups:
        group.variable = group.variable.astype(str)
        fig = px.box(group, x="split variable", y=metric, color="explanation", title=f"{n}, variable {v}, split by {variable}", width=800)
        fig.update_traces(quartilemethod="exclusive") # or "inclusive", or "linear" by default
        fig.update_layout(yaxis_title=METRIC_NAMES[metric])
        fig.show()

In [332]:
boxplots("size")

In [333]:
boxplots("size", df=df[df.dist_weight == 0])

In [334]:
boxplots("size", df=df[df.dist_weight == 0], metric="l2")

In [335]:
boxplots("dist_weight")

In [336]:
boxplots("dist_weight", "l2")

In [337]:
boxplots("size", "l2")

In [338]:
def boxplots_all_datasets(metric="rho", df=df):
    df["dataset"] = df["name"] + ", variable " + df["variable"]
    df = df.sort_values(by=["name", "variable"])
    fig = px.box(df, x="dataset", y=metric, color="explanation", title=f"{METRIC_NAMES[metric]} for various datasets", hover_data=["path"], width=800)
    fig.update_traces(quartilemethod="exclusive") # or "inclusive", or "linear" by default
    fig.update_layout(yaxis_title=METRIC_NAMES[metric])
    fig.show()

In [339]:
boxplots_all_datasets(df = df[(df.dist_weight == 0) & (df["size"] == 64) & (df.iter == 50)], metric="l2")

In [340]:
df.columns

Index(['name', 'path', 'variable', 'size', 'seed', 'lr', 'iter', 'constrain',
       'dist_weight', 'ale_l2', 'ale_l1', 'ale_max_diff', 'ale_rho', 'pd_l2',
       'pd_l1', 'pd_max_diff', 'pd_rho', 'l1', 'l2', 'max_diff', 'rho',
       'explanation', 'split variable'],
      dtype='object')

In [341]:
df.dist_weight.describe()

count    1858.00000
mean        0.22603
std         0.39430
min         0.00000
25%         0.00100
50%         0.01000
75%         0.10000
max         1.00000
Name: dist_weight, dtype: float64

In [347]:
def boxplots_by_weight(metric="rho", df=df):
    df["weight"] = "dist weight " + df.dist_weight.astype(str)
    df = df.sort_values(by=["dist_weight", "name", "variable"])
    fig = px.box(df, x="weight", y=metric, color="explanation", title=f"{METRIC_NAMES[metric]} for various distribution loss weights", width=800)
    fig.update_traces(quartilemethod="exclusive") # or "inclusive", or "linear" by default
    fig.update_layout(yaxis_title=METRIC_NAMES[metric])
    fig.show()

In [348]:
boxplots_by_weight(df=df)

In [367]:
gr = df[(df["variable"] != "Age") & (df.dist_weight != 0.0001)].groupby(["dist_weight", "name", "variable"]).mean().reset_index()
gr = gr.sort_values(by="dist_weight")
gr["weight"] = gr.dist_weight.astype(str)
fig = px.line(gr, x="weight", y="pd_rho",  color="name", title="Average Spearman rho for PD per dataset", width=800)
fig.update_layout(yaxis_title="Spearman rho", xaxis_title="Distribution distance loss weight")

fig.show()


The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



In [366]:
gr = df[(df["variable"] != "Age") & (df.dist_weight != 0.0001)].groupby(["dist_weight", "name", "variable"]).mean().reset_index()
gr = gr.sort_values(by="dist_weight")
gr["weight"] = gr.dist_weight.astype(str)
fig = px.line(gr, x="weight", y="ale_rho",  color="name", title="Average Spearman rho per dataset", width=800)
fig.update_layout(yaxis_title="Spearman rho", xaxis_title="Distribution distance loss weight")

fig.show()


The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.

