In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
import plotly.express as px
import plotly.graph_objects as pg
import os
import matplotlib.pyplot as plt
import plotly.subplots as ps

In [None]:
MODELS_PATH = Path("/data/toulouse/bicycle/notebooks/experiments/bottleneck/data/models")
PLOTS_PATH = Path("/data/toulouse/bicycle/notebooks/experiments/bottleneck/data/plots")
ANALYSIS_PATH = Path("/data/toulouse/bicycle/notebooks/experiments/bottleneck/data/analysis")
exclude=["test_run_00013", "test_run_00014", "figures"]

## Summary profiles

In [None]:
parameters = pd.read_csv(ANALYSIS_PATH.joinpath("parameters.csv"), index_col=0).sort_values("run_id", ignore_index=True)
ngenes_parameters = parameters.iloc[:12, :10]
key_array = ngenes_parameters["run_id"].to_numpy().reshape((len(ngenes_parameters)//3, 3))
ngenes_parameters = ngenes_parameters.set_index("run_id", drop=True)

print(key_array)

In [None]:

training_profiles = dict()
for dir in ANALYSIS_PATH.iterdir():
    if str(dir.name) in exclude:
        continue
    if dir.is_dir() and dir.name in key_array:
        training_profiles[str(dir.name)] = pd.read_csv(dir.joinpath("training_profile.csv"))

In [None]:
# plot training tima for baseline graph
aggregator={"training_time": "mean"}
data = ngenes_parameters.groupby("data_n_genes").agg("mean").reset_index()
#data["training_time"]/=60
#data["training_time"] /=data["n_epochs"]
fig = px.bar(data,
             x="data_n_genes",
             y="training_time",
             labels={"training_time":"Mean time per epoch [s]",
                     "data_n_genes": "Number of genes"},
             title="Training time per batch with different data sizes",
             color="n_epochs",
             hover_data=["data_n_genes", 
                         "n_epochs",
                         "batch_size",
                         ],
             color_continuous_scale="reds"
             )

fig.write_image(str(ANALYSIS_PATH/"figures"/"Baseline_training_time.pdf"), scale=6)
fig.show()


In [None]:
metrics = ["Call_num", "Primitive_Call_num", "Time"]
processed_profiles = dict()
for n, keys in enumerate(key_array):
    
    
    assert ["Class_Function" in training_profiles[keys[0]].compare(training_profiles[key]).columns for key in keys], "Dataframes not aligned"
    array = np.stack([np.array(training_profiles[key][metrics]) for key in keys], axis=2)
    mean = np.mean(array, axis=2)
    std = np.std(array, axis=2)
    results = pd.DataFrame(np.concatenate([mean, std], axis=1), columns=[f"{r}_{m}" for r in ["mean", "std"] for m in metrics])
    df = pd.concat([training_profiles[keys[0]].drop(columns= metrics), results], axis=1)
    processed_profiles[keys[0]] = df
 
    


ngenes_parameters

In [None]:
condition = "not is_callback"
top_functions = pd.DataFrame(columns=np.arange(10))
top_classes = pd.DataFrame(columns=np.arange(10))
for n, df in processed_profiles.items():
    for metric in metrics:
        title = f"Summarized_mean_{metric}_{str(ngenes_parameters.loc[n, 'data_n_genes'])}genes_{condition}"
        data = df.query(condition).sort_values("mean_"+metric, ignore_index=True, ascending = False).iloc[:10]
        top_functions.loc[len(top_functions)]=data["Function"]
        top_classes.loc[len(top_functions)]=data["Class_Function"]
        fig = px.bar(data,
                     x="Function",
                     y= "mean_"+metric,
                     error_y="std_"+metric,
                     #facet_row="Class",
                     title=title,
                     text="Class_Function"
                     )
        #fig.show()
        if not ANALYSIS_PATH.joinpath("figures","summarized").is_dir():
            ANALYSIS_PATH.joinpath("figures","summarized").mkdir()
        fig.write_image(ANALYSIS_PATH/"figures"/"summarized"/f"{title}.pdf", scale=6)
        

In [None]:
# check if Class_Function pairs are unique
top_functions = set(top_functions.to_numpy().flatten())
top_classes = set(top_classes.to_numpy().flatten())
print(top_functions, "\n", top_classes)

In [None]:
ngenes_parameters

## Full profiles

In [None]:
aggregator = {
    "Class": lambda x: x.iloc[0],
    "Function": lambda x: x.iloc[0],
    "Class_Function_etc": lambda x: x.iloc[0],
    "Class_Function": lambda x: x.iloc[0],
    "Summary_index": lambda x: x.iloc[0],
    "filename_lineno(function)": lambda x: x.iloc[0],
    "is_callback": lambda x: x.iloc[0],
    "Call_num": "sum",
    "Primitive_Call_num": "sum",
    "Time": "sum",
    "ncalls": "sum",
    "tot_time": "sum",
    "tot_percall": "mean",
    "cum_time": "sum",
    "cum_percall": "mean",    
}

In [None]:
full_profiles = dict()
for dir in ANALYSIS_PATH.iterdir():
    if str(dir.name) in exclude:
        continue
    if dir.is_dir():
        df = pd.read_csv(dir.joinpath("full_training_profile.csv")).drop(columns = ["Rank"])
        df = df.groupby(["Class_Function_etc"], as_index=False, ).agg(aggregator).reset_index(drop=True)
        df["is_target"] = df["Function"].apply(lambda x: x in top_functions)
        df["in_model"] = df["Class_Function_etc"].apply(lambda x: "model.py" in str(x).casefold())
        df["in_bicycle"] = df["Class_Function_etc"].apply(lambda x: "bicycle" in str(x).casefold())
        df = df.sort_values("filename_lineno(function)").set_index(pd.Index(np.arange(len(df))),drop=True)
        #numericals = ["Call_num","Primitive_Call_num","Time","ncalls","tot_time","tot_percall","cum_time","cum_percall"]
        #df[numericals] = df[numericals]/sum(df[numericals], axis=0)
        full_profiles[str(dir.name)] = df

In [None]:
filtered_full_profiles = dict()
for keys in key_array:
    for i, key in enumerate(keys):
        other = [k for k in keys if k != key]
        first = full_profiles[other[0]]["Class_Function_etc"]
        second = full_profiles[other[1]]["Class_Function_etc"]
        df = full_profiles[key]
        #print("duplicated sum:",df["filename_lineno(function)"].duplicated().sum())
        #print("duplicated sum:",df["Class_Function_etc"].duplicated().sum())

        df["first"] = df["Class_Function_etc"].isin(first)
        #print(len(df["first"]), df["first"].sum())
        df["second"] = df["Class_Function_etc"].isin(second)
        #print(len(df["second"]), df["second"].sum())
        df = df[df["first"] & df["second"]]
        #print(len(df), "\n")

        filtered_full_profiles[key] = df.reset_index(drop=True)

In [None]:
for keys in key_array:
    print(keys, "\n", [filtered_full_profiles[key].index for key in keys])
    assert ["Class_Function_etc" in filtered_full_profiles[keys[0]].compare(filtered_full_profiles[n]).columns for n in keys], "Dataframes not aligned"
    #print([full_profiles[keys[0]].compare(full_profiles[n]).columns for n in keys])

In [None]:
metrics = ["ncalls","tot_time","tot_percall","cum_time","cum_percall"]
processed_full_profiles = dict()
for n, keys in enumerate(key_array):
    print(keys)
    assert ["filename_lineno(function)" in filtered_full_profiles[keys[0]].compare(filtered_full_profiles[key]).columns for key in keys], "Dataframes not aligned"
    array = np.stack([np.array(filtered_full_profiles[key][metrics]) for key in keys], axis=2)
    mean = np.mean(array, axis=2)
    std = np.std(array, axis=2)
    results = pd.DataFrame(np.concatenate([mean, std], axis=1), columns=[f"{r}_{m}" for r in ["mean", "std"] for m in metrics])
    df = pd.concat([filtered_full_profiles[keys[0]].drop(columns= metrics), results], axis=1)
    processed_full_profiles[keys[0]] = df
 

In [None]:
parameters

### Figures:

In [None]:
colors = ["aliceblue", "antiquewhite", "aqua", "aquamarine", "azure",
            "beige", "bisque", "black", "blanchedalmond", "blue",
            "blueviolet", "brown", "burlywood", "cadetblue",
            "chartreuse", "chocolate", "coral", "cornflowerblue",
            "cornsilk","crimson", "cyan","darkblue", "darkcyan",
            "darkgoldenrod", "darkgray","darkgrey", "darkgreen",
            "darkkhaki", "darkmagenta","darkolivegreen", "darkorange",
            "darkorchid", "darkred","darksalmon", "darkseagreen",
            "darkslateblue","darkslategray", "darkslategrey","darkturquoise",
              "darkviolet","deeppink", "deepskyblue","dimgray", "dimgrey",
              "dodgerblue", "firebrick","floralwhite", "forestgreen"
              ,"fuchsia", "gainsboro","ghostwhite","gold","goldenrod",
              "gray","grey", "green","greenyellow","honeydew", "hotpink",
              "indianred", "indigo","ivory","khaki", "lavender","lavenderblush",
              "lawngreen","lemonchiffon", "lightblue","lightcoral", "lightcyan",
              "lightgoldenrodyellow","lightgray", "lightgrey","lightgreen",
              "lightpink","lightsalmon", "lightseagreen","lightskyblue",
              "lightslategray", "lightslategrey","lightsteelblue", "lightyellow",
              "lime", "limegreen","linen", "magenta","maroon", "mediumaquamarine",
              "mediumblue","mediumorchid", "mediumpurple","mediumseagreen",
              "mediumslateblue", "mediumspringgreen","mediumturquoise","mediumvioletred",
              "midnightblue","mintcream","mistyrose", "moccasin","navajowhite",
              "navy","oldlace","olive", "olivedrab","orange", "orangered","orchid",
              "palegoldenrod","palegreen", "paleturquoise","palevioletred", "papayawhip",
              "peachpuff","peru", "pink","plum","powderblue", "purple","red",
              "rosybrown","royalblue", "rebeccapurple","saddlebrown", "salmon","sandybrown","seagreen", "seashell","sienna", "silver","skyblue","slateblue", "slategray","slategrey", "snow","springgreen","steelblue","tan", "teal","thistle", "tomato","turquoise","violet", "wheat","white", "whitesmoke","yellow","yellowgreen"
            ]

In [None]:
for key in processed_full_profiles.keys():
    n_genes = int(ngenes_parameters.loc[key].data_n_genes)
    df = processed_full_profiles[key].set_index("filename_lineno(function)").query("in_model")
    fig = ps.make_subplots(rows=len(metrics)+1,
                           subplot_titles=metrics,

                            #specs="bar",
                            )

    class_to_color={x: np.random.randint(low=100000,high=999999) for x in df["Class"]}

    for n, metric in enumerate(metrics):
        data = df.sort_values(f"mean_{metric}", ascending=False).iloc[:10]
        fig.add_trace(
            pg.Bar(x=data.index,
                   y=data[f"mean_{metric}"],
                   error_y=pg.bar.ErrorY(array=data[f"std_{metric}"].to_numpy()),
                   orientation="v",
                   name=metric,
                   text=data["Function"],
                   textangle=0,
                   insidetextanchor="start",
                   #marker={"color":[class_to_color[x] for x in data["Class"]],
                   #        "colorbar":{"tickmode":"auto",}
                   #},
                   ),

        row=n+1,
        col=1
        )
    fig.update_layout(height=1000, width=2000,title_text = f"Top 10 functions in different profile metrics with {n_genes} genes")
    fig.write_image(ANALYSIS_PATH/"figures"/f"Top10_panel_{n_genes}genes.pdf", scale=10)
#    fig.show()


In [None]:
ngenes_parameters

In [None]:
# concatenate all processed/meaned profiles into one df
full_df = pd.concat([pd.concat(
    [df.sort_values("mean_tot_time").iloc[:10], 
     pd.DataFrame(ngenes_parameters.loc[key].to_numpy().repeat(len(df)).reshape((ngenes_parameters.shape[1], len(df))).T, columns=ngenes_parameters.columns)],
      axis=1
      ) for key, df in processed_full_profiles.items()],
      axis = 0)

In [None]:
full_df.head()

In [None]:
print(full_df.columns)
aggregator={
    'Class':lambda x: x.iloc[0],
    'Function':lambda x: x.iloc[0],
    'Class_Function_etc':lambda x: x.iloc[0],
    'Class_Function':lambda x: x.iloc[0],
    'Summary_index':lambda x: x.iloc[0],
    'filename_lineno(function)':lambda x: x.iloc[0],
    'is_callback':lambda x: x.iloc[0],
    'Call_num':"mean",
    'Primitive_Call_num':"mean",
    'Time':"mean",
    'is_target':lambda x: x.iloc[0],
    'in_model':lambda x: x.iloc[0],
    'in_bicycle':lambda x: x.iloc[0],
    'first':lambda x: x.iloc[0],
    'second':lambda x: x.iloc[0],
    'mean_ncalls':"mean",
    'mean_tot_time':"mean",
    'mean_tot_percall':"mean",
    'mean_cum_time':"mean",
    'mean_cum_percall':"mean",
    'std_ncalls':"mean",
    'std_tot_time':"mean",
    'std_tot_percall':"mean",
    'std_cum_time':"mean",
    'std_cum_percall':"mean",
    'data_n_genes':"mean",
    'data_n_samples_control':"mean",
    'data_n_samples_per_perturbation':"mean",
    'batch_size':"mean",
    'n_epochs':"mean",
    'n_epochs_pretrain_latents':"mean",
    'scale_factor':"mean",
    'training_time':"mean",
    'pretraining_time':"mean"
    }

In [None]:
# plot all processed dataframes: Deprecated
for metric in metrics:
    
    data = full_df.sort_values("mean_" +metric, ignore_index=True, ascending=False)
    fig = px.bar(data,
        x="filename_lineno(function)",
        y = "mean_"+ metric,
        error_y="std_"+ metric,
        #facet_row="n_genes",
        facet_row= "data_n_genes",
        #color="filename_lineno(function)",
        color = "Function",
        barmode="group",
        #width=5000,
        height=1000,
        #log_y=True
        )
    #fig.show()

In [None]:
def mean_error_prop(stds:pd.Series):
    return np.sqrt((stds**2).sum())/len(stds)
aggregator = {
    "Class": lambda x: x.iloc[0],
    "Function": lambda x: x.iloc[0],
    "Class_Function_etc": lambda x: x.iloc[0],
    "Class_Function": lambda x: x.iloc[0],
    "Summary_index": lambda x: x.unique(), 
    "filename_lineno(function)": lambda x: x.iloc[0],
    "is_callback": lambda x: x.iloc[0],
    "Call_num": "sum",
    "Primitive_Call_num": "sum",
    "Time": "sum",
    "mean_ncalls": "sum",
    "std_ncalls": mean_error_prop,
    "mean_tot_time": "sum",
    "std_tot_time": mean_error_prop,
    "mean_tot_percall": "mean",
    "std_tot_percall": mean_error_prop,
    "mean_cum_time": "sum",
    "std_cum_time": mean_error_prop,
    "mean_cum_percall": "mean",
    "std_cum_percall": mean_error_prop 
}

In [None]:
condition = "in_model"

top_df = pd.DataFrame(columns=df.columns.append(ngenes_parameters.columns))
output_dir = ANALYSIS_PATH / "figures"
os.makedirs(output_dir, exist_ok=True)
for metric in metrics:
    for n, df in processed_full_profiles.items():
        data = df.query(condition).sort_values("mean_"+metric, ignore_index=True, ascending = False).iloc[:10]
        for _, row in data.iterrows():
            top_df.loc[len(top_df)] = np.concatenate([row, ngenes_parameters.loc[n]], axis=0)

        title = f"Mean_{metric}_{str(ngenes_parameters.loc[n, 'data_n_genes'])}genes_{condition}"
        fig = px.bar(data,
                    x="filename_lineno(function)",
                    y= "mean_"+metric,
                    error_y="std_"+metric,
                    #facet_row="Class",
                    color="Class",
                    text = "Function",
                    title=f"mean {metric} with {ngenes_parameters.loc[n]['data_n_genes']}",
                    log_y=True,
                    )
        #fig.show()



        
        if not ANALYSIS_PATH.joinpath("figures","full_profile").is_dir():
            ANALYSIS_PATH.joinpath("figures","full_profile").mkdir()
        fig.write_image(ANALYSIS_PATH/"figures"/"full_profile"/f"{title}.pdf", scale=6)
        

top_df = top_df.groupby(by="filename_lineno(function)", as_index=False).agg(aggregator).sort_values(f"mean_tot_time", ascending=False)
top_df.to_csv(ANALYSIS_PATH.joinpath(f"top_functions_{condition}.csv"))
top_df

In [None]:
metrics = ["ncalls","tot_time","tot_percall","cum_time","cum_percall"]
top_df = pd.read_csv(ANALYSIS_PATH.joinpath(f"top_functions_{condition}.csv"))
for metric in metrics:
    title = f"Mean {metric} of functions with {condition}"
    fig = px.bar(top_df.sort_values("mean_"+metric),
    x = "filename_lineno(function)",
    y = "mean_" + metric,
    error_y="std_"+metric,
    color= "Function",
    #color_discrete_map={"[LightningModule]BICYCLE": "#000991",
    #                    "[Strategy]SingleDeviceStrategy": "#800000"},
    log_y = True,
    text="Class",
    title=title,
    width=1500,
    height=500
    )
    if not ANALYSIS_PATH.joinpath("figures","top_df").is_dir():
        ANALYSIS_PATH.joinpath("figures","top_df").mkdir()
    fig.write_image(ANALYSIS_PATH/"figures"/"top_df"/f"{title}.pdf", scale=6)
    fig.show()

In [None]:
time = [float(full_profiles[key].query("Class_Function_etc == '[Strategy]SingleDeviceStrategy_training_step:model.py:828(split_samples)'")["tot_time"].iloc[0]) for key in key_array.flatten()]
n_genes = [ngenes_parameters.loc[key, "data_n_genes"] for key in key_array.flatten()]
px.bar(pd.DataFrame(np.array([n_genes, time]).T, columns=["n_genes", "time",]),
                    x = "n_genes",
                    y = "time",
                    )

In [None]:
px.bar(ngenes_parameters.reset_index(),
       x = "data_n_genes",
       y = "training_time",
       barmode="relative",
       )

In [None]:
ngenes_parameters

In [None]:
# Difference in training and Profile time?
plt.bar(height=[ngenes_parameters.loc[key]["training_time"]- training_profiles[key]["Time"].sum() for key in key_array.flatten()],
        x=[ngenes_parameters.loc[key, "data_n_genes"] for key in key_array.flatten()])