In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
import plotly.express as px
import os
import matplotlib.pyplot as plt

In [None]:
MODELS_PATH = Path("/data/toulouse/bicycle/notebooks/experiments/bottleneck/data/models")
PLOTS_PATH = Path("/data/toulouse/bicycle/notebooks/experiments/bottleneck/data/plots")
ANALYSIS_PATH = Path("/data/toulouse/bicycle/notebooks/experiments/bottleneck/data/analysis")
exclude=["test_run_00013", "test_run_00014", "figures"]

In [None]:
# run_id        profile scale compile full dynamic mode   worker_num  name
parameters = pd.read_csv(ANALYSIS_PATH/"manual_params.csv").set_index("run_id", drop=True)
worker_keys = pd.Series([
    "test_run_00051",
    "test_run_00052",
    "test_run_00053",
    "test_run_00054",
    "test_run_00055",
    "test_run_00056",
    ])
parameters = parameters.loc[worker_keys]

gene_number_paras = pd.read_csv(ANALYSIS_PATH/"parameters.csv").set_index("run_id", drop=True).sort_index()
gene_number_paras[gene_number_paras.isna()] = np.nan
gene_number_paras["loader_workers"] = 63
gene_number_paras["name"] = "63_workers"
gene_number_paras["compile"] = False
gene_number_paras["profile"] = True

In [None]:
gene_number_paras

In [None]:
for key in ["test_run_00027", "test_run_00028", "test_run_00029"]:
    parameters.loc[len(parameters)] = gene_number_paras[parameters.columns].loc[key]
    
    worker_keys.loc[len(worker_keys)] = key

parameters = parameters.reset_index()
parameters["run_id"] = worker_keys


## Summary profiles

In [None]:
training_profiles = dict()
for dir in ANALYSIS_PATH.iterdir():
    if str(dir.name) in exclude:
        continue
    if dir.is_dir() and dir.name in worker_keys.to_list():
        training_profiles[str(dir.name)] = pd.read_csv(dir.joinpath("training_profile.csv"))


In [None]:
# calculate filtered profiled time
times = list()
for key in worker_keys:
    print(key)
    times.append(training_profiles[key]["Time"].sum())
parameters["Profile_Time"] = times

In [None]:
fig = px.bar(parameters.sort_values("Profile_Time").reset_index(),
       x="run_id",
       y="Profile_Time",
       text="name",
       title="Comparison of profiled time with different cpu worker numbers"
       )
fig.show()
#fig.write_image(ANALYSIS_PATH/"figures"/"Worker_profiled_runtime.pdf", scale=10)

In [None]:
fig = px.bar(parameters.sort_values("training_time").reset_index(),
       x="run_id",
       y="training_time",
       text="name",
       title="Comparison of total training time with different cpu worker numbers"
       )
fig.show()
#fig.write_image(ANALYSIS_PATH/"figures"/"Worker_training_runtime.pdf", scale=10)

In [None]:
aggregator = {
    "Class": lambda x: x.iloc[0],
    "Function": lambda x: x.iloc[0],
    "Class_Function_etc": lambda x: x.iloc[0],
    "Class_Function": lambda x: x.iloc[0],
    "Summary_index": lambda x: x.iloc[0],
    "filename_lineno(function)": lambda x: x.iloc[0],
    "is_callback": lambda x: x.iloc[0],
    "Call_num": "sum",
    "Primitive_Call_num": "sum",
    "Time": "sum",
    "ncalls": "sum",
    "tot_time": "sum",
    "tot_percall": "mean",
    "cum_time": "sum",
    "cum_percall": "mean",
}

In [None]:
full_profiles = dict()
for dir in ANALYSIS_PATH.iterdir():
    if str(dir.name) in exclude:
        continue
    if dir.is_dir() and dir.name in worker_keys.to_list():
        df = pd.read_csv(dir.joinpath("full_training_profile.csv")).drop(columns = ["Rank"])
        df = df.groupby(["Class_Function_etc"], as_index=False, ).agg(aggregator).reset_index()
        df["in_model"] = df["Class_Function_etc"].apply(lambda x: "model.py" in str(x).casefold())
        df["in_bicycle"] = df["Class_Function_etc"].apply(lambda x: "bicycle" in str(x).casefold())
        df = df.sort_values("filename_lineno(function)").set_index(pd.Index(np.arange(len(df))),drop=True)
        #numericals = ["Call_num","Primitive_Call_num","Time","ncalls","tot_time","tot_percall","cum_time","cum_percall"]
        #df[numericals] = df[numericals]/sum(df[numericals], axis=0)
        full_profiles[str(dir.name)] = df

In [None]:
print(full_profiles.keys())

In [None]:
metrics = ["ncalls","tot_time","tot_percall","cum_time","cum_percall"]

if parameters.index.name != "run_id":
    parameters=parameters.set_index("run_id")
condition="in_model"
top_df = pd.DataFrame(columns=df.columns.append(parameters.columns))
for n, df in full_profiles.items():
        data = df.query(condition)#.sort_values(metric, ignore_index=True, ascending = False).iloc[:20]
        for _, row in data.iterrows():
            top_df.loc[len(top_df)] = np.concatenate([row, parameters.loc[n]], axis=0)

In [None]:
aggregator["name"]= lambda x: x.iloc[0]
for metric in metrics:
    title = f"Loader workers: {metric} of functions with {condition}"
    
    data = top_df.groupby(["loader_workers", "Class_Function_etc"],).agg(aggregator)
    fig = px.bar(data.sort_values(metric).sort_values("loader_workers"),
    x = "filename_lineno(function)",
    y = metric,
    color= "name",
#    log_y = True,
    text="Function",
    title=title,
    barmode="group",
    )
    fig.show()
    #fig.write_image(ANALYSIS_PATH/"figures"/f"{title}.pdf")

In [None]:
top_df.loc[top_df["filename_lineno(function)"] == "model.py:828(split_samples)"]