In [130]:
import pandas as pd 
import wandb
from tqdm import tqdm
import requests
import os
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn import metrics
import matplotlib.pyplot as plt
import scipy.stats as stats
import matplotlib.colors as mcolors

In [131]:
import pandas as pd 
import wandb
from tqdm import tqdm
import requests
import os
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn import metrics

class wandb_results:
    def __init__(self, project_id, wandb_username="itaybachartechnion"): 
        self.api = wandb.Api(timeout=60)
        self.project_id = project_id
        self.wandb_username = wandb_username

    def get_sweep_results(self, sweep_id, metric="accuracy_all", best_epoch=False, get_servers=False, reset_api=False, read_csv_if_exist=True, save_to_csv=True):
        if reset_api:
            self.reset_api()

        print(f"Download {sweep_id=} data...")
        runs = self.api.sweep(f"{self.wandb_username}/{self.project_id}/{sweep_id}").runs
        n_runs = len(runs)
        path = f"sweeps_csvs/{sweep_id}_{n_runs}.csv"
        if read_csv_if_exist and os.path.exists(path):
            return pd.read_csv(path, index_col=0)
        summary_list, config_list, name_list = [], [], []
        for run in tqdm(runs): 
            summary_list.append(run.summary._json_dict)
            config_list.append(
                {k: v for k,v in run.config.items()
                  if not k.startswith('_')})
            name_list.append(run.name)

        runs_df = pd.DataFrame({
            "summary": summary_list,
            "config": config_list,
            "name": name_list
            })
        config_cols = pd.json_normalize(runs_df['config'])
        config_cols.columns = [f"config_{c}" for c in config_cols.columns]
        summary_cols = pd.json_normalize(runs_df['summary'])
        runs_df = pd.concat([runs_df, config_cols, summary_cols], axis=1)
        runs_df.drop(['config', 'summary'], axis=1, inplace=True)
        hpt = [c for c in config_cols.columns if c not in ["config_seed", "config_run_hash"]]
        if save_to_csv: runs_df.to_csv(path)
        return runs_df

    def get_sweeps_results(self, sweeps, metric="accuracy_all", best_epoch=False, get_servers=False,  read_csv_if_exist=True, save_to_csv=True):
        print("Total number of sweeps:", len(sweeps))
        j = pd.concat([self.get_sweep_results(sweep, metric=metric, best_epoch=best_epoch,  get_servers=get_servers, save_to_csv=save_to_csv, read_csv_if_exist=read_csv_if_exist) for sweep in sweeps])
        j = j.reset_index(drop=True)
        return j

    def reset_api(self):
        self.api = wandb.Api()

In [132]:
#from read_wandb import wandb_results
api = wandb_results("NLP2024_PROJECT_204117071_206948218", wandb_username="itaybachartechnion")

BASE_METRIC = "accuracy_per_mean_user_and_bot"


In [133]:
def result_metric(sweeps, group_name, drop_list=[0], drop_HPT=False, metric=BASE_METRIC, epoch="best"):
    df = api.get_sweeps_results(sweeps, metric=metric)

    config_cols = [c for c in df.columns if
                   "config_" in c and c != "config_wandb_run_id" and c != "config_online_simulation_size"]
    HPT_cols = [col for col in config_cols if df[col].nunique() > 1]
    print(HPT_cols)
    if drop_HPT:
        df = df.drop([c for c in HPT_cols if not c in ["config_LLM_SIM_SIZE", "config_seed"]], axis=1)
        HPT_cols = ["config_LLM_SIM_SIZE", "config_seed"]

    # dropping all non-numeric columns that are not in the HPT_cols list
    drop_names = [c for c in df.columns if
                  (c not in HPT_cols) and (c not in [metric, "epoch"]) and (df[c].dtype not in [np.number])]
    df = df.drop(drop_names, axis=1)

    if len(HPT_cols) == 1:  # has only one column of config_seed
        # add a dummy column to group by
        df["dummy"] = 'dummy'
        HPT_cols.append("dummy")

    grouped = df.groupby([c for c in HPT_cols if c != "config_seed"], dropna=False)
    mean_df = grouped.mean()
    std_df = grouped.std()
    if epoch == "best":
        best_col = mean_df[
            [c for c in mean_df.columns if (metric in c and metric[-4:] == c.split("_epoch")[0][-4:])]].idxmax(axis=1)
    else:
        best_col = mean_df[[c for c in mean_df.columns if f"{metric}_epoch{epoch}" in c]].idxmax(axis=1)

    result = grouped.apply(lambda x: x[best_col.loc[x.name]].values)
    means = grouped.apply(lambda x: x[best_col.loc[x.name]].mean())
    stds = grouped.apply(lambda x: x[best_col.loc[x.name]].std())

    df_cols = {'mean': means, 'std': stds, 'values': result.values}
    if epoch == "best": df_cols['epoch'] = best_col.apply(
        lambda x: int(x.split("epoch")[1]) if "epoch" in x else "last")

    df_cols['CI'] = result.apply(lambda x: bootstrap_ci(x))

    summary_df = pd.DataFrame(df_cols, index=best_col.index)
    for d in drop_list:
        if d in summary_df.index:
            summary_df = summary_df.drop(d)
    if len(summary_df.index.names) == 1:
        return summary_df.rename_axis(group_name)
    else:
        return summary_df


def bootstrap_ci(data, n_bootstrap=1000, ci=0.95):
    bootstrapped_means = []
    for _ in range(n_bootstrap):
        sample = np.random.choice(data, size=len(data), replace=True)
        bootstrapped_means.append(np.mean(sample))
    lower_bound = np.percentile(bootstrapped_means, (1 - ci) / 2 * 100)
    upper_bound = np.percentile(bootstrapped_means, (1 + ci) / 2 * 100)
    return lower_bound, upper_bound

# For HyperParameterTuning

For every configuration that you test in the sweep, you will receive in the table the average, standard deviation, all the values obtained for the different seed values, and also the confidence interval within which the result is located at a confidence level of 95%.

When epoch="best" is defined, you can check in which epoch the best result is obtained. If epoch=5 is defined, you will receive the result obtained for epoch number 5.

You can test multiple sweeps simultaneously by entering them into the list found in the first element of the function result_metric.

In [134]:
directory = 'sweeps_csvs'
if not os.path.exists(directory):
    os.makedirs(directory)
sweep_results = result_metric(["btmsogld", "2qmjkpp3"], "final", drop_HPT=False, epoch="best")
sweep_results

Total number of sweeps: 2
Download sweep_id='btmsogld' data...
Download sweep_id='2qmjkpp3' data...
['config_seed', 'config_basic_nature']


  (c not in HPT_cols) and (c not in [metric, "epoch"]) and (df[c].dtype not in [np.number])]


Unnamed: 0_level_0,mean,std,values,epoch,CI
final,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
18,0.835498,0.002733,"[0.8399023951337118, 0.8357191856846014, 0.834...",14,"(0.8337674765960472, 0.8378245864716133)"
22,0.833533,0.002812,"[0.8371260525799309, 0.8315670372578251, 0.835...",19,"(0.8313666216442528, 0.8357361842964869)"


# Result for a specific epoch

In [135]:
sweep_results = result_metric(["2qmjkpp3"], "LLMs", drop_HPT=False, epoch=10)
sweep_results

Total number of sweeps: 1
Download sweep_id='2qmjkpp3' data...
['config_seed']


  (c not in HPT_cols) and (c not in [metric, "epoch"]) and (df[c].dtype not in [np.number])]


Unnamed: 0_level_0,mean,std,values,CI
LLMs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dummy,0.834185,0.002353,"[0.8370430510024045, 0.833089257416279, 0.8361...","(0.8324349104956876, 0.8360833698694361)"
