In [1]:
import pandas as pd 
import wandb
from tqdm import tqdm
import requests
import os
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn import metrics
import matplotlib.pyplot as plt
import scipy.stats as stats
import matplotlib.colors as mcolors

In [2]:
BASE_METRIC = "accuracy_per_mean_user_and_bot"
WANDB_OR_USERNAME = "guylororg"

In [3]:
def result_metric(sweeps, group_name, drop_list=[], drop_HPT=False, metric=BASE_METRIC, epoch="best"):
    df = api.get_sweeps_results(sweeps, metric=metric) 

    config_cols = [c for c in df.columns if "config_" in c and c!="config_wandb_run_id" and c!="config_online_simulation_size"]
    HPT_cols = [col for col in config_cols if df[col].nunique() > 1]
    # print(HPT_cols)
    if drop_HPT:
        df=df.drop([c for c in HPT_cols if not c in ["config_LLM_SIM_SIZE", "config_seed"]], axis=1)
        HPT_cols = ["config_LLM_SIM_SIZE", "config_seed"]
    # display(df['config_online_simulation_factor'])
    numeric_cols = df.select_dtypes(include='number').columns.tolist()
    grouped = df.groupby([c for c in HPT_cols if c != "config_seed"])
    
    mean_df = grouped[numeric_cols].mean()
    std_df = grouped[numeric_cols].std()
    # display(mean_df)

    if epoch=="best":
        best_col = mean_df[[c for c in mean_df.columns if (metric in c and metric[-4:] == c.split("_epoch")[0][-4:])]].idxmax(axis=1)
    else:
        best_col = mean_df[[c for c in mean_df.columns if f"{metric}_epoch{epoch}" in c]].idxmax(axis=1)
    
    result = grouped.apply(lambda x: x[best_col.loc[x.name]].values)
    means = grouped.apply(lambda x: x[best_col.loc[x.name]].mean())
    stds = grouped.apply(lambda x: x[best_col.loc[x.name]].std())

    # display(result)


    df_cols = {'mean': means, 'std': stds, 'values': result.values}
    # display(df_cols)
    if epoch == "best": df_cols['epoch'] = best_col.apply(lambda x: int(x.split("epoch")[1]) if "epoch" in x else "last")

    df_cols['CI'] = result.apply(lambda x: bootstrap_ci(x))
    # display(df_cols)
    summary_df = pd.DataFrame(df_cols, index=best_col.index)
    # display(summary_df)
    for d in drop_list:
        if d in summary_df.index:
            summary_df=summary_df.drop(d)
    # if len(summary_df.index.names) == 1:
    #     return summary_df.rename_axis(group_name)
    # else:
    #     return summary_df

    return summary_df

def bootstrap_ci(data, n_bootstrap=1000, ci=0.95):
    bootstrapped_means = []
    for _ in range(n_bootstrap):
        sample = np.random.choice(data, size=len(data), replace=True)
        bootstrapped_means.append(np.mean(sample))
    lower_bound = np.percentile(bootstrapped_means, (1 - ci) / 2 * 100)
    upper_bound = np.percentile(bootstrapped_means, (1 + ci) / 2 * 100)
    return lower_bound, upper_bound


# Results of HyperParameterTuning

For every configuration that you test in the sweep, you will receive in the table the average, standard deviation, all the values obtained for the different seed values, and also the confidence interval within which the result is located at a confidence level of 95%.

When epoch="best" is defined, you can check in which epoch the best result is obtained. If epoch=5 is defined, you will receive the result obtained for epoch number 5.

You can test multiple sweeps simultaneously by entering them into the list found in the first element of the function result_metric.

In [4]:
from read_wandb import wandb_results
api = wandb_results("NLP2024_PROJECT_Guy-lor", wandb_username=WANDB_OR_USERNAME)
sweep_ids_list = ["muwcnrjn", "8nixkte7"] # HPT==True

In [5]:
sweep_results = result_metric(sweep_ids_list, "config_online_simulation_factor",drop_list=[], drop_HPT=False, epoch="best")

Total number of sweeps: 2
Download sweep_id='muwcnrjn' data...
Download sweep_id='8nixkte7' data...


In [6]:
df = sweep_results
mapping_dict = {
    17: 'ORIGINAL PAPER STRATEGIES (OP)',
    18: 'OP + SENTIMENT RATIO',
    19: 'OP + LENGTH RATIO',
    20: 'OP + USER PREFERENCE VECTORS',
    21: 'OP + SENTIMENT RATIO + LENGTH RATIO',
    22: 'OP + SENTIMENT RATIO + LENGTH RATIO + USER PREFERENCE VECTORS',
    23: 'OP + USER PREFERENCE VECTORS (Double Weight)',
    24: 'OP + SENTIMENT RATIO + LENGTH RATIO (Double Weight)',
    25: 'OP + SENTIMENT RATIO + LENGTH RATIO + USER PREFERENCE VECTORS (Double Weight)',
    26: 'ONLY USER PREFERENCE VECTORS'
}

# Rename the MultiIndex using the mapping dictionary
df = df.rename(index=mapping_dict, level='config_basic_nature')
df

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,values,epoch,CI
config_basic_nature,config_simulation_user_improve,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ORIGINAL PAPER STRATEGIES (OP),0.01,0.79783,0.007891,"[0.787738360428849, 0.8048380266722495, 0.8033...",7,"(0.7915834810644624, 0.8040770675689211)"
ORIGINAL PAPER STRATEGIES (OP),0.05,0.797222,0.008552,"[0.7935647194453304, 0.8046062482917739, 0.786...",14,"(0.790211285893607, 0.8042332410528391)"
OP + SENTIMENT RATIO,0.01,0.794078,0.011826,"[0.7871046590816577, 0.8081927103389087, 0.799...",6,"(0.7845309625659587, 0.8036252475635322)"
OP + SENTIMENT RATIO,0.05,0.796739,0.008835,"[0.7914221248496729, 0.8076199807129346, 0.799...",7,"(0.7896887050433703, 0.8037897838251682)"
OP + LENGTH RATIO,0.01,0.796034,0.007499,"[0.7890858799080173, 0.801573813112203, 0.8033...",15,"(0.7895819611508536, 0.8024865517995263)"
OP + LENGTH RATIO,0.05,0.795034,0.010897,"[0.7851168728448563, 0.805184215264057, 0.8037...",13,"(0.7856184450249026, 0.8044504378740028)"
OP + USER PREFERENCE VECTORS,0.01,0.794751,0.008892,"[0.7891944410779456, 0.8062827287507508, 0.797...",20,"(0.7878448898600064, 0.8020106568325495)"
OP + USER PREFERENCE VECTORS,0.05,0.796727,0.007821,"[0.7915920519265607, 0.8071226516097457, 0.798...",5,"(0.7907452223409499, 0.8032818885771055)"
OP + SENTIMENT RATIO + LENGTH RATIO,0.01,0.797027,0.00663,"[0.7988912359204683, 0.8020964892993244, 0.799...",7,"(0.790424099515907, 0.8015301047786629)"
OP + SENTIMENT RATIO + LENGTH RATIO,0.05,0.793067,0.012446,"[0.7891488991017048, 0.8109521073351021, 0.790...",1,"(0.7840843822418442, 0.8055013052767528)"


In [7]:
df[['mean','CI']]

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,CI
config_basic_nature,config_simulation_user_improve,Unnamed: 2_level_1,Unnamed: 3_level_1
ORIGINAL PAPER STRATEGIES (OP),0.01,0.79783,"(0.7915834810644624, 0.8040770675689211)"
ORIGINAL PAPER STRATEGIES (OP),0.05,0.797222,"(0.790211285893607, 0.8042332410528391)"
OP + SENTIMENT RATIO,0.01,0.794078,"(0.7845309625659587, 0.8036252475635322)"
OP + SENTIMENT RATIO,0.05,0.796739,"(0.7896887050433703, 0.8037897838251682)"
OP + LENGTH RATIO,0.01,0.796034,"(0.7895819611508536, 0.8024865517995263)"
OP + LENGTH RATIO,0.05,0.795034,"(0.7856184450249026, 0.8044504378740028)"
OP + USER PREFERENCE VECTORS,0.01,0.794751,"(0.7878448898600064, 0.8020106568325495)"
OP + USER PREFERENCE VECTORS,0.05,0.796727,"(0.7907452223409499, 0.8032818885771055)"
OP + SENTIMENT RATIO + LENGTH RATIO,0.01,0.797027,"(0.790424099515907, 0.8015301047786629)"
OP + SENTIMENT RATIO + LENGTH RATIO,0.05,0.793067,"(0.7840843822418442, 0.8055013052767528)"


# Result on Test

In [8]:
sweep_ids_list = ['2gmspxn7'] # HPT==False 
sweep_results = result_metric(sweep_ids_list, "config_online_simulation_factor",drop_list=[], drop_HPT=False, epoch="best")

Total number of sweeps: 1
Download sweep_id='2gmspxn7' data...


In [9]:
df = sweep_results
mapping_dict = {
    17: 'ORIGINAL PAPER STRATEGIES (OP)',
    18: 'OP + SENTIMENT RATIO',
    19: 'OP + LENGTH RATIO',
    20: 'OP + USER PREFERENCE VECTORS',
    21: 'OP + SENTIMENT RATIO + LENGTH RATIO',
    22: 'OP + SENTIMENT RATIO + LENGTH RATIO + USER PREFERENCE VECTORS',
    23: 'OP + USER PREFERENCE VECTORS (Double Weight)',
    24: 'OP + SENTIMENT RATIO + LENGTH RATIO (Double Weight)',
    25: 'OP + SENTIMENT RATIO + LENGTH RATIO + USER PREFERENCE VECTORS (Double Weight)',
    26: 'ONLY USER PREFERENCE VECTORS'
}

# Rename the MultiIndex using the mapping dictionary
df = df.rename(index=mapping_dict, level='config_basic_nature')
df

Unnamed: 0_level_0,mean,std,values,epoch,CI
config_basic_nature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ORIGINAL PAPER STRATEGIES (OP),0.835264,0.002924,"[0.8394127949732522, 0.8347798132165378, 0.831...",11,"(0.8331097398219548, 0.8378110031930424)"
OP + USER PREFERENCE VECTORS,0.835367,0.003055,"[0.8375905369015115, 0.83879266154687, 0.83258...",8,"(0.8329749392018932, 0.8377596822028203)"
ONLY USER PREFERENCE VECTORS,0.829595,0.003508,"[0.8288568696208233, 0.8255839366017164, 0.835...",11,"(0.8270860657730786, 0.8325948419073333)"


In [10]:
df[['mean','CI']]

Unnamed: 0_level_0,mean,CI
config_basic_nature,Unnamed: 1_level_1,Unnamed: 2_level_1
ORIGINAL PAPER STRATEGIES (OP),0.835264,"(0.8331097398219548, 0.8378110031930424)"
OP + USER PREFERENCE VECTORS,0.835367,"(0.8329749392018932, 0.8377596822028203)"
ONLY USER PREFERENCE VECTORS,0.829595,"(0.8270860657730786, 0.8325948419073333)"
