In [2]:
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

In [3]:
results = pd.read_csv("../experimental_data/wandb_export_2023-11-01T12_13_53.919-04_00.csv")
results = results[results["State"] == "finished"]
results = results[
    ["Name", "Created", "model_name_or_path", "dataset_name", "adapter_config_string", "trainable_parameters",
     "dataset_config_name", "Runtime", "update_step", "peak_memory_usage",
     "throughput_examples", "eval/accuracy", "eval/f1"]]

results["Dataset"] = results["dataset_config_name"].fillna(results["dataset_name"])
results.drop(columns=["dataset_name", "dataset_config_name"], inplace=True)

results.sort_values(by=["model_name_or_path", "adapter_config_string", "Dataset"], inplace=True)

results["Score"] = results["eval/f1"].fillna(results["eval/accuracy"])
results.drop(columns=["eval/f1", "eval/accuracy"], inplace=True)

# rename columns
results.rename(columns={"model_name_or_path": "Model", "adapter_config_string": "PEFT", "Runtime": "runtime"}, inplace=True)

# replace value "meta-llama/Llama-2-7b-hf" in Model with just LLaMA-2 7B
results["Model"] = results["Model"].str.replace("meta-llama/Llama-2-7b-hf", "LLaMA-2 7B")

results = results[
    ["Name", "Created", "Model", "PEFT","Dataset", "Score",
     "trainable_parameters",
     "runtime", "update_step", "peak_memory_usage",
     "throughput_examples"
     ]]

# delete t5-small
results = results[results["Model"] != "t5-small"]

# drop NaN in Model or PEFT or Dataset
results.dropna(subset=["Model", "PEFT", "Dataset"], inplace=True)

# print duplicates of Model, PEFT, Dataset
print("Duplicates:")
print(results[results.duplicated(subset=["Model", "PEFT", "Dataset"], keep=False)].sort_values(by=["Model", "PEFT", "Dataset"]))
print()

# check that every model-peft combination has scores for all of the datasets
all_datasets = results["Dataset"].unique()
print("All datasets: ", all_datasets)
print("Missing scores:")
for model in results["Model"].unique():
    for peft in results["PEFT"].unique():
        missing = np.setdiff1d(all_datasets, results[(results["Model"] == model) & (results["PEFT"] == peft)]["Dataset"].unique())
        if len(missing) > 0:
            print(model, peft, missing)

results.to_csv("../experimental_data/experiment_table_Nov1.csv", index=False)
results

Duplicates:
Empty DataFrame
Columns: [Name, Created, Model, PEFT, Dataset, Score, trainable_parameters, runtime, update_step, peak_memory_usage, throughput_examples]
Index: []

All datasets:  ['boolq' 'cb' 'copa' 'rte']
Missing scores:
LLaMA-2 7B full_tuning ['boolq' 'cb' 'copa' 'rte']
t5-11b ia3 ['boolq' 'cb' 'copa' 'rte']
t5-11b mam ['boolq' 'cb' 'copa' 'rte']
t5-11b prefix_tuning ['boolq' 'cb' 'copa' 'rte']
t5-11b prefix_tuning_flat ['boolq' 'cb' 'copa' 'rte']
t5-11b unipelt ['boolq' 'cb' 'copa' 'rte']
t5-11b full_tuning ['boolq' 'cb' 'copa' 'rte']
t5-3b mam ['boolq' 'cb' 'copa' 'rte']
t5-3b prefix_tuning ['boolq' 'cb' 'copa' 'rte']
t5-3b prefix_tuning_flat ['boolq' 'cb' 'copa' 'rte']
t5-3b unipelt ['boolq' 'cb' 'copa' 'rte']


Unnamed: 0,Name,Created,Model,PEFT,Dataset,Score,trainable_parameters,runtime,update_step,peak_memory_usage,throughput_examples
67,light-hill-225,2023-10-27T19:56:28.000Z,LLaMA-2 7B,compacter,boolq,96.5443,,8188,884.0,,
64,lively-night-228,2023-10-27T22:13:07.000Z,LLaMA-2 7B,compacter,cb,40.5104,,1186,23.0,,
63,proud-water-229,2023-10-27T22:33:04.000Z,LLaMA-2 7B,compacter,copa,1.0000,,2141,38.0,,
58,wobbly-music-234,2023-10-27T23:08:55.000Z,LLaMA-2 7B,compacter,rte,97.8339,,3315,233.0,,
54,stilted-butterfly-238,2023-10-28T00:04:20.000Z,LLaMA-2 7B,compacter++,boolq,81.2844,,8482,884.0,,
...,...,...,...,...,...,...,...,...,...,...,...
168,absurd-wildflower-26,2023-09-21T06:40:58.000Z,t5-large,scaled_parallel,rte,87.0036,,877,233.0,,
148,northern-dawn-46,2023-09-21T09:43:26.000Z,t5-large,unipelt,boolq,62.1713,,3306,884.0,,
147,glorious-grass-47,2023-09-21T10:38:40.000Z,t5-large,unipelt,cb,22.2222,,197,23.0,,
146,wobbly-mountain-48,2023-09-21T10:42:06.000Z,t5-large,unipelt,copa,49.0000,,385,38.0,,


In [4]:
# average Score across Dataset, keep only Model, PEFT, Score
avg_scores = results.groupby(['PEFT', 'Model'])['Score'].mean().reset_index()
avg_scores = avg_scores.pivot(index='PEFT', columns='Model', values='Score')

avg_scores.to_csv("../experimental_data/experiment_table_Nov1_avg_scores.csv")
avg_scores

Model,LLaMA-2 7B,t5-11b,t5-3b,t5-large
PEFT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
compacter,58.97215,63.244625,53.587225,64.876275
compacter++,45.661425,63.314925,55.67035,66.096725
full_tuning,,,79.772425,81.190525
houlsby,78.3396,82.333425,79.671875,73.7468
ia3,0.045875,,54.40595,65.22945
ln_tuning,24.655975,66.62685,56.471775,66.329275
lora,41.862375,81.429025,81.5475,72.227475
mam,64.457175,,,39.673225
pfeiffer,53.180425,29.13615,62.3069,69.061925
prefix_tuning,69.977925,,,44.489025
