# Checking the different model performances obtained depending on the Lora config

In [1]:
import pickle
import numpy as np
import pandas as pd

METRICS = ['precision', 'recall', 'f1']

# Loading the results from the Full-Finetuning
with open('../Experiments/results_list_full_fine_tune_all_light.pkl', 'rb') as f:
    results_list_full_fine_tune = pickle.load(f)

# Loading the results from the Lora rank experiment
with open('../Experiments/results_list_lora_r_experiment_all_light.pkl', 'rb') as f:
    results_list_lora_r_experiment = pickle.load(f)

# Loading the results from the ablation study
with open('../Experiments/results_list_lora_ablation_all_light.pkl', 'rb') as f:
    results_list_lora_ablation = pickle.load(f)

**NB:** Dictionnary list structure guide:

- Full-Finetuning: List of 'baseline' dictionnaries which all hold the metrics keys: 'precision', 'recall' and 'f1'
- Lora r experiment: List of dictionnaries. Each dictionnary holds the keys corresponding to the r used in the experiment. For a given r, dictionnary[r] holds a baseline dictionnary.
- Lora ablation study: List of dictionnaries. Each dictionnary holds the keys corresponding to the experiment in the form (modules lora wath applied to, r). For a given such key, dictionnary[key] holds a baseline dictionnary.

### Agreggating the results

In [2]:
# Full Fine-Tuning
result_dict_full_fine_metrics = {}

for d in results_list_full_fine_tune:
    for key, value in d.items():
        if key in METRICS:
            result_dict_full_fine_metrics.setdefault(key, []).append(value)

# Lora rank experiment
result_dict_r_experiment_metrics = {k: {} for k in results_list_lora_r_experiment[0].keys()}
for dictionnary in results_list_lora_r_experiment:
    for r_key in dictionnary:
        for key, value in dictionnary[r_key].items():
            if key in METRICS:
                result_dict_r_experiment_metrics[r_key].setdefault(key, []).append(value)

# Lora ablation study
result_dict_ablation_experiment_metrics = {k: {} for k in results_list_lora_ablation[0].keys()}
for dictionnary in results_list_lora_ablation:
    for experiment_key in dictionnary:
        for key, value in dictionnary[experiment_key].items():
            if key in METRICS:
                result_dict_ablation_experiment_metrics[experiment_key].setdefault(key, []).append(value)

### Bootstraping to obtain better mean and confidence intervals

In [3]:
def boostrat_metrics(aggregated_dict, num_samples = 10_000):
    """
    From a dictionnary
    {metric 1: list of values, metric 2: list_of_values, ...}
    computes and returns new dictionnary
    {metric 1: 'mean +- std' metric 2: 'mean +- std' ...}
    """

    bootstraped_dict = {}

    for m in METRICS:
        data = aggregated_dict[m]

        # Initialize array to store bootstrap statistics
        bootstrap_stats = []

        for _ in range(num_samples):

            # Sample with replacement
            bootstrap_sample = np.random.choice(data, size=len(data), replace=True)
            # Calculate and store the mean
            bootstrap_stats.append(np.mean(bootstrap_sample))

        bootstraped_mean = np.mean(bootstrap_stats)
        std_error_bootstrap = np.std(bootstrap_stats)
        bootstraped_dict[m] = str(round(bootstraped_mean, 3)) + ' \u00B1 ' + str(round(std_error_bootstrap, 3))

    return bootstraped_dict

In [4]:
boostratped_dict_full_fine_tune = {'Full Fine Tuning': boostrat_metrics(result_dict_full_fine_metrics)}
boostratped_dict_full_fine_tune['Full Fine Tuning']['# experiments'] = len(result_dict_full_fine_metrics['f1'])

boostratped_dict_lora_r_experiments = {f'Lora with r={k}': boostrat_metrics(result_dict_r_experiment_metrics[k]) for k in result_dict_r_experiment_metrics.keys()}
for k in result_dict_r_experiment_metrics.keys():
    boostratped_dict_lora_r_experiments[f'Lora with r={k}']['# experiments'] = len(result_dict_r_experiment_metrics[k]['f1'])

boostratped_dict_lora_ablation_experiments = {k: boostrat_metrics(result_dict_ablation_experiment_metrics[k]) for k in result_dict_ablation_experiment_metrics.keys()}
for k in result_dict_ablation_experiment_metrics.keys():
    boostratped_dict_lora_ablation_experiments[k]['# experiments'] = len(result_dict_ablation_experiment_metrics[k]['f1'])

In [5]:
def make_max_bold(col):
	if col.name in METRICS:
		is_max = col == col.max()
	else:
		is_max = [False for _ in range(len(col))]
	return [f'font-weight: bold; background-color: purple' if cell else '' for cell in is_max]

def make_min_bold(col):
	if col.name in METRICS:
		is_max = col == col.min()
	else:
		is_max = [False for _ in range(len(col))]
	return [f'font-weight: bold; background-color: purple' if cell else '' for cell in is_max]

In [6]:
df_r_experiment_recap = pd.DataFrame(boostratped_dict_full_fine_tune | boostratped_dict_lora_r_experiments).T
df_r_experiment_recap.style.apply(make_max_bold)

Unnamed: 0,precision,recall,f1,# experiments
Full Fine Tuning,0.656 ± 0.008,0.699 ± 0.012,0.674 ± 0.006,25
Lora with r=4,0.654 ± 0.004,0.693 ± 0.01,0.673 ± 0.006,13
Lora with r=8,0.649 ± 0.008,0.668 ± 0.013,0.657 ± 0.006,13
Lora with r=16,0.657 ± 0.007,0.708 ± 0.011,0.68 ± 0.006,13
Lora with r=32,0.656 ± 0.006,0.68 ± 0.017,0.666 ± 0.01,13
Lora with r=64,0.542 ± 0.026,0.673 ± 0.011,0.597 ± 0.019,13


In [7]:
df_lora_ablation_recap = pd.DataFrame(boostratped_dict_full_fine_tune | boostratped_dict_lora_ablation_experiments).T
df_lora_ablation_recap.style.apply(make_max_bold)

Unnamed: 0,precision,recall,f1,# experiments
Full Fine Tuning,0.656 ± 0.008,0.699 ± 0.012,0.674 ± 0.006,25
"('query', 1)",0.616 ± 0.01,0.652 ± 0.007,0.634 ± 0.007,5
"('query', 2)",0.617 ± 0.003,0.637 ± 0.024,0.626 ± 0.01,5
"('query', 4)",0.631 ± 0.016,0.685 ± 0.02,0.656 ± 0.015,5
"('query', 8)",0.643 ± 0.009,0.643 ± 0.018,0.642 ± 0.01,5
"('query', 16)",0.653 ± 0.007,0.686 ± 0.012,0.669 ± 0.009,5
"('query', 32)",0.655 ± 0.013,0.644 ± 0.026,0.649 ± 0.019,5
"('query', 64)",0.653 ± 0.01,0.672 ± 0.016,0.662 ± 0.009,5
"('key', 1)",0.595 ± 0.009,0.643 ± 0.014,0.617 ± 0.002,5
"('key', 2)",0.628 ± 0.005,0.641 ± 0.015,0.634 ± 0.008,5
