In [63]:
n_jobs = 1   # Remember to set `parameters` tag!
dtu_hpc = "true"

In [65]:
if (not dtu_hpc) or (dtu_hpc == "false"):
  from google.colab import drive, userdata
  import os
  print("Running on Google Colab")
  drive.mount('/content/drive')
  drive_dir = '/content/drive/My Drive/'
  data_dir = os.path.join(drive_dir, 'ITI-datasets')
  cache_dir = os.path.join(drive_dir, 'model_cache')
  !pip install -q seaborn
  disable_pbar = False

else:
  import os
  print("Running at DTU HPC")
  drive_dir = '/work3/s184399/msc'
  data_dir = os.path.join(drive_dir, 'ITI-datasets')
  cache_dir = os.path.join(drive_dir, "cache_dir", "huggingface")
  disable_pbar = True

Running at DTU HPC


In [66]:
# Unit test: Test the bias on some whack distribution, possibly using the Dirichlet
import numpy as np
import pandas as pd

def bootstrap_CI(p, alpha=0.05, k=2000):
  """
    Computes the confidence interval of the mean using bootstrapping.
    Here the confidence interval is the 100*(1-alpha) central CI, from percentile 100*(alpha/2) to 100*(1-alpha/2) rounded to broadest interval when picking the indices.
    Line Clemmensen suggests picking k (number of repeats) to 1000 or 2000 for this tasks, so I do this.
  """
  assert isinstance(p, np.ndarray)
  assert p.ndim == 1
  N = len(p)
  bootstraps = np.random.choice(p, (k,N), replace=True)
  ci_lower = alpha/2.
  ci_upper = 1.-(alpha/2.)
  idxs = [
    int(np.floor(k*ci_lower)),
    int(np.ceil(k*ci_upper))
  ]
  CI = np.sort(np.mean(bootstraps, axis=-1))[idxs]     # Sorts lowest to highest
  assert CI[0] <= CI[1]  # To be on the safe side...
  CI = f"[{(CI[0]*100):.2f}%, {(CI[1]*100):.2f}%]"
  return CI, N    # Returns CI and support (N)

In [67]:
# Naming patterns (titles in data_dir)
# f"is_correct_{model_name_str}_ITI_truthful_qa_par3.npz"
# f"is_correct_{model_name_str}_ITI_truthful_qa_{ood_test}.npz"
# f"is_correct_{model_name_str}_Base_truthful_qa_par3.npz"
# f"is_correct_{model_name_str}_Base_truthful_qa_{ood_test}.npz"
files = os.listdir(data_dir)
files = list(filter(lambda x: x.startswith('is_correct') and x.endswith('.npz'), files))

dataset_names = ['par3', 'common_claim_true_false', 'counterfact_true_false', 'cities', 'neg_cities', 'politicians']
#model_names = ['Llama-2-7b-chat-hf', 'Llama-2-7b-hf', 'Meta-Llama-3-8B', 'Meta-Llama-3-8B-Instruct', 'Mistral-7B-Instruct-v0.2', 'Mistral-7B-Instruct-v0.3', 'Mistral-7B-v0.3', 'Mixtral-8x7B-v0.1', 'Mixtral-8x7B-Instruct-v0.1', 'opt-2.7b', 'opt-125m', 'opt-350m', 'Phi-3-mini-4k-instruct'] 
model_names = ['Llama-2-7b-hf', 'Meta-Llama-3-8B', 'Meta-Llama-3-8B-Instruct', 'Mistral-7B-Instruct-v0.2', 'Mistral-7B-Instruct-v0.3', 'Mistral-7B-v0.3', 'Mixtral-8x7B-v0.1', 'opt-2.7b', 'opt-125m', 'opt-350m', 'Phi-3-mini-4k-instruct'] 


# Check that we have all files...
missing_files = []
for d_name in dataset_names:
    for m_name in model_names:
        if not f"is_correct_{m_name}_ITI_truthful_qa_{d_name}.npz" in files:
            missing_files.append(f"is_correct_{m_name}_ITI_truthful_qa_{d_name}.npz")
        if not f"is_correct_{m_name}_Base_truthful_qa_{d_name}.npz" in files:
            missing_files.append(f"is_correct_{m_name}_Base_truthful_qa_{d_name}.npz")
assert len(missing_files) == 0, f"Missing files: {missing_files}"

# Individual performance CIs

In [68]:
mc_performance_df = pd.DataFrame(columns=dataset_names)
# Sorry, but no baseline computation in this one (for now at least)...
for m_name in model_names:
    for version in ['ITI', 'Base']:
        row = {}
        for d_name in dataset_names:
            file_name = f"is_correct_{m_name}_{version}_truthful_qa_{d_name}.npz"
            data = np.load(os.path.join(data_dir, file_name))
            p = data['is_correct']
            CI, N = bootstrap_CI(p)
            row[d_name] = CI
        mc_performance_df.loc[f"{m_name} {version}"] = row

print(mc_performance_df.to_latex())


\begin{tabular}{lllllll}
\toprule
 & par3 & common_claim_true_false & counterfact_true_false & cities & neg_cities & politicians \\
\midrule
Llama-2-7b-hf ITI & [46.15%, 55.95%] & [53.69%, 55.72%] & [85.50%, 88.54%] & [97.26%, 98.65%] & [2.26%, 4.35%] & [60.79%, 62.71%] \\
Llama-2-7b-hf Base & [31.50%, 40.85%] & [57.21%, 59.46%] & [84.84%, 87.87%] & [97.96%, 99.13%] & [2.87%, 5.00%] & [62.00%, 63.92%] \\
Meta-Llama-3-8B ITI & [48.43%, 59.77%] & [49.02%, 50.10%] & [50.53%, 55.80%] & [54.67%, 60.80%] & [39.37%, 45.94%] & [44.58%, 46.73%] \\
Meta-Llama-3-8B Base & [32.59%, 42.15%] & [47.98%, 53.85%] & [76.33%, 80.62%] & [92.99%, 95.49%] & [8.64%, 12.12%] & [58.45%, 60.36%] \\
Meta-Llama-3-8B-Instruct ITI & [52.83%, 63.57%] & [47.10%, 53.30%] & [53.76%, 58.98%] & [57.38%, 63.38%] & [35.52%, 41.38%] & [47.14%, 49.31%] \\
Meta-Llama-3-8B-Instruct Base & [44.61%, 54.88%] & [53.67%, 59.20%] & [77.66%, 81.76%] & [83.90%, 87.98%] & [9.46%, 13.18%] & [56.93%, 58.87%] \\
Mistral-7B-Instruct-v0.2 I

In [69]:
mc_performance_df

Unnamed: 0,par3,common_claim_true_false,counterfact_true_false,cities,neg_cities,politicians
Llama-2-7b-hf ITI,"[46.15%, 55.95%]","[53.69%, 55.72%]","[85.50%, 88.54%]","[97.26%, 98.65%]","[2.26%, 4.35%]","[60.79%, 62.71%]"
Llama-2-7b-hf Base,"[31.50%, 40.85%]","[57.21%, 59.46%]","[84.84%, 87.87%]","[97.96%, 99.13%]","[2.87%, 5.00%]","[62.00%, 63.92%]"
Meta-Llama-3-8B ITI,"[48.43%, 59.77%]","[49.02%, 50.10%]","[50.53%, 55.80%]","[54.67%, 60.80%]","[39.37%, 45.94%]","[44.58%, 46.73%]"
Meta-Llama-3-8B Base,"[32.59%, 42.15%]","[47.98%, 53.85%]","[76.33%, 80.62%]","[92.99%, 95.49%]","[8.64%, 12.12%]","[58.45%, 60.36%]"
Meta-Llama-3-8B-Instruct ITI,"[52.83%, 63.57%]","[47.10%, 53.30%]","[53.76%, 58.98%]","[57.38%, 63.38%]","[35.52%, 41.38%]","[47.14%, 49.31%]"
Meta-Llama-3-8B-Instruct Base,"[44.61%, 54.88%]","[53.67%, 59.20%]","[77.66%, 81.76%]","[83.90%, 87.98%]","[9.46%, 13.18%]","[56.93%, 58.87%]"
Mistral-7B-Instruct-v0.2 ITI,"[55.62%, 66.20%]","[78.16%, 82.63%]","[84.29%, 87.75%]","[99.45%, 99.79%]","[8.58%, 12.09%]","[58.76%, 60.73%]"
Mistral-7B-Instruct-v0.2 Base,"[58.48%, 69.19%]","[78.53%, 83.17%]","[84.46%, 87.99%]","[99.51%, 99.81%]","[7.30%, 10.47%]","[58.29%, 60.32%]"
Mistral-7B-Instruct-v0.3 ITI,"[51.07%, 61.59%]","[65.93%, 71.14%]","[88.40%, 91.47%]","[99.76%, 99.89%]","[1.74%, 3.21%]","[62.16%, 64.10%]"
Mistral-7B-Instruct-v0.3 Base,"[53.96%, 64.51%]","[75.28%, 79.73%]","[88.29%, 91.25%]","[99.77%, 99.90%]","[1.18%, 2.40%]","[61.85%, 63.76%]"


# Difference on each dataset between ITI and Base

In [70]:
mc_difference_df = pd.DataFrame(columns=dataset_names)
for m_name in model_names:
    row = {}
    for d_name in dataset_names:
        file_name_ITI = f"is_correct_{m_name}_ITI_truthful_qa_{d_name}.npz"
        file_name_Base = f"is_correct_{m_name}_Base_truthful_qa_{d_name}.npz"
        p_ITI = np.load(os.path.join(data_dir, file_name_ITI))['is_correct']
        p_Base = np.load(os.path.join(data_dir, file_name_Base))['is_correct']
        row[d_name] = bootstrap_CI(p_ITI - p_Base)[0]
    mc_difference_df.loc[f"{m_name}"] = row

print(mc_difference_df.to_latex())

\begin{tabular}{lllllll}
\toprule
 & par3 & common_claim_true_false & counterfact_true_false & cities & neg_cities & politicians \\
\midrule
Llama-2-7b-hf & [11.57%, 18.67%] & [-5.23%, -2.05%] & [-0.23%, 1.53%] & [-0.94%, -0.22%] & [-0.97%, -0.33%] & [-1.60%, -0.83%] \\
Meta-Llama-3-8B & [10.19%, 22.75%] & [-3.87%, 1.30%] & [-28.06%, -22.99%] & [-39.46%, -33.63%] & [29.52%, 34.91%] & [-14.89%, -12.69%] \\
Meta-Llama-3-8B-Instruct & [2.65%, 14.51%] & [-12.37%, -0.61%] & [-25.65%, -21.00%] & [-27.61%, -23.17%] & [24.81%, 29.64%] & [-10.76%, -8.67%] \\
Mistral-7B-Instruct-v0.2 & [-4.52%, -1.76%] & [-1.27%, 0.35%] & [-0.36%, 0.08%] & [-0.09%, 0.02%] & [1.11%, 1.69%] & [0.29%, 0.64%] \\
Mistral-7B-Instruct-v0.3 & [-4.62%, -1.68%] & [-10.83%, -7.24%] & [-0.02%, 0.37%] & [-0.03%, 0.03%] & [0.52%, 0.89%] & [0.22%, 0.41%] \\
Mistral-7B-v0.3 & [9.34%, 21.87%] & [-14.75%, -6.46%] & [-39.08%, -33.77%] & [-47.05%, -41.69%] & [39.92%, 45.34%] & [-22.10%, -19.63%] \\
Mixtral-8x7B-v0.1 & [4.63%, 17.28

In [71]:
mc_difference_df

Unnamed: 0,par3,common_claim_true_false,counterfact_true_false,cities,neg_cities,politicians
Llama-2-7b-hf,"[11.57%, 18.67%]","[-5.23%, -2.05%]","[-0.23%, 1.53%]","[-0.94%, -0.22%]","[-0.97%, -0.33%]","[-1.60%, -0.83%]"
Meta-Llama-3-8B,"[10.19%, 22.75%]","[-3.87%, 1.30%]","[-28.06%, -22.99%]","[-39.46%, -33.63%]","[29.52%, 34.91%]","[-14.89%, -12.69%]"
Meta-Llama-3-8B-Instruct,"[2.65%, 14.51%]","[-12.37%, -0.61%]","[-25.65%, -21.00%]","[-27.61%, -23.17%]","[24.81%, 29.64%]","[-10.76%, -8.67%]"
Mistral-7B-Instruct-v0.2,"[-4.52%, -1.76%]","[-1.27%, 0.35%]","[-0.36%, 0.08%]","[-0.09%, 0.02%]","[1.11%, 1.69%]","[0.29%, 0.64%]"
Mistral-7B-Instruct-v0.3,"[-4.62%, -1.68%]","[-10.83%, -7.24%]","[-0.02%, 0.37%]","[-0.03%, 0.03%]","[0.52%, 0.89%]","[0.22%, 0.41%]"
Mistral-7B-v0.3,"[9.34%, 21.87%]","[-14.75%, -6.46%]","[-39.08%, -33.77%]","[-47.05%, -41.69%]","[39.92%, 45.34%]","[-22.10%, -19.63%]"
Mixtral-8x7B-v0.1,"[4.63%, 17.28%]","[-17.25%, -7.98%]","[-39.01%, -34.19%]","[-45.31%, -40.01%]","[35.65%, 40.97%]","[-20.81%, -18.52%]"
opt-2.7b,"[6.66%, 18.25%]","[-4.58%, 3.85%]","[-20.40%, -15.78%]","[-36.58%, -31.26%]","[29.01%, 34.32%]","[-12.74%, -10.69%]"
opt-125m,"[2.77%, 13.77%]","[-3.45%, 3.85%]","[-13.06%, -8.36%]","[-25.24%, -20.15%]","[18.78%, 23.73%]","[-10.49%, -8.67%]"
opt-350m,"[2.40%, 12.33%]","[-2.85%, 2.17%]","[-16.34%, -11.21%]","[-29.64%, -23.38%]","[22.97%, 29.10%]","[-14.01%, -11.95%]"


# CI for general ITI vs general Base

In [72]:
mc_aggregated_difference_df = pd.DataFrame(columns=dataset_names)
row = {}
ns = {}
for d_name in dataset_names:
    p_diff = []
    for m_name in model_names:
        file_name_ITI = f"is_correct_{m_name}_ITI_truthful_qa_{d_name}.npz"
        file_name_Base = f"is_correct_{m_name}_Base_truthful_qa_{d_name}.npz"
        p_ITI = np.load(os.path.join(data_dir, file_name_ITI))['is_correct']
        p_Base = np.load(os.path.join(data_dir, file_name_Base))['is_correct']
        p_diff.append(p_ITI - p_Base)
    p_diff = np.hstack(p_diff)
    assert p_diff.ndim == 1
    row[d_name], ns[d_name] = bootstrap_CI(p_diff)
mc_aggregated_difference_df.loc[f"Aggregated over models"] = row
mc_aggregated_difference_df.loc[f"Number of observations"] = ns

print(mc_aggregated_difference_df.to_latex())

\begin{tabular}{lllllll}
\toprule
 & par3 & common_claim_true_false & counterfact_true_false & cities & neg_cities & politicians \\
\midrule
Aggregated over models & [6.20%, 9.35%] & [-7.42%, -5.27%] & [-17.36%, -16.04%] & [-24.28%, -22.75%] & [21.29%, 22.76%] & [-10.05%, -9.46%] \\
Number of observations & 3017 & 10934 & 10934 & 8162 & 8162 & 74316 \\
\bottomrule
\end{tabular}



In [73]:
mc_aggregated_difference_df

Unnamed: 0,par3,common_claim_true_false,counterfact_true_false,cities,neg_cities,politicians
Aggregated over models,"[6.20%, 9.35%]","[-7.42%, -5.27%]","[-17.36%, -16.04%]","[-24.28%, -22.75%]","[21.29%, 22.76%]","[-10.05%, -9.46%]"
Number of observations,3017,10934,10934,8162,8162,74316


# CI for OOD performance of ITI vs Base

In [75]:
p_diff = []
for d_name in filter(lambda x: 'par3' not in x, dataset_names):
    for m_name in model_names:
        file_name_ITI = f"is_correct_{m_name}_ITI_truthful_qa_{d_name}.npz"
        file_name_Base = f"is_correct_{m_name}_Base_truthful_qa_{d_name}.npz"
        p_ITI = np.load(os.path.join(data_dir, file_name_ITI))['is_correct']
        p_Base = np.load(os.path.join(data_dir, file_name_Base))['is_correct']
        p_diff.append(p_ITI - p_Base)

p_diff = np.hstack(p_diff)
assert p_diff.ndim == 1
CI_ood_aggregated, N = bootstrap_CI(p_diff)

print(CI_ood_aggregated)
print(N)

[-9.04%, -8.55%]
112508
