In [None]:
from IPython.core.debugger import set_trace

In [None]:
pwd = !pwd
pwd = pwd[0]

# Absolute path to default Hydra config for normalization script
config_path = '/'.join(pwd.split('/')[:-1]) + '/examples/configs/normalization/fit/default.yaml'

# Absolute path to saved train and eval managers
EVAL_MAN_PATHS = {
  'CoQA': pwd + '/polygraph_tacl_stablelm12b_coqa.man',
  'GSM8K': pwd + '/polygraph_tacl_stablelm12b_gsm8k.man',
  'MMLU': pwd + '/polygraph_tacl_stablelm12b_mmlu.man',
  'TriviaQA': pwd + '/polygraph_tacl_stablelm12b_triviaqa.man',
  'WMT14': pwd + '/polygraph_tacl_stablelm12b_wmt14.man',
  'WMT19': pwd + '/polygraph_tacl_stablelm12b_wmt19.man',
  'XSum': pwd + '/polygraph_tacl_stablelm12b_xsum.man',
}

TRAIN_MAN_PATHS = {
  'CoQA': pwd + '/polygraph_tacl_stablelm12b_coqa_train.man',
  'GSM8K': pwd + '/polygraph_tacl_stablelm12b_gsm8k_train.man',
  'MMLU': pwd + '/polygraph_tacl_stablelm12b_mmlu_train.man',
  'TriviaQA': pwd + '/polygraph_tacl_stablelm12b_triviaqa_train.man',
  'WMT14': pwd + '/polygraph_tacl_stablelm12b_wmt14_train.man',
  'WMT19': pwd + '/polygraph_tacl_stablelm12b_wmt19_train.man',
  'XSum': pwd + '/polygraph_tacl_stablelm12b_xsum_train.man',
}

DATASET_NAMES = list(TRAIN_MAN_PATHS.keys())

UE_METHOD_NAMES = [
    'MaximumSequenceProbability',
    'Perplexity',
    'MeanTokenEntropy',
    'MonteCarloSequenceEntropy',
    'MonteCarloNormalizedSequenceEntropy',
    'MeanPointwiseMutualInformation',
    'RenyiNeg',
    'FisherRao',
    'TokenSAR',
    'CCP',
    'SemanticEntropy',
    'SentenceSAR',
    'SAR',
    'PTrue',
    'NumSemSets',
    'EigValLaplacian_NLI_score_entail',
    'EigValLaplacian_NLI_score_contra',
    'EigValLaplacian_Jaccard_score',
    'DegMat_NLI_score_entail',
    'DegMat_NLI_score_contra',
    'DegMat_Jaccard_score',
    'Eccentricity_NLI_score_entail',
    'Eccentricity_NLI_score_contra',
    'Eccentricity_Jaccard_score',
    'LexicalSimilarity_rouge1',
    'LexicalSimilarity_rouge2',
    'LexicalSimilarity_rougeL',
]

UE_METHOD_NAMES_ABBR = {
    'MaximumSequenceProbability': 'MSP',
    'Perplexity': 'PPL',
    'MeanTokenEntropy': 'MTE',
    'MonteCarloSequenceEntropy': 'MCSE',
    'MonteCarloNormalizedSequenceEntropy': 'MCNSE',
    'MeanPointwiseMutualInformation': 'MPMI',
    'RenyiNeg': 'RenyiNeg',
    'FisherRao': 'FisherRao',
    'TokenSAR': 'TokenSAR',
    'CCP': 'CCP',
    'SemanticEntropy': 'SE',
    'SentenceSAR': 'SentenceSAR',
    'SAR': 'SAR',
    'PTrue': 'PTrue',
    'NumSemSets': 'NumSemSets',
    'EigValLaplacian_NLI_score_entail': 'EVL_entail',
    'EigValLaplacian_NLI_score_contra': 'EVL_contra',
    'EigValLaplacian_Jaccard_score': 'EVL_Jaccard',
    'DegMat_NLI_score_entail': 'DegMat_entail',
    'DegMat_NLI_score_contra': 'DegMat_contra',
    'DegMat_Jaccard_score': 'DegMat_Jaccard',
    'Eccentricity_NLI_score_entail': 'Eccentricity_entail',
    'Eccentricity_NLI_score_contra': 'Eccentricity_contra',
    'Eccentricity_Jaccard_score': 'Eccentricity_Jaccard',
    'LexicalSimilarity_rouge1': 'LS_rouge1',
    'LexicalSimilarity_rouge2': 'LS_rouge2',
    'LexicalSimilarity_rougeL': 'LS_rougeL',
}

GEN_METRIC_NAMES = ['AlignScore']

In [None]:
# Download all managers to current directory
#!wget -r --cut-dirs=2 -nH --no-parent -A '*man' http://209.38.249.180:8000/polygraph_data/mans/

In [None]:
def get_man_paths_list(man_paths):
    """ Formats a list of paths so it can be passed as a parameter override to hydra script call"""
    paths = ['"' + path + '"' for path in man_paths]
    paths = '\'[' + ','.join(paths) + ']\''

    return paths

In [None]:
import os

# Run polygraph_normalize to fit all normalizers using all train datasets
train_man_paths = get_man_paths_list(list(TRAIN_MAN_PATHS.values()))
os.system(f'HYDRA_CONFIG={config_path} polygraph_normalize save_path="./" man_paths={train_man_paths}')

In [None]:
import pickle

# Load saved fitted normalizers
with open('fitted_normalizers.json', 'rb') as f:
    fitted_normalizers = pickle.load(f)

In [None]:
import numpy as np
from collections import defaultdict
from lm_polygraph.normalizers.minmax import MinMaxNormalizer
from lm_polygraph.normalizers.quantile import QuantileNormalizer
from lm_polygraph.normalizers.binned_pcc import BinnedPCCNormalizer
from lm_polygraph.normalizers.isotonic_pcc import IsotonicPCCNormalizer

NORMALIZERS = {
    'min_max': MinMaxNormalizer,
    'quantile': QuantileNormalizer,
    'binned_pcc': BinnedPCCNormalizer,
    'isotonic_pcc': IsotonicPCCNormalizer
}

def get_confidences(normalizers, ues):
    """ For each combination of method, gen metric and normalizer type
    load normalizer from encoded string and use it to transform UE array """
    
    confidences = {'min_max': defaultdict(dict),
                   'quantile': defaultdict(dict),
                   'binned_pcc': defaultdict(dict),
                   'isotonic_pcc': defaultdict(dict)}
    
    for key in confidences.keys():
        for method_name in UE_METHOD_NAMES:
            for metric_name in GEN_METRIC_NAMES:
                normalizer = NORMALIZERS[key].loads(normalizers[(metric_name, method_name, key)])
                normalized = normalizer.transform(ues[method_name])
                confidences[key][metric_name][method_name] = normalized

    return confidences

def calculate_mses(confidences, gen_metrics, ues):
    """ Given a list of confidences and gen metrics calculates MSE between them
    for each combination of method, metric and normalizer type"""
    
    mses = {'min_max': defaultdict(dict),
            'quantile': defaultdict(dict),
            'binned_pcc': defaultdict(dict),
            'isotonic_pcc': defaultdict(dict)}
    
    for key in confidences.keys():
        for metric_name in GEN_METRIC_NAMES:
            metric_mses = []
            for method_name in UE_METHOD_NAMES:
                gt_metric = gen_metrics[metric_name]
                try:
                    mse = ((confidences[key][metric_name][method_name] - gt_metric) ** 2).mean()
                except:
                    set_trace()
                    pass
                metric_mses.append(mse)
            mses[key][metric_name] = metric_mses

    return mses

### All datasets

In [None]:
from lm_polygraph.utils.normalize import get_mans_ues_metrics

# Load and concatenate all UE values and metrics for all test datasets
ues, gen_metrics = get_mans_ues_metrics(EVAL_MAN_PATHS.values(), UE_METHOD_NAMES, GEN_METRIC_NAMES)

In [None]:
confidences = get_confidences(fitted_normalizers, ues)
mses = calculate_mses(confidences, gen_metrics, ues)

In [None]:
def plot_mses(ax, mses, title):
    ax.bar(x-0.3, mses['min_max']['AlignScore'], width=0.2, color='g', align='center', label='Linear')
    ax.bar(x-0.1, mses['quantile']['AlignScore'], width=0.2, color='b', align='center', label='Quantile')
    ax.bar(x+0.1, mses['binned_pcc']['AlignScore'], width=0.2, color='tab:olive', align='center', label='Binned')
    ax.bar(x+0.3, mses['isotonic_pcc']['AlignScore'], width=0.2, color='r', align='center', label='Isotonic')

    ax.set_xticks(range(len(UE_METHOD_NAMES)), list(UE_METHOD_NAMES_ABBR.values()), rotation=90, fontsize=14)
    
    ax.set_title(title, fontsize=20)
    ax.set_ylabel('MSE', fontsize=18)
    ax.legend()

In [None]:
import matplotlib.pyplot as plt

x = np.array(list(range(len(UE_METHOD_NAMES))))

f, ax = plt.subplots(1, 1, figsize=(9, 7))

plot_mses(ax, mses, 'MSE between AlignScore and confidence')

# handles, labels = ax.get_legend_handles_labels()
# f.legend(handles, labels, bbox_to_anchor=(1.15, 0.96), fontsize=12)

plt.tight_layout()
# Change this to plt.show() to display inline
plt.savefig(f'normalization_mse_total.pdf', bbox_inches='tight')
# plt.show()
plt.clf()

### OOD Datasets

In [None]:
ood_confidences = {}
ood_mses = {}

for dataset_name in DATASET_NAMES:
    # Fit normalizers excluding current OOD dataset from train set
    train_man_paths_wo_dataset = [value for key, value in TRAIN_MAN_PATHS.items() if key != dataset_name]
    train_man_paths_wo_dataset = get_man_paths_list(train_man_paths_wo_dataset)
    os.system(f'HYDRA_CONFIG={config_path} polygraph_normalize save_path="./ood_{dataset_name}" man_paths={train_man_paths_wo_dataset}')

    # Get UE and metric values for OOD dataset
    ood_ues, ood_gen_metrics = get_mans_ues_metrics([EVAL_MAN_PATHS[dataset_name]], UE_METHOD_NAMES, GEN_METRIC_NAMES)

    with open(f'./ood_{dataset_name}/fitted_normalizers.json', 'rb') as f:
        fitted_normalizers = pickle.load(f)

    ood_confidences[dataset_name] = get_confidences(fitted_normalizers, ood_ues)
    ood_mses[dataset_name] = calculate_mses(ood_confidences[dataset_name], ood_gen_metrics, ood_ues)

In [None]:
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec

# If number of datasets is not even, we display an even part in pairs, and then last one separately
is_even = (len(DATASET_NAMES) % 2 == 0)
even_datasets_subset = DATASET_NAMES if is_even else DATASET_NAMES[:-1]
dataset_pairs = [even_datasets_subset[i:i + 2] for i in range(0, len(even_datasets_subset), 2)]

x = np.array(list(range(len(UE_METHOD_NAMES))))

for dataset_pair in dataset_pairs:
    fig = plt.figure(figsize=(18, 5))
    gs = GridSpec(1, 2, figure=fig)

    for i, dataset_name in enumerate(dataset_pair):
        ax = fig.add_subplot(gs[i//2,i%2])
        plot_mses(ax, ood_mses[dataset_name], f'MSE between true AlignScore and confidence: {dataset_name}')
        handles, labels = ax.get_legend_handles_labels()
    
    plt.tight_layout()
    # Change this to plt.show() to display inline
    plt.savefig(f'normalization_mse_ood_{"_".join(dataset_pair).lower()}.pdf')
    # plt.show()
    plt.clf()

if not is_even:
    dataset_name = DATASET_NAMES[-1]
    
    fig = plt.figure(figsize=(9, 5))
    gs = GridSpec(1, 1, figure=fig)

    ax = fig.add_subplot(gs[0,0])

    plot_mses(ax, ood_mses[dataset_name], f'MSE between true AlignScore and confidence: {dataset_name}')
    
    plt.tight_layout()
    # Change this to plt.show() to display inline
    plt.savefig(f'normalization_mse_ood_{dataset_name.lower()}.pdf')
    # plt.show()
    plt.clf()

### PRR change relative to raw uncertainty

In [None]:
from lm_polygraph.utils.normalize import filter_nans
from lm_polygraph.ue_metrics.pred_rej_area import PredictionRejectionArea
from lm_polygraph.ue_metrics.ue_metric import (
    get_random_scores,
    normalize_metric,
)
import pandas as pd

ue_metric = PredictionRejectionArea()

cols = ['MinMax', 'Quantile', 'Binned PCC', 'Isotonic PCC']

# For each of the dataset we take all confidences calculated in OOD setting
# and compare PRR of this to raw unnormalized UE
for dataset_name, path in EVAL_MAN_PATHS.items():
    res = {}
    all_ues, all_gen_metrics = get_mans_ues_metrics([path], UE_METHOD_NAMES, GEN_METRIC_NAMES)
    train_man_paths_wo_dataset = [value for key, value in TRAIN_MAN_PATHS.items() if key != dataset_name]
    train_ues, train_gen_metrics = get_mans_ues_metrics(train_man_paths_wo_dataset, UE_METHOD_NAMES, GEN_METRIC_NAMES)

    for metric_name in GEN_METRIC_NAMES:
        gen_metrics = all_gen_metrics[metric_name]
        for ue_method_name in UE_METHOD_NAMES:
            ues = all_ues[ue_method_name]
            
            filtered_metric, filtered_ues = filter_nans(gen_metrics, ues)
            
            # -np.array() because we need to use UE, not confidence to calculate PRR
            minmax_ues = -np.array(ood_confidences[dataset_name]['min_max'][metric_name][ue_method_name])
            quantile_ues = -np.array(ood_confidences[dataset_name]['quantile'][metric_name][ue_method_name])
            binned_pcc_ues = -np.array(ood_confidences[dataset_name]['binned_pcc'][metric_name][ue_method_name])
            isotonic_pcc_ues = -np.array(ood_confidences[dataset_name]['isotonic_pcc'][metric_name][ue_method_name])
            
            oracle_score = ue_metric(-filtered_metric, filtered_metric)
            random_score = get_random_scores(ue_metric, filtered_metric)

            raw_ue_metric_val = ue_metric(filtered_ues, filtered_metric)
            raw_score = normalize_metric(raw_ue_metric_val, oracle_score, random_score)

            minmax_ue_metric_val = ue_metric(minmax_ues, filtered_metric)
            minmax_score = normalize_metric(minmax_ue_metric_val, oracle_score, random_score)
            minmax_diff = raw_score - minmax_score

            quantile_ue_metric_val = ue_metric(quantile_ues, filtered_metric)
            quantile_score = normalize_metric(quantile_ue_metric_val, oracle_score, random_score)
            quantile_diff = raw_score - quantile_score
            
            binned_pcc_ue_metric_val = ue_metric(binned_pcc_ues, filtered_metric)
            binned_pcc_score = normalize_metric(binned_pcc_ue_metric_val, oracle_score, random_score)
            binned_pcc_diff = raw_score - binned_pcc_score

            isotonic_pcc_ue_metric_val = ue_metric(isotonic_pcc_ues, filtered_metric)
            isotonic_pcc_score = normalize_metric(isotonic_pcc_ue_metric_val, oracle_score, random_score)
            isotonic_pcc_diff = raw_score - isotonic_pcc_score

            res[ue_method_name] = [minmax_diff, quantile_diff, binned_pcc_diff, isotonic_pcc_diff]

    # Show table for each datasets that contains difference between raw UE PRR and PRR based on normalized confidence
    # Lower is better, negative is best (means normalized confidence improves upon raw PRR
    df = pd.DataFrame.from_dict(res, orient='index', columns=cols)
    display(df)

In [None]:
all_ues, all_gen_metrics = get_mans_ues_metrics(list(EVAL_MAN_PATHS.values()), UE_METHOD_NAMES, GEN_METRIC_NAMES)

# Same for all datasets concatenated
for metric_name in GEN_METRIC_NAMES:
    gen_metrics = all_gen_metrics[metric_name]
    for ue_method_name in UE_METHOD_NAMES:
        ues = all_ues[ue_method_name]

        filtered_metric, filtered_ues = filter_nans(gen_metrics, ues)

        minmax_ues = -np.array(confidences['min_max'][metric_name][ue_method_name])
        quantile_ues = -np.array(confidences['quantile'][metric_name][ue_method_name])
        binned_pcc_ues = -np.array(confidences['binned_pcc'][metric_name][ue_method_name])
        isotonic_pcc_ues = -np.array(confidences['isotonic_pcc'][metric_name][ue_method_name])

        oracle_score = ue_metric(-filtered_metric, filtered_metric)
        random_score = get_random_scores(ue_metric, filtered_metric)

        raw_ue_metric_val = ue_metric(filtered_ues, filtered_metric)
        raw_score = normalize_metric(raw_ue_metric_val, oracle_score, random_score)

        minmax_ue_metric_val = ue_metric(minmax_ues, filtered_metric)
        minmax_score = normalize_metric(minmax_ue_metric_val, oracle_score, random_score)
        minmax_diff = raw_score - minmax_score

        quantile_ue_metric_val = ue_metric(quantile_ues, filtered_metric)
        quantile_score = normalize_metric(quantile_ue_metric_val, oracle_score, random_score)
        quantile_diff = raw_score - quantile_score

        binned_pcc_ue_metric_val = ue_metric(binned_pcc_ues, filtered_metric)
        binned_pcc_score = normalize_metric(binned_pcc_ue_metric_val, oracle_score, random_score)
        binned_pcc_diff = raw_score - binned_pcc_score

        isotonic_pcc_ue_metric_val = ue_metric(isotonic_pcc_ues, filtered_metric)
        isotonic_pcc_score = normalize_metric(isotonic_pcc_ue_metric_val, oracle_score, random_score)
        isotonic_pcc_diff = raw_score - isotonic_pcc_score

        res[ue_method_name] = [minmax_diff, quantile_diff, binned_pcc_diff, isotonic_pcc_diff]

df = pd.DataFrame.from_dict(res, orient='index', columns=cols)
display(df)

### Table coloring and formatting

In [None]:
import matplotlib
from matplotlib import colors

cmap = matplotlib.cm.get_cmap('Greens')
my_cmap = cmap(np.arange(cmap.N))
my_cmap[:,-1] = 0.5
my_cmap = colors.ListedColormap(my_cmap)

def b_g(values, cmap, low=0, high=0):
    # values = s.apply(lambda x: float(x.split("±")[0]) if len(x.split("±"))>1 else x)
    
    rng = values.max().max() - values.min().min()
    norm = colors.Normalize(values.min().min() - (rng * low), values.max().max() + (rng * high))
    normed = norm(values.values)
    back_colors = [[colors.rgb2hex(val) for val in x] for x in plt.cm.get_cmap(cmap)(normed)]
    text_colors = [["white" if val>0.3 else "black" for val in x] for x in normed]
    
    return np.array([[f'color: {text_color}; background-color: {color}' for text_color, color in zip(row_text_colors, row_colors)] for row_text_colors, row_colors in zip(text_colors, back_colors)])

In [None]:
def rgba2rgb(rgba, background=(1,1,1)):
    ch = rgba.shape[0]
    if ch == 3:
        return rgba

    assert ch == 4, 'RGBA image has 4 channels.'

    r, g, b, a = rgba[0], rgba[1], rgba[2], rgba[3]
    a = np.asarray(a, dtype='float32')
    R, G, B = background

    r_new = r * a + (1.0 - a) * R
    g_new = g * a + (1.0 - a) * G
    b_new = b * a + (1.0 - a) * B

    return [r_new, g_new, b_new]


def to_color(text, vals):
    vals = rgba2rgb(np.array(vals))
    return '\\cellcolor[rgb]{'+f'{vals[0]},'+f'{vals[1]},'+f'{vals[2]}'+'} '+f'{text}'

def bold_best(df, columns):
    total_min = df.values.min().min()
    total_max = df.values.max().max()
    for col in columns:
        values_init_raw = [x if x!='-' else np.nan for x in df[col]]
        values_init = np.array([x for x in df[col] if x!='-'])
        if values_init.min() != values_init.max():
            values_init_raw = np.array([(x - total_min) / (total_max - total_min) if not np.isnan(x) else x for x in values_init_raw])
            
        def get_new_x(x):
            if isinstance(x, str):
                return x
            return '-'
        
        values = [to_color("{:.3f}".format(raw), my_cmap(float(x))) if (isinstance(x, float) and (not np.isnan(x))) else get_new_x(x) for raw, x in zip(df[col], values_init_raw)]
        df[col] = values
    return df

df = pd.DataFrame.from_dict(res, orient='index', columns=cols)
df.style.apply(b_g, cmap=cmap, axis=None)
df_colored = bold_best(df, df.columns)
with open('total_prr_table.tex', 'w') as f:
    with pd.option_context("max_colwidth", 1000):
        table = df_colored.to_latex()
        table = table.replace('-0.000', '0.000')
        table = table.replace('\\textbackslash ', '\\')
        table = table.replace('\\{', '{')
        table = table.replace('\\}', '}')
        f.write(table)

### Normalized confidence vs raw uncertainty plots

In [None]:
for method in UE_METHOD_NAMES:
    metric = 'AlignScore'
    cur_ues = all_ues[method]
    order = np.argsort(cur_ues)
    sor_ues = cur_ues[order]
    sor_metrics = all_gen_metrics[metric]
    plt.plot(sor_ues, sor_metrics)
    plt.plot(sor_ues, confidences['min_max'][metric][method][order], label='MinMax')
    plt.plot(sor_ues, confidences['quantile'][metric][method][order], label='Quantile')
    plt.plot(sor_ues, confidences['binned_pcc'][metric][method][order], label='Binned PCC')
    plt.plot(sor_ues, confidences['isotonic_pcc'][metric][method][order], label='Isotonic PCC')
    plt.title(method)
    plt.legend()
    plt.tight_layout()
    plt.show()
    plt.clf()