In [None]:
import os
import sys

import numpy as np
import pandas as pd
from tqdm import tqdm

import matplotlib 
from matplotlib import colors
import matplotlib.pyplot as plt 

from lm_polygraph.utils.manager import UEManager, _recombine_data, _delete_nans
from lm_polygraph.ue_metrics import PredictionRejectionArea, KendallTauCorrelation, SpearmanRankCorrelation

In [None]:
names_dict = {
    "MaximumSequenceProbability": "Maximum Sequence Probability",
    "Perplexity":"Perplexity",

    "MeanTokenEntropy":"Mean Token Entropy",
    "MutualInformation":"Pointwise Mutual Information",
    "MeanPointwiseMutualInformation": "Pointwise Mutual Information",
    "MeanConditionalPointwiseMutualInformation":"Conditional Pointwise Mutual Information",

    "PTrueSampling":"P(True) Sampling",
    "PTrue":"P(True)",

    "SemanticEntropy": "Semantic Entropy",
    "MonteCarloSequenceEntropy": "Monte Carlo Sequence Entropy",
    "MonteCarloNormalizedSequenceEntropy":"Monte Carlo Normalized Sequence Entropy",

    "LexicalSimilarity_rouge1":"Lexical Similarity Rouge-1",
    "LexicalSimilarity_rouge2":"Lexical Similarity Rouge-2",
    "LexicalSimilarity_rougeL":"Lexical Similarity Rouge-L",
    "LexicalSimilarity_BLEU":"Lexical Similarity BLEU",

    'EigValLaplacian_Jaccard_score':'EigValLaplacian Jaccard Score', 
    'DegMat_Jaccard_score':'DegMat Jaccard Score',
    'Eccentricity_Jaccard_score': 'Eccentricity Jaccard Score',

    'EigValLaplacian_NLI_score_contra':'EigValLaplacian NLI Score contra.', 
    'DegMat_NLI_score_contra':'DegMat NLI Score Contra.',
    'Eccentricity_NLI_score_contra': 'Eccentricity NLI Score contra.',

    'EigValLaplacian_NLI_score_entail':'EigValLaplacian NLI Score entail.', 
    'DegMat_NLI_score_entail':'DegMat NLI Score entail.',
    'Eccentricity_NLI_score_entail': 'Eccentricity NLI Score entail.',

    "MahalanobisDistanceSeq_decoder":"Mahalanobis Distance - Decoder",
    "RelativeMahalanobisDistanceSeq_decoder":"Relative Mahalanobis Distance - Decoder",
    "RDESeq_decoder":"RDE - Decoder",
    "PPLMDSeq_decoder":"HUQ-MD - Decoder",
    "PPLRMDSeq_decoder":"HUQ-RMD - Decoder",
}

In [None]:
def bootstraping_for_std(ue, metric, ue_metric, num_runs: int = 1000, return_string: bool = False):
    idx = np.arange(0, len(ue))
    all_samples = np.random.choice(idx, num_runs * len(ue), True)
    samples = np.array(np.array_split(all_samples, num_runs))
    mean_values = []
    for s in samples:
        mean_values.append(ue_metric(ue[s], metric[s]))
    mean_values = np.array(mean_values)
    sorted_mean_values = np.array(sorted(mean_values))
    return sorted_mean_values[int(0.05*num_runs):int(0.95*num_runs)].std()

def get_random_scores(function, metrics, num_iter=1000, seed=42, is_bartscore=False):
    np.random.seed(seed)
    rand_scores = np.arange(len(metrics))

    value, scores = [], []
    for i in range(num_iter):
        np.random.shuffle(rand_scores)
        rand_val = function(rand_scores, metrics)
        value.append(rand_val)
    return np.mean(value)

In [None]:
def read_mans(paths, model, dataset):
    mans = []
    for path in paths:
        man_path = f"{path}/{model}/{dataset}"
        for subdir, _, man_files in os.walk(man_path):
            if subdir != man_path:
                continue
            for man_file in man_files:
                try:
                    man = UEManager.load(os.path.join(man_path, man_file))
                except:
                    continue
                mans.append(man)
    return mans

def update_mans(mans):
    final_man = mans[0]
    for man in mans[1:]:
        for stats in ["estimations", "metrics", "gen_metrics"]:
            final_man.__dict__[stats].update(man.__dict__[stats])
    return final_man

def get_tables(paths, models, datasets, gen_metrics, ue_metrics, recompute_metrics=True):
    dfs = {}
    quality_dfs = {}
    for model in models:
        dfs[model] = {}
        quality_dfs[model] = pd.DataFrame({})
        for ue_metric_name, ue_metric in ue_metrics.items():
            result = pd.DataFrame({})
            for ds in datasets:
                mans = read_mans(paths, model, ds)
                if len(mans) == 0:
                    continue
                final_man = update_mans(mans)
                
                ue_methods = np.array([k[1] for k in final_man.metrics.keys()])
                _, idx = np.unique(ue_methods, return_index=True)
                ue_methods = ue_methods[np.sort(idx)]
                result[('','UE Method')] = [names_dict.get(m, m) for m in ue_methods]

                for gen_metric in gen_metrics:  
                    score_vals = []
                    metrics_val = np.array(final_man.gen_metrics[("sequence", gen_metric)])
                    if (ds, gen_metric) not in quality_dfs[model].columns:
                        quality_dfs[model][(ds, gen_metric)] = [metrics_val[~np.isnan(metrics_val)].mean()] 
                    for ue_method in tqdm(ue_methods):
                        ue = np.array(final_man.estimations[("sequence", ue_method)])
                        ue_, metrics_val_, selected_ids = _delete_nans(ue, metrics_val)
                        
                        if len(ue):
                            inputs_no_nans = np.array(final_man.stats['input_texts'])[selected_ids]
                            ue_ = np.array(ue_)
                            metrics_val_ = np.array(metrics_val_)
                            rec_ue, rec_metrics_val = _recombine_data(ue_, metrics_val_, inputs_no_nans)
                            rec_metrics_val = np.array(rec_metrics_val)
                            rec_ue = np.array(rec_ue)
                            
                            dict_key = ('sequence', ue_method, gen_metric, ue_metric_name)
                            if (dict_key in final_man.metrics.keys()) and not recompute_metrics:
                                mean_val = np.array(final_man.metrics[dict_key])
                            else:
                                mean_val = ue_metric(rec_ue, rec_metrics_val)
                                
                            oracle = ue_metric(-rec_metrics_val, rec_metrics_val)
                            random = get_random_scores(ue_metric, rec_metrics_val)
                            final_score = (mean_val - random) / (oracle - random) 
                            std = bootstraping_for_std(rec_ue, rec_metrics_val, ue_metric)
                        else:
                            std = 0
                            final_score = 0
                        score_vals.append(f"{final_score:.2f}±{std:.2f}")
                    result[(ds, gen_metric)] = score_vals
            quality_dfs[model].columns=pd.MultiIndex.from_tuples(quality_dfs[model].columns)
            result.columns=pd.MultiIndex.from_tuples(result.columns)
            dfs[model][ue_metric_name] = result
    return dfs, quality_dfs

In [None]:
paths = ["../workdir/camera_ready_exps/v1", "../workdir/camera_ready_exps/bertscore"]
models = ["vicuna", "llama"]
datasets = ["aeslc", "xsum", "coqa", "babiqa", "wmt14_deen", "wmt14_fren"]
gen_metrics = ["Rouge_rougeL", "Bert"]
ue_metrics = {"prr": PredictionRejectionArea(), 
              "kendalltau": KendallTauCorrelation(),
              "spearmanr": SpearmanRankCorrelation()
             }
dfs, quality_dfs = get_tables(paths, models, datasets, gen_metrics, ue_metrics)

In [None]:
cmap = matplotlib.cm.get_cmap('Greens')
my_cmap = cmap(np.arange(cmap.N))
my_cmap[:,-1] = 0.5
my_cmap = colors.ListedColormap(my_cmap)

def b_g(s, cmap, low=0, high=0):
    values = s.apply(lambda x: float(x.split("±")[0]) if len(x.split("±"))>1 else x)
    if isinstance(values.max(), str):
        return ['' for c in values]
    rng = values.max() - values.min()
    norm = colors.Normalize(values.min() - (rng * low), values.max() + (rng * high))
    normed = norm(values.values)
    back_colors = [colors.rgb2hex(x) for x in plt.cm.get_cmap(cmap)(normed)]
    text_colors = ["white" if x>0.3 else "black" for x in normed]
    return [f'color: {text_color}; background-color: {color}' for text_color, color in zip(text_colors, back_colors)]

In [None]:
order = list(range(9))+[23]+[9]+list(range(11,23))+list(range(24,29))
table_style = {
    'selector': 'caption',
    'props': [
        ('color', 'black'),
        ('font-size', '20px'),
        ('font-weight', 'bold')
    ]
}
dfs["vicuna"]["prr"].iloc[order].style.apply(b_g, cmap=my_cmap).set_caption('Vicuna, PRR').set_table_styles([table_style])

In [None]:
dfs["vicuna"]["kendalltau"].iloc[order].style.apply(b_g, cmap=my_cmap).set_caption('Vicuna, Kendall $\\tau$').set_table_styles([table_style])

In [None]:
dfs["llama"]["kendalltau"].iloc[order].style.apply(b_g, cmap=my_cmap).set_caption('Llama, Kendall $\\tau$').set_table_styles([table_style])

In [None]:
dfs["vicuna"]["spearmanr"].iloc[order].style.apply(b_g, cmap=my_cmap).set_caption('Vicuna, Spearman $\\rho$').set_table_styles([table_style])

In [None]:
dfs["llama"]["spearmanr"].iloc[order].style.apply(b_g, cmap=my_cmap).set_caption('Llama, Spearman $\\rho$').set_table_styles([table_style])

In [None]:
quality_dfs["vicuna"]

In [None]:
def rgba2rgb(rgba, background=(1,1,1)):
    ch = rgba.shape[0]
    if ch == 3:
        return rgba

    assert ch == 4, 'RGBA image has 4 channels.'

    r, g, b, a = rgba[0], rgba[1], rgba[2], rgba[3]
    a = np.asarray(a, dtype='float32')
    R, G, B = background

    r_new = r * a + (1.0 - a) * R
    g_new = g * a + (1.0 - a) * G
    b_new = b * a + (1.0 - a) * B

    return [r_new, g_new, b_new]


def to_color(text, vals):
    vals = rgba2rgb(np.array(vals))
    return '\\cellcolor[rgb]{'+f'{vals[0]},'+f'{vals[1]},'+f'{vals[2]}'+'} '+f'{text}'

def bold_best(df, columns):
    for col in columns:
        values_init_raw = [float(x.split('±')[0]) if x!='-' else np.nan for x in df[col]]
        values_init = np.array([float(x.split('±')[0]) for x in df[col] if x!='-'])
        if values_init.min() != values_init.max():
            values_init_raw = np.array([(x - values_init.min()) / (values_init.max() - values_init.min()) if not np.isnan(x) else x for x in values_init_raw])
            
        def get_new_x(x):
            if isinstance(x, str):
                return x
            return '-'
        values = [to_color(raw, my_cmap(float(x))) if (isinstance(x, float) and (not np.isnan(x))) else get_new_x(x) for raw, x in zip(df[col], values_init_raw)]
        df[col] = values
    return df

def prepare_latex(df1):
    start_tex = '\\begin{table*}[!ht] \\resizebox{\\textwidth}{!}{'
    end_tex = "}\\caption{\\label{tab:llama_results} PRR$\\uparrow$ for Llama v2 model for various tasks for the considered sequence-level methods. Darker color indicates better results.}\end{table*}"
    df1 = bold_best(df1, df1.columns[1:])
    latex_table = df1.to_latex(bold_rows=False, index=False).replace('±', '$\pm$')
        
    latex_table = latex_table.replace('\\textbackslash ', '\\')
    latex_table = latex_table.replace('{lllllllllllll}', '{l|cc|cc|cc|cc|cc|cc}')
    latex_table = latex_table.replace('{lllllll}', '{l|c|c|c|c|c|c}')
    
    latex_table = latex_table.replace('\\{', '{')
    latex_table = latex_table.replace('\\}', '}')
    str_list = latex_table.split('\n')
    str_list.pop(3)
    latex_table = '\n'.join(str_list)
    return start_tex+latex_table+end_tex

In [None]:
import copy

with pd.option_context("max_colwidth", 1000):
    res_str = prepare_latex(copy.deepcopy(dfs["vicuna"]["prr"]).round(2)).split('\n')
    res_str[2] = """\\multirow{2}{*}{\\textbf{UE Method}} & \multicolumn{2}{c|}{\\textbf{AESLC}} & \multicolumn{2}{c|}{\\textbf{XSUM}} & \multicolumn{2}{c|}{\\textbf{CoQA}} & \multicolumn{2}{c|}{\\textbf{bAbiQA}} & \multicolumn{2}{c|}{\\textbf{WMT14 De-En}} & \multicolumn{2}{c}{\\textbf{WMT14 Fr-En}} \\\\ \\cline{2-13}
    & \\textbf{Rouge-L} & \\textbf{BERTScore}& \\textbf{Rouge-L} & \\textbf{BERTScore}& \\textbf{Rouge-L} & \\textbf{BERTScore}& \\textbf{Rouge-L} & \\textbf{BERTScore}&  \\textbf{Rouge-L} & \\textbf{BERTScore}& \\textbf{Rouge-L} & \\textbf{BERTScore} \\\\\\midrule"""
    print('\n'.join(res_str))

In [None]:
with pd.option_context("max_colwidth", 1000):
    res_str = prepare_latex(copy.deepcopy(dfs["llama"]["prr"]).round(2)).split('\n')
    res_str[2] = """\\multirow{2}{*}{\\textbf{UE Method}} & \multicolumn{2}{c|}{\\textbf{AESLC}} & \multicolumn{2}{c|}{\\textbf{XSUM}} & \multicolumn{2}{c|}{\\textbf{CoQA}} & \multicolumn{2}{c|}{\\textbf{bAbiQA}} & \multicolumn{2}{c|}{\\textbf{WMT14 De-En}} & \multicolumn{2}{c}{\\textbf{WMT14 Fr-En}} \\\\ \\cline{2-13}
    & \\textbf{Rouge-L} & \\textbf{BERTScore}& \\textbf{Rouge-L} & \\textbf{BERTScore}& \\textbf{Rouge-L} & \\textbf{BERTScore}& \\textbf{Rouge-L} & \\textbf{BERTScore}&  \\textbf{Rouge-L} & \\textbf{BERTScore}& \\textbf{Rouge-L} & \\textbf{BERTScore} \\\\\\midrule"""
    print('\n'.join(res_str))

In [None]:
def prepare_latex(df1):
    start_tex = '\\begin{table*}[!ht] \\resizebox{\\textwidth}{!}{'
    end_tex = "}\\caption{\\label{tab:llama_results} PRR$\\uparrow$ for Llama v2 model for various tasks for the considered sequence-level methods. Darker color indicates better results.}\end{table*}"
    latex_table = df1.to_latex(bold_rows=False, index=False).replace('±', '$\pm$')
        
    latex_table = latex_table.replace('\\textbackslash ', '\\')
    latex_table = latex_table.replace('{lllllllllllll}', '{l|cc|cc|cc|cc|cc|cc}')
    latex_table = latex_table.replace('{lllllll}', '{l|c|c|c|c|c|c}')
    
    latex_table = latex_table.replace('\\{', '{')
    latex_table = latex_table.replace('\\}', '}')
    str_list = latex_table.split('\n')
    str_list.pop(3)
    latex_table = '\n'.join(str_list)
    return start_tex+latex_table+end_tex

In [None]:
with pd.option_context("max_colwidth", 1000):
    res_str = prepare_latex(copy.deepcopy(quality_dfs['vicuna']).round(2)).split('\n')
    res_str[2] = """\multicolumn{2}{c|}{\\textbf{AESLC}} & \multicolumn{2}{c|}{\\textbf{XSUM}} & \multicolumn{2}{c|}{\\textbf{CoQA}} & \multicolumn{2}{c|}{\\textbf{bAbiQA}} & \multicolumn{2}{c|}{\\textbf{WMT14 De-En}} & \multicolumn{2}{c}{\\textbf{WMT14 Fr-En}} \\\\ \\midrule
    \\textbf{Rouge-L} & \\textbf{BERTScore}& \\textbf{Rouge-L} & \\textbf{BERTScore}& \\textbf{Rouge-L} & \\textbf{BERTScore}& \\textbf{Rouge-L} & \\textbf{BERTScore}&  \\textbf{Rouge-L} & \\textbf{BERTScore}& \\textbf{Rouge-L} & \\textbf{BERTScore} \\\\\\midrule"""
    print('\n'.join(res_str))

In [None]:
with pd.option_context("max_colwidth", 1000):
    res_str = prepare_latex(copy.deepcopy(quality_dfs['llama']).round(2)).split('\n')
    res_str[2] = """\multicolumn{2}{c|}{\\textbf{AESLC}} & \multicolumn{2}{c|}{\\textbf{XSUM}} & \multicolumn{2}{c|}{\\textbf{CoQA}} & \multicolumn{2}{c|}{\\textbf{bAbiQA}} & \multicolumn{2}{c|}{\\textbf{WMT14 De-En}} & \multicolumn{2}{c}{\\textbf{WMT14 Fr-En}} \\\\ \\midrule
    \\textbf{Rouge-L} & \\textbf{BERTScore}& \\textbf{Rouge-L} & \\textbf{BERTScore}& \\textbf{Rouge-L} & \\textbf{BERTScore}& \\textbf{Rouge-L} & \\textbf{BERTScore}&  \\textbf{Rouge-L} & \\textbf{BERTScore}& \\textbf{Rouge-L} & \\textbf{BERTScore} \\\\\\midrule"""
    print('\n'.join(res_str))