In [1]:
pwd = !pwd
pwd = pwd[0]

# Absolute path to default Hydra config for normalization script
config_path = '/'.join(pwd.split('/')[:-1]) + '/examples/configs/normalization/fit/default.yaml'

# Absolute path to saved train and eval managers
EVAL_MAN_PATHS = {
  'CoQA': pwd + '/polygraph_tacl_stablelm12b_coqa.man',
  'GSM8K': pwd + '/polygraph_tacl_stablelm12b_gsm8k.man',
  'MMLU': pwd + '/polygraph_tacl_stablelm12b_mmlu.man',
  'TriviaQA': pwd + '/polygraph_tacl_stablelm12b_triviaqa.man',
  'WMT14': pwd + '/polygraph_tacl_stablelm12b_wmt14.man',
  'WMT19': pwd + '/polygraph_tacl_stablelm12b_wmt19.man',
  'XSum': pwd + '/polygraph_tacl_stablelm12b_xsum.man',
}

TRAIN_MAN_PATHS = {
  'CoQA': pwd + '/polygraph_tacl_stablelm12b_coqa_train.man',
  'GSM8K': pwd + '/polygraph_tacl_stablelm12b_gsm8k_train.man',
  'MMLU': pwd + '/polygraph_tacl_stablelm12b_mmlu_train.man',
  'TriviaQA': pwd + '/polygraph_tacl_stablelm12b_triviaqa_train.man',
  'WMT14': pwd + '/polygraph_tacl_stablelm12b_wmt14_train.man',
  'WMT19': pwd + '/polygraph_tacl_stablelm12b_wmt19_train.man',
  'XSum': pwd + '/polygraph_tacl_stablelm12b_xsum_train.man',
}

DATASET_NAMES = list(TRAIN_MAN_PATHS.keys())

UE_METHOD_NAMES = [
   'MaximumSequenceProbability',
   'Perplexity',
   'MeanTokenEntropy',
   'MeanPointwiseMutualInformation',
   'MeanConditionalPointwiseMutualInformation',
   'PTrue',
   'PTrueSampling',
   'MonteCarloSequenceEntropy',
   'MonteCarloNormalizedSequenceEntropy',
   'LexicalSimilarity_rouge1',
   'LexicalSimilarity_rouge2',
   'LexicalSimilarity_rougeL',
   'LexicalSimilarity_BLEU',
   'NumSemSets',
   'EigValLaplacian_NLI_score_entail',
   'EigValLaplacian_NLI_score_contra',
   'EigValLaplacian_Jaccard_score',
   'DegMat_NLI_score_entail',
   'DegMat_NLI_score_contra',
   'DegMat_Jaccard_score',
   'Eccentricity_NLI_score_entail',
   'Eccentricity_NLI_score_contra',
   'Eccentricity_Jaccard_score',
   'SemanticEntropy',
]

GEN_METRIC_NAMES = ['AlignScore']

In [2]:
# Download all managers to current directory
# !wget -r --cut-dirs=2 -nH --no-parent -A '*man' http://209.38.249.180:8000/polygraph_data/mans/

In [3]:
def get_man_paths_list(man_paths):
    """ Formats a list of paths so it can be passed as a parameter override to hydra script call"""
    paths = ['"' + path + '"' for path in man_paths]
    paths = '\'[' + ','.join(paths) + ']\''

    return paths

In [4]:
import os

# Run polygraph_normalize to fit all normalizers using all train datasets
train_man_paths = get_man_paths_list(list(TRAIN_MAN_PATHS.values()))
os.system(f'HYDRA_CONFIG={config_path} polygraph_normalize save_path="./" man_paths={train_man_paths}')

Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassificatio

Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x29f0862c0>]
Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x2a245bf10>]
Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x29f0386d0>]
Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x2997b23b0>]
Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x29f0e8460>]
Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x29f0870a0>]
Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x2aabfbf70>]


0

In [5]:
import pickle

# Load saved fitted normalizers
with open('fitted_normalizers.json', 'rb') as f:
    fitted_normalizers = pickle.load(f)

In [6]:
import numpy as np
from collections import defaultdict
from lm_polygraph.normalizers.minmax import MinMaxNormalizer
from lm_polygraph.normalizers.quantile import QuantileNormalizer
from lm_polygraph.normalizers.binned_pcc import BinnedPCCNormalizer
from lm_polygraph.normalizers.isotonic_pcc import IsotonicPCCNormalizer

NORMALIZERS = {
    'min_max': MinMaxNormalizer,
    'quantile': QuantileNormalizer,
    'binned_pcc': BinnedPCCNormalizer,
    'isotonic_pcc': IsotonicPCCNormalizer
}

def get_confidences(normalizers, ues):
    """ For each combination of method, gen metric and normalizer type
    load normalizer from encoded string and use it to transform UE array """
    
    confidences = {'min_max': defaultdict(dict),
                   'quantile': defaultdict(dict),
                   'binned_pcc': defaultdict(dict),
                   'isotonic_pcc': defaultdict(dict)}
    
    for key in confidences.keys():
        for method_name in UE_METHOD_NAMES:
            for metric_name in GEN_METRIC_NAMES:
                normalizer = NORMALIZERS[key].loads(normalizers[(metric_name, method_name, key)])
                confidences[key][metric_name][method_name] = normalizer.transform(ues[method_name])

    return confidences

def calculate_mses(confidences, gen_metrics):
    """ Given a list of confidences and gen metrics calculates MSE between them
    for each combination of method, metric and normalizer type"""
    
    mses = {'min_max': defaultdict(dict),
            'quantile': defaultdict(dict),
            'binned_pcc': defaultdict(dict),
            'isotonic_pcc': defaultdict(dict)}
    
    for key in confidences.keys():
        for metric_name in GEN_METRIC_NAMES:
            metric_mses = []
            for method_name in UE_METHOD_NAMES:
                gt_metric = gen_metrics[metric_name]
                mse = ((confidences[key][metric_name][method_name] - gt_metric) ** 2).mean()
                metric_mses.append(mse)
            mses[key][metric_name] = metric_mses

    return mses

  from .autonotebook import tqdm as notebook_tqdm


### All datasets

In [7]:
from lm_polygraph.utils.normalize import get_mans_ues_metrics

# Load and concatenate all UE values and metrics for all test datasets
ues, gen_metrics = get_mans_ues_metrics(EVAL_MAN_PATHS.values(), UE_METHOD_NAMES, GEN_METRIC_NAMES)

Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x29e8a4220>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x29f093df0>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x2a43d3df0>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x2a392bee0>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x29e8a4100>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x2a8c23eb0>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x29e8a7ac0>]


In [8]:
confidences = get_confidences(fitted_normalizers, ues)
mses = calculate_mses(confidences, gen_metrics)

In [9]:
def plot_mses(ax, mses, title):
    ax.bar(x-0.3, mses['min_max']['AlignScore'], width=0.2, color='g', align='center', label='MinMax')
    ax.bar(x-0.1, mses['quantile']['AlignScore'], width=0.2, color='b', align='center', label='Quantile')
    ax.bar(x+0.1, mses['binned_pcc']['AlignScore'], width=0.2, color='tab:olive', align='center', label='Binned')
    ax.bar(x+0.3, mses['isotonic_pcc']['AlignScore'], width=0.2, color='r', align='center', label='Isotonic')

    ax.set_xticks(range(len(UE_METHOD_NAMES)), UE_METHOD_NAMES, rotation=90)
    
    ax.set_title(title)
    ax.set_ylabel('MSE')

In [10]:
import matplotlib.pyplot as plt

x = np.array(list(range(len(UE_METHOD_NAMES))))

f, ax = plt.subplots(1, 1, figsize=(9, 9))

plot_mses(ax, mses, 'MSE between true AlignScore and confidence')

handles, labels = ax.get_legend_handles_labels()
f.legend(handles, labels, bbox_to_anchor=(1.13, 0.97))

plt.tight_layout()
# Change this to plt.show() to display inline
plt.savefig(f'normalization_mse_total.pdf')
plt.clf()

<Figure size 900x900 with 0 Axes>

### OOD Datasets

In [11]:
ood_confidences = {}
ood_mses = {}

for dataset_name in DATASET_NAMES:
    # Fit normalizers excluding current OOD dataset from train set
    train_man_paths_wo_dataset = [value for key, value in TRAIN_MAN_PATHS.items() if key != dataset_name]
    train_man_paths_wo_dataset = get_man_paths_list(train_man_paths_wo_dataset)
    os.system(f'HYDRA_CONFIG={config_path} polygraph_normalize save_path="./ood_{dataset_name}" man_paths={train_man_paths_wo_dataset}')

    # Get UE and metric values for OOD dataset
    ues, gen_metrics = get_mans_ues_metrics([EVAL_MAN_PATHS[dataset_name]], UE_METHOD_NAMES, GEN_METRIC_NAMES)

    with open(f'./ood_{dataset_name}/fitted_normalizers.json', 'rb') as f:
        fitted_normalizers = pickle.load(f)

    ood_confidences[dataset_name] = get_confidences(fitted_normalizers, ues)
    ood_mses[dataset_name] = calculate_mses(ood_confidences[dataset_name], gen_metrics)

Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassificatio

Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x290a82dd0>]
Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x2927dbee0>]
Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x290aa0610>]
Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x290ab4760>]
Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x290ab41c0>]
Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x290a82920>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x2af5b8910>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassificatio

Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x299d83a30>]
Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x1486fbee0>]
Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x299da06a0>]
Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x299db46a0>]
Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x299db4160>]
Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x299d82770>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x2a95abeb0>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassificatio

Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x295483a30>]
Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x297e5bee0>]
Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x29543b340>]
Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x295439390>]
Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x297e6fd90>]
Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x2954805b0>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x29f13fdf0>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassificatio

Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x29f183880>]
Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x2a1c5bf70>]
Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x29f137dc0>]
Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x29f135a80>]
Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x2a1c6be80>]
Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x29f180790>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x2a9527fa0>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassificatio

Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x296f83b50>]
Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x29a45bee0>]
Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x296f37d30>]
Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x283c640a0>]
Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x29a46be20>]
Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x296f00ca0>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x2b06fc130>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassificatio

Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x290081690>]
Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x29335bf10>]
Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x290037ac0>]
Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x290035720>]
Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x29336bdf0>]
Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x290000ca0>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x103625c30>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassificatio

Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x298c83a30>]
Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x29be5bee0>]
Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x28b5923e0>]
Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x298c35570>]
Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x29be6bdc0>]
Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x298c007f0>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x2b05fbf40>]


In [12]:
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec

# If number of datasets is not even, we display an even part in pairs, and then last one separately
is_even = (len(DATASET_NAMES) % 2 == 0)
even_datasets_subset = DATASET_NAMES if is_even else DATASET_NAMES[:-1]
dataset_pairs = [even_datasets_subset[i:i + 2] for i in range(0, len(even_datasets_subset), 2)]

x = np.array(list(range(len(UE_METHOD_NAMES))))

for dataset_pair in dataset_pairs:
    fig = plt.figure(figsize=(12, 6))
    gs = GridSpec(1, 2, figure=fig)

    for i, dataset_name in enumerate(dataset_pair):
        ax = fig.add_subplot(gs[i//2,i%2])
        plot_mses(ax, ood_mses[dataset_name], f'MSE between true AlignScore and confidence: {dataset_name}')
        handles, labels = ax.get_legend_handles_labels()

    fig.legend(handles, labels, bbox_to_anchor=(1.1, 0.95))
    
    plt.tight_layout()
    # Change this to plt.show() to display inline
    plt.savefig(f'normalization_mse_ood_{dataset_name}.pdf')
    plt.clf()

if not is_even:
    dataset_name = DATASET_NAMES[-1]
    
    fig = plt.figure(figsize=(6, 6))
    gs = GridSpec(1, 1, figure=fig)

    ax = fig.add_subplot(gs[0,0])

    plot_mses(ax, ood_mses[dataset_name], f'MSE between true AlignScore and confidence: {dataset_name}')
    
    handles, labels = ax.get_legend_handles_labels()
    fig.legend(handles, labels, bbox_to_anchor=(1.2, 0.95))
    
    plt.tight_layout()
    # Change this to plt.show() to display inline
    plt.savefig(f'normalization_mse_ood_{dataset_name}.pdf')
    plt.clf()

<Figure size 1200x600 with 0 Axes>

<Figure size 1200x600 with 0 Axes>

<Figure size 1200x600 with 0 Axes>

<Figure size 600x600 with 0 Axes>

### PRR change relative to raw uncertainty

In [13]:
from lm_polygraph.utils.normalize import filter_nans
from lm_polygraph.ue_metrics.pred_rej_area import PredictionRejectionArea
from lm_polygraph.ue_metrics.ue_metric import (
    get_random_scores,
    normalize_metric,
)
import pandas as pd
from IPython.core.debugger import set_trace

ue_metric = PredictionRejectionArea()

cols = ['MinMax', 'Quantile', 'Binned PCC', 'Isotonic PCC']

# For each of the dataset we take all confidences calculated in OOD setting
# and compare PRR of this to raw unnormalized UE
for dataset_name, path in EVAL_MAN_PATHS.items():
    res = {}
    all_ues, all_gen_metrics = get_mans_ues_metrics([path], UE_METHOD_NAMES, GEN_METRIC_NAMES)
    
    for metric_name in GEN_METRIC_NAMES:
        gen_metrics = all_gen_metrics[metric_name]
        for ue_method_name in UE_METHOD_NAMES:
            ues = all_ues[ue_method_name]
            
            filtered_metric, filtered_ues = filter_nans(gen_metrics, ues)

            # -np.array() because we need to use UE, not confidence to calculate PRR
            minmax_ues = -np.array(ood_confidences[dataset_name]['min_max'][metric_name][ue_method_name])
            quantile_ues = -np.array(ood_confidences[dataset_name]['quantile'][metric_name][ue_method_name])
            binned_pcc_ues = -np.array(ood_confidences[dataset_name]['binned_pcc'][metric_name][ue_method_name])
            isotonic_pcc_ues = -np.array(ood_confidences[dataset_name]['isotonic_pcc'][metric_name][ue_method_name])

            assert(len(filtered_ues) == len(minmax_ues))
            
            oracle_score = ue_metric(-filtered_metric, filtered_metric)
            random_score = get_random_scores(ue_metric, filtered_metric)

            raw_ue_metric_val = ue_metric(filtered_ues, filtered_metric)
            raw_score = normalize_metric(raw_ue_metric_val, oracle_score, random_score)

            minmax_ue_metric_val = ue_metric(minmax_ues, filtered_metric)
            minmax_score = normalize_metric(minmax_ue_metric_val, oracle_score, random_score)
            minmax_diff = raw_score - minmax_score

            quantile_ue_metric_val = ue_metric(quantile_ues, filtered_metric)
            quantile_score = normalize_metric(quantile_ue_metric_val, oracle_score, random_score)
            quantile_diff = raw_score - quantile_score
            
            binned_pcc_ue_metric_val = ue_metric(binned_pcc_ues, filtered_metric)
            binned_pcc_score = normalize_metric(binned_pcc_ue_metric_val, oracle_score, random_score)
            binned_pcc_diff = raw_score - binned_pcc_score

            isotonic_pcc_ue_metric_val = ue_metric(isotonic_pcc_ues, filtered_metric)
            isotonic_pcc_score = normalize_metric(isotonic_pcc_ue_metric_val, oracle_score, random_score)
            isotonic_pcc_diff = raw_score - isotonic_pcc_score

            res[ue_method_name] = [minmax_diff, quantile_diff, binned_pcc_diff, isotonic_pcc_diff]

    # Show table for each datasets that contains difference between raw UE PRR and PRR based on normalized confidence
    # Lower is better, negative is best (means normalized confidence improves upon raw PRR
    df = pd.DataFrame.from_dict(res, orient='index', columns=cols)
    display(df)

Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x29df27430>]


Unnamed: 0,MinMax,Quantile,Binned PCC,Isotonic PCC
MaximumSequenceProbability,0.0,3.027419e-05,0.045624,0.0
Perplexity,5.4e-05,6.973301e-05,0.01712,0.0001025691
MeanTokenEntropy,0.0,8.562246e-05,0.071875,0.0001270313
MeanPointwiseMutualInformation,0.0,-1.521409e-05,0.023563,1.318007e-06
MeanConditionalPointwiseMutualInformation,5.4e-05,6.973301e-05,0.01712,0.0001025691
PTrue,0.0,5.897441e-05,-0.040076,-0.01698985
PTrueSampling,0.0,0.0004133538,-0.005526,0.09455222
MonteCarloSequenceEntropy,0.0,6.969507e-05,0.072111,0.0003595616
MonteCarloNormalizedSequenceEntropy,4.3e-05,7.251505e-06,0.026097,0.0002124271
LexicalSimilarity_rouge1,0.0,6.378414e-05,0.240101,0.0004681824


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x2a0ec02b0>]


Unnamed: 0,MinMax,Quantile,Binned PCC,Isotonic PCC
MaximumSequenceProbability,0.0,2.2e-05,-0.01449,0.0
Perplexity,0.0,3.5e-05,0.077725,0.0
MeanTokenEntropy,0.0,8.8e-05,0.051275,0.0
MeanPointwiseMutualInformation,0.0,0.000218,0.372224,0.0
MeanConditionalPointwiseMutualInformation,0.0,3.5e-05,0.077725,0.0
PTrue,0.0,-9.7e-05,0.014158,0.0
PTrueSampling,0.0,0.001666,0.059528,0.0
MonteCarloSequenceEntropy,0.000199,0.000205,0.100969,0.005841522
MonteCarloNormalizedSequenceEntropy,0.0,-0.00013,0.026921,0.0
LexicalSimilarity_rouge1,0.0,5e-05,0.017079,0.0


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x29df24dc0>]


Unnamed: 0,MinMax,Quantile,Binned PCC,Isotonic PCC
MaximumSequenceProbability,0.0,2.871689e-05,0.034351,0.0
Perplexity,0.0,-7.75797e-06,0.076598,0.0
MeanTokenEntropy,0.0,4.218778e-05,-0.078761,0.0
MeanPointwiseMutualInformation,0.0,2.981897e-05,0.171447,0.0
MeanConditionalPointwiseMutualInformation,0.0,-7.75797e-06,0.076598,0.0
PTrue,-0.010234,-0.01012989,-0.7553,-0.5903252
PTrueSampling,-0.002085,-0.001342966,-0.099219,-0.002084942
MonteCarloSequenceEntropy,0.0,1.287856e-05,0.018792,0.0
MonteCarloNormalizedSequenceEntropy,0.0,6.304778e-07,0.071458,0.0
LexicalSimilarity_rouge1,0.0,-0.001144289,0.16548,-0.0003125111


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x2a9e91d20>]


Unnamed: 0,MinMax,Quantile,Binned PCC,Isotonic PCC
MaximumSequenceProbability,-1.436972e-05,0.000125,0.076625,-1.4e-05
Perplexity,0.0,-2.1e-05,0.032847,0.0
MeanTokenEntropy,0.0,-2.5e-05,0.210053,0.0
MeanPointwiseMutualInformation,0.0,4.6e-05,-0.102983,0.0
MeanConditionalPointwiseMutualInformation,0.0,-2.1e-05,0.032847,0.0
PTrue,0.0,-0.000144,0.44975,0.086119
PTrueSampling,0.0,-0.000199,-0.003844,-0.006621
MonteCarloSequenceEntropy,-1.436972e-05,-0.000104,0.068503,-1.4e-05
MonteCarloNormalizedSequenceEntropy,0.0,-6.7e-05,0.040447,-1.4e-05
LexicalSimilarity_rouge1,0.0,6.9e-05,0.340906,5.7e-05


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x2a74d3460>]


Unnamed: 0,MinMax,Quantile,Binned PCC,Isotonic PCC
MaximumSequenceProbability,0.0,-1e-05,0.146019,0.0
Perplexity,0.0,-2.4e-05,0.023779,0.0
MeanTokenEntropy,0.0,8.1e-05,0.040141,0.0
MeanPointwiseMutualInformation,0.0,1.4e-05,0.015287,0.0
MeanConditionalPointwiseMutualInformation,0.0,-2.4e-05,0.023779,0.0
PTrue,0.0,-1.4e-05,0.124073,0.03356028
PTrueSampling,0.0,-0.000205,0.135484,0.01691108
MonteCarloSequenceEntropy,0.0,-2.8e-05,0.202491,0.0
MonteCarloNormalizedSequenceEntropy,0.0,-0.000291,-0.02853,0.0
LexicalSimilarity_rouge1,0.0,-5.8e-05,0.054083,0.0009814974


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x2a8b7bfd0>]


Unnamed: 0,MinMax,Quantile,Binned PCC,Isotonic PCC
MaximumSequenceProbability,0.0,-0.000154,0.093635,0.0
Perplexity,0.0,0.000233,0.004554,0.0
MeanTokenEntropy,0.0,-0.000198,0.016244,0.0
MeanPointwiseMutualInformation,0.0,0.00014,-0.067971,0.0
MeanConditionalPointwiseMutualInformation,0.0,0.000233,0.004554,0.0
PTrue,0.0,-1.7e-05,0.270316,0.0004404489
PTrueSampling,0.0,-0.000116,0.05436,0.0002749514
MonteCarloSequenceEntropy,0.0,0.000113,0.233209,0.0
MonteCarloNormalizedSequenceEntropy,0.0,0.000165,-0.015257,0.0
LexicalSimilarity_rouge1,0.0,-0.000724,0.04252,3.040329e-05


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x29df9fc40>]


Unnamed: 0,MinMax,Quantile,Binned PCC,Isotonic PCC
MaximumSequenceProbability,0.0,0.000124,-0.02184,0.0
Perplexity,0.0,-1e-06,0.011166,0.0
MeanTokenEntropy,0.0,8.1e-05,0.0196,0.0
MeanPointwiseMutualInformation,0.0,0.000111,0.045119,0.0
MeanConditionalPointwiseMutualInformation,0.0,-1e-06,0.011166,0.0
PTrue,0.0,-9.9e-05,-0.050726,0.0003519649
PTrueSampling,0.0,1.3e-05,-0.017532,0.001190431
MonteCarloSequenceEntropy,0.0,5.5e-05,-0.014168,0.0
MonteCarloNormalizedSequenceEntropy,0.0,-1.2e-05,-0.000438,0.0
LexicalSimilarity_rouge1,0.0,4e-06,0.083852,-5.900041e-05


In [14]:
all_ues, all_gen_metrics = get_mans_ues_metrics(list(EVAL_MAN_PATHS.values()), UE_METHOD_NAMES, GEN_METRIC_NAMES)

# Same for all datasets concatenated
for metric_name in GEN_METRIC_NAMES:
    gen_metrics = all_gen_metrics[metric_name]
    for ue_method_name in UE_METHOD_NAMES:
        ues = all_ues[ue_method_name]
        
        filtered_metric, filtered_ues = filter_nans(gen_metrics, ues)
        
        minmax_ues = -np.array(confidences['min_max'][metric_name][ue_method_name])
        quantile_ues = -np.array(confidences['quantile'][metric_name][ue_method_name])
        binned_pcc_ues = -np.array(confidences['binned_pcc'][metric_name][ue_method_name])
        isotonic_pcc_ues = -np.array(confidences['isotonic_pcc'][metric_name][ue_method_name])

        assert(len(filtered_ues) == len(minmax_ues))
        
        oracle_score = ue_metric(-filtered_metric, filtered_metric)
        random_score = get_random_scores(ue_metric, filtered_metric)

        raw_ue_metric_val = ue_metric(filtered_ues, filtered_metric)
        raw_score = normalize_metric(raw_ue_metric_val, oracle_score, random_score)

        minmax_ue_metric_val = ue_metric(minmax_ues, filtered_metric)
        minmax_score = normalize_metric(minmax_ue_metric_val, oracle_score, random_score)
        minmax_diff = raw_score - minmax_score

        quantile_ue_metric_val = ue_metric(quantile_ues, filtered_metric)
        quantile_score = normalize_metric(quantile_ue_metric_val, oracle_score, random_score)
        quantile_diff = raw_score - quantile_score
        
        binned_pcc_ue_metric_val = ue_metric(binned_pcc_ues, filtered_metric)
        binned_pcc_score = normalize_metric(binned_pcc_ue_metric_val, oracle_score, random_score)
        binned_pcc_diff = raw_score - binned_pcc_score

        isotonic_pcc_ue_metric_val = ue_metric(isotonic_pcc_ues, filtered_metric)
        isotonic_pcc_score = normalize_metric(isotonic_pcc_ue_metric_val, oracle_score, random_score)
        isotonic_pcc_diff = raw_score - isotonic_pcc_score

        res[ue_method_name] = [minmax_diff, quantile_diff, binned_pcc_diff, isotonic_pcc_diff]

df = pd.DataFrame.from_dict(res, orient='index', columns=cols)
display(df)

Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x2a202f100>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x2a8b79bd0>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x2a9e62140>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x2a9e454b0>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x2a95053f0>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x2a9e84af0>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x2af5bb9a0>]


Unnamed: 0,MinMax,Quantile,Binned PCC,Isotonic PCC
MaximumSequenceProbability,-5.721821e-06,-7e-06,-0.050397,-5.721821e-06
Perplexity,-8.007145e-06,8e-06,0.018182,-7.831774e-06
MeanTokenEntropy,3.145503e-10,2e-06,-0.008805,3.145503e-10
MeanPointwiseMutualInformation,0.0,6.9e-05,-0.313686,2.883105e-08
MeanConditionalPointwiseMutualInformation,-8.007145e-06,8e-06,0.018182,-7.831774e-06
PTrue,-3.504455e-05,5.1e-05,-0.759635,-0.3782925
PTrueSampling,0.0,2e-06,-0.37516,-0.05205038
MonteCarloSequenceEntropy,-5.044093e-06,-3e-06,-0.068054,-5.044093e-06
MonteCarloNormalizedSequenceEntropy,3.145507e-10,-5e-06,0.009059,3.145507e-10
LexicalSimilarity_rouge1,0.0,-2.3e-05,0.062659,-1.029108e-05
