## Importing libraries

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import json
import os
from tqdm.notebook import tqdm
from pywaffle import Waffle
import itertools

from dtw import dtw
from numpy.linalg import norm
from scipy.stats import entropy
import math

import sys
sys.path.append(os.path.abspath("../src/"))  # or any other path to where `ourlibs.py` is
from ourlib import *

## Loading general and metrics functions

In [2]:
def has_any_false(values):
    return not any(values)

def all_equal(values):
    previous_v = values[0]
    for v in values[1:]:
        if v == 'None of the above':
            continue
        if v != previous_v:
            return False
        previous_v = v
    return True

def get_answer(c, choices):
    if c in choices:
        return choices[c]
    return None

is_correct = lambda totals: True if totals['correct'] > totals['incorrect'] and totals['correct'] > totals['nota'] else False
is_not_correct = lambda totals: True if totals['incorrect'] > totals['correct'] and totals['incorrect'] > totals['nota'] else False

# for chance model
def get_random_key(my_dict):
  if not my_dict:
    return None  # Return None if the dictionary is empty
  return random.choice(list(my_dict.keys()))

def get_most_voted_response(votes):
    max_votes = -1
    max_voted = None
    for vote, value in votes.items():
        if value > max_votes:
            max_votes = value
            max_voted = vote
    return max_voted

def compute_auc_dtw(data: dict):
    x = np.array(sorted(data.keys()))
    y = np.array([data[xi] for xi in x])
    area = np.trapezoid(y, x)
    # # Common x
    # x_ref = np.linspace(0.0, 1.0, num=len(x))
    y_perfect = np.ones_like(x)
    # Worst-case curve: y=1 at x=0, y=0 elsewhere
    y_worst = np.zeros_like(x)
    y_worst[0] = 1.0
    # Compute normalization factor
    dtw_max, _, _, _ = dtw(y_perfect.reshape(-1, 1), y_worst.reshape(-1, 1), dist=lambda a, b: norm(a - b, ord=1))
    # Compute DTW to flat line
    y_ref = np.ones_like(y)
    dtw_val, _, _, _ = dtw(y.reshape(-1, 1), y_ref.reshape(-1, 1), dist=lambda a, b: norm(a - b, ord=1))
    norm_dtw = 1.0 - (dtw_val / dtw_max if dtw_max > 0 else 0)
    return area, norm_dtw

def compute_consistency_index(au_value, au_ref):
    divisor = 1.0
    if au_value > au_ref:
        divisor = 1.0 - au_ref
    elif au_value < au_ref:
        divisor = au_ref
    return (au_value - au_ref)/(divisor)

## Setting directory, benchmark, evaluation type, model, temperature and consistency levels

In [3]:
# change directory and benchmark here
DIR='../data/MedQA'
BENCH='MedQA'

# choose type of evaluation here
evaluation_types = [
    # 'standard_MC',
    'standard_MC_shuffled',
    # 'standard_MC_wNOTA',
    # 'standard_MC_wNOTA_shuffled',
    # 'decoupled_MC',
    # 'decoupled_MC_wNOTA',
    # 'decoupled_MC_shuffled',
    # 'decoupled_MC_wNOTA_shuffled'
]

NOTA_STRING = "None of the others"

# default values to compute consistency levels
consistency_TH = 0.9
consistency_TH_incorrect = -0.9

# choose model here
models = [
    #"Bio-Medical-Llama-3-8B",
    #"Llama-3-8B-Instruct",
    #"medllama3-v20",
    #"llama-3.1-8b",
    #"BioMistral-7B",
    "Mistral-7B-Instruct",
    "medalpaca-7b",
    #"llama1-7b",
]

# comment next 3 lines below in case you won't calculate chance 
total_chance_models = 5
for i in range(total_chance_models):
    models.append( f"chance_m{i:03d}" )

# uncomment next 2 lines if there is a temperature in the file name
# temperature_values = ['0']#, '5']
# models = [ f"{model}_{fs}" for model in models for fs in temperature_values]

## Reading files and creating chance fields (if chance)

In [4]:
for model in models:
    print(f"Current model: {model}")
    
    if model.find('chance') < 0:
        is_chance_model = False
        # change file name here
        input_file = os.path.join(DIR, f"{BENCH}_wOutputs_{model}_0_merged.xlsx")
    else:
        is_chance_model = True
        # just get the first model to keep track of the number of alternatives for each question
        # change file name here
        input_file = os.path.join(DIR, f"{BENCH}_wOutputs_{models[0]}_0_merged.xlsx")
    
    input_data = pd.read_excel(input_file)

Current model: Mistral-7B-Instruct
Current model: medalpaca-7b
Current model: chance_m000
Current model: chance_m001
Current model: chance_m002
Current model: chance_m003
Current model: chance_m004


## Calculating results

In [5]:
totals = {}

for model in models:
    print(f"Current model: {model}") 

    totals[model] = { 
        'count': 0,
        'correct_original': 0,
        'correct_mv': 0, 
        'correct_consistency_adjusted': 0,
        'correct_car': { c/10 : 0 for c in range(11)  }, # for CAR curve
        'consistency': {    
            'correct-high': 0,
            'correct-low': 0,
            'incorrect-low': 0,
            'incorrect-high': 0,
        },
        'positive_consistency': 0,
        'negative_consistency': 0,
        'individual_evals': {},
        'individual_evals_hits': {},
        'consistency_distribution' : { c/10 : 0 for c in range(11)  } | { -c/10 : 0 for c in range(11)  },
        'entropy' : 0,
        'positive_entropy': 0,
        'negative_entropy': 0
    }
    
    for idx, row in input_data.iterrows():
        try:
            evaluation_data = json.loads(row['expanded_evaluation'])
        except:
            continue

        correct_choice = evaluation_data['standard_MC'][0]['correct_choice']
        choices = evaluation_data['standard_MC'][0]['choices']
        #model_answer = get_answer(model_choice, choices)

        # compute standard accuracy
        if not is_chance_model:
            if 'model_choice' in evaluation_data['standard_MC'][0]:
                model_choice = evaluation_data['standard_MC'][0]['model_choice']
            else:
                continue
        else:
            model_choice = get_random_key(choices)

        votes = {}
        local_total = {'correct': 0, 'incorrect': 0, 'count': 0}
        for evaluation_type in evaluation_types:
            for ridx, decoupled_response in enumerate(evaluation_data[evaluation_type]):
                evaluation_signature = f"{evaluation_type}_{ridx}"
                if evaluation_signature not in totals[model]['individual_evals']: # for individual evaluation
                    totals[model]['individual_evals'][evaluation_signature] = 0
                    totals[model]['individual_evals_hits'][evaluation_signature] = []
                if 'model_choice' not in decoupled_response:
                    print(decoupled_response)
                    continue
                if not is_chance_model:
                    alt_model_choice = decoupled_response['model_choice']
                else:
                    alt_model_choice = get_random_key(decoupled_response['choices'])
                if alt_model_choice in decoupled_response['choices']:
                    alt_model_answer = decoupled_response['choices'][alt_model_choice]
                else:
                    alt_model_answer = None # provided choice is not available
                    continue # just skpe
                alt_correct_choice = decoupled_response['correct_choice']
                if alt_model_choice == alt_correct_choice:
                    local_total['correct'] += 1
                    totals[model]['individual_evals'][evaluation_signature] += 1
                    totals[model]['individual_evals_hits'][evaluation_signature].append(1)
                else:
                    local_total['incorrect'] += 1
                    totals[model]['individual_evals_hits'][evaluation_signature].append(0)
                local_total['count'] += 1
                choice_vote = find_key_by_value(choices, alt_model_answer)
                if choice_vote not in votes:
                    votes[ choice_vote ] = 0
                votes[ choice_vote ] += 1
        totals[model]['count'] += 1

        if local_total['count'] == 0: # garbage outputs, just skip
            continue

        if model_choice == correct_choice: # MCQA
            totals[model]['correct_original'] += 1
        max_voted = get_most_voted_response( votes )

        # compute entropy
        vote_values = list(votes.values())
        H_max = math.log2(len(vote_values))
        p = np.array(vote_values) / np.sum(vote_values)
        H = entropy(p, base=2)
        normalized_H = H/H_max if H_max > 0 else 0
        totals[model]['entropy'] += normalized_H

        if max_voted == correct_choice:
            consistency_level = votes[max_voted]/local_total['count']
            totals[model]['correct_mv'] += 1
            if consistency_level > consistency_TH:
                totals[model]['consistency']['correct-high'] += 1
            else:
                totals[model]['consistency']['correct-low'] += 1 
            totals[model]['positive_consistency'] += consistency_level
            for _c in range(10):
                c = _c/10
                c_next = (_c+1)/10
                if consistency_level > c and consistency_level <= c_next:
                    totals[model]['consistency_distribution'][c] += 1
            totals[model]['positive_entropy'] += normalized_H
        else:
            consistency_level = votes[max_voted]/local_total['count']
            if  consistency_level > consistency_TH:
                totals[model]['consistency']['incorrect-high'] += 1
            else:
                totals[model]['consistency']['incorrect-low'] += 1
            totals[model]['negative_consistency'] += consistency_level
            consistency_level = -consistency_level

            for _c in range(10):
                c = -_c/10
                c_next = -(_c+1)/10
                if consistency_level < c and consistency_level >= c_next:
                    totals[model]['consistency_distribution'][c] += 1
            totals[model]['negative_entropy'] += normalized_H
        totals[model]['correct_consistency_adjusted'] += consistency_level

        # compute totals for CAR curve
        positive_consistency = local_total['correct']/local_total['count']
        for c_TH in totals[model]['correct_car']:
            if positive_consistency >= c_TH:
                totals[model]['correct_car'][c_TH] += 1

Current model: Mistral-7B-Instruct
Current model: medalpaca-7b
Current model: chance_m000
Current model: chance_m001
Current model: chance_m002
Current model: chance_m003
Current model: chance_m004


## Creating dicts for results

In [6]:
data_dict = {
    'model': [],
    'MCQA': [],
    'MV': [],
    'mean_H': [],
    'mean_+H': [],
    'mean_-H': []}

data_dict = data_dict | {f'MCA({c/10})': [] for c in range(11)}

data_dict = data_dict | {
    'AUCAR': [],
    'DTW': [],
    'CORE': [],
    'CI': [],
    'CoRA': [],
    'CAR': [],
    'MC': [],
    'MPC': [], # mean positive consistency
    'MNC': [], # mean negative consistency
    }

## Adding results to dicts

In [7]:
for model, results in totals.items():
    print(model)
    data_dict['model'].append(model)

    MCQA= results['correct_original']/results['count']
    print(f"  MCQA accuracy: {MCQA:.4F}")
    data_dict['MCQA'].append(MCQA)

    MV = results['correct_mv']/results['count']
    print(f"  MV accuracy: {MV:.4F}")
    data_dict['MV'].append(MV)

    mean_entropy = results['entropy']/results['count']
    print(f"  Mean entropy: {mean_entropy}")
    data_dict['mean_H'].append(mean_entropy)

    mean_entropy = results['positive_entropy']/results['correct_original']
    print(f"  Mean positive entropy: {mean_entropy}")
    data_dict['mean_+H'].append(mean_entropy)

    mean_entropy = results['negative_entropy']/(results['count'] - results['correct_original'])
    print(f"  Mean negative entropy: {mean_entropy}")
    data_dict['mean_-H'].append(mean_entropy)

    CAR = {}
    data_for_auc = {}
    for c_TH in results['correct_car']:
        CAR[c_TH] = results['correct_car'][c_TH]/results['count']
        data_dict[f'MCA({c_TH})'].append(CAR[c_TH])
        
        data_for_auc[c_TH] = results['correct_car'][c_TH]/results['count']
        
    # Ensure data is sorted by x (in case it's not)
    auc, dtw_1 = compute_auc_dtw(data_for_auc)
    data_dict['AUCAR'].append(auc)
    data_dict['DTW'].append(dtw_1)
    data_dict['CORE'].append(auc * dtw_1)
    
    CI = 1.0 - (MCQA-CAR[c_TH])
    data_dict['CI'].append(CI)
  
    CoRA = MCQA * CI
    print(f"  CoRA accuracy: {CoRA:.4F}")
    data_dict['CoRA'].append(CoRA)

    CAR = results['correct_consistency_adjusted']/results['count']
    data_dict['CAR'].append(CAR)

    if results['correct_original'] > 0:
        data_dict['MC'].append( results['correct_consistency_adjusted']/results['correct_original'])
    else: 
         data_dict['MC'].append(0.0)
    if results['correct_original'] > 0:
        data_dict['MPC'].append(results['positive_consistency']/results['correct_original'])
    else: 
         data_dict['MPC'].append(0.0)
    if results['correct_original'] > 0:
        data_dict['MNC'].append(results['negative_consistency']/abs(results['count']-results['correct_original']))
    else: 
         data_dict['MNC'].append(0.0)

    print(f"  {results['consistency']}")
    for evaluation_signature, individual_total in results['individual_evals'].items():
        if evaluation_signature not in data_dict:
            data_dict[evaluation_signature] = []
        print(f"    {evaluation_signature}: {individual_total/results['count']:.4F}")
        data_dict[evaluation_signature].append( individual_total/results['count'])

Mistral-7B-Instruct
  MCQA accuracy: 0.2113
  MV accuracy: 0.1972
  Mean entropy: 0.92491542829513
  Mean positive entropy: 0.8627528760132556
  Mean negative entropy: 0.9415705344343976
  CoRA accuracy: 0.1667
  {'correct-high': 0, 'correct-low': 251, 'incorrect-low': 1022, 'incorrect-high': 0}
    standard_MC_shuffled_0: 0.1940
    standard_MC_shuffled_1: 0.1893
    standard_MC_shuffled_2: 0.1932
    standard_MC_shuffled_3: 0.2066
    standard_MC_shuffled_4: 0.1956
    standard_MC_shuffled_5: 0.2121
    standard_MC_shuffled_6: 0.2082
    standard_MC_shuffled_7: 0.1838
    standard_MC_shuffled_8: 0.1956
    standard_MC_shuffled_9: 0.1885
    standard_MC_shuffled_10: 0.1870
medalpaca-7b
  MCQA accuracy: 0.1736
  MV accuracy: 0.1822
  Mean entropy: 0.9268269875826175
  Mean positive entropy: 0.9747115323329495
  Mean negative entropy: 0.9167675917748017
  CoRA accuracy: 0.1435
  {'correct-high': 0, 'correct-low': 232, 'incorrect-low': 1041, 'incorrect-high': 0}
    standard_MC_shuffled_

## Saving results to a xlsx file

In [8]:
df_results = pd.DataFrame(data_dict, index=models)
df_results.to_excel('../data/MedQA/medqa_results.xlsx')

## Visualizing results

In [9]:
df_results.head(10)

Unnamed: 0,model,MCQA,MV,mean_H,mean_+H,mean_-H,MCA(0.0),MCA(0.1),MCA(0.2),MCA(0.3),...,standard_MC_shuffled_1,standard_MC_shuffled_2,standard_MC_shuffled_3,standard_MC_shuffled_4,standard_MC_shuffled_5,standard_MC_shuffled_6,standard_MC_shuffled_7,standard_MC_shuffled_8,standard_MC_shuffled_9,standard_MC_shuffled_10
Mistral-7B-Instruct,Mistral-7B-Instruct,0.211312,0.197172,0.924915,0.862753,0.941571,1.0,0.651218,0.369992,0.150825,...,0.189317,0.193244,0.206599,0.195601,0.212097,0.20817,0.183818,0.195601,0.188531,0.18696
medalpaca-7b,medalpaca-7b,0.173606,0.182247,0.926827,0.974712,0.916768,1.0,0.673998,0.369207,0.148468,...,0.180676,0.205813,0.19403,0.189317,0.19403,0.184603,0.212097,0.201885,0.20817,0.190888
chance_m000,chance_m000,0.203456,0.196386,0.927081,0.896253,0.934955,1.0,0.666929,0.367636,0.141398,...,0.191673,0.194815,0.190102,0.196386,0.179104,0.20817,0.176748,0.20817,0.2011,0.190102
chance_m001,chance_m001,0.197172,0.206599,0.924718,0.969884,0.913626,1.0,0.676355,0.371563,0.156324,...,0.193244,0.216811,0.186174,0.205813,0.223881,0.208955,0.179104,0.188531,0.194815,0.202671
chance_m002,chance_m002,0.202671,0.205813,0.925295,0.938988,0.921814,1.0,0.670856,0.389631,0.176748,...,0.2011,0.186174,0.2011,0.189317,0.202671,0.204242,0.195601,0.219953,0.198743,0.199529
chance_m003,chance_m003,0.206599,0.201885,0.92678,0.90404,0.932701,1.0,0.695208,0.377062,0.160251,...,0.197958,0.202671,0.191673,0.197172,0.200314,0.206599,0.207384,0.200314,0.195601,0.228594
chance_m004,chance_m004,0.197958,0.211312,0.925338,0.983132,0.911074,1.0,0.692852,0.391202,0.18696,...,0.200314,0.195601,0.213668,0.205027,0.220738,0.198743,0.203456,0.212883,0.216811,0.211312


## Consolidating mean chance results [if chance was calculated]

In [10]:
chance_rows = df_results[df_results['model'].str.match(r'^chance_m\d+$')]
mean_row = chance_rows.drop(columns=['model']).mean()
mean_row['model'] = 'chance_model'
df_results = pd.concat([df_results[~df_results['model'].str.match(r'^chance_m\d+$')], pd.DataFrame([mean_row])], ignore_index=True)

In [11]:
df_results.head()

Unnamed: 0,model,MCQA,MV,mean_H,mean_+H,mean_-H,MCA(0.0),MCA(0.1),MCA(0.2),MCA(0.3),...,standard_MC_shuffled_1,standard_MC_shuffled_2,standard_MC_shuffled_3,standard_MC_shuffled_4,standard_MC_shuffled_5,standard_MC_shuffled_6,standard_MC_shuffled_7,standard_MC_shuffled_8,standard_MC_shuffled_9,standard_MC_shuffled_10
0,Mistral-7B-Instruct,0.211312,0.197172,0.924915,0.862753,0.941571,1.0,0.651218,0.369992,0.150825,...,0.189317,0.193244,0.206599,0.195601,0.212097,0.20817,0.183818,0.195601,0.188531,0.18696
1,medalpaca-7b,0.173606,0.182247,0.926827,0.974712,0.916768,1.0,0.673998,0.369207,0.148468,...,0.180676,0.205813,0.19403,0.189317,0.19403,0.184603,0.212097,0.201885,0.20817,0.190888
2,chance_model,0.201571,0.204399,0.925843,0.93846,0.922834,1.0,0.68044,0.379419,0.164336,...,0.196858,0.199214,0.196544,0.198743,0.205342,0.205342,0.192459,0.20597,0.201414,0.206441


## Adding individual standard deviation and mean to results

In [12]:
eval_cols = [c for c in df_results.columns if c.find('standard') == 0]
df_results = df_results.copy().assign(
    individuals_mean=df_results[eval_cols].mean(axis=1),
    individuals_std=df_results[eval_cols].std(axis=1)
)
df_results = df_results.rename(columns={'individuals_mean': 'MCQA+'})

In [13]:
df_results.head()

Unnamed: 0,model,MCQA,MV,mean_H,mean_+H,mean_-H,MCA(0.0),MCA(0.1),MCA(0.2),MCA(0.3),...,standard_MC_shuffled_3,standard_MC_shuffled_4,standard_MC_shuffled_5,standard_MC_shuffled_6,standard_MC_shuffled_7,standard_MC_shuffled_8,standard_MC_shuffled_9,standard_MC_shuffled_10,MCQA+,individuals_std
0,Mistral-7B-Instruct,0.211312,0.197172,0.924915,0.862753,0.941571,1.0,0.651218,0.369992,0.150825,...,0.206599,0.195601,0.212097,0.20817,0.183818,0.195601,0.188531,0.18696,0.195815,0.009282
1,medalpaca-7b,0.173606,0.182247,0.926827,0.974712,0.916768,1.0,0.673998,0.369207,0.148468,...,0.19403,0.189317,0.19403,0.184603,0.212097,0.201885,0.20817,0.190888,0.196529,0.009985
2,chance_model,0.201571,0.204399,0.925843,0.93846,0.922834,1.0,0.68044,0.379419,0.164336,...,0.196544,0.198743,0.205342,0.205342,0.192459,0.20597,0.201414,0.206441,0.200471,0.004738


## Computing consistency index

In [14]:
for metric in ['MCQA', 'MCQA+', 'MV', 'MCA(1.0)', 'AUCAR', 'DTW', 'CORE']:
    au_ref = df_results.loc[df_results['model'] == 'chance_model', metric].values[0]
    df_results[f'CD_{metric}'] = df_results[metric].apply(lambda x: compute_consistency_index(x, au_ref))

## Visualizing table of results by model

In [15]:
results_by_model_df = df_results.set_index('model')[['mean_H', 'mean_+H', 'mean_-H', 'MC', 'MPC', 'MNC', 
                                                     'MCQA', 'MCQA+', 'MV', 'MCA(0.5)', 'MCA(0.6)', 'MCA(0.7)', 
                                                     'MCA(0.8)', 'MCA(0.9)', 'MCA(1.0)', 'AUCAR', 'DTW', 'CORE']]
results_by_model_df.head()

Unnamed: 0_level_0,mean_H,mean_+H,mean_-H,MC,MPC,MNC,MCQA,MCQA+,MV,MCA(0.5),MCA(0.6),MCA(0.7),MCA(0.8),MCA(0.9),MCA(1.0),AUCAR,DTW,CORE
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Mistral-7B-Instruct,0.924915,0.862753,0.941571,-1.044948,0.343021,0.371876,0.211312,0.195815,0.197172,0.012569,0.002357,0.000786,0.0,0.0,0.0,0.173881,0.123881,0.02154
medalpaca-7b,0.926827,0.974712,0.916768,-1.335664,0.38009,0.360439,0.173606,0.196529,0.182247,0.006284,0.001571,0.000786,0.0,0.0,0.0,0.174352,0.124352,0.021681
chance_model,0.925843,0.93846,0.922834,-1.074361,0.372212,0.365091,0.201571,0.200471,0.204399,0.014454,0.0022,0.000628,0.0,0.0,0.0,0.179081,0.129081,0.023132


## Visualizing correlation table

In [16]:
correlation_df = df_results.set_index('model')[['mean_H', 'mean_+H', 'mean_-H', 'MC', 'MPC', 'MNC', 'MCQA', 'MCQA+', 
                                                'MV', 'MCA(0.5)', 'MCA(0.6)', 'MCA(0.7)', 'MCA(0.8)', 'MCA(0.9)', 
                                                'MCA(1.0)', 'AUCAR', 'DTW', 'CORE']].corr()
correlation_df.head()

Unnamed: 0,mean_H,mean_+H,mean_-H,MC,MPC,MNC,MCQA,MCQA+,MV,MCA(0.5),MCA(0.6),MCA(0.7),MCA(0.8),MCA(0.9),MCA(1.0),AUCAR,DTW,CORE
mean_H,1.0,0.976327,-0.954119,-0.91541,0.943492,-0.992257,-0.967706,0.125247,-0.673483,-0.746182,-0.950433,0.0173,,,,0.064687,0.064687,0.062514
mean_+H,0.976327,1.0,-0.996298,-0.806675,0.992837,-0.995632,-0.890274,0.336878,-0.497651,-0.584519,-0.860679,-0.199376,,,,0.279001,0.279001,0.27691
mean_-H,-0.954119,-0.996298,1.0,0.752884,-0.999433,0.98392,0.847827,-0.41657,0.421245,0.512605,0.813724,0.282876,,,,-0.36052,-0.36052,-0.358488
MC,-0.91541,-0.806675,0.752884,1.0,-0.730288,0.85833,0.987316,0.2847,0.914058,0.951039,0.995192,-0.418298,,,,0.342464,0.342464,0.344509
MPC,0.943492,0.992837,-0.999433,-0.730288,1.0,-0.977346,-0.829485,0.446956,-0.390457,-0.483393,-0.793684,-0.315024,,,,0.391733,0.391733,0.389729



## Visualizing table of main metrics

In [17]:
df_results.set_index('model')[['MCQA', 'MCQA+', 'MV', 'MCA(1.0)', 'CORE']]

Unnamed: 0_level_0,MCQA,MCQA+,MV,MCA(1.0),CORE
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Mistral-7B-Instruct,0.211312,0.195815,0.197172,0.0,0.02154
medalpaca-7b,0.173606,0.196529,0.182247,0.0,0.021681
chance_model,0.201571,0.200471,0.204399,0.0,0.023132


## Visualizing consistency index

In [18]:
for idx, row in df_results.iterrows():
    print(f"{row['model']}")
    print(f" consistency index: {compute_consistency_index(row['AUCAR'], au_ref):.3F} | MCQA: {row['MCQA']:.3F} | MV: {row['MV']:.3F}")

Mistral-7B-Instruct
 consistency index: 0.154 | MCQA: 0.211 | MV: 0.197
medalpaca-7b
 consistency index: 0.155 | MCQA: 0.174 | MV: 0.182
chance_model
 consistency index: 0.160 | MCQA: 0.202 | MV: 0.204
