# Evaluation

In [59]:
MODEL_TO_RUN_PATH = 'anli_falcon_inst_zero_shot_diff2.csv'
MODEL_TO_RUN_PATH2 = 'anli_gpt2_CoT1.csv'
# for consistency voting
MODEL_TO_RUN_PATH3 = 'anli_gpt2_CoT3.csv'

# consistency_zero, zero_shot, few_shot, new_prompt, CoT_zero, CoT_few


### Setting up

In [60]:
# libraries
import pandas as pd
import numpy as np
import json
import os
#import re
import random
import scipy.stats as stats

from difflib import SequenceMatcher
from sentence_transformers import SentenceTransformer, util

In [61]:
def load_jsonl(path):
    lines = []
    with open(path) as file:
        lines = file.read().splitlines()

    return pd.DataFrame([json.loads(line) for line in lines])

In [62]:
aNLIPath = ''

aNLI_test = load_jsonl(aNLIPath+'/test.jsonl')
label_test = pd.read_csv(aNLIPath+'/test-labels.lst', header=None, names=['label'])
aNLI_test = aNLI_test.join(label_test)
print('test data length:', len(aNLI_test))

pd.set_option('display.max_colwidth', None)
aNLI_test.head(2)

test data length: 3059


Unnamed: 0,story_id,obs1,obs2,hyp1,hyp2,label
0,87aa0983-9b84-48b1-86ff-160b1567487c-1,Jane was a professor teaching piano to students.,Jane spent the morning sipping coffee and reading a book.,Two of Jane's students were early for their lessons.,None of Jane's students had a lesson that day.,2
1,dfc8584e-13fe-4e26-bdf6-2485e90ef29d-1,Nate had the summer off before college.,Nate's last summer before college was a total blast!,Nate spent the summer traveling and partying.,Nate decided to spend the entire summer working in the Mines.,1


In [63]:
aNLI_train = load_jsonl(aNLIPath+'/train.jsonl')
label_train = pd.read_csv(aNLIPath+'/train-labels.lst', header=None, names=['label'])
aNLI_train = aNLI_train.join(label_train)

In [64]:
def make_dataset(path, test_set):
    data = pd.read_csv(path)
    data = data.merge(test_set, on='story_id')
    data['answer'] = data['answer'].apply(lambda x: str(x).replace('["', '').replace('"]',''))
    data['answer'] = data['answer'].apply(lambda x: str(x).replace("['", '').replace("']",''))
    return data
    

In [65]:
answers = make_dataset(MODEL_TO_RUN_PATH, aNLI_test)
if MODEL_TO_RUN_PATH2:
    answers2 = make_dataset(MODEL_TO_RUN_PATH2, aNLI_test)
else:
    answers2 = pd.DataFrame({'story_id':[], 'obs1':[], 'obs2':[], 'hyp1':[], 'hyp2':[], 'label':[]})
if MODEL_TO_RUN_PATH3:
    answers3 = make_dataset(MODEL_TO_RUN_PATH3, aNLI_test)
else:
    answers3 = pd.DataFrame({'story_id':[], 'obs1':[], 'obs2':[], 'hyp1':[], 'hyp2':[], 'label':[]})

## Categorizing predictions

In [66]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [67]:
"""
similar_answers = []
cor = 0

for i, row in aNLI_test.iterrows():
    context = model.encode(row.obs1 + ' ' + row.obs2)
    passage_embedding = model.encode([row.hyp1, row.hyp2])
    score = util.dot_score(context, passage_embedding)[0]
    if score[0] > score[1]:
        similar_answers.append(1)
        if row.label == 1:
            cor += 1
    elif score[0] < score[1]:
        similar_answers.append(2)
        if row.label == 2:
            cor +=1
        
    else:  
        print(score)

print(cor)
print(round(cor/len(aNLI_test),3))

pd.DataFrame(similar_answers).to_csv('similarity_answers')
"""
#0.541 with only similarity 

"\nsimilar_answers = []\ncor = 0\n\nfor i, row in aNLI_test.iterrows():\n    context = model.encode(row.obs1 + ' ' + row.obs2)\n    passage_embedding = model.encode([row.hyp1, row.hyp2])\n    score = util.dot_score(context, passage_embedding)[0]\n    if score[0] > score[1]:\n        similar_answers.append(1)\n        if row.label == 1:\n            cor += 1\n    elif score[0] < score[1]:\n        similar_answers.append(2)\n        if row.label == 2:\n            cor +=1\n        \n    else:  \n        print(score)\n\nprint(cor)\nprint(round(cor/len(aNLI_test),3))\n\npd.DataFrame(similar_answers).to_csv('similarity_answers')\n"

In [68]:
def remove_substrings(string):
    irrelevant_strings = ['the simple explanation is:', 'the simple explanation is ', 'the explanation is:', 'the explanation is ',
                         'the explanation is:', 'the explanation is ']
    for substr in irrelevant_strings:
        if substr in string:
            string = string.replace(substr, '')
            
    return string.strip()

In [69]:
def evaluate_it(data):
    answers_label = []

    
    
    formulations_A_check= ['the correct answer is a','the correct choice is: a', 'the more likely explanation is a', 
                   'correct scenario is a', 'a is more likely', 'scenario a is more likely', 'a. is more likely',
                     '<strong>scenario a</strong>', 'correct scenario is a', 'a is the more likely', 'a) is the more likely',
                     'the more likely explanation is choice a', 'the more likely one is hypothesis a', 'a is more likely',
                     'more likely explanation is hypothesis a', 'more likely explanation is that a', 'a seems more likely',
                     'a) is more likely', 'the answer is: a', 'the correct hypothesis is a', 'more similar to the context is text a',
                     'the context is better explained by hypothesis a.', 'the answer is a', 'scenario a', 'the answer: a', 'answer: (a)', 
                       'a.', '(a)', "'a'", '(1)', '1.', 'h1','hypothesis a','text a', 'a. answers the question', 'a. relates better',
                     'a is more similar', 'case a', 'option a', 'first one is more similar', 'a is more consistent', 'a. is correct',
                          'yes, hypothesis b'] # contradicts context
    formulations_B_check = ['the correct answer is b','the correct choice is: b', 'the more likely explanation is b', 
                            'correct scenario is b', 'b is more likely', 'scenario b is more likely', 'b. is more likely',
                     '<strong>scenario b</strong>', 'correct scenario is b', 'b is the more likely', 'b) is the more likely',
                     'the more likely explanation is choice b', ' the more likely one is hypothesis b', 'b is more likely',
                     'more likely explanation is hypothesis b', 'more likely explanation is that b', 'b seems more likely',
                     'b) is more likely', 'the answer is: b', 'the correct hypothesis is b', 'more similar to the context is text b',
                     'the context is better explained by hypothesis a.', 'the answer is b', 'scenario b', 'the answer: b', 'answer: (b)', 
                        'b.', '(b)',"'b'", '(2)', '2.', 'h2', 'hypothesis b','text b', 'b. answers the question','b. relates better',
                     'b is more similar', 'case b', 'option b', 'second one is more similar','b is more consistent', 'b. is correct',
                           'yes, hypothesis a']

    
    for i, row in data.iterrows():
        # just don't want to deal with casing
        answ, hyp1, hyp2 = str(row.answer).lower(), row.hyp1.lower(), row.hyp2.lower()
        answ = answ.replace(row.obs1.lower(), '')
        answ = answ.replace(row.obs2.lower(), '')

        answ = answ.split('the context is:', 1)[0]

        answer_found = False
    
        # if it told just the letter/number of of hypothesis (T5)
        if answ=='a' or answ=='1' or answ=='a.' or answ=='\na':
            answers_label.append(1)
            continue
        elif answ=='b' or answ==2 or answ=='b.' or answ=='\nb':
            answers_label.append(2)
            continue
        elif answ=='':
            answers_label.append(0)
            continue

        # have to check if there aren't both
        for s in range(len(formulations_A_check)):
            if (formulations_A_check[s] in answ) and (formulations_B_check[s] not in answ):
                answers_label.append(1)
                answer_found = True
                break
            elif (formulations_B_check[s] in answ) and (formulations_A_check[s] not in answ):
                answers_label.append(2)
                answer_found = True
                break

        if answer_found:
            continue

        if answ.startswith('\nthe correct scenario is a'):
            answers_label.append(1)
            answer_found = True
        elif answ.startswith('\nthe correct scenario is b'):
            answers_label.append(2)
            answer_found = True
        
        if answer_found:
            continue

        # get just what is after this
        likely_string='the more likely explanation is that '
        likely_string2 = "it's more likely that "
        if likely_string in answ:
            index = answ.find(likely_string)
            answ = (answ[index+len(likely_string):])
        elif likely_string2 in answ:
            index = answ.find(likely_string2)
            answ = (answ[index+len(likely_string2):])
             
        # if it repeated the hypothesis
        if ((answ in hyp1[:-1]) and (answ not in hyp2[:-1])) or ((hyp1[:-1] in answ) and (hyp2[:-1] not in answ)):
            answers_label.append(1)
        elif (answ in hyp2[:-1]) and (answ not in hyp1[:-1]) or ((hyp2[:-1] in answ) and (hyp1[:-1] not in answ)):
            answers_label.append(2)
                
        else:
            answ = remove_substrings(answ)
            # try to match the longest string between the answer and hypothesis
            if answ:
                match1_ratio = SequenceMatcher(None, answ, hyp1).find_longest_match()[2]/len(answ)
                match2_ratio = SequenceMatcher(None, answ, hyp2).find_longest_match()[2]/len(answ)
            else:
                match1_ratio, match2_ratio = 0, 0
            if match1_ratio > 0.8 and match2_ratio < 0.6:
                answers_label.append(1)
            elif match2_ratio > 0.8 and match1_ratio < 0.6:
                answers_label.append(2)
                
            else:
                # I will use similarity of embeddings to evaluate it
                query_embedding = model.encode(answ)
                passage_embedding = model.encode([hyp1, hyp2])
                score = util.dot_score(query_embedding, passage_embedding)[0]
                score1 = float(score[0]) 
                score2 = float(score[1])

                # this should be a good match for one hypothesis
                if (score1 > 0.95) and (score2 < 0.8):
                    answers_label.append(1)
                elif (score2 > 0.95) and (score2 < 0.8):
                    answers_label.append(2)

                elif score1 > 0.70 and (score1-score2) > 0.2:
                    answers_label.append(1)
                elif score2 > 0.70 and (score2-score1) > 0.2:
                    answers_label.append(2)

                # those are mostly just weird and not correct
                else:
                    answers_label.append(0)

    
    data['answr_lbl'] = answers_label
    answers_correct = len(data.loc[(data['label'] == data['answr_lbl'])])
    not_assessed = len(data.loc[(data['answr_lbl'] == 0)])
    
    #print('correct:', answers_correct, 'all:', len(data), 'percentage correct:', round(answers_correct/len(data),3))
    #print('not evaluated:', not_assessed, 'percent:', round(not_assessed/len(data),3) )
    
    return data, answers_correct/len(data), not_assessed

In [70]:
predictions, correct_num, not_assessed = evaluate_it(answers)

if MODEL_TO_RUN_PATH2:
    predictions2, correct_num2, not_assessed2 = evaluate_it(answers2)
if MODEL_TO_RUN_PATH3:
    predictions3, correct_num3, not_assessed3 = evaluate_it(answers3)

## Looking at how it gets stuff right/wrong
could it probably just choose the answer that has better similarity with the context?  
**T5 large zero-shot**:  
* didnt get:  it is only around 2 percent, in the error  
    - maybe I could be more benevolent with the similarity? or some hadn't/didn't or maybe tokenize it?
    - I also sometimes it just adds some explanation...  
    - lot of the errors are that it doesn't get it is searching for an explanation and just gives the result context 
    - and lot of error is just combination of the two hypothesis 
* wrong: 
    - maybe some knowledge about world missing... |
    - no reasoning 7
    - not easy for me also ||
    
**T5 large CoT zero-shot**: 
* right: some of them are wrong as I'm not generating the whole answer! - SOLVED
* didn't get it: 16% quite a lot!
    - combines observations |
    - just weird ||
    - giving both hypothesis 6
    - truncated |
* wrong: 
    - no reasoning 6
    - both hypothesis 4

**T5 large few-shot** (3): 
* right: Mostly just the letter for answer; could be quite easily explained with higher similarity score
* didn't get it: nothing
* wrong:
    - same words (maybe negation there...): |
    - no reasoning: 8
    - i dont get which one: |

## What is the error for the model?

In [71]:
def find_variation_by_split(dataset, folds):
    fold_size = round(len(dataset)/folds)
    #print('fold size:', fold_size)
    
    shuffled = dataset.sample(frac=1)
    correct_frac = []

    for i in range(folds):
        a = shuffled[i*fold_size:(i+1)*fold_size]
        gold = list(a.label)
        pred = list(a.answr_lbl)

        ok = 0
        for n in range(len(gold)):
            if pred[n] == gold[n]:
                ok +=1
        if gold:
            correct_frac.append(ok/len(gold))
        else:
            print('ERROR', ok)
            correct_frac.append(0)

    stderr = np.std(correct_frac)/np.sqrt(folds)*1.96

    return correct_frac, stderr
    

In [72]:
# do not count the not assessed ones for accuracy
correct_list, stderr = find_variation_by_split(answers, 30)

#print('!!!!!!!!!!!!!!!!!!!!!!!')
#print('accuracy:', round(correct_num*100), '+-', round(stderr*100))
#print('not assessed: ', round(not_assessed/len(answers),3))
#print(len(answers))

if MODEL_TO_RUN_PATH2:
    correct_list2, stderr2 = find_variation_by_split(answers2, 30)
if MODEL_TO_RUN_PATH3:
    correct_list3, stderr3 = find_variation_by_split(answers3, 30)

In [73]:
len(answers[answers.answr_lbl!=0])

3058

In [74]:
#answers_sim['answ_lbl'] = similar_answers

### Two models for comparison
Null hypothesis: There is no significant difference between the accuracy of the two models.   
Alternative hypothesis: There is a significant difference between the accuracy of the two models.   
A small p value <=0.05; reject the null hypothesis

In [None]:
davinci = 74.0
ChatGPT = 80.9
Bard = 75.0
BERT = 68.9
GPT = 63.1

test = 62

In [None]:
# one sample T-test
t_stat, p_value = stats.ttest_1samp(correct_list, popmean=ChatGPT)
#print('davinci vs model p-value:', p_value)
if p_value >= 0.05:
    print('no difference')

In [None]:
# paired Two-Sample T-test
t_stat, p_value = stats.ttest_rel(correct_list, correct_list2)
t_stat2, p_value2 = stats.ttest_rel(correct_list, correct_list3)
t_stat3, p_value3 = stats.ttest_rel(correct_list2, correct_list3)

In [None]:
# paired Two-Sample T-test
t_stat3, p_value3 = stats.ttest_rel(correct_list2, correct_list3)
#print(f'p-value btw models: {p_value3:.6f}')
#if p_value3 >= 0.05:
#    print('no difference')

## Consistency voting

In [None]:
one = predictions.answr_lbl.tolist()
two = predictions2.answr_lbl.tolist()
three = predictions3.answr_lbl.tolist()
gold = predictions.label.tolist()

In [None]:
"""
correct = 0
R = 0
for i in range(len(gold)):
    answ_list = [one[i], two[i], three[i]]
    if answ_list.count(1) >= 2:
        if gold[i] == 1:
            correct += 1
    elif answ_list.count(2) >= 2:
        if gold[i] == 2:
            correct += 1
    elif answ_list.count(0) >=2:
        answ_list.remove(0)
        answ_list.remove(0)
        if gold[i] == answ_list[0]:
            correct += 1
    else:
        rnd = random.choice([1,2])
        R += 1
        if gold[i] == rnd:
            correct += 1
"""

# Results

In [75]:
print('FIRST DATASET')
print('data lenght:', len(answers), 'correct:', len(answers[answers.label == answers.answr_lbl]))
print('ACCURACY:', round(correct_num*100), '+-', round(stderr*100))
print('N/A:', not_assessed, 'Percentage N/A:', round(not_assessed/len(answers),3) )
print('answer A:', len(predictions[predictions.answr_lbl==1]), 'answer B:', len(predictions[predictions.answr_lbl==2]))
print('answer ratio:', len(predictions[predictions.answr_lbl==1])/len(predictions[predictions.answr_lbl==2]))
print()

print('SECOND DATASET')
print('data lenght:', len(answers2), 'correct:', len(answers2[answers2.label == answers2.answr_lbl]))
print('ACCURACY:', round(correct_num2*100), '+-', round(stderr2*100))
print('N/A:', not_assessed2, 'Percentage N/A:', round(not_assessed2/len(answers2),3) )
print('answer A:', len(predictions2[predictions2.answr_lbl==1]), 'answer B:', len(predictions2[predictions2.answr_lbl==2]))
print('answer ratio:', len(predictions2[predictions2.answr_lbl==1])/len(predictions2[predictions2.answr_lbl==2]))
print()

print('THIRD DATASET')
print('data lenght:', len(answers3), 'correct:', len(answers3[answers3.label == answers3.answr_lbl]))
print('ACCURACY:', round(correct_num3*100), '+-', round(stderr3*100))
print('N/A:', not_assessed3, 'Percentage N/A:', round(not_assessed3/len(answers3),3) )
print('answer A:', len(predictions3[predictions3.answr_lbl==1]), 'answer B:', len(predictions3[predictions3.answr_lbl==2]))
print('answer ratio:', len(predictions3[predictions3.answr_lbl==1])/len(predictions3[predictions3.answr_lbl==2]))
print()

print('p-values')
print('model 1 vs model 2:', p_value)
if p_value >= 0.05:
    print('no difference')
print('model 1 vs model 3:', p_value2)
if p_value2 >= 0.05:
    print('no difference')
print('model 2 vs model 3:', p_value3)
if p_value3 >= 0.05:
    print('no difference')
print()

print('CONSISTENCY VOTING')
print(round(correct/len(gold),3))
print('random',  round(R/len(gold),3))

FIRST DATASET
data lenght: 3059 correct: 1567
ACCURACY: 51 +- 2
N/A: 1 Percentage N/A: 0.0
answer A: 367 answer B: 2691
answer ratio: 0.13638052768487552

SECOND DATASET
data lenght: 600 correct: 210
ACCURACY: 35 +- 4
N/A: 190 Percentage N/A: 0.317
answer A: 149 answer B: 261
answer ratio: 0.5708812260536399

THIRD DATASET
data lenght: 600 correct: 188
ACCURACY: 31 +- 4
N/A: 229 Percentage N/A: 0.382
answer A: 154 answer B: 217
answer ratio: 0.7096774193548387

p-values


In [None]:
model_to_look_at = predictions
model_to_look_at2 = predictions3

In [None]:
# similarity of two models together -> Falcon and GPT not much 53\%, GPT and LLAMA nope, GPT and T5? base 39\%, large 43\%, (x)xl 50:50
#chooses the same as similarity???
length_classified = 0
same = 0
for a, b in zip(model_to_look_at2.answr_lbl.tolist(),model_to_look_at.answr_lbl.tolist()):  
    if a!=0 and b!=0:
        length_classified +=1
        if a==b:
            same += 1

same/length_classified


In [None]:
#chooses the same as similarity???

same = 0
for a, b in zip(similar_answers,model_to_look_at.answr_lbl.tolist()):       
    if a==b:
        same += 1

length_classified = len(model_to_look_at[model_to_look_at.answr_lbl!=0])
print(length_classified)
same/length_classified


In [None]:
# sometimes both hypothesis contradict; No hypothesis contradicts the context. 
#CoT1 #relates better

#correct hypothesis

In [None]:
suit = model_to_look_at[model_to_look_at.answer.str.contains('relates better', case=False)]
#suit = model_to_look_at[model_to_look_at.answr_lbl ==1]
#suit = model_to_look_at[model_to_look_at.answr_lbl != model_to_look_at.label]
suit.sample(3)

In [None]:
len(suit[suit.answr_lbl==1])

In [None]:
'following prompt', len(suit)/len(model_to_look_at),len(suit)

In [None]:
'suit accuracy', len(suit[suit.answr_lbl==suit.label])/len(suit)

In [None]:
#model_to_look_at[model_to_look_at.hyp1.str.contains('The correct choice is:', case=False)]

In [None]:
# what the model got right
print(len(model_to_look_at.loc[(model_to_look_at['label'] == model_to_look_at['answr_lbl'])]))
right = model_to_look_at.loc[(model_to_look_at['label'] == model_to_look_at['answr_lbl'])]
right.sample(5)
#right[right.label==1]

In [None]:
# what the model got wrong
not_NA = model_to_look_at.loc[(model_to_look_at['answr_lbl'] != 0)]
print(len(not_NA.loc[(not_NA['label'] != not_NA['answr_lbl'])]))
wrong = not_NA.loc[(not_NA['label'] != not_NA['answr_lbl'])]
wrong.sample(5)
#wrong[wrong.answr_lbl==2]

In [None]:
# what I wasnt able to categorize to answer
print(len(model_to_look_at.loc[(model_to_look_at['answr_lbl'] == 0)]))
na =  model_to_look_at.loc[(model_to_look_at['answr_lbl'] == 0)]
na.sample(5)

#na[na.label!=2]

In [None]:
len(model_to_look_at[model_to_look_at.answer == 'nan'])/len(model_to_look_at)

In [None]:
pd.set_option("display.max_columns", None)
"""To see what changed between the models"""
merged = model_to_look_at.merge(model_to_look_at2, how='outer', on='story_id')
different = merged.loc[(merged['answr_lbl_x'] != merged['answr_lbl_y'])]
print(len(different[(different['answr_lbl_y']==different['label_y'])]))
different[(different['answr_lbl_x']==different['label_x'])].sample(5)

### Pure A, B and random results

In [None]:
pureA = len(aNLI_test[aNLI_test.label==1])/len(aNLI_test)
pureB = len(aNLI_test[aNLI_test.label==2])/len(aNLI_test)

print('pure A:', round(pureA, 2), 'pure B:', round(pureB,2))
# A:55, B 45 for first 600

In [None]:
random_deviation = []

for i in range(30):
    random_list = [1]*1530 + [2]*1529
    random.shuffle(random_list)
    
    random_correct = 0
    for m, t in zip(random_list, aNLI_test.label.tolist()):
        if m==t: random_correct+=1

    random_deviation.append(round(random_correct/len(aNLI_test),2))
    
print(min(random_deviation), max(random_deviation))

In [None]:
pureA, pureB, random = 0,0,0
for label in aNLI_test.label.tolist():
    if la
    print(label)