# Chain evaluation

In [186]:
# libraries
import pandas as pd
import numpy as np
import json
import os
#import re
import random
import scipy.stats as stats

from difflib import SequenceMatcher
from sentence_transformers import SentenceTransformer, util

In [187]:
def load_jsonl(path):
    lines = []
    with open(path) as file:
        lines = file.read().splitlines()
    jsons = [json.loads(line) for line in lines]
    df = pd.DataFrame(jsons)

    return df

In [188]:
aNLIPath = ''
aNLI_test = load_jsonl(aNLIPath+'test.jsonl')
label_test = pd.read_csv(aNLIPath+'test-labels.lst', header=None, names=['label'])
aNLI_test = aNLI_test.join(label_test)
print('test data length:', len(aNLI_test))

FileNotFoundError: [Errno 2] No such file or directory: 'test.jsonl'

## Final multiprompt

In [189]:
# for T5!
chain = pd.read_csv('t5_multiprompt_final.csv')
chain = chain.fillna('')
chain = chain.merge(aNLI_test, on='story_id')

for i, row in chain.iterrows():
    if row.hyp1 in row.end:
        chain.at[i,'end'] = 1
    elif row.hyp2 in row.end:
        chain.at[i,'end'] = 2
    else:
        chain.at[i,'end'] = 0

chain.loc[chain['consist']=='A', 'consist'] = 2
chain.loc[chain['consist']=='B', 'consist'] = 1

chain.loc[chain['likEx']=='Explanation A', 'likEx'] = 1
chain.loc[chain['likEx']=='Explanation B', 'likEx'] = 2

print(len(chain))

3059


In [202]:
# For Falcon!
chain = pd.read_csv('falcon_multiprompt.csv')
chain = chain.fillna('')
chain = chain.merge(aNLI_test, on='story_id')

for i, row in chain.iterrows():
    if row.hyp1 in row.end:
        chain.at[i,'end'] = 1
    elif row.hyp2 in row.end:
        chain.at[i,'end'] = 2
    elif 'B explains' in row.end:
        chain.at[i,'end'] = 2
    elif 'A explains' in row.end:
        chain.at[i,'end'] = 1
    else:
        chain.at[i,'end'] = 0

    if 'A is logically consistent' in row.consist or 'B is logically inconsistent' in row.consist or \
            'B is not logically consistent' in row.consist:
        chain.at[i,'consist'] = 1
    elif 'B is logically consistent' in row.consist or 'A is logically inconsistent' in row.consist or \
            'A is not logically consistent' in row.consist:
        chain.at[i,'consist'] = 2

    if 'A is more likely' in row.likEx or 'A is more correct' in row.likEx:
        chain.at[i,'likEx'] = 1
    elif 'B is more likely' in row.likEx or 'B is more correct' in row.likEx:
        chain.at[i,'likEx'] = 2


chain.loc[chain['cor1'].str.contains('Incoherent', case=False), 'cor1'] = 'No'
chain.loc[chain['cor2'].str.contains('Incoherent', case=False), 'cor2'] = 'No'
chain.loc[chain['cor1'].str.contains('Inconsistent', case=False), 'cor1'] = 'No'
chain.loc[chain['cor2'].str.contains('Inconsistent', case=False), 'cor2'] = 'No'
chain.loc[chain['cor1'].str.contains('Correct', case=False), 'cor1'] = 'Yes'
chain.loc[chain['cor2'].str.contains('Correct', case=False), 'cor2'] = 'Yes'
chain.loc[chain['cor1'].str.contains('Coherent', case=False), 'cor1'] = 'Yes'
chain.loc[chain['cor2'].str.contains('Coherent', case=False), 'cor2'] = 'Yes'
    
chain.loc[chain['consist']=='A', 'consist'] = 2
chain.loc[chain['consist']=='B', 'consist'] = 1

#loc[chain['likEx']=='Explanation A', 'likEx'] = 1
#chain.loc[chain['likEx']=='Explanation B', 'likEx'] = 2

#chain.loc[chain['likEx'].str.contains('A is more likely', case=False), 'likEx'] = 1
#chain.loc[chain['likEx'].str.contains('B is more likely', case=False), 'likEx'] = 2




In [203]:
chain.sample(3)

Unnamed: 0,story_id,end,end_cons,consist,inconsH1,inconsH2,cor1,cor2,likEx,obs1,obs2,hyp1,hyp2,label
1535,d0957cb6-3087-4e21-94f9-9af6b88f89cf-1,2,Yes,2,The inconsistency between the sentences is tha...,The inconsistency between the sentences is tha...,No,No,1,I was feeling frisky at work today.,That is until my boss joined me in skipping ab...,I was worried when my boss saw me running.,So I was kipping back and forth.,2
1216,a1d1655c-66c2-4aed-9d35-318715ac83f6-1,2,Yes,2,The inconsistency in the text is that Paul wan...,The inconsistency in the text is that Paul cam...,No,No,1,"Paul was an egotistical man, who would do anyt...","Paul backed down, and he moved to France.",Paul wanted to move to France.,Paul campaigned unsuccessfully to become compa...,2
176,943978bb-5cd2-4040-a2f3-ad38a8477e0b-1,0,"Yes, the sentences are consistent. The text is...",1,The inconsistency in the text is that the firs...,The inconsistency in the text is that the firs...,No,No,1,My girlfriend loves showing me internet videos.,"I learned two things, she is twisted and I can...",My girlfriend showed me a video about making p...,My girlfriend showed me a book of a couple hav...,1


In [204]:
# end + consistency with O1
print("end A:", len(chain[chain.end==1]) ,"end B:", len(chain[chain.end==2]), "NA:", len(chain[(chain.end!=1) & (chain.end!=2)]))
print("end accuracy:", round(len(chain[chain.end==chain.label])/len(chain),3) )
print("consistency end yes:", len(chain[chain.end_cons.str.contains('yes',case=False)]),
      "no:", len(chain[chain.end_cons.str.contains('no',case=False)]),
     "NA:", len(chain[~(chain.end_cons.str.contains('yes',case=False))& ~chain.end_cons.str.contains('no',case=False)]))

print()

#consist_changed = chain.copy()
chain.loc[((chain.end_cons.str.contains('no',case=False)) & (chain['end']==1)), 'S1'] = 2
chain.loc[((chain.end_cons.str.contains('no',case=False)) & (chain['end']==2)), 'S1'] = 1
chain.loc[((chain.end_cons.str.contains('yes',case=False)) & (chain['end']==1)), 'S1'] = 1
chain.loc[((chain.end_cons.str.contains('yes',case=False)) & (chain['end']==2)), 'S1'] = 2
print('End+O1 accuracy', round(len(chain[chain.S1==chain.label])/len(chain),3))

# okay this just gets worst for T5: 75->67
# Falcon 39->52 (the opposite yelds quite good results? :D
# print("end accuracy:", round(len(chain[chain.end!=chain.label])/len(chain),3) ) - wow it does the opposite and then has 61% yes!!!

end A: 1701 end B: 1204 NA: 154
end accuracy: 0.48
consistency end yes: 2673 no: 351 NA: 95

End+O1 accuracy 0.471


In [205]:
(len(chain[(chain.end==1) & (chain.label==2)]) + len(chain[(chain.end==1) & (chain.label==2)]) ) / len(chain)

0.5374305328538738

In [206]:
simpler_questions = chain[chain.end == chain.consist]
print('percentage of simpler questions', len(simpler_questions)/len(chain))
print('accuracy on those', len(simpler_questions[simpler_questions.consist==simpler_questions.label])/len(simpler_questions))

harder_questions = chain[chain.end != chain.consist]
print('percentage of harder questions', len(harder_questions)/len(chain))
print('accuracy on those', len(harder_questions[harder_questions.consist==harder_questions.label])/len(harder_questions))

percentage of simpler questions 0.4658385093167702
accuracy on those 0.5298245614035088
percentage of harder questions 0.5341614906832298
accuracy on those 0.49265605875153


In [207]:
harder_questions.sample(2)

Unnamed: 0,story_id,end,end_cons,consist,inconsH1,inconsH2,cor1,cor2,likEx,obs1,obs2,hyp1,hyp2,label,S1
2806,1ac1783d-b6da-4e81-bf50-aeda867cbd92-1,1,Yes,2,The inconsistency in the text is that the girl...,The inconsistency in the text is that the firs...,No,No,1,I went to the gym with my girlfriend today.,We were sore the next day.,My girlfriend and I exercised really hard.,We hadn't non-wroughted out for months.,1,1.0
2642,b851a644-14a4-4983-be8b-597468855c57-1,2,Yes,1,The inconsistency between the sentences is tha...,The inconsistency between the sentences is tha...,No,No,1,Ann has to help with a surgery.,Ann was very relieved.,Ann was well prepared.,the ear piercing went well.,1,2.0


In [208]:
# consistency only; T5: 78; Falcon nothing
print("consist A:", len(chain[chain.consist==1]) ,"consist B:", len(chain[chain.consist==2]),
      "NA:", len(chain[(chain.consist!=1) & (chain.consist!=2)]))
print("consist accuracy:", round(len(chain[chain.consist==chain.label])/len(chain),3) )

consist A: 1304 consist B: 1673 NA: 82
consist accuracy: 0.51


In [209]:
# generating explanation and deciding...
# likely T5 worst, 67%; like accuracy 49 :(
print("likEx A:", len(chain[chain.likEx==1]) ,"likEx B:", len(chain[chain.likEx==2]), "NA:", len(chain[(chain.likEx!=1) & (chain.likEx!=2)]))
print("likEx accuracy:", round(len(chain[chain.likEx==chain.label])/len(chain),3) )

print("cor1 yes:", len(chain[chain.cor1=='Yes']) ,"cor1 No:", len(chain[chain.cor1=='No']), 
      "NA:", len(chain[(chain.cor1!='Yes') & (chain.cor1!='No')]))
print("cor2 yes:", len(chain[chain.cor2=='Yes']) ,"cor2 No:", len(chain[chain.cor2=='No']), 
      "NA:", len(chain[(chain.cor2!='Yes') & (chain.cor2!='No')]))
print()

# T5: ~44% identified by this; A 72%, B 70%; Falcon bad, allmost all incorrect
A_chosen = chain[(chain.cor1=='Yes') & (chain.cor2=='No')]
B_chosen = chain[(chain.cor1=='No') & (chain.cor2=='Yes')]
print("yes and no -> A", len(A_chosen))
print("A correct", round(len(chain[(chain.cor1=='Yes') & (chain.cor2=='No') & (chain.label==1)])/len(A_chosen),3))
print("no and yes -> B", len(chain[(chain.cor1=='No') & (chain.cor2=='Yes')]))
print("B correct", round(len(chain[(chain.cor1=='No') & (chain.cor2=='Yes') & (chain.label==2)])/len(B_chosen),3))
print("both yes", len(chain[(chain.cor1=='Yes') & (chain.cor2=='Yes')]))
print("both no", len(chain[(chain.cor1=='No') & (chain.cor2=='No')]))

likEx A: 2762 likEx B: 247 NA: 50
likEx accuracy: 0.507
cor1 yes: 14 cor1 No: 3043 NA: 2
cor2 yes: 16 cor2 No: 3040 NA: 3

yes and no -> A 13
A correct 0.538
no and yes -> B 15
B correct 0.4
both yes 1
both no 3025


In [211]:

for i, row in chain.iterrows():
    if ('yes' in row.cor1 or 'Yes'in row.cor1) and ('no' in row.cor2 or 'No'in row.cor2):
        chain.at[i,'S2'] = 1
    elif ('yes' in row.cor2 or 'Yes'in row.cor2) and ('no' in row.cor1 or 'No'in row.cor1):
        chain.at[i,'S2'] = 2
    else:
        chain.at[i,'S2'] = row.likEx

print(len(chain[chain.S2==chain.label])/len(chain))

0.5070284406668846


In [212]:
chain.sample(1)

Unnamed: 0,story_id,end,end_cons,consist,inconsH1,inconsH2,cor1,cor2,likEx,obs1,obs2,hyp1,hyp2,label,S1,S2
1370,677776b6-e170-449f-b774-23b9a297a339-1,2,Yes,2,The inconsistency in the text lies in the phra...,The inconsistency in the text is that Sue visi...,No,No,1,Sue talked her husband into getting goats.,"As she did, the other goat butted her, knockin...",Sue didn't realize Billies loved to butt. When...,Sue visited Due's goats.,1,2.0,1


In [216]:
Z0 = pd.read_csv('zero_shot/anli_falcon_inst_Z0.csv')
Z0 = Z0.answer.tolist()
chain['Z0'] = Z0
for i, row in chain.iterrows():
    if 'A'in row.Z0:
        chain.at[i,'Z0'] = 1
    elif 'B'in row.Z0:
        chain.at[i,'Z0'] = 2
#chain.loc[chain['Z0'].str.contains('A', case=False), 'Z0'] = 1
#chain.loc[chain['Z0'].str.contains('B', case=False), 'Z0'] = 2
chain.sample(1)

Unnamed: 0,story_id,end,end_cons,consist,inconsH1,inconsH2,cor1,cor2,likEx,obs1,obs2,hyp1,hyp2,label,S1,S2,Z0
615,edb32dd1-d4ce-407c-830a-a9523ccb0d54-1,2,Yes,1,The inconsistency in the text is that Larry's ...,The inconsistency in the text is that it first...,No,No,1,Larry wanted to buy a car but his credit ratin...,After 2 Years his credit had improved enough t...,Larry paid all his bills on time for two years.,Larry stopped to pay down all his bills.,1,2.0,1,1


In [219]:
# voting amongst it (and maybe also with some other result?
correct = 0
for i, row in chain.iterrows():
    if isinstance(row.S2, str):
        s2=0
    else:
        s2=row.S2
    answers = [row.consist, row.end, int(s2), row.Z0]
    if len(set(answers)) == 1:
        if answers[0] == row.label:
            correct +=1
    elif answers.count(2) > answers.count(1):
        if row.label == 2:
            correct +=1
    elif answers.count(2) < answers.count(1):
        if row.label == 1:
            correct +=1 
    else:
        if row.label == row.Z0:
            correct += 1

print(correct/len(chain)) # 84\% for all of them, not improvement; 83 without S2, without end, 82 without consist.

0.5142203334423014


## Chain likely

In [350]:
chain = pd.read_csv('falcon_chain2_pokus_likely_3.csv')
chain = chain.fillna('')
chain = chain.merge(aNLI_test, on='story_id')

In [351]:
chain.loc[chain['likely1'].str.contains('A explains the Conclusion better'), 'likely1A'] = 1 # almostonly B
chain.loc[chain['likely1'].str.contains('B explains the Conclusion better'), 'likely1A'] = 2
chain.loc[chain['likely2'].str.contains('A is more coherent'), 'likely2A'] = 1  # better A/B ratio ~ 2xB - only this makes sense
chain.loc[chain['likely2'].str.contains('B is more coherent'), 'likely2A'] = 2
chain.loc[chain['likely3'].str.contains('A is the worst'), 'likely3A'] = 1 # mostly B
chain.loc[chain['likely3'].str.contains('B is the worst'), 'likely3A'] = 2

In [369]:
bla = chain[chain.likely3A==1]
len(bla[bla.likely3A==bla.label])/len(bla)

0.36

In [352]:
chain.sample(3)

Unnamed: 0,story_id,likely1,likely2,likely3,obs1,obs2,hyp1,hyp2,label,likely1A,likely2A,likely3A
2971,d013d64f-7baa-490d-8147-4c7ca5aca354-1,Explanation B explains the Conclusion better because Lou's brother always showed up late to Easter dinner. Correct explanation is B.,"\nStory B is more coherent. In Story A, there is a discrepancy between Lou's favorite part of Easter (getting chocolate bunnies) and his brother's behavior (showing up late and getting Lou's leftover bunnies). In Story B, there is no inconsistency, as Lou'","\nText B is the worst explanation for the conclusion because it doesn't provide any new information or insights. It simply restates the fact that Lou didn't enjoy chocolate bunnies, without providing any context or elaboration. In contrast, Text A offers some background information about Lou's favorite part of Easter,",Lou's favorite part of Easter was getting chocolate bunnies.,He had always gotten the Lou's leftover bunnies to eat.,Lou's brother always showed up late to Easter dinner.,Lou didn't enjoy them much.,1,2.0,2.0,2.0
2802,77dde58c-cf20-4f12-b160-2122ac11ca52-1,Explanation B explains the Conclusion better because it was just crickets. Correct explanation is B.,"\nStory B is more coherent, as it provides a specific and consistent explanation for Joe's experiences (smelling sounds), while Story A does not.","Text B is the worst explanation for Conclusion because it is not specific and does not provide enough information to reach a conclusion. Text A, on the other hand, provides more information and is more specific, making it a better explanation for the conclusion.",Joe thought he was going crazy.,It turns out it was just crickets.,Joe heard a non-stop ringing in his ears.,Joe kept smelling sounds and couldn't place them.,1,2.0,2.0,2.0
385,2449b454-5e8e-4011-806d-14b751ff008d-1,Explanation B explains the Conclusion better because John got a job at Sears. Correct explanation is B.,"\nStory B is more coherent, as it provides a reason for John to get an employee discount on coffee (a job at Sears) and explains why he still drinks coffee every morning. Story A, on the other hand, does not provide a clear reason for John's morning coffee ritual.","\nText B is the worst explanation for the conclusion because it is a more specific and direct explanation compared to Text A. Text A could mean that John loves coffee in general, whereas Text B implies that he loves coffee specifically at Sears.",John loves to drink coffee every morning.,He also now gets an employee discount on coffee!,He goes to the same store everyday.,John got a job at Sears.,1,2.0,2.0,2.0


In [332]:
for i, row in chain.iterrows():
    if row.hyp1 in row.end:
        chain.at[i,'end'] = 1
    elif row.hyp2 in row.end:
        chain.at[i,'end'] = 2
    else:
        chain.at[i,'end'] = 0

Unnamed: 0,story_id,likely1,likely2,likely3,obs1,obs2,hyp1,hyp2,label,topic
0,87aa0983-9b84-48b1-86ff-160b1567487c-1,Explanation B explains the Conclusion better because the students were early and none of them had a lesson that day. Correct explanation is B.,"\nStory A is more coherent, as it follows a logical sequence of events (students being early for their lessons) and does not contain any contradictory information (none of Jane's students having a lesson that day). Story B, on the other hand, has a logical inconsistency (students being early for","\nText B is the worst explanation for the conclusion because it is more specific and informative than Text A, which is vague and does not provide enough information to reach a proper conclusion.",Jane was a professor teaching piano to students.,Jane spent the morning sipping coffee and reading a book.,Two of Jane's students were early for their lessons.,None of Jane's students had a lesson that day.,2,2.0
1,dfc8584e-13fe-4e26-bdf6-2485e90ef29d-1,Explanation B explains the Conclusion better because the last summer before college was a blast! Correct explanation is B.,"\nStory A is more coherent, as it mentions both traveling and partying, while Story B only focuses on working.","Text B is the worst explanation for the conclusion because it is the most specific and detailed explanation, while Text A is the most general and vague explanation.",Nate had the summer off before college.,Nate's last summer before college was a total blast!,Nate spent the summer traveling and partying.,Nate decided to spend the entire summer working in the Mines.,1,2.0
2,bac4a9c0-be9c-41d3-bb5c-474a1aa7a78b-1,Explanation B explains the Conclusion better because Anne did not miss carbs at all. Correct explanation is B.,"\nStory A is more coherent. In Story A, Anne's weight loss is directly related to her dietary change, whereas in Story B her weight loss is not directly related to her dietary change.","\nText B is the worst explanation for the conclusion because it is not specific and does not provide any information about the reason for Anne's weight loss. Text A, on the other hand, provides a specific reason (cutting out carbs) which makes it a better explanation for the conclusion.",Anne decided to cut out carbs from her diet.,After a few weeks she did not even miss carbs at all!,Anne started losing weight and soon wasn't very hungry.,She wanted to bulk up.,1,2.0
3,e8a69a76-5ab1-4b00-b442-80d33c6e12cd-1,<noinput>\nThe correct explanation is B. Explanation A explains the Conclusion better because Dan's pet pig is a part of his family. Correct explanation is B.,"\nStory A is more coherent, as it does not contain any contradictory information or actions.","\nText B is the worst explanation for the conclusion because it doesn't provide any information about Dan's relationship with his pet pig. Text A, on the other hand, at least hints that Dan is glad to have his pet pig be a part of his family.",Dan's neighbors know him for having a strange pet.,Dan is glad to have his pet pig be a part of his family.,Dan adopted a pig.,Dan doesn't like people knowing about his pet pig.,1,1.0
4,78dd77c9-99e3-4977-89d7-bfd8f8de1504-1,Explanation B is better because it explains the Conclusion better. Correct explanation is B.,"\nStory B is more coherent, as it provides a clear reason for Jason to take classes (after reading many books) to learn hands-on tactics, while Story A does not provide a clear reason for him to do so.",\nText B is the worst explanation for the conclusion because it is more specific and provides more information on the practical application of kung fu. Text A is too general and does not provide enough detail on the specific techniques and forms of kung fu.,Jason wants to learn kung fu.,Jason realizes that reading a book is not a good way to learn kung-fu.,Jason found a cartoon book at the library.,"After reading many books, he needed to take classes to learn hands on tactics.",2,
...,...,...,...,...,...,...,...,...,...,...
3054,eeab15ae-c394-4c5b-9327-af2d1928895e-1,Explanation B explains the Conclusion better because it was a daily exercise class. Correct explanation is B.,"\nStory B is more coherent. In Story A, the main reason for Jane getting a new bikini is because she felt uncomfortable in it at first. However, in Story B, the reason for her getting a new bikini is because she wanted to look good in it after taking an exercise class every day. This is a","\nText B is the worst explanation for the conclusion because it provides more information about Jane's exercise routine, which is not relevant to the topic of her getting ready for a swimsuit.",Jane decided to get swimsuit ready.,She was ready to rock her new bikini.,It took her awhile to feel comfortable but before it was on she thought it looked good.,Jane took an exercise class every day.,2,2.0
3055,45b09097-c288-4fa6-b04c-50519f1b8f24-1,Explanation B explains the Conclusion better because Mike decided to attend an internship for electricians. Correct explanation is B.,"\nStory B is more coherent, as it directly addresses Mike's realization of the need for a reliable skill and his decision to attend an internship to acquire that skill. Story A, on the other hand, jumps around in its narrative, starting with Mike's decision to attend school and then shifting to his realization of","Text B is the worst explanation for the conclusion because it does not provide a clear reason for Mike dropping out of school. Text A, on the other hand, provides a specific reason for his decision to attend an electrician internship.",Mike realized that he needed to have a reliable skill.,"Unfortunately, Mike did not enjoy school and decided to drop out.",Mike decided to sign up for school and like the classes.,Mike decided to attend an internship for electricians.,2,2.0
3056,498efa85-9db4-432e-af4e-e559767d59ef-1,Explanation B is better because it explains the Conclusion better. Correct explanation is B.,"\nStory A is more coherent as it is a direct continuation of the set-up for the interview, while Story B introduces an unexpected twist that doesn't fit the initial premise.","\nText B is the worst explanation for the conclusion because it is more believable and straightforward. In Text A, the reason for the interview is not given, and it is assumed that Chef Bill is being interviewed for a job at a 5 star restaurant. In Text B, the reason for the interview is given, making",Chef Bill is set up for a job interview at a 5 star restaurant.,When they visit the restaurant unannounced Bill is humiliated,He's one of the limited chefs in town.,Chef Bill is visited by his interviewer at his current job.,2,
3057,b46dd7e3-e1d6-4bd6-9a0d-95f750aea580-1,Explanation B is better because it explains the Conclusion better. Correct explanation is B.,"\nStory B is more coherent, as it provides a clear chain of events leading to the resolution of the conflict (fights breaking out, the girls grabbing their bags and fleeing), while Story A jumps directly to the argument about the shorts without explaining the cause of the conflict.","Text B is the worst explanation for the conclusion because it provides no information about the reason for the fight, whereas Text A offers some context about the argument.",Amy was at the mall with a friend.,"Amy didn't care about the yelling, but was hurt to lose the shorts.",Amy got into an argument with another customer about the last pair of shorts for sale.,"Amy bought a pair of shorts, but a fight broke out, and the girls grabbed their bags and fled.",1,


In [None]:
chain = pd.read_csv('t5_chain_1.csv')
chain = chain.fillna('')
chain = chain.merge(aNLI_test, on='story_id')

chain.loc[chain['topic'].str.startswith('Yes'), 'topic'] = 'Yes'
chain.loc[chain['topic'].str.startswith('No'), 'topic'] = 'No'

chain.loc[chain['end_cons'].str.contains('inconsistent'), 'end_cons'] = 'No'
chain.loc[chain['end_cons'].str.contains('not consistent'), 'end_cons'] = 'No'
chain.loc[chain['end_cons'].str.contains('consistent'), 'end_cons'] = 'Yes'

for i, row in chain.iterrows():
    if row.hyp1 in row.end:
        chain.at[i,'end'] = 1
    elif row.hyp2 in row.end:
        chain.at[i,'end'] = 2
    else:
        chain.at[i,'end'] = 0
        
    if row.hyp1 in row.start:
        chain.at[i,'start'] = 1
    elif row.hyp2 in row.start:
        chain.at[i,'start'] = 2
    else:
        chain.at[i,'start'] = 0

    if 'B is not consistent' in row.consist or 'B' in row.consist:
        chain.at[i,'consist'] = 1
    elif 'A is not consistent' in row.consist or 'A' in row.consist:
        chain.at[i,'consist'] = 2

#chain.loc[chain['consist'].str.contains('A is not consistent'), 'consist'] = 2
#chain.loc[chain['consist'].str.contains('B is not consistent'), 'consist'] = 1

## Evaluation chain that adds sentences

In [513]:
chain = pd.read_csv('falcon_chain_pre_final.csv')
chain = chain.fillna('')
chain = chain.merge(aNLI_test, on='story_id')

chain.loc[chain['end_cons'].str.contains('inconsistent'), 'end_cons'] = 'No'
chain.loc[chain['end_cons'].str.contains('not consistent'), 'end_cons'] = 'No'
chain.loc[chain['end_cons'].str.contains('There is a logical contradiction'), 'end_cons'] = "No"
chain.loc[chain['end_cons'].str.contains('Yes'), 'end_cons'] = "No"
chain.loc[chain['end_cons'].str.contains('consistent'), 'end_cons'] = 'Yes'
chain.loc[chain['end_cons'].str.contains('no logical or commonsense contradiction'), 'end_cons'] = 'Yes'

chain.loc[chain['consistH1'].str.contains('inconsistent'), 'consistH1'] = "No"
chain.loc[chain['consistH1'].str.contains('not consistent'), 'consistH1'] = "No"
chain.loc[chain['consistH1'].str.contains('There is a logical contradiction'), 'consistH1'] = "No"
chain.loc[chain['consistH1'].str.contains('Yes'), 'consistH1'] = "No"
chain.loc[chain['consistH1'].str.contains('consistent'), 'consistH1'] = "Yes"
chain.loc[chain['consistH1'].str.contains('no logical or commonsense contradiction'), 'consistH1'] = 'Yes'

chain.loc[chain['consistH2'].str.contains('inconsistent'), 'consistH2'] = "No"
chain.loc[chain['consistH2'].str.contains('There is a logical contradiction'), 'consistH2'] = "No"
chain.loc[chain['consistH2'].str.contains('not consistent'), 'consistH2'] = "No"
chain.loc[chain['consistH2'].str.contains('Yes'), 'consistH2'] = "No"
chain.loc[chain['consistH2'].str.contains('consistent'), 'consistH2'] = "Yes"
chain.loc[chain['consistH2'].str.contains('no logical or commonsense contradiction'), 'consistH2'] = 'Yes'

consistency_listB = ['A is logically consistent', 'A is not logically inconsistent', 'Text B contains contradictions', 'B is not logically consistent',
                    'B is not consistent', 'Text B is logically inconsistent']
consistency_listA = ['B is logically consistent', 'B is not logically inconsistent', 'Text A contains contradictions', 'A is not logically consistent',
                    'A is not consistent', 'Text A is logically inconsistent']

for i, row in chain.iterrows():
    if row.hyp1 in row.end:
        chain.at[i,'end'] = 1
    elif row.hyp2 in row.end:
        chain.at[i,'end'] = 2
    else:
        chain.at[i,'end'] = 0
        
    if 'A is more coherent' in row.likeliness:
        chain.at[i,'likeliness'] = 1
    elif 'B is more coherent' in row.likeliness:
        chain.at[i,'likeliness'] = 2
    else:
        chain.at[i,'likeliness'] = 0


    found = False
    #print('next text')
    for l in range(len(consistency_listB)):
        if found == True:
            continue
        if consistency_listB[l] in row.consist:
            chain.at[i,'consist'] = 1
            found = True
        elif consistency_listA[l] in row.consist:
            chain.at[i,'consist'] = 2
            found = True
        else:
            chain.at[i,'consist'] = 0
            


In [514]:
print("end A:", len(chain[chain.end==1]) ,"end B:", len(chain[chain.end==2]) ) # this quite ok! 3:1
print("likeliness A:", len(chain[chain.likeliness==1]) ,"likeliness B:", len(chain[chain.likeliness==2]) ) # 4:1, -> 3:1
print("consist A:", len(chain[chain.consist==1]) ,"consist B:", len(chain[chain.consist==2]) ) # okay ratio

print("end_cons Yes:", len(chain[chain.end_cons=='Yes']) ,"consist No:", len(chain[chain.end_cons=='No']) ) # > 2:1
print("consistH1 Yes:", len(chain[chain.consistH1=='Yes']) ,"consist No:", len(chain[chain.consistH1=='No']) ) # 1:4 for B now
print("consistH2 Yes:", len(chain[chain.consistH2=='Yes']) ,"consist No:", len(chain[chain.consistH2=='No']) ) # 1:4 for B

end A: 192 end B: 71
likeliness A: 231 likeliness B: 99
consist A: 154 consist B: 158
end_cons Yes: 187 consist No: 144
consistH1 Yes: 78 consist No: 278
consistH2 Yes: 66 consist No: 290


In [515]:
chain.sample(3)


Unnamed: 0,story_id,end,likeliness,end_cons,consist,consistH1,consistH2,obs1,obs2,hyp1,hyp2,label
18,eb7720a7-c234-4cb6-ae50-55768d70d5aa-1,1,0,Yes,2,Yes,No,My race car sounded like it needed a new engine.,My engine ran great again and I save money.,"I determined the bad sound came from worn out spark plugs, which I then replaced.",My mechanic checked out my body.,1
232,8bd50b4c-581e-4d2c-b991-0a2a6e6520ef-1,0,0,No,1,No,No,Pat and Lindsay had been together for 10 years.,Lindsay said yes.,Pat finally proposed marriage.,lindsay proposed to pat at a restaurant.,1
130,99937112-691b-440b-8b49-0083e36796d8-1,0,1,Yes,2,No,No,Marvin joined the military shortly after graduating from high school.,Upon her return Marvin was elated and asked Sarah to marry him.,"Marvin's girlfriend Sarah also joined, but she was stationed elsewhere.","Sarah left to college, and Marvin waited for her to go.",1


In [519]:
print(len(chain[chain.end != 0]), len(chain))

print('End accuracy', round(len(chain[chain.end==chain.label])/len(chain[chain.end != 0]),3))
print('likely accuracy', round(len(chain[chain.likeliness==chain.label])/len(chain[chain.likeliness != 0]),3))
print('consistent accuracy', round(len(chain[chain.consist==chain.label])/len(chain[chain.consist != 0]),3)) # best fo T5, 81%
print()
print('consistent end', round(len(chain[(chain.end_cons=='Yes') | (chain.end_cons=='yes')])/len(chain),3))
print('consistH1 ok', round(len(chain[(chain.consistH1=='Yes') | (chain.consistH1=='yes')])/len(chain),3))
print('consistH2 ok', round(len(chain[(chain.consistH2=='Yes') | (chain.consistH2=='yes')])/len(chain),3))


263 356
End accuracy 0.536
likely accuracy 0.567
consistent accuracy 0.538

consistent end 0.525
consistH1 ok 0.219
consistH2 ok 0.185


In [525]:
# not consistent when O1 added; T5 worst; Falcon helps 0.2\%
# consistent end not working now
consist_changed = chain.copy()
consist_changed.loc[((consist_changed['end_cons']=='No') & (consist_changed['end']==1)), 'end'] = 2
consist_changed.loc[((consist_changed['end_cons']=='No') & (consist_changed['end']==2)), 'end'] = 1
print('End accuracy', round(len(consist_changed[consist_changed.end==consist_changed.label])/len(chain[chain.end != 0]),3))

End accuracy 0.529


In [520]:
# Topic change; T5 nothing, a bit worst; Falcon helps 0.3\%
"""
topic_changed = chain.copy()
topic_changed.loc[((topic_changed['topic']=='No') & (topic_changed['end']==1)), 'end'] = 2
topic_changed.loc[((topic_changed['topic']=='No') & (topic_changed['end']==2)), 'end'] = 1
print('End accuracy', round(len(topic_changed[topic_changed.end==topic_changed.label])/len(chain[chain.end != 0]),3))
"""

"\ntopic_changed = chain.copy()\ntopic_changed.loc[((topic_changed['topic']=='No') & (topic_changed['end']==1)), 'end'] = 2\ntopic_changed.loc[((topic_changed['topic']=='No') & (topic_changed['end']==2)), 'end'] = 1\nprint('End accuracy', round(len(topic_changed[topic_changed.end==topic_changed.label])/len(chain[chain.end != 0]),3))\n"

In [532]:
# start and end same
same = chain[chain.end == chain.likeliness].copy()
print('start and end same answer:',len(same))
print('correct for those:', round(len(same[same.end==same.label])/len(same),3)) # T5 83% quite good

# not same - the difficult identified
not_same = chain[chain.end != chain.likeliness]
print('correct for not same:', round(len(not_same[not_same.end==not_same.label])/len(not_same),3)) # T5 only 61%; Falcon 35%
print('accuracy for start:', round(len(not_same[not_same.likeliness==not_same.label])/len(not_same),3)) # T5 38% for start; Falcon 33%

"""
# changing topic for those that are not right -> T5 62 % accuracy, only 1% better...; Falcon helps 0.2%
tp_changed = not_same.copy()
tp_changed.loc[((tp_changed['topic']=='No') & (tp_changed['end']==1)), 'end'] = 2
tp_changed.loc[((tp_changed['topic']=='No') & (tp_changed['end']==2)), 'end'] = 1 # seems to help Falcon?
print('Topic changed:', round(len(tp_changed[tp_changed.end==tp_changed.label])/len(tp_changed[tp_changed.end != 0]),3))
"""

# changing consistency for those -> T5 54 accuracy - making it worst...; Falcon not helping
cns_changed = not_same.copy()
cns_changed.loc[((cns_changed['end_cons']=='No') & (cns_changed['end']==1)), 'end'] = 2 
cns_changed.loc[((cns_changed['end_cons']=='No') & (cns_changed['end']==2)), 'end'] = 1 # seems to help Falcon?
print('Consistency changed:', round(len(cns_changed[cns_changed.end==cns_changed.label])/len(cns_changed[cns_changed.end != 0]),3))


start and end same answer: 149
correct for those: 0.577
correct for not same: 0.266
accuracy for start: 0.488
Consistency changed: 0.455


In [530]:
# end and consistency same answer
same = chain[chain.end == chain.consist].copy()
print('end and consistency same answer:',len(same))
print('correct for those:', round(len(same[same.end==same.label])/len(same),3)) # T5 87% accuracy for those!; Falcon 55%

# not same
not_same = chain[chain.end != chain.consist].copy()
print('correct for not same:', round(len(not_same[not_same.end==not_same.label])/len(not_same),3)) # T5 only 38%; Falcon 22%
print('accuracy for consistency:', round(len(not_same[not_same.consist==not_same.label])/len(not_same),3)) # T5 61\% - potential?, 52%

"""
# changing topic for those that are not right -> T5 1% better again...; Falcon helps 27%
tp_changed = not_same.copy()
tp_changed.loc[((tp_changed['topic']=='No') & (tp_changed['end']==1)), 'end'] = 2 # makes Falcon worst
tp_changed.loc[((tp_changed['topic']=='No') & (tp_changed['end']==2)), 'end'] = 1 # makes it worst Falcon
print('Topic changed:', round(len(tp_changed[tp_changed.end==tp_changed.label])/len(tp_changed[tp_changed.end != 0]),3))
"""

# changing consistency for those -> T5 7\% better, this works a bit! 14 when changed only from 1->2 ! what???; Falcon helps 27%
cns_changed = not_same.copy()
cns_changed.loc[((cns_changed['end_cons']=='No') & (cns_changed['end']==1)), 'end'] = 2 # makes Falcon worst les... 
cns_changed.loc[((cns_changed['end_cons']=='No') & (cns_changed['end']==2)), 'end'] = 1 # makes Falcon worst
print('Consistency changed:', round(len(cns_changed[cns_changed.end==cns_changed.label])/len(cns_changed[cns_changed.end != 0]),3))

# consistency changed for those that are more difficult -> T5 81 % :/ did not help, topic nothing
# Falcon helps to 54.4% (1.2%) -> not help to consistency - only the topic 2->1 works
final_thing = chain.copy()
final_thing.loc[((final_thing.end != final_thing.consist) & (final_thing['end_cons']=='No') & (final_thing['end']==1)), 'end'] = 2
final_thing.loc[((final_thing.end != final_thing.consist) & (final_thing['end_cons']=='No') & (final_thing['end']==2)), 'end'] = 1
print('Final thing  end:', round(len(final_thing[final_thing.end==final_thing.label])/len(final_thing[final_thing.end != 0]),3))
print('Final thing conist:', round(len(final_thing[final_thing.consist==final_thing.label])/len(final_thing[final_thing.consist != 0]),3))

""" LOOK AT THE DIFFICULT INSTANCES!!!!! (T5 not done, Falcon not done) """

end and consistency same answer: 121
correct for those: 0.512
correct for not same: 0.336
accuracy for consistency: 0.451
Consistency changed: 0.503
Final thing  end: 0.506
Final thing conist: 0.538


' LOOK AT THE DIFFICULT INSTANCES!!!!! (T5 not done, Falcon not done) '

In [325]:
# voting for end, start, consistency
answers = zip(chain.end.tolist(), chain.start.tolist(), chain.consist.tolist(), chain.label.tolist())

answer = []
correct = 0
for a, b, c, l in answers:
    abc=[a,b,c]
    abc = [t for t in abc if t!=0]
    if abc.count(1) > abc.count(2):
        answer.append(1)
        if l==1:
            correct+=1
    elif abc.count(1) < abc.count(2):
        answer.append(2)
        if l==2:
            correct+=1
    else:
        answer.append(c)
        if l==c:
            correct+=1

correct/len(answer) # T5 81% -> 

0.8071265119320039

In [298]:
topic_changed.sample(4)

Unnamed: 0,story_id,end,start,end_cons,consist,topic,obs1,obs2,hyp1,hyp2,label
870,d9fa6560-b915-4989-92f3-eb07c0c1c02c-1,1,2,Yes,1,No,Mike's morning couldn't officially begin until he finished his coffee.,"He would sip in silence, ready to tackle the day.",Mike didn't take any sips of the coffee.,It was a ritual for Mike.,2
462,2bab4a0f-835e-46a5-82d9-41be5a3b1811-1,1,0,No,1,Yes,Ron stormed into his wife's office.,They later worked out their problems.,Ron had misunderstood some information that he got.,Ron and his husband got into a big fight.,1
665,9d52c41d-df5c-407e-a4fe-e8cb145fe75c-1,0,1,No,1,Yes,Linda was a terrible cook.,"After a few months, all her friends agreed that she was a great cook!",She took a class to get better.,Linda's friends decided to take a cooking class to learn how to cook.,1
265,04dad9be-d5b7-4171-a3d6-a1942147bc47-1,1,1,Yes,1,Yes,Sue had lost her wedding ring.,He had seen it lying on the walkway outside.,Sue looked outside for her ring.,Sue told a ex boyfriend who pulled it out of his pocket.,1


### OLD

In [5]:
likeliness = pd.read_csv('Falcon_chain_1.csv')
likeliness = likeliness.fillna('')
likeliness.loc[likeliness['likely1'].str.contains('Explanation A explains'), 'answer1'] = 1
likeliness.loc[likeliness['likely1'].str.contains('Explanation B explains'), 'answer1'] = 2
likeliness.loc[likeliness['likely2'].str.startswith('\nStory A'), 'answer2'] = 1
likeliness.loc[likeliness['likely2'].str.startswith('\nStory B'), 'answer2'] = 2
likeliness.loc[likeliness['likely3'].str.contains('Text B is the worst'), 'answer3'] = 1
likeliness.loc[likeliness['likely3'].str.contains('Text A is the worst'), 'answer3'] = 2
likeliness.loc[likeliness['likely3'].str.contains('Text B is the better explanation'), 'answer3'] = 2
likeliness.loc[likeliness['likely3'].str.contains('Text A is the better explanation'), 'answer3'] = 1
likeliness = likeliness.merge(aNLI_test, on='story_id')
len(likeliness)

KeyError: 'likely1'

In [763]:
likeliness.sample(1)

Unnamed: 0,story_id,likely1,likely2,likely3,answer1,answer2,answer3,obs1,obs2,hyp1,hyp2,label
521,e151b6a6-e2a8-48d0-8230-3622aea43253-1,Explanation B explains the Conclusion better because the patient died shortly after they left. Correct explanation is B.,"\nStory B is more coherent, as it provides a direct cause for the patient's death (the family pulling the lifesupport) and a clear chain of events leading to it, while Story A only describes symptoms and a general deterioration without specifying a cause or timeline.",\nText B is the worst explanation for the conclusion because it is more specific and provides more information about the patient's condition. Text A is too vague and does not provide enough information to determine the cause of the patient's worsening condition.,2.0,2.0,1.0,The patient was getting worse.,He died shortly after they left.,They tried everything to keep the patient away.,The family pulled the lifesupport.,2


In [764]:
round(len(likeliness[likeliness.answer1==1])/len(likeliness),2), round(len(likeliness[likeliness.answer2==1])/len(likeliness),2), round(len(likeliness[likeliness.answer3==1])/len(likeliness),2)

(0.05, 0.38, 0.88)

In [765]:
round(len(likeliness[likeliness.label==likeliness.answer1])/len(likeliness),2)

0.44

In [766]:
round(len(likeliness[likeliness.label==likeliness.answer2])/len(likeliness),2)

0.53

In [767]:
round(len(likeliness[likeliness.label==likeliness.answer3])/len(likeliness),2)

0.52

In [760]:
len(likeliness[(likeliness.answer2==1) & (likeliness.answer2==likeliness.label)])/len(likeliness[(likeliness.answer2==1)])

0.6103896103896104

In [761]:
likeliness[likeliness.answer3!=likeliness.label].sample(3)

Unnamed: 0,story_id,likely1,likely2,likely3,answer1,answer2,answer3,obs1,obs2,hyp1,hyp2,label
148,961a77ca-e5a0-4726-bcde-bb1fa1a92c58-1,Explanation B explains the Conclusion better because it is more coherent and related to the task. Correct explanation is B.,"\nStory B is more coherent, as it provides a reason for Professor Smith's recent difficulty teaching and offers a plausible explanation for his apology. Story A, on the other hand, does not provide any clear reason for Professor Smith's recent decline in teaching ability and does not offer an explanation for his apology, making","\nText B is the worst explanation for the conclusion because it provides no new information and does not offer any insight into the situation. It simply states that Professor Smith was having trouble teaching his classes. In contrast, Text A provides some context about Professor Smith's recent stroke, which could potentially explain his behavior in class.",2.0,2.0,1.0,Professor Smith was a well liked teacher of black history.,"Professor Smith asked for forgiveness, he'd recently had a stroke.",A racial slur used Professor Smith in class.,Professor Smith was having trouble teaching his classes.,2
188,33633fff-980d-42fd-945f-7c557186e2f8-1,Explanation B explains the Conclusion better because it is more coherent and accurate. Correct explanation is B.,"\nStory B is more coherent, as it uses scholarly music articles as citations and relies on them for accuracy, while Story A includes incorrect information due to Mike's mistake in using unreliable sources.","\nText B is the better explanation for the conclusion as it uses scholarly music articles as citations, while Text A includes incorrect information.",2.0,2.0,2.0,Mike was writing a report about guitars.,His teacher wrote that Wikipedia is a bad source.,"Mike looked up information, not realizing some was wrong.",Mike only used scholarly music articles as citations.,1
511,3b4c3e41-0207-4fbe-831f-f778905c4e87-1,Explanation B explains the Conclusion better because the fish was in a plastic bag. Correct explanation is B.,"\nStory B is more coherent, as it follows a logical sequence of events, while Story A jumps around in time and has an unexpected outcome.","Text B is the worst explanation for the conclusion because it is a factual error. In Text A, the fish was eaten, whereas in Text B, the fish was not mentioned.",2.0,2.0,1.0,Erik wanted a pet.,"Then his parents made him flush it down the toilet, RIP.","Erik got a fish,and ate it .",Erik won a goldfish at the fair but he forgot to get it out of its plastic bag.,2


In [331]:
len(likeliness[(likeliness.answer1!=likeliness.answer2) & (likeliness.answer1==likeliness.label)])/len(likeliness)

0.20714285714285716

In [292]:
data = pd.read_csv(MODEL_TO_RUN_PATH)
data2 = pd.read_csv('falcon_chain2_pokus_2.csv')
data = pd.concat([data, data2])
data = data.merge(aNLI_test, on='story_id')
data_short = data
print(len(data))

420


In [293]:
data_short.loc[data_short['consA1'].str.startswith('\nNo'), 'consA1'] = 'no'
data_short.loc[data_short['consA1'].str.startswith('\nYes'), 'consA1'] = 'yes'
data_short.loc[data_short['consA2'].str.startswith('\nNo'), 'consA2'] = 'no'
data_short.loc[data_short['consA2'].str.startswith('\nYes'), 'consA2'] = 'yes'
data_short.loc[data_short['consA3'].str.startswith('\nNo'), 'consA3'] = 'no'
data_short.loc[data_short['consA3'].str.startswith('\nYes'), 'consA3'] = 'yes'
data_short.loc[data_short['consA4'].str.startswith('\nNo'), 'consA4'] = 'no'
data_short.loc[data_short['consA4'].str.startswith('\nYes'), 'consA4'] = 'yes'

data_short.loc[data_short['consB1'].str.startswith('\nNo'), 'consB1'] = 'no'
data_short.loc[data_short['consB1'].str.startswith('\nYes'), 'consB1'] = 'yes'
data_short.loc[data_short['consB2'].str.startswith('\nNo'), 'consB2'] = 'no'
data_short.loc[data_short['consB2'].str.startswith('\nYes'), 'consB2'] = 'yes'
data_short.loc[data_short['consB3'].str.startswith('\nNo'), 'consB3'] = 'no'
data_short.loc[data_short['consB3'].str.startswith('\nYes'), 'consB3'] = 'yes'
data_short.loc[data_short['consB4'].str.startswith('\nNo'), 'consB4'] = 'no'
data_short.loc[data_short['consB4'].str.startswith('\nYes'), 'consB4'] = 'yes'

data_short.loc[data_short['consC1'].str.startswith('\nNo'), 'consC1'] = 'no'
data_short.loc[data_short['consC1'].str.startswith('\nYes'), 'consC1'] = 'yes'
data_short.loc[data_short['consC2'].str.startswith('\nNo'), 'consC2'] = 'no'
data_short.loc[data_short['consC2'].str.startswith('\nYes'), 'consC2'] = 'yes'
data_short.loc[data_short['consC3'].str.startswith('\nNo'), 'consC3'] = 'no'
data_short.loc[data_short['consC3'].str.startswith('\nYes'), 'consC3'] = 'yes'
data_short.loc[data_short['consC4'].str.startswith('\nNo'), 'consC4'] = 'no'
data_short.loc[data_short['consC4'].str.startswith('\nYes'), 'consC4'] = 'yes'

In [294]:
label1 = data_short[data_short.label==1]
label2 = data_short[data_short.label==2]

In [295]:
# using only contradictions 55% from those that make sense

correct = 0
counted = 0

for i, row in data_short.iterrows():
    categorized = False

    right=0

    # consistency B
    if not categorized and row.consB1=='yes' and row.consB2=='no':
        right=1
        counted+=1
        categorized=True
    if not categorized and row.consB1=='no' and row.consB2=='yes':
        right=2
        counted+=1
        categorized=True

    # consistency A
    if not categorized and row.consA1=='yes' and row.consA2=='no':
        right=1
        counted+=1
        categorized=True
    if not categorized and row.consA1=='no' and row.consA2=='yes':
        right=2
        counted+=1
        categorized=True
 
    if right == row.label:
        correct += 1

round(correct/counted,2), counted, round(correct/len(data_short),2)

(0.68, 161, 0.26)

In [193]:
len(data_short[(data_short.label==1) & (data_short.consC1=='yes')])


print('CwO1 wrongly contradicts', round(len(data_short[(data_short.label==1) & (data_short.consC1=='yes')])/len(label1),2))
print('CwO1 rightly contradicts', round(len(data_short[(data_short.label==1) & (data_short.consC1=='yes')])/len(label1),2))

C1 wrongly contradicts 0.3


In [195]:
len(data_short[(data_short.label==2) & (data_short.consC2=='yes')])

36

In [197]:
data_short[(data_short.label==2) & (data_short.consC2=='yes')][:3]

Unnamed: 0,story_id,consA1,consB1,consC1,consA2,consB2,consC2,consA3,consB3,consC3,consA4,consB4,consC4,obs1,obs2,hyp1,hyp2,label
4,78dd77c9-99e3-4977-89d7-bfd8f8de1504-1,yes,yes,yes,yes,yes,yes,yes,no,yes,yes,no,yes,Jason wants to learn kung fu.,Jason realizes that reading a book is not a good way to learn kung-fu.,Jason found a cartoon book at the library.,"After reading many books, he needed to take classes to learn hands on tactics.",2
6,a16ce8ce-9f32-441a-8113-6997d8988ece-1,yes,yes,yes,yes,yes,yes,no,yes,no,yes,no,yes,Bob was playing league of legends.,Bob was banned from the game for a month afterwards.,Some of the other players insulted Bob.,He called someone online an asshole.,2
8,903210da-9c27-4efb-b546-b3af848a153d-1,no,no,yes,no,no,yes,no,no,no,no,no,no,Two girls have boyfriends in isis.,ISIS killed them.,the boyfriends were american citizens.,The girls become involved with ISIS.,2


In [198]:
data_short[(data_short.label==1) & (data_short.consC1=='yes')][:3]

Unnamed: 0,story_id,consA1,consB1,consC1,consA2,consB2,consC2,consA3,consB3,consC3,consA4,consB4,consC4,obs1,obs2,hyp1,hyp2,label
2,bac4a9c0-be9c-41d3-bb5c-474a1aa7a78b-1,yes,yes,yes,yes,no,yes,yes,no,yes,yes,yes,yes,Anne decided to cut out carbs from her diet.,After a few weeks she did not even miss carbs at all!,Anne started losing weight and soon wasn't very hungry.,She wanted to bulk up.,1
13,3a317d9a-06fd-4363-80d8-e0be38097d0d-1,no,no,yes,no,no,yes,no,no,yes,no,yes,yes,Frank was hauling bowling balls in his truck.,Frank screamed as he tumbled into the water.,One of the balls came loose and he lost control of the truck.,Frank's friend truck went off the road into the lake.,1
14,4140e766-050c-4811-887e-8b12e2be190d-1,yes,yes,yes,no,no,yes,no,no,yes,no,yes,yes,Dan was digging in his yard to put in an extension to his home.,When he sniffed the air he realized he'd struck a sewage pipe.,Dan hit a pipe.,Dan's shovel hit something soft.,1


In [196]:
len(data_short[(data_short.label==1) & (data_short.consC1=='yes')])

36

In [134]:
def evaluate_coumn_odd(column):
    no = data[data[column].str.startswith('\nNo')]
    no_right = len(no[no.label==2])
    yes = data[data[column].str.startswith('\nYes')]
    yes_right =  len(yes[yes.label==1])
    return len(yes), round(yes_right/len(yes),2), len(no), round(no_right/len(no),2), round((yes_right+no_right)/(len(yes)+len(no)),2), round(len(yes)/len(no),2)

In [135]:
def evaluate_coumn_even(column):
    no = data[data[column].str.startswith('\nNo')]
    no_right = len(no[no.label==1])
    yes = data[data[column].str.startswith('\nYes')]
    yes_right =  len(yes[yes.label==2])
    return len(yes), round(yes_right/len(yes),2), len(no), round(no_right/len(no),2), round((yes_right+no_right)/(len(yes)+len(no)),2), round(len(yes)/len(no),2)

In [173]:
print('A1:', evaluate_coumn_odd('consA1'))
print('B1:', evaluate_coumn_odd('consB1'))
print('C1:', evaluate_coumn_even('consC1')) # for C it is the other way around

print('hyp 2')
print('A2:', evaluate_coumn_even('consA2'))
print('B2:', evaluate_coumn_even('consB2'))
print('C2:', evaluate_coumn_odd('consC2'))

print('without O1')
print('A3:', evaluate_coumn_odd('consA3'))
print('B3:', evaluate_coumn_odd('consB3'))
print('C3:', evaluate_coumn_even('consC3'))

print('hyp 2')
print('A4:', evaluate_coumn_even('consA4'))
print('B4:', evaluate_coumn_even('consB4'))
print('C4:', evaluate_coumn_odd('consC4'))

A1: (73, 0.62, 117, 0.49, 0.54, 0.62)
B1: (106, 0.63, 80, 0.55, 0.6, 1.32)
C1: (60, 0.43, 130, 0.55, 0.51, 0.46)
hyp 2
A2: (68, 0.49, 122, 0.57, 0.54, 0.56)
B2: (97, 0.44, 91, 0.55, 0.49, 1.07)
C2: (72, 0.57, 118, 0.46, 0.5, 0.61)
without O1
A3: (61, 0.69, 129, 0.51, 0.57, 0.47)
B3: (111, 0.63, 77, 0.56, 0.6, 1.44)
C3: (69, 0.36, 121, 0.5, 0.45, 0.57)
hyp 2
A4: (63, 0.43, 127, 0.54, 0.51, 0.5)
B4: (98, 0.44, 91, 0.54, 0.49, 1.08)
C4: (74, 0.58, 116, 0.47, 0.51, 0.64)


In [132]:
len(data[(data['consB1'].str.startswith("\nNo")) & (data['consB2'].str.startswith("\nYes")) & (data['label']==2)])

10

In [131]:
len(data[(data['consB1'].str.startswith("\nNo")) & (data['consB2'].str.startswith("\nYes"))])

12