In [None]:
!pip install semantic-text-similarity

In [None]:
from semantic_text_similarity.models import WebBertSimilarity
from semantic_text_similarity.models import ClinicalBertSimilarity

web_model = WebBertSimilarity(device='cpu', batch_size=10) #defaults to GPU prediction

# clinical_model = ClinicalBertSimilarity(device='cuda', batch_size=10) #defaults to GPU prediction

In [None]:
import numpy as np
import pdb
import json
class GFG: 
  def __init__(self,graph): 
    self.graph = graph 
    self.ppl = len(graph) 
    self.jobs = len(graph[0]) 

  def bpm(self, u, matchR, seen): 
    for v in range(self.jobs):  
      if self.graph[u][v] and seen[v] == False: 
        seen[v] = True
        if matchR[v] == -1 or self.bpm(matchR[v], matchR, seen): 
          matchR[v] = u 
          return True
    return False
  
  # Returns maximum number of matching 
  def maxBPM(self):
    matchR = [-1] * self.jobs 

    result = 0
    for i in range(self.ppl):
      seen = [False] * self.jobs
      if self.bpm(i, matchR, seen): 
        result += 1
    return result, matchR

def my_lcs(string, sub):
  if(len(string)<= len(sub)):
    sub, string = string, sub

    lengths = [[0 for i in range(0,len(sub)+1)] for j in range(0,len(string)+1)]

    for j in range(1,len(sub)+1):
      for i in range(1,len(string)+1):
        if (string[i-1] == sub[j-1]):
          lengths[i][j] = lengths[i-1][j-1] + 1
        else:
          lengths[i][j] = max(lengths[i-1][j] , lengths[i][j-1])

    return lengths[len(string)][len(sub)]

class Rouge():
  def __init__(self):
    self.beta = 1.2

  def calc_score(self, candidate, refs):
    assert(len(candidate)==1)	
    assert(len(refs)>0)         
    prec = []
    rec = []

  # split into tokens
    token_c = candidate[0].split(" ")
    	
    for reference in refs:
      # split into tokens
      token_r = reference.split(" ")
      # compute the longest common subsequence
      lcs = my_lcs(token_r, token_c)
      if (lcs == None):
        prec.append(0)
        rec.append(0)
      else:
        prec.append(lcs/float(len(token_c)))
        rec.append(lcs/float(len(token_r)))

      prec_max = max(prec)
      rec_max = max(rec)

      if (prec_max!=0 and rec_max !=0):
        score = ((1 + self.beta**2)*prec_max*rec_max)/float(rec_max + self.beta**2*prec_max)
      else:
        score = 0.0
      return score

  def compute_score(self, refs, test):
    score = []
    for i in range(len(refs)):
      hypo = test[i]
      ref = refs[i]
      if (hypo == " " or hypo == ""):
        score.append(0)
      else:
        score.append(self.calc_score([hypo], [ref]))
    
    average_score = np.mean(np.array(score))
    return average_score, np.array(score)

  def method(self):
    return "Rouge"

In [None]:
import numpy as np
import pandas as pd
import pdb
import json

For evaluation of AIR approach on gold corpus

In [None]:
output_file = './MultiRC_BM25_vs_POCC_justification_quality_score/air_test_output.tsv'
test_file = '../data/E2_test.json'
name = 'air_test_output'
f = open(test_file)
data = json.load(f)

In [None]:
output_cols = ["start", "index", "zero_zero", "correctness", "ques", "prop0", "prop1", "prop2", "prop3", "prop4", "prop5", "prop6", "prop7","prop8","prop9","prop10","prop11","prop12","prop13","prop14","prop15","prop16","prop17","prop18","prop19","prop20"]
df = pd.read_csv(output_file, sep='\t', names=output_cols, engine='python', header=None)
df = df.fillna("")

output_queries = df['ques']
output_props = []
output_correctness = df['correctness']

for i,row in df.iterrows():
  props = [row['prop0'],row['prop1'],row['prop2'],row['prop3'],row['prop4'],row['prop5'],row['prop6'],row['prop7'],row['prop8'],row['prop9'],row['prop10'],row['prop11'],row['prop12'],row['prop13'],row['prop14'],row['prop15'],row['prop16'],row['prop17'],row['prop18'],row['prop19'],row['prop20']]
  output_props.append(props)

count = 0
recall = 0.0
precision = 0.0
f_score = 0.0
for i in range(0, len(data['q_text'])):
  # if i % 1000 == 0:
  #   print(i)
  true_props = data['property'][i]
  query = data['q_text'][i].replace('\n','')
  if data['correct'][i]:
    query += ' ' + data['option'][i]
  else:
    query += ' not ' + data['option'][i]

  retrieved_props = []
  found = False
  for cntr in range(i, len(output_queries)):
    if query == output_queries[cntr]:
      retrieved_props = output_props[cntr]
      found = True
      # print(i, cntr)
      break
  if (found == False):
    for cntr in range(0,i):
      if query == output_queries[cntr]:
        retrieved_props = output_props[cntr]
        # print(i, cntr)
        break
  retrieved_props = [x for x in retrieved_props if x]
  
  num_found = 0
  recall0 = 0
  precision0 = 0
  for prop in retrieved_props:
    if prop in true_props:
      num_found += 1
  precision += float(num_found)/len(retrieved_props)
  precision0 = float(num_found)/len(retrieved_props)

  num_found = 0
  for prop in true_props:
    if prop in retrieved_props:
      num_found += 1
  recall += float(num_found)/len(true_props)
  recall0 = float(num_found)/len(true_props)
  count += 1
  if precision0 + recall0 == 0:
    f_score += 0
  else:
    f_score += 2*precision0*recall0/(precision0 + recall0)

precision /= count
recall /= count
f_score /= count
print("Exact score============")
print('Recall: ', recall)
print('Precision: ', precision)
print('F1 Score: ', f_score)

For evaluation of AIR approach on Silver corpus

In [None]:
output_file = './MultiRC_BM25_vs_POCC_justification_quality_score/air_omcs_test_output.tsv'
test_file = '../data/E2_test.json'
name = 'air_omcs_test_output'
f = open(test_file)
data = json.load(f)

In [None]:
output_cols = ["start", "index", "zero_zero", "correctness", "ques", "prop0", "prop1", "prop2", "prop3", "prop4", "prop5", "prop6", "prop7","prop8","prop9","prop10","prop11","prop12","prop13","prop14","prop15","prop16","prop17","prop18","prop19","prop20"]
df = pd.read_csv(output_file, sep='\t', names=output_cols, engine='python', header=None)
df = df.fillna("")

output_queries = df['ques']
output_props = []
output_correctness = df['correctness']

for i,row in df.iterrows():
  props = [row['prop0'],row['prop1'],row['prop2'],row['prop3'],row['prop4'],row['prop5'],row['prop6'],row['prop7'],row['prop8'],row['prop9'],row['prop10'],row['prop11'],row['prop12'],row['prop13'],row['prop14'],row['prop15'],row['prop16'],row['prop17'],row['prop18'],row['prop19'],row['prop20']]
  output_props.append(props)

In [None]:
%cd ../../generation/cider

In [None]:
cider_refs_thresholder = []
cider_test_thresholder = []
spice_thresholder = []
count = 0
for k in range(0, len(data['q_text'])):
  # if k % 500 == 0:
  #   print(k)
  true_props = data['property'][k]
  query = data['q_text'][k].replace('\n','')
  if data['correct'][k]:
    query += ' ' + data['option'][k]
  else:
    query += ' not ' + data['option'][k]

  retrieved_props = []
  found = False
  for cntr in range(k, len(output_queries)):
    if query == output_queries[cntr]:
      retrieved_props = output_props[cntr]
      found = True
      if cntr != k:
        print(k, cntr)
      break
  if (found == False):
    for cntr in range(0,k):
      if query == output_queries[cntr]:
        retrieved_props = output_props[cntr]
        print(k, cntr)
        break
  retrieved_props = output_props[cntr]
  retrieved_props = [x for x in retrieved_props if x]
  correctness = data['correct'][k]

  for i in range(len(true_props)):
    for j in range(len(retrieved_props)):
      struct = {
          "image_id": count,
          "caption": retrieved_props[j]
      }
      cider_test_thresholder.append(struct)
      struct = {
          "image_id": count,
          "caption": true_props[i]
      }
      cider_refs_thresholder.append(struct)
      struct = {
          "image_id": count,
          "test": retrieved_props[j],
          "refs": [true_props[i]]
      }
      spice_thresholder.append(struct)
      count += 1

with open('./data/cider_' + name + '_refs.json', 'w') as outfile:
    json.dump(cider_refs_thresholder, outfile)
with open('./data/cider_' + name + '_test.json', 'w') as outfile:
    json.dump(cider_test_thresholder, outfile)
with open('../spice/spice_' + name + '.json', 'w') as outfile:
    json.dump(spice_thresholder, outfile)

params = {
  "pathToData" : "./data/",
    "refName" : 'cider_' + name + '_refs.json',
    "candName" : 'cider_' + name + '_test.json',
    "resultFile" : 'cider_' + name + '_results.json',
    "idf" : "coco-val-df"
}
with open('./params.json', 'w') as outfile:
    json.dump(params, outfile)

In [None]:
!python2 ./cidereval.py

In [None]:
file2 = open('./cider_' + name + '_results.json')
cider_output = json.load(file2)

In [None]:
%cd ../meteor

In [None]:
write_file = open(name + "_meteor_refs", "w")
for i in range(len(cider_refs_thresholder)):
  new_line = cider_refs_thresholder[i]['caption'].replace("\n", " ") + " \n"
  write_file.write(new_line)
write_file.close()

write_file2 = open(name + "_meteor_test", "w")
for i in range(len(cider_test_thresholder)):
  if (cider_test_thresholder[i]['caption'] == "" or cider_test_thresholder[i]['caption'] == " "):
    new_line = "empty \n"
  else:
    new_line = cider_test_thresholder[i]['caption'].replace("\n", " ") + " \n"
  write_file2.write(new_line)
write_file2.close()

In [None]:
meteor_scores = !java -Xmx2G -jar meteor-1.5.jar ./air_omcs_test_output_meteor_test ./air_omcs_test_output_meteor_refs -l en -norm -a data/paraphrase-en.gz -q
meteor_scores = [float(meteor_scores[i]) for i in range(len(meteor_scores))]
meteor_scores[-1]

In [None]:
%cd ../spice

In [None]:
!java -Xmx8G -jar spice-1.0.jar spice_air_omcs_test_output.json -cache ./cache/ -out spice_air_omcs_test_output_output.json

In [None]:
file2 = open('./spice_' + name + '_output.json')
spice_output = json.load(file2)
len(spice_output)

In [None]:
rouge_test =  [cider_test_thresholder[i]['caption'] for i in range(len(cider_test_thresholder))]
rouge_refs =  [cider_refs_thresholder[i]['caption'] for i in range(len(cider_refs_thresholder))]
r = Rouge()
rouge_scores = r.compute_score(rouge_refs, rouge_test)
rouge_scores[0]

In [None]:
spice_recall = 0.0
spice_precision = 0.0
spice_fscore = 0.0
cider_recall = 0.0
cider_precision = 0.0
cider_fscore = 0.0
meteor_recall = 0.0
meteor_precision = 0.0
meteor_fscore = 0.0
rouge_recall = 0.0
rouge_precision = 0.0
rouge_fscore = 0.0
count1 = 0
count = 0
spice_threshold = 0.4
cider_threshold = 3
meteor_threshold = 0.3
rouge_threshold = 0.3
counter = []
for k in range(0, len(data['q_text'])):
  # if k % 500 == 0:
  #   print(k)
  true_props = data['property'][k]
  query = data['q_text'][k].replace('\n','')
  if data['correct'][k]:
    query += ' ' + data['option'][k]
  else:
    query += ' not ' + data['option'][k]

  retrieved_props = []
  found = False
  for cntr in range(k, len(output_queries)):
    if query == output_queries[cntr]:
      retrieved_props = output_props[cntr]
      found = True
      break
  if (found == False):
    for cntr in range(0,k):
      if query == output_queries[cntr]:
        retrieved_props = output_props[cntr]
        break

  retrieved_props = output_props[cntr]
  retrieved_props = [x for x in retrieved_props if x]
  correctness = data['correct'][k]
  bipartite_graph = np.zeros((len(true_props), len(retrieved_props)))
  bipartite_graph_double_spice = np.zeros((len(true_props), len(retrieved_props)))
  bipartite_graph_double_meteor = np.zeros((len(true_props), len(retrieved_props)))
  bipartite_graph_double_rouge = np.zeros((len(true_props), len(retrieved_props)))
  
  for i in range(len(true_props)):
    for j in range(len(retrieved_props)):
      cider_score = cider_output['CIDEr'][count1]
      meteor_score = meteor_scores[count1]
      rouge_score = rouge_scores[1][count1]
      spice_score = spice_output[count1]['scores']['All']['f']
      count1 += 1
      if (spice_score >= spice_threshold):
        bipartite_graph_double_spice[i][j] = 1
      else:
        bipartite_graph_double_spice[i][j] = 0
      if (cider_score >= cider_threshold):
        bipartite_graph[i][j] = 1
      else:
        bipartite_graph[i][j] = 0
      if (meteor_score >= meteor_threshold):
        bipartite_graph_double_meteor[i][j] = 1
      else:
        bipartite_graph_double_meteor[i][j] = 0
      if (rouge_score >= rouge_threshold):
        bipartite_graph_double_rouge[i][j] = 1
      else:
        bipartite_graph_double_rouge[i][j] = 0
  
  g = GFG(bipartite_graph_double_spice)
  number, division_list = g.maxBPM()
  
  score_recall = 0
  score_precision = 0
  for i in range(len(true_props)):
    j = -1
    for k in range(len(division_list)):
      if (division_list[k] == i):
        j = k
        break
    
    if (j != -1):
      score_recall += 1
      score_precision += 1
      count += 1
      counter.append(count) 
    else:
      count += 1

  spice_recall += score_recall/len(true_props)
  spice_precision += score_precision/len(retrieved_props)
  a2 = score_recall/len(true_props)
  b2 = score_precision/len(retrieved_props)
  if (a2+b2 != 0):
    spice_fscore += 2*a2*b2/(a2+b2)

  g = GFG(bipartite_graph)
  number, division_list = g.maxBPM()
  
  score_recall2 = 0
  score_precision2 = 0
  for i in range(len(true_props)):
    j = -1
    for k in range(len(division_list)):
      if (division_list[k] == i):
        j = k
        break
    
    if (j != -1):
      score_recall2 += 1
      score_precision2 += 1
      count += 1
      counter.append(count) 
    else:
      count += 1

  cider_recall += score_recall2/len(true_props)
  cider_precision += score_precision2/len(retrieved_props)
  a2 = score_recall2/len(true_props)
  b2 = score_precision2/len(retrieved_props)
  if (a2+b2 != 0):
    cider_fscore += 2*a2*b2/(a2+b2)

  g = GFG(bipartite_graph_double_meteor) 
  number, division_list = g.maxBPM()
  
  l1 = true_props
  l2 = retrieved_props
  score_recall3 = 0
  score_precision3 = 0
  for i in range(len(l1)):
    j = -1
    for k in range(len(division_list)):
      if (division_list[k] == i):
        j = k
        break
    if (j != -1):
      score_recall3 += 1
      score_precision3 += 1
      count += 1 
    else:
      count += 1
  meteor_recall += score_recall3/len(l1)
  meteor_precision += score_precision3/len(l2)
  a2 = score_recall3/len(l1)
  b2 = score_precision3/len(l2)
  if (a2+b2 != 0):
    meteor_fscore += 2*a2*b2/(a2+b2)

  g = GFG(bipartite_graph_double_rouge) 
  number, division_list = g.maxBPM()
  
  score_recall4 = 0
  score_precision4 = 0
  for i in range(len(l1)):
    j = -1
    for k in range(len(division_list)):
      if (division_list[k] == i):
        j = k
        break
    if (j != -1):
      score_recall4 += 1
      score_precision4 += 1
      count += 1 
    else:
      count += 1
  rouge_recall += score_recall4/len(l1)
  rouge_precision += score_precision4/len(l2)
  a2 = score_recall4/len(l1)
  b2 = score_precision4/len(l2)
  if (a2+b2 != 0):
    rouge_fscore += 2*a2*b2/(a2+b2)

# print(count)
# print(len(counter))
x = len(data["q_text"])
print("SPICE==============")
print('Recall: ', spice_recall/x)
print('Precision: ', spice_precision/x)
print('F1 Score: ', spice_fscore/x)
print("CIDEr==============")
print('Recall: ', cider_recall/x)
print('Precision: ', cider_precision/x)
print('F1 Score: ', cider_fscore/x)
print("METEOR==============")
print(meteor_recall/x)
print(meteor_precision/x)
print(meteor_fscore/x)
print("ROUGE==============")
print('Recall: ', rouge_recall/x)
print('Precision: ', rouge_precision/x)
print('F1 Score: ', rouge_fscore/x)

In [None]:
sts_recall = 0.0
sts_precision = 0.0
sts_fscore = 0.0
count = 0
sts_threshold = 3

counter = []
for i in range(0, len(data['q_text'])):
  # if i % 500 == 0:
  #   print(i)
  true_props = data['property'][i]
  query = data['q_text'][i].replace('\n','')
  if data['correct'][i]:
    query += ' ' + data['option'][i]
  else:
    query += ' not ' + data['option'][i]

  retrieved_props = []
  found = False
  for cntr in range(i, len(output_queries)):
    if query == output_queries[cntr]:
      retrieved_props = output_props[cntr]
      found = True
      break
  if (found == False):
    for cntr in range(0,i):
      if query == output_queries[cntr]:
        retrieved_props = output_props[cntr]
        break

  retrieved_props = [x for x in retrieved_props if x]

  bipartite_graph = np.zeros((len(true_props), len(retrieved_props)))
  bipartite_graph_double = np.zeros((len(true_props), len(retrieved_props)))
  for i in range(len(true_props)):
    for j in range(len(retrieved_props)):
      sts_score = web_model.predict([(true_props[i], retrieved_props[j])])[0]
      bipartite_graph_double[i][j] = sts_score
      if sts_score >= sts_threshold:
        bipartite_graph[i][j] = 1
      else:
        bipartite_graph[i][j] = 0

  g = GFG(bipartite_graph) 
  number, division_list = g.maxBPM()

  score0 = 0
  score_recall0 = 0
  score_precision0 = 0
  for i in range(len(true_props)):
    j = -1
    for k in range(len(division_list)):
      if(division_list[k] == i):
        j = k
        break

    if (j != -1):
      sts_score = bipartite_graph_double[i][j]
      score0 += sts_score
      score_recall0 += 1
      score_precision0 += 1
      count += 1
      
    else:
      count += 1
  
  sts_recall += score_recall0/len(true_props)
  sts_precision += score_precision0/len(retrieved_props)
  a0 = score_recall0/len(true_props)
  b0 = score_precision0/len(retrieved_props)
  if (a0+b0 != 0):
    sts_fscore += 2*a0*b0/(a0+b0)

x = len(data["q_text"])
print("STS==============")
print('Recall: ', sts_recall/x)
print('Precision: ', sts_precision/x)
print('F1 Score: ', sts_fscore/x)

In [None]:
%cd ../../retrieval/AIR-retriever