In [1]:
import evaluate
import pandas as pd

In [2]:
# BLEU score
def bleu_score(s1, s2):
    bleu = evaluate.load("bleu")
    result = bleu.compute(predictions=s1, references=s2)
    return result

In [3]:
# ROUGE score
def rouge_score(s1, s2):
    rouge = evaluate.load('rouge')
    result = rouge.compute(predictions=s1,references=s2)
    return result

In [4]:
# all-MiniLM-L6-v2 + distilbert/distilbert-base-cased-distilled-squad
df1 = pd.read_csv('model_results/mini_dis.csv')

ref1 = df1['reference_answer'].tolist()
ref1_bleu = [[ra] for ra in ref1]
pre1 = df1['predict_answer_02(rag model)'].tolist()

res1_bleu = bleu_score(pre1, ref1_bleu)
res1_rouge = rouge_score(pre1, ref1)

runtime1 = df1['run_time (unit: s)'].mean()

In [5]:
res1_bleu, res1_rouge, runtime1

({'bleu': 1.9109566130130727e-06,
  'precisions': [0.8611111111111112,
   0.8076923076923077,
   0.7368421052631579,
   0.7142857142857143],
  'brevity_penalty': 2.456761185699623e-06,
  'length_ratio': 0.0718562874251497,
  'translation_length': 36,
  'reference_length': 501},
 {'rouge1': 0.1524074156955152,
  'rouge2': 0.1054058143250645,
  'rougeL': 0.15114400519297425,
  'rougeLsum': 0.1530206380061826},
 1.1816762924)

In [6]:
# all-mpnet-base-v2 + distilbert/distilbert-base-cased-distilled-squad
df2 = pd.read_csv('model_results/mp_dis.csv')

ref2 = df2['reference_answer'].tolist()
ref2_bleu = [[ra] for ra in ref2]
pre2 = df2['predict_answer_02(rag model)'].tolist()

res2_bleu = bleu_score(pre2, ref2_bleu)
res2_rouge = rouge_score(pre2, ref2)

runtime2 = df2['run_time (unit: s)'].mean()

In [7]:
res2_bleu, res2_rouge, runtime2

({'bleu': 9.858634486177437e-06,
  'precisions': [0.8536585365853658, 0.7741935483870968, 0.68, 0.65],
  'brevity_penalty': 1.3409968695045071e-05,
  'length_ratio': 0.08183632734530938,
  'translation_length': 41,
  'reference_length': 501},
 {'rouge1': 0.16224160036924967,
  'rouge2': 0.1108271306862305,
  'rougeL': 0.15758767645569066,
  'rougeLsum': 0.1588738537058787},
 1.1404136657)

In [16]:
# all-MiniLM-L6-v2 + OLMo: no context
df3 = pd.read_csv('model_results/mini_olmo.csv')
df3 = df3[:10]

ref3 = df3['reference_answer'].tolist()
ref3_bleu = [[ra] for ra in ref3]
pre3 = df3['predict_answer_01(no context model)'].tolist()

res3_bleu = bleu_score(pre3, ref3_bleu)
res3_rouge = rouge_score(pre3, ref3)

runtime3 = df3['run_time (unit: s)'].mean()

In [17]:
res3_bleu, res3_rouge, runtime3

({'bleu': 0.0,
  'precisions': [0.17849462365591398,
   0.035164835164835165,
   0.006741573033707865,
   0.0],
  'brevity_penalty': 0.9255016586176857,
  'length_ratio': 0.9281437125748503,
  'translation_length': 465,
  'reference_length': 501},
 {'rouge1': 0.15260360506989995,
  'rouge2': 0.03730780172885435,
  'rougeL': 0.11062998441462144,
  'rougeLsum': 0.11054174403391961},
 64.289510512)

In [19]:
# all-MiniLM-L6-v2 + OLMo: context
pre4 = df3['predict_answer_02(rag model)'].tolist()

res4_bleu = bleu_score(pre4, ref3_bleu)
res4_rouge = rouge_score(pre4, ref3)

runtime4 = df3['run_time (unit: s).1'].mean()

In [20]:
res4_bleu, res4_rouge, runtime4

({'bleu': 0.04475797178491108,
  'precisions': [0.1795774647887324,
   0.05595026642984014,
   0.025985663082437275,
   0.015370705244122965],
  'brevity_penalty': 1.0,
  'length_ratio': 2.2674650698602794,
  'translation_length': 1136,
  'reference_length': 501},
 {'rouge1': 0.2896253811654588,
  'rouge2': 0.11512477969115435,
  'rougeL': 0.21048541803378723,
  'rougeLsum': 0.21307941491397647},
 107.487441888)

In [30]:
# all-mpnet-base-v2 + OLMo: no context
df5 = pd.read_csv('model_results/mp_olmo.csv')
df5 = df5[:10]

ref5 = df5['reference_answer'].tolist()
ref5_bleu = [[ra] for ra in ref5]
pre5 = df5['predict_answer_01(no context model)'].tolist()

res5_bleu = bleu_score(pre5, ref5_bleu)
res5_rouge = rouge_score(pre5, ref5)

runtime5 = df5['run_time (unit: s)'].mean()

In [31]:
res5_bleu, res5_rouge, runtime5

({'bleu': 0.016485732633162022,
  'precisions': [0.286144578313253,
   0.046583850931677016,
   0.01282051282051282,
   0.0033112582781456954],
  'brevity_penalty': 0.6010746487566252,
  'length_ratio': 0.6626746506986028,
  'translation_length': 332,
  'reference_length': 501},
 {'rouge1': 0.21040996347622581,
  'rouge2': 0.044013452640874876,
  'rougeL': 0.15432464272407018,
  'rougeLsum': 0.1565172755255232},
 55.840623355000005)

In [32]:
# all-mpnet-base-v2 + OLMo: context
pre6 = df3['predict_answer_02(rag model)'].tolist()

res6_bleu = bleu_score(pre6, ref5_bleu)
res6_rouge = rouge_score(pre6, ref5)

runtime6 = df5['run_time (unit: s).1'].mean()

In [33]:
res6_bleu, res6_rouge, runtime6

({'bleu': 0.04475797178491108,
  'precisions': [0.1795774647887324,
   0.05595026642984014,
   0.025985663082437275,
   0.015370705244122965],
  'brevity_penalty': 1.0,
  'length_ratio': 2.2674650698602794,
  'translation_length': 1136,
  'reference_length': 501},
 {'rouge1': 0.2896253811654588,
  'rouge2': 0.11512477969115435,
  'rougeL': 0.21048541803378723,
  'rougeLsum': 0.21307941491397647},
 110.49867651899999)