In [8]:
import json

with open("../2_datasets/expert-dataset.json", 'r', encoding='utf-8') as file:
        data = json.load(file)

with open("./predictions/nemo_rerank_7to3.json", 'r', encoding='utf-8') as file:
        preds = json.load(file)

filtered = []
for element in data:
        id = element["Id"]
        for element2 in preds:
                id2 = element2["id"]
                if id == id2:
                    filtered.append(element2)


with open("./predictions/RAG_predictions.json", 'w', encoding='utf-8') as file:
        json.dump(filtered, file, ensure_ascii=False, indent=4)



FileNotFoundError: [Errno 2] No such file or directory: './predictions/nemo_rerank_7to3.json'

#### Evaluation of our Generation Results
We define a set of metrics and evaluate our system against two baselines.

Input data was generated in *generate.ipynb* with results in *./predictions*.

In [1]:
from bleurt import score
import numpy as np
from bert_score import BERTScorer
from rouge_score import rouge_scorer, scoring

# BERTScore and ROUGE
def evaluateBERT_ROUGE(rag_model, predictions, references):
    scorer = BERTScorer(model_type='bert-base-multilingual-cased')
    P, R, F1 = scorer.score(predictions, references)
    BERT_Precision = P.mean()
    BERT_Recall = R.mean()
    BERT_F1 = F1.mean()
    print(f"BERTScore Precision: {BERT_Precision:.4f}, Recall: {BERT_Recall:.4f}, F1: {BERT_F1:.4f}")

    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    aggregator = scoring.BootstrapAggregator()

    # Iterate through references and predictions
    for ref, pred in zip(references, predictions):
        scores = scorer.score(ref, pred)
        aggregator.add_scores(scores)

    # Aggregate and print ROUGE scores
    result = aggregator.aggregate()
    rouge1 = result['rouge1'].mid.fmeasure
    rouge2 = result['rouge2'].mid.fmeasure
    rougeL = result['rougeL'].mid.fmeasure
    print(f"ROUGE-1: {rouge1}")
    print(f"ROUGE-2: {rouge2}")
    print(f"ROUGE-L: {rougeL}")

    return P, R, F1, result['rouge1'],result['rouge2'],result['rougeL']

# BLEURT
checkpoint = "./models/BLEURT-20"
scorer = score.BleurtScorer(checkpoint)

def BLEURTSCORE(references, generated):
    # Compute BLEURT score
    scores = scorer.score(references=references, candidates=generated)
    return scores

# Scoring Differences based on complexity and relevance levels
def scoresForRatings(complexities, relevances, scores, name):
    print(f"Scores for {name}")
    for index in range(1,7):
        cplx_at_idx = [score for score_idx, score in enumerate(scores, 0) if complexities[score_idx] == str(index)]
        rel_at_idx = [score for score_idx, score in enumerate(scores, 0) if relevances[score_idx] == str(index)]
        print(f"Complexity @{index}: {np.array(cplx_at_idx).mean()}")
        print(f"Relevance @{index}: {np.array(rel_at_idx).mean()}")


INFO:tensorflow:Reading checkpoint ./models/BLEURT-20.
INFO:tensorflow:Config file found, reading.
INFO:tensorflow:Will load checkpoint BLEURT-20
INFO:tensorflow:Loads full paths and checks that files exists.
INFO:tensorflow:... name:BLEURT-20
INFO:tensorflow:... bert_config_file:bert_config.json
INFO:tensorflow:... max_seq_length:512
INFO:tensorflow:... vocab_file:None
INFO:tensorflow:... do_lower_case:None
INFO:tensorflow:... sp_model:sent_piece
INFO:tensorflow:... dynamic_seq_length:True
INFO:tensorflow:Creating BLEURT scorer.
INFO:tensorflow:Creating SentencePiece tokenizer.
INFO:tensorflow:Creating SentencePiece tokenizer.
INFO:tensorflow:Will load model: ./models/BLEURT-20\sent_piece.model.
INFO:tensorflow:SentencePiece tokenizer created.
INFO:tensorflow:Creating Eager Mode predictor.
INFO:tensorflow:Loading model.
INFO:tensorflow:BLEURT initialized.


INFO:tensorflow:BLEURT initialized.


In [2]:
import json

models = ["./predictions/baseline_halluzination.json","./predictions/RAG_predictions.json"]

def evaluateRAG(model, baseline):
    eval_file = model
    with open(eval_file, 'r', encoding="utf8") as file:
        data = json.load(file)
    predictions = []
    references = []
    contexts = []
    questions = []
    complexity_rating = []
    relevance_rating = []
    
    timeOverall = 0
    for item in data:
        pred = item["prediction"]
        que = item["question"]
        ref = item["reference"]
        ctx = item["contexts"]
        timeOverall += item["gen-time"]
        cplx = item["complexity"]
        rele = item["relevance"]
        complexity_rating.append(cplx)
        relevance_rating.append(rele)
        if baseline:
            baselinePred = "Ich kann Ihnen auf diese Frage leider keine Antwort geben."
            predictions.append(baselinePred)
        else:
            predictions.append(pred)
        references.append(ref)
        contexts.append(ctx)
        questions.append(que)

    print(f"Time for Questions {model}: {timeOverall/53}(Mean)")
    P, R, F1, rouge1, rouge2, rougeL = evaluateBERT_ROUGE(model, predictions, references)
    scoresForRatings(complexity_rating, relevance_rating, P, "Precision BERT")
    scoresForRatings(complexity_rating, relevance_rating, R, "Recall BERT")
    scoresForRatings(complexity_rating, relevance_rating, F1, "F1 BERT")
    scoresForRatings(complexity_rating, relevance_rating, rouge1, "Rouge 1")
    scoresForRatings(complexity_rating, relevance_rating, rouge2, "Rouge 2")
    scoresForRatings(complexity_rating, relevance_rating, rougeL, "Rouge L")
    
    bleurt = BLEURTSCORE(references, predictions)
    scoresForRatings(complexity_rating, relevance_rating, bleurt, "BLEURT")
    print(f'{model}-BLEURT-Score:',np.array(bleurt).mean())
    

In [3]:
# Default answer baseline
evaluateRAG(models[0], True)

Time for Questions ./predictions/baseline_halluzination.json: 68.19603773584906(Mean)


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]



BERTScore Precision: 0.6145, Recall: 0.5648, F1: 0.5882
ROUGE-1: 0.03430439355986151
ROUGE-2: 0.0004837929366231253
ROUGE-L: 0.030039193883866562
Scores for Precision BERT
Complexity @1: 0.6042108535766602
Relevance @1: 0.6178925633430481
Complexity @2: 0.6162104606628418
Relevance @2: 0.6193664073944092
Complexity @3: 0.6109195351600647
Relevance @3: 0.6076955199241638
Complexity @4: 0.6185082793235779
Relevance @4: 0.616362452507019
Complexity @5: 0.6080071926116943
Relevance @5: nan
Complexity @6: 0.6488398313522339
Relevance @6: nan
Scores for Recall BERT
Complexity @1: 0.5744127035140991
Relevance @1: 0.5586015582084656
Complexity @2: 0.5703244805335999
Relevance @2: 0.5808422565460205
Complexity @3: 0.559414803981781
Relevance @3: 0.5554892420768738
Complexity @4: 0.5658586621284485
Relevance @4: 0.5444623827934265
Complexity @5: 0.53498774766922
Relevance @5: nan
Complexity @6: 0.6327117085456848
Relevance @6: nan
Scores for F1 BERT
Complexity @1: 0.5887242555618286
Relevance @1

  print(f"Relevance @{index}: {np.array(rel_at_idx).mean()}")
  ret = ret.dtype.type(ret / rcount)
  print(f"Complexity @{index}: {np.array(cplx_at_idx).mean()}")


Scores for BLEURT
Complexity @1: 0.28688178956508636
Relevance @1: 0.31469235196709633
Complexity @2: 0.3280910147087915
Relevance @2: 0.3359516777775504
Complexity @3: 0.3839507043361664
Relevance @3: 0.3686158925294876
Complexity @4: 0.3466654082139333
Relevance @4: 0.31724029779434204
Complexity @5: 0.2550540827214718
Relevance @5: nan
Complexity @6: 0.522446870803833
Relevance @6: nan
./predictions/baseline_halluzination.json-BLEURT-Score: 0.34420200479480456


In [5]:
# Halluzination baseline
evaluateRAG(models[0], False)

Time for Questions ./predictions/baseline_halluzination.json: 68.19603773584906(Mean)




BERTScore Precision: 0.6035, Recall: 0.6946, F1: 0.6447
ROUGE-1: 0.14339096023131992
ROUGE-2: 0.040590108605684794
ROUGE-L: 0.10130790402298498
Scores for Precision BERT
Complexity @1: 0.5728986263275146
Relevance @1: 0.593463659286499
Complexity @2: 0.613224446773529
Relevance @2: 0.6019572615623474
Complexity @3: 0.6044213175773621
Relevance @3: 0.5943030118942261
Complexity @4: 0.6055366396903992
Relevance @4: 0.6400230526924133
Complexity @5: 0.6159853935241699
Relevance @5: nan
Complexity @6: 0.49309083819389343
Relevance @6: nan
Scores for Recall BERT
Complexity @1: 0.7129166126251221
Relevance @1: 0.6861295104026794
Complexity @2: 0.7180389761924744
Relevance @2: 0.703456461429596
Complexity @3: 0.6740996837615967
Relevance @3: 0.6816138625144958
Complexity @4: 0.6870089769363403
Relevance @4: 0.7084568738937378
Complexity @5: 0.6944085955619812
Relevance @5: nan
Complexity @6: 0.7135599255561829
Relevance @6: nan
Scores for F1 BERT
Complexity @1: 0.6343410015106201
Relevance @1

  print(f"Relevance @{index}: {np.array(rel_at_idx).mean()}")
  print(f"Complexity @{index}: {np.array(cplx_at_idx).mean()}")


Scores for BLEURT
Complexity @1: 0.5449222326278687
Relevance @1: 0.5484912991523743
Complexity @2: 0.5843877749783652
Relevance @2: 0.5643149533055045
Complexity @3: 0.5532901763916016
Relevance @3: 0.5436044156551361
Complexity @4: 0.5427051981290182
Relevance @4: 0.583053571837289
Complexity @5: 0.5622473657131195
Relevance @5: nan
Complexity @6: 0.5123196244239807
Relevance @6: nan
./predictions/baseline_halluzination.json-BLEURT-Score: 0.557780329911214


In [4]:
# RAG
evaluateRAG(models[1], False)

Time for Questions ./predictions/RAG_predictions.json: 215.5171698113208(Mean)




BERTScore Precision: 0.6637, Recall: 0.7589, F1: 0.7066
ROUGE-1: 0.26939922587078446
ROUGE-2: 0.12153732250239746
ROUGE-L: 0.1983011911382967
Scores for Precision BERT
Complexity @1: 0.682620644569397
Relevance @1: 0.6422492265701294
Complexity @2: 0.6898080110549927
Relevance @2: 0.6802591681480408
Complexity @3: 0.6615080237388611
Relevance @3: 0.654068112373352
Complexity @4: 0.6547761559486389
Relevance @4: 0.651305079460144
Complexity @5: 0.6264488697052002
Relevance @5: nan
Complexity @6: 0.537344217300415
Relevance @6: nan
Scores for Recall BERT
Complexity @1: 0.8252018690109253
Relevance @1: 0.7839110493659973
Complexity @2: 0.7949286103248596
Relevance @2: 0.7718503475189209
Complexity @3: 0.7541089653968811
Relevance @3: 0.7462186813354492
Complexity @4: 0.7280923128128052
Relevance @4: 0.7400194406509399
Complexity @5: 0.7175034284591675
Relevance @5: nan
Complexity @6: 0.6880191564559937
Relevance @6: nan
Scores for F1 BERT
Complexity @1: 0.7442771196365356
Relevance @1: 0.

  print(f"Relevance @{index}: {np.array(rel_at_idx).mean()}")
  print(f"Complexity @{index}: {np.array(cplx_at_idx).mean()}")


Scores for BLEURT
Complexity @1: 0.6782665550708771
Relevance @1: 0.6355424523353577
Complexity @2: 0.6770975249154227
Relevance @2: 0.6619087132540616
Complexity @3: 0.6502765774726867
Relevance @3: 0.6384119480848313
Complexity @4: 0.6232050657272339
Relevance @4: 0.6165731038366046
Complexity @5: 0.5800376832485199
Relevance @5: nan
Complexity @6: 0.5736052393913269
Relevance @6: nan
./predictions/RAG_predictions.json-BLEURT-Score: 0.6450643809336536
