<a href="https://colab.research.google.com/github/Adria100/clin_IQ/blob/main/4_Evaluation_draft.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Evaluation

In [None]:
!pip install rouge_score
!pip install bert-score python-Levenshtein nltk
!pip install spacy transformers scikit-learn rouge-score nltk textacy
!python -m spacy download en_core_web_md

In [None]:
import json
import spacy
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, precision_score, recall_score
import numpy as np
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from sklearn.metrics.pairwise import cosine_similarity
from rouge_score import rouge_scorer
from nltk.tokenize import sent_tokenize
from transformers import pipeline
from bert_score import score
from nltk.translate.meteor_score import meteor_score
from sklearn.metrics import f1_score
import pandas as pd
import Levenshtein
import nltk
from nltk.corpus import wordnet

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Update file path


In [None]:
nlp = spacy.load('en_core_web_md')

# Hugging Face pipeline for entailment (textual consistency)
entailment_pipeline = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

In [None]:
nltk.download("wordnet")
nltk.download("omw-1.4")

In [None]:
def evaluate_classification(references, predictions):
    """
    Calculate accuracy, precision, and recall for letter classification.

    Parameters:
        y_true (list): True labels
        y_pred (list): Predicted labels

    Returns:
        dict: A dictionary with accuracy, precision, and recall
    """
    return {
        'accuracy': accuracy_score(references, predictions),
        'precision': precision_score(references, predictions, average='macro', zero_division=0),
        'recall': recall_score(references, predictions, average='macro', zero_division=0)
    }
#BERTScore
def evaluate_bertscore(references, predictions, lang='en'):
    P, R, F1 = score(predictions, references, lang=lang)
    return {
        'precision': P.mean().item(),
        'recall': R.mean().item(),
        'f1': F1.mean().item()
    }
#METEOR
def evaluate_meteor(references, predictions):
    scores = []
    for ref, pred in zip(references, predictions):
        scores.append(meteor_score([ref], pred))
    return {
        'average_meteor': sum(scores) / len(scores)
    }
#Levenshtein
def evaluate_levenshtein(references, predictions):
    distances = []
    for ref, pred in zip(references, predictions):
        distance = Levenshtein.distance(ref, pred)
        distances.append(distance)
    avg_distance = sum(distances) / len(distances)
    return {
        'average_levenshtein_distance': avg_distance
    }
#Cosine Similarity
def compute_cosine_similarity(references: str, predictions: str):
        """
        Compute cosine similarity between two texts using their embeddings.
        """
        doc1 = nlp(references)
        doc2 = nlp(predictions)
        return cosine_similarity([doc1.vector], [doc2.vector])[0][0]

#G-Eval
def evaluate_reasoning_flow(prediction):
    sentences = sent_tokenize(prediction)
    flow_scores = []
    for i in range(1, len(sentences)):
        score = compute_cosine_similarity(sentences[i - 1], sentences[i])
        flow_scores.append(score)
    return np.mean(flow_scores) if flow_scores else 0

#BLEU
def compute_bleu(references, predictions):
    """
    Compute BLEU score between a reference and a prediction.

    Parameters:
        reference (str): The ground truth answer.
        prediction (str): The model's predicted answer.

    Returns:
        float: BLEU score (0 to 1)
    """
    # Tokenize by splitting on whitespace
    reference_tokens = [references.split()]
    prediction_tokens = predictions.split()

    # Smoothing helps avoid zero scores for short predictions
    smoothie = SmoothingFunction().method4

    return sentence_bleu(reference_tokens, prediction_tokens, smoothing_function=smoothie)

#ROUGE
def compute_rouge(references, predictions):
    """
    Compute ROUGE scores between a reference and a prediction.

    Parameters:
        reference (str): The ground truth answer.
        prediction (str): The model's predicted answer.

    Returns:
        dict: ROUGE-1, ROUGE-2, and ROUGE-L F1 scores
    """
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(references, predictions)

    return {
        'rouge1': scores['rouge1'].fmeasure,
        'rouge2': scores['rouge2'].fmeasure,
        'rougeL': scores['rougeL'].fmeasure
    }


class EvaluationSuite():
    """
    Class that contains the evaluation scripts for each question type.
    """

    def evaluate_MC(self, predictions, ground_truth):
        """
        Evaluates the accuracy of multiple-choice predictions.

        This function compares a list of predicted answers (e.g., "A", "B", "C", etc.)
        against the corresponding ground-truth labels and computes overall accuracy.

        Args:
            predictions (List[str]): A list of predicted choices (e.g., ["B", "C", "A", ...]).
            ground_truth (List[str]): A list of correct choices (same format and length as predictions).

        Returns:
            dict: A dictionary containing:
                - 'accuracy' (float): Overall classification accuracy.
                - 'precision' (float): Macro-averaged precision across all classes.
                - 'recall' (float): Macro-averaged recall across all classes.
        """
        # Ensure predictions and ground truth have the same length
        assert len(predictions) == len(ground_truth), "Predictions and ground_truth must have the same length."

        # Normalize labels (strip whitespace, convert to uppercase)
        predictions = [p.strip().upper() for p in predictions]
        ground_truth = [g.strip().upper() for g in ground_truth]

        # Generate confusion matrix
        labels = sorted(list(set(ground_truth + predictions)))  # Get all unique class labels
        cm = confusion_matrix(ground_truth, predictions, labels=labels)

        # Display the matrix
        disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
        disp.plot(cmap='Blues', xticks_rotation=45)
        plt.title("Confusion Matrix")
        plt.tight_layout()
        plt.show()

        # Return classification metrics
        return evaluate_classification(ground_truth, predictions)

    def evaluate_FT(self, predictions, ground_truth):
        assert len(predictions) == len(ground_truth)

        labels = sorted(list(set(ground_truth + predictions)))

        # Compute confusion matrix
        cm = confusion_matrix(ground_truth, predictions, labels=labels)

        # Display the confusion matrix using ConfusionMatrixDisplay
        disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
        disp.plot(cmap='Blues', xticks_rotation=45)
        plt.title("Confusion Matrix")
        plt.tight_layout()
        plt.show()  # Show the plot

        # Compute and return classification metrics
        return evaluate_classification(ground_truth, predictions)

    def evaluate_SA(self, predictions, ground_truth):
        assert len(predictions) == len(ground_truth), "Predictions and ground_truth must have the same length."

        BLEU_scores = []
        ROUGE_scores = []

        for idx, prediction in enumerate(predictions):
            if prediction == 'N/A':
                continue

            print("Ground truth: ", ground_truth[idx])
            print("Prediction: ", prediction)

            bleu_score = compute_bleu(ground_truth[idx], prediction)
            rouge_score = compute_rouge(ground_truth[idx], prediction)

            print("BLEU: ", bleu_score)
            print("ROUGE: ", rouge_score)

            BLEU_scores.append(bleu_score)
            ROUGE_scores.append(rouge_score)

        avg_bleu = sum(BLEU_scores) / len(BLEU_scores) if BLEU_scores else 0
        avg_rouge = sum(ROUGE_scores) / len(ROUGE_scores) if ROUGE_scores else 0

        return avg_rouge, avg_bleu

    def evaluate_MH(self, predictions, ground_truth):
    assert len(predictions) == len(ground_truth), "Predictions and ground_truth must have the same length."

    BLEU_scores = []
    ROUGE_scores = []
    similarity_scores = []
    coherence_scores = []

    for idx, prediction in enumerate(predictions):
        if prediction == 'N/A':
            continue

        print("Ground truth: ", ground_truth[idx])
        print("Prediction: ", prediction)

        bleu_score = compute_bleu(ground_truth[idx], prediction)
        rouge_score = compute_rouge(ground_truth[idx], prediction)
        similarity = compute_cosine_similarity(ground_truth[idx], prediction)
        coherence = evaluate_reasoning_flow(prediction)

        print("BLEU: ", bleu_score)
        print("ROUGE: ", rouge_score)
        print(f"Average Cosine Similarity: {similarity}")
        print(f"Average Coherence: {coherence}")

        BLEU_scores.append(bleu_score)
        ROUGE_scores.append(rouge_score)
        similarity_scores.append(similarity)
        coherence_scores.append(coherence)

    avg_bleu = sum(BLEU_scores) / len(BLEU_scores) if BLEU_scores else 0
    avg_rouge = {
        'rouge1': np.mean([s['rouge1'] for s in ROUGE_scores]) if ROUGE_scores else 0,
        'rouge2': np.mean([s['rouge2'] for s in ROUGE_scores]) if ROUGE_scores else 0,
        'rougeL': np.mean([s['rougeL'] for s in ROUGE_scores]) if ROUGE_scores else 0
    }
    avg_similarity = np.mean(similarity_scores) if similarity_scores else 0
    avg_coherence = np.mean(coherence_scores) if coherence_scores else 0

    return {
        "avg_bleu": avg_bleu,
        "avg_rouge": avg_rouge,
        "avg_similarity": avg_similarity,
        "avg_coherence": avg_coherence
    }


#Clean Evaluation

For short answer and multi-hop evaluation we have looked at evaluation approaches used in literature:

-BERTScore, BLEU, METEOR  and Levenshtein Ratio:
Lemesle, Q., Chevelu, J., Martin, P., Lolive, D., Delhay, A., & Barbot, N. (2025, January). Paraphrase Generation Evaluation Powered by an LLM: A Semantic Metric, Not a Lexical One. In Proceedings of the 31st International Conference on Computational Linguistics (pp. 8057-8087).

-Cosine Similarity:
Patwardhan, A., Vaidya, V., & Kundu, A. (2024, October). Automated Consistency Analysis of LLMs. In 2024 IEEE 6th International Conference on Trust, Privacy and Security in Intelligent Systems, and Applications (TPS-ISA) (pp. 118-127). IEEE.

-G-Eval (particularly Reasonig Flow was used (https://www.galileo.ai/blog/g-eval-metric):
Liu, Y., Iter, D., Xu, Y., Wang, S., Xu, R., & Zhu, C. (2023). G-eval: NLG evaluation using gpt-4 with better human alignment. arXiv preprint arXiv:2303.16634.

-ROUGE:
Briman, M. K. H., & Yildiz, B. (2024). Beyond ROUGE: A comprehensive evaluation metric for abstractive summarization leveraging similarity, entailment, and acceptability. International Journal on Artificial Intelligence Tools. DOI: https://doi. org/10.1142 S.

-Semantic Match Score:
Ansar Aynetdinov and Alan Akbik. 2024. SemScore: Automated Evaluation of Instruction-Tuned LLMs based on Semantic Textual Similarity. arXiv. doi: https://doi.org/10.48550/arxiv.2401.17072. URL: https://arxiv.org/abs/2401.17072.

We generally followed the evaluation metrics that where listed on the challenge's website (https://cliniqlink.org/):
- Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. Bleu: a Method for Automatic Evaluation of Machine Translation. In Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics, pages 311–318, Philadelphia, Pennsylvania, USA. Association for Computational Linguistics.
- Chin-Yew Lin. 2004. ROUGE: A Package for Automatic Evaluation of Summaries. In Text Summarization Branches Out, pages 74–81, Barcelona, Spain. Association for Computational Linguistics.
- Satanjeev Banerjee and Alon Lavie. 2005. METEOR: An Automatic Metric for MT Evaluation with Improved Correlation with Human Judgments. In Proceedings of the ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and/or Summarization, pages 65–72, Ann Arbor, Michigan. Association for Computational Linguistics.
- T. Zhang, V. Kishore, F. Wu, K. Q. Weinberger, and Y. Artzi, “BERTScore: Evaluating Text Generation with BERT,” arXiv.org, 2019. https://arxiv.org/abs/1904.09675.
- Ansar Aynetdinov and Alan Akbik. 2024. SemScore: Automated Evaluation of Instruction-Tuned LLMs based on Semantic Textual Similarity. arXiv. doi: https://doi.org/10.48550/arxiv.2401.17072. URL: https://arxiv.org/abs/2401.17072.


In [None]:
#!pip install -r requirements.txt

In [None]:
!pip install scikit-learn
!pip install nltk
!pip install numpy
!pip install matplotlib
!pip install bert-score
!pip install rouge-score
!pip install python-Levenshtein
!pip install spacy
!pip install pandas
!pip install tqdm

In [None]:
!python -m spacy download en_core_web_md
nlp = spacy.load("en_core_web_md")
import nltk
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
# Required imports only
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, ConfusionMatrixDisplay
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize
from bert_score import score
import Levenshtein
import spacy
import pandas as pd
from tqdm import tqdm

In [None]:
# Load SpaCy model
nlp = spacy.load("en_core_web_md")

# Metric functions
def mean_pooling(token_embeddings, mask):
    return (token_embeddings * mask[:, :, None]).sum(1) / mask.sum(1)[:, None]

def semantic_match_score(reference, prediction, weights=(0.3, 0.3, 0.4)):
    # Tokenize sentences
    ref_doc = nlp(reference)
    pred_doc = nlp(prediction)

    # --- Word Level ---
    word_sim = cosine_similarity([ref_doc.vector], [pred_doc.vector])[0][0]

    # --- Sentence Level ---
    ref_sents = list(ref_doc.sents)
    pred_sents = list(pred_doc.sents)
    min_len = min(len(ref_sents), len(pred_sents))
    sent_sims = [
        cosine_similarity([ref_sents[i].vector], [pred_sents[i].vector])[0][0]
        for i in range(min_len)
    ]
    sentence_sim = np.mean(sent_sims) if sent_sims else 0

    # --- Paragraph Level ---
    paragraph_sim = cosine_similarity([ref_doc.vector], [pred_doc.vector])[0][0]

    # --- Weighted Sum ---
    w_word, w_sentence, w_paragraph = weights
    semantic_score = (
        w_word * word_sim +
        w_sentence * sentence_sim +
        w_paragraph * paragraph_sim
    )
    return {
        "word_similarity": word_sim,
        "sentence_similarity": sentence_sim,
        "paragraph_similarity": paragraph_sim,
        "semantic_match_score": semantic_score
    }




def evaluate_classification(references, predictions):
    return {
        'accuracy': accuracy_score(references, predictions),
        'precision': precision_score(references, predictions, average='macro', zero_division=0),
        'recall': recall_score(references, predictions, average='macro', zero_division=0)
    }

def evaluate_bertscore(references, predictions, lang='en'):#https://milvus.io/ai-quick-reference/what-is-bertscore-or-other-embeddingbased-metrics-and-can-they-be-helpful-in-evaluating-the-similarity-between-a-generated-answer-and-a-reference-answer-or-source-text
    P, R, F1 = score(predictions, references, lang=lang)
    return {
        'precision': P.mean().item(),
        'recall': R.mean().item(),
        'f1': F1.mean().item()}

def evaluate_levenshtein(references, predictions):#https://www.cuelogic.com/blog/the-levenshtein-algorithm
    distances = [Levenshtein.distance(ref, pred) for ref, pred in zip(references, predictions)]
    return {'average_levenshtein_distance': sum(distances) / len(distances)}

def evaluate_meteor(references, predictions):#https://spotintelligence.com/2024/08/26/meteor-metric-in-nlp-how-it-works-how-to-tutorial-in-python/
    scores = [meteor_score([ref], pred) for ref, pred in zip(references, predictions)]
    return {'average_meteor': sum(scores) / len(scores)}

def compute_bleu(reference, prediction):#https://codelabsacademy.com/en/blog/understanding-bleu-score-in-nlp-evaluating-translation-quality
    reference_tokens = [reference.split()]
    prediction_tokens = prediction.split()
    smoothie = SmoothingFunction().method4
    return sentence_bleu(reference_tokens, prediction_tokens, smoothing_function=smoothie)

def compute_rouge(reference, prediction):#https://www.traceloop.com/blog/evaluating-model-performance-with-the-rouge-metric-a-comprehensive-guide
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, prediction)
    return {
        'rouge1': scores['rouge1'].fmeasure,
        'rouge2': scores['rouge2'].fmeasure,
        'rougeL': scores['rougeL'].fmeasure
    }

def compute_cosine_similarity(reference, prediction):#https://www.comet.com/site/blog/bertscore-for-llm-evaluation/
    doc1 = nlp(reference)
    doc2 = nlp(prediction)
    return cosine_similarity([doc1.vector], [doc2.vector])[0][0]

def evaluate_reasoning_flow(prediction): #https://www.galileo.ai/blog/g-eval-metric
    sentences = sent_tokenize(prediction)
    flow_scores = []
    for i in range(1, len(sentences)):
        score = compute_cosine_similarity(sentences[i - 1], sentences[i])
        flow_scores.append(score)
    return np.mean(flow_scores) if flow_scores else 0

# Evaluation suite class
class EvaluationSuite:

    def evaluate_MC(self, predictions, ground_truth):
        assert len(predictions) == len(ground_truth)
        predictions = [p.strip().upper() for p in predictions]
        ground_truth = [g.strip().upper() for g in ground_truth]
        labels = sorted(list(set(ground_truth + predictions)))
        cm = confusion_matrix(ground_truth, predictions, labels=labels)
        disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
        disp.plot(cmap='Blues', xticks_rotation=45)
        plt.title("Confusion Matrix")
        plt.tight_layout()
        plt.show()
        return evaluate_classification(ground_truth, predictions)

    def evaluate_FT(self, predictions, ground_truth):
        assert len(predictions) == len(ground_truth)
        labels = sorted(list(set(ground_truth + predictions)))
        cm = confusion_matrix(ground_truth, predictions, labels=labels)
        disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
        disp.plot(cmap='Blues', xticks_rotation=45)
        plt.title("Confusion Matrix")
        plt.tight_layout()
        plt.show()
        return evaluate_classification(ground_truth, predictions)

    def evaluate_SA(self, predictions, ground_truth):
        assert len(predictions) == len(ground_truth)

        bleu_scores, meteor_scores, lev_distances, rouge_scores = [], [], [], []
        semantic_scores = []

        for ref, pred in zip(ground_truth, predictions):
            if pred == 'N/A':
                continue
            bleu_scores.append(compute_bleu(ref, pred))
            meteor_scores.append(meteor_score([ref], pred))
            lev_distances.append(Levenshtein.distance(ref, pred))
            rouge_scores.append(compute_rouge(ref, pred))
            semantic_scores.append(semantic_match_score(ref, pred))

        avg_bleu = np.mean(bleu_scores) if bleu_scores else 0
        avg_meteor = np.mean(meteor_scores) if meteor_scores else 0
        avg_levenshtein = np.mean(lev_distances) if lev_distances else 0
        avg_rouge = {
            'rouge1': np.mean([s['rouge1'] for s in rouge_scores]) if rouge_scores else 0,
            'rouge2': np.mean([s['rouge2'] for s in rouge_scores]) if rouge_scores else 0,
            'rougeL': np.mean([s['rougeL'] for s in rouge_scores]) if rouge_scores else 0
        }
        avg_semantic = {
            k: np.mean([s[k] for s in semantic_scores]) if semantic_scores else 0
            for k in semantic_scores[0].keys()
        } if semantic_scores else {}

        return {
            "avg_bleu": avg_bleu,
            "avg_meteor": avg_meteor,
            "avg_levenshtein": avg_levenshtein,
            "avg_rouge": avg_rouge,
            "semantic_match_score": avg_semantic
        }

    def evaluate_MH(self, predictions, ground_truth):
        assert len(predictions) == len(ground_truth)

        bert_scores = evaluate_bertscore(ground_truth, predictions)
        cosine_sims, coherence_scores, semantic_scores = [], [], []

        for ref, pred in zip(ground_truth, predictions):
            if pred == 'N/A':
                continue
            cosine_sims.append(compute_cosine_similarity(ref, pred))
            coherence_scores.append(evaluate_reasoning_flow(pred))
            semantic_scores.append(semantic_match_score(ref, pred))

        avg_cosine = np.mean(cosine_sims) if cosine_sims else 0
        avg_coherence = np.mean(coherence_scores) if coherence_scores else 0
        avg_semantic = {
            k: np.mean([s[k] for s in semantic_scores]) if semantic_scores else 0
            for k in semantic_scores[0].keys()
        } if semantic_scores else {}

        return {
            "bertscore": bert_scores,
            "avg_cosine_similarity": avg_cosine,
            "avg_reasoning_coherence": avg_coherence,
            "semantic_match_score": avg_semantic
        }
