In [None]:
import os 
import json
from dotenv import load_dotenv

from deepeval import evaluate
from deepeval.models import GeminiModel
from deepeval.evaluate import AsyncConfig
from deepeval.evaluate import ErrorConfig
from deepeval.test_case import LLMTestCase
from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric, ContextualRelevancyMetric

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

from src.text.vector import VectorStore
from src.model.model import MultiModalEmbeddingModel, OllamaLanguageModel

---

# Actual Outputs Generatation (with RAG)

In [None]:
load_dotenv(".env")
os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")
print(os.environ["GOOGLE_API_KEY"])

model = GeminiModel(
    model_name="gemini-2.0-flash",
    api_key=os.environ["GOOGLE_API_KEY"],
    temperature=0
)

In [None]:
generated = {}
with open(os.path.join(os.getcwd(), "dataset", "generated.json"), "r") as f:
    generated = json.load(f)

In [None]:
queries = []
queries_path = os.path.join(os.getcwd(), "dataset", "dataset.json")

with open(queries_path, "r") as f:
    data = json.load(f)
    for k, v in data.items():
        if v["query"] != "" and v["golden"] != "":
            queries.append((k, v["query"]))

In [None]:
generate = True

if len(queries) == len(generated):
    print("All queries have been generated")
    generate = False

In [None]:
if generate:
    vector_store = VectorStore(
        embed_model=MultiModalEmbeddingModel("nomic-ai/nomic-embed-text-v1.5", "nomic-ai/nomic-embed-vision-v1.5"),
        persist_directory="eval")
    local_model = OllamaLanguageModel(model_name="gemma3:27b-it-qat", temperature=0.0).model

In [None]:
texts = []
root_folder = os.path.join(os.getcwd(), "dataset", "text")

for file in os.listdir(root_folder):
    file_path = os.path.join(root_folder, file)
    with open(file_path, "r") as f:
        text = f.read()
    texts.append(text)

if generate:
    vector_store.add(texts)
    print(f"{len(vector_store.vector_store.get()['ids'])} documents loaded to vector store")

In [None]:
def inference(query, llm, vector_store):
    def format_docs(docs):
        context = ""
        for doc in docs:
            context += f"{doc.page_content}\n\n"
        return context

    system_prompt = '''
        # Role
        
        You are an expert AI professor capable of summarizing classroom materials to make understand better the concepts. In particular you will be answering user queries based on retrieved information from a database of classroom materials.
        
        ## Input:

            - Context: Revelevant chunks retrieved from the database of classroom materials.
            - User Query: User question.

        ## Instruction:

            1. Answer only using the provided context. If the context contains sufficient information to answer the query, provide a precise, well-structured response, without referring to general knowledge or external sources.
            2. If you don't know what to say, just say that you don't know.
            3. Answer in English.
        '''

    user_prompt = '''
        # Inputs
        
        Context:
        {context}
        
        User Query:
        {query}
        '''
    
    prompt = ChatPromptTemplate.from_messages([
            ("system", system_prompt),
            ("user", user_prompt),
        ])

    retriever = vector_store.get_retriever(filter={"type": "text"})
    docs = retriever.invoke(query)
    retrieved_context = format_docs(docs)
    
    rag_chain = (
        prompt
        | llm
        | StrOutputParser()
    )

    output = rag_chain.invoke({"context": retrieved_context,
                                "query": query})
    docs = [doc.page_content for doc in docs]
    return output, docs

In [None]:
outputs = []
retrieved_contexts = []

if generate:
    for i, query in enumerate(queries):
        query = query[1]
        output, retrieved_context = inference(query, local_model, vector_store)
        outputs.append(output)
        retrieved_contexts.append(retrieved_context)
else:
    for query in queries:
        query = query[0]
        output = generated[query]["output"]
        retrieved_context = generated[query]["retrieved_context"]
        outputs.append(output)
        retrieved_contexts.append(retrieved_context)

In [None]:
if generate:
    generated = {}

    for query, output, retrieved_context_list in zip(queries, outputs, retrieved_contexts):
        key = query[0]
        generated[key] = {
            "query": query[1],
            "output": output,
            "retrieved_context": retrieved_context_list
        }

    with open(os.path.join(os.getcwd(), "dataset", "generated.json"), "w") as f:
        json.dump(generated, f, indent=4)

---
# Rag Triad Evaluation


In [None]:
test_cases = []

for query, output, retrieved_contexts in zip(queries, outputs, retrieved_contexts):
    test_case = LLMTestCase(
        input=query[1],
        actual_output=output,
        retrieval_context=retrieved_contexts,
    )
    test_cases.append(test_case)

In [None]:
answer_relevancy = AnswerRelevancyMetric(model=model, async_mode=False)
faithfulness = FaithfulnessMetric(model=model, async_mode=False)
contextual_relevancy = ContextualRelevancyMetric(model=model, async_mode=False)

In [None]:
async_config = AsyncConfig(
    run_async=False,
    max_concurrent=1,
    throttle_value=1
    )

error_config = ErrorConfig(
    ignore_errors=True,
)

results = evaluate(
    test_cases=test_cases,
    metrics=[
        answer_relevancy,
        faithfulness,
        contextual_relevancy
    ],
    async_config=async_config,
    error_config=error_config,
)

In [None]:
relevancies = []
faithfulnesses = []
contextual_relevancies = []

for test in results.test_results:
    for i, metric in enumerate(test.metrics_data):
        if i == 0:
            relevancies.append(metric.score)
        elif i == 1:
            faithfulnesses.append(metric.score)
        elif i == 2:
            contextual_relevancies.append(metric.score)

relevance = sum(relevancies) / len(relevancies)
faithfulness = sum(faithfulnesses) / len(faithfulnesses)
contextual_relevancy = sum(contextual_relevancies) / len(contextual_relevancies)

print(f"Relevancy: {relevance:2f}")
print(f"Faithfulness: {faithfulness:2f}")
print(f"Contextual Relevancy: {contextual_relevancy:2f}")

----

# Standard Evaluation: BERT Score - BLEU - ROUGE

In [None]:
goldens = {}
dataset_path = os.path.join(os.getcwd(), "dataset", "dataset.json")

with open(dataset_path, "r") as f:
    data = json.load(f)
    for k, v in data.items():
        if v["query"] != "" and v["golden"] != "":
            goldens[k] = v["golden"]

In [None]:
final_data = []

for k, v in generated.items():
    final_data.append({
        "golden": goldens[k],
        "output": v["output"],
    })

In [None]:
# BLEU code taken from: https://github.com/confident-ai/deepeval/blob/main/deepeval/scorer/scorer.py#L52

def sentence_bleu_score(references, prediction, bleu_type = "bleu1"):
    """Calculates the BLEU (Bilingual Evaluation Understudy) score for a given prediction compared to one or more reference sentences.

    BLEU is a metric used to evaluate the quality of machine-generated text by comparing it to one or more reference sentences.
    It measures the similarity of the generated text to the reference text based on n-grams.

    Args:
        references (Union[str, List[str]): A reference sentence or a list of reference sentences.
        prediction (str): The generated text or sentence to be evaluated.
        bleu_type (Optional[str]): The BLEU score type (Options: 'bleu1', 'bleu2', 'bleu3', 'bleu4'). Default is 'bleu1'.

    Returns:
        float: The BLEU score for the given prediction and references.
    """
    try:
        from nltk.tokenize import word_tokenize
        from nltk.translate.bleu_score import sentence_bleu
    except ModuleNotFoundError as e:
        print("Please install nltk module. Command: pip install nltk")

    assert bleu_type in [
        "bleu1",
        "bleu2",
        "bleu3",
        "bleu4",
    ], "Invalid bleu_type. Options: 'bleu1', 'bleu2', 'bleu3', 'bleu4'"
    targets = [references] if isinstance(references, str) else references
    tokenized_targets = [word_tokenize(target) for target in targets]
    tokenized_prediction = word_tokenize(prediction)
    bleu_weight_map = {
        "bleu1": (1, 0, 0, 0),
        "bleu2": (0, 1, 0, 0),
        "bleu3": (0, 0, 1, 0),
        "bleu4": (0, 0, 0, 1),
    }
    return sentence_bleu(
        tokenized_targets,
        tokenized_prediction,
        weights=bleu_weight_map[bleu_type],
    )

In [None]:
# ROUGE code taken from: https://github.com/confident-ai/deepeval/blob/main/deepeval/scorer/scorer.py#L19

def rouge_score(target: str, prediction: str, score_type: str) -> float:
        """Calculates the Rouge score for a given target and prediction.

        Rouge (Recall-Oriented Understudy for Gisting Evaluation) is a metric used for evaluating the quality of generated text,
        especially in tasks like text summarization.

        To utilize the rouge_score scoring method, be sure to `pip install rouge-score` before calling this method.

        Args:
            target (str): The actual label or target text.
            prediction (str): The generated text from the model or LLM.
            score_type (str): The Rouge score type (Options: 'rouge1', 'rouge2', 'rougeL').

        Returns:
            float: The Rouge score for the given target and prediction, based on the specified score type.
        """
        try:
            from rouge_score import rouge_scorer
        except:
            pass

        assert score_type in [
            "rouge1",
            "rouge2",
            "rougeL",
        ], "score_type can be either rouge1, rouge2 or rougeL"
        scorer = rouge_scorer.RougeScorer([score_type], use_stemmer=True)
        scores = scorer.score(target, prediction)
        return scores[score_type].fmeasure

In [None]:
# BERTScore Taken from: https://github.com/confident-ai/deepeval/blob/main/deepeval/scorer/scorer.py#L129

def bert_score(references, predictions, model = "microsoft/deberta-large-mnli", lang = "en") -> float:
    """
    Calculate BERTScore for one or more reference sentences compared to one or more prediction sentences using a specified BERT model.

    Args:
        references (Union[str, List[str]]): A single reference sentence or a list of reference sentences.
        predictions (Union[str, List[str]]): A single prediction sentence or a list of prediction sentences.
        model (Optional[str], optional): The name of the BERT model to be used for scoring. Defaults to "microsoft/deberta-large-mnli".
        lang (Optional[str], optional): The language code of the text, e.g., "en" for English. Defaults to "en".

    Returns:
        Dict[str, float]: A dictionary containing BERTScore metrics including precision, recall, and F1 score.
            - 'bert-precision' (float): BERTScore precision.
            - 'bert-recall' (float): BERTScore recall.
            - 'bert-f1' (float): BERTScore F1 score.

    Note:
        Before using this function, make sure to install the 'bert_score' module by running the following command:
        ```
        pip install bert-score
        ```
    """
    try:
        from bert_score import BERTScorer
    except ModuleNotFoundError as e:
        print(
            "Please install bert_score module. Command: pip install bert-score"
        )

    try:
        import torch
    except ModuleNotFoundError as e:
        print("Please install torch module. Command: pip install torch")

    device = "cuda" if torch.cuda.is_available() else "cpu"
    bert_scorer = BERTScorer(
        model_type=model,
        lang=lang,
        rescale_with_baseline=True,
        device=device,
    )

    if isinstance(predictions, str):
        predictions = [predictions]

    if isinstance(references, str):
        references = [references]

    if (
        isinstance(predictions, list)
        and isinstance(references, list)
        and not isinstance(references[0], list)
    ):
        if len(predictions) != len(references):
            references = [references]

    precision, recall, f1 = bert_scorer.score(
        cands=predictions, refs=references
    )
    return {
        "bert-precision": precision.detach().numpy().tolist(),
        "bert-recall": recall.detach().numpy().tolist(),
        "bert-f1": f1.detach().numpy().tolist(),
    }

In [None]:
import nltk
nltk.download('punkt_tab')

In [None]:
bleus = []
rouges = []

for couple in final_data:
    res = sentence_bleu_score(couple["golden"], couple["output"], bleu_type="bleu4")
    bleus.append(res)
    
    res = rouge_score(couple["golden"], couple["output"], score_type="rougeL")
    rouges.append(res)

references = [elem["golden"] for elem in final_data]
predictions = [elem["output"] for elem in final_data]
bert_scores = bert_score(references, predictions, lang="en")

In [None]:
bleu = sum(bleus) / len(bleus)
rouge = sum(rouges) / len(rouges)
bert_precision = sum(bert_scores["bert-precision"]) / len(bert_scores["bert-precision"])
bert_recall = sum(bert_scores["bert-recall"]) / len(bert_scores["bert-recall"])
bert_f1 = sum(bert_scores["bert-f1"]) / len(bert_scores["bert-f1"])


print(f"BLEU: {bleu:5f}")
print(f"ROUGE: {rouge:5f}")
print(f"BERT Precision: {bert_precision:2f}")
print(f"BERT Recall: {bert_recall:2f}")
print(f"BERT F1: {bert_f1:2f}")