In [None]:
# ===============================================================================================================#
# Copyright 2024 Infosys Ltd.                                                                          #
# Use of this source code is governed by Apache License Version 2.0 that can be found in the LICENSE file or at  #
# http://www.apache.org/licenses/                                                                                #
# ===============================================================================================================#

## Tool 04 - RAG Metrics (Interactive)
To experiment with data and RAG performance metrics

In [None]:
import os
import json
import pandas as pd
from datasets import Dataset
from langchain_openai import AzureChatOpenAI
from langchain_openai import AzureOpenAIEmbeddings
from ragas.metrics import (answer_correctness, faithfulness, answer_similarity,
                           context_precision, context_utilization, context_recall,
                           context_relevancy, answer_relevancy, context_entity_recall)
from ragas import evaluate
from _internal_utils.rag_evaluation_visualizer import RagEvaluationVisualizer

In [None]:
# Set Azure generic properties
%store -r USE_LOCAL_STORE
if 'USE_LOCAL_STORE' in locals() and USE_LOCAL_STORE:
    %store -r AZURE_OPENAI_SECRET_KEY
    os.environ["AZURE_OPENAI_API_KEY"]=AZURE_OPENAI_SECRET_KEY
    %store -r AZURE_OPENAI_SERVER_BASE_URL
    os.environ["AZURE_OPENAI_ENDPOINT"]=AZURE_OPENAI_SERVER_BASE_URL
else:
    os.environ["AZURE_OPENAI_API_KEY"] = os.environ["AZURE_OPENAI_SECRET_KEY"]
    os.environ["AZURE_OPENAI_ENDPOINT"] = os.environ["AZURE_OPENAI_SERVER_BASE_URL"]

llm_model = AzureChatOpenAI(
    openai_api_version="2024-02-15-preview",
    azure_deployment="gpt4"
)

embeddings_model = AzureOpenAIEmbeddings(
    openai_api_version="2022-12-01",
    azure_deployment="text-embedding-ada-002",
)

In [None]:
def do_rag_eval(dataset):
    """Test dataset 1"""    
    dataset = Dataset.from_dict(dataset)
    score = evaluate(dataset,
                     llm=llm_model,
                     embeddings=embeddings_model,
                     metrics=[faithfulness, answer_similarity,
                           context_precision, context_utilization, context_recall,
                           context_relevancy, answer_relevancy, context_entity_recall
                             ])
    return score.to_pandas()

In [None]:
from pathlib import Path
import os
import json
from datetime import datetime
class HistoryUtil():
    
    def __init__(self):
        home_path = 'C:/del/fs/notebookuc/STORAGE'
        user_nb_folder = f'{home_path}/data/output/tool_04_rag_metrics'
        os.makedirs(user_nb_folder, exist_ok=True)
        date_str = datetime.utcnow().strftime('%Y_%m_%d')
        self.__file_path = f'{user_nb_folder}/{date_str}_rag_metrics.json'
    
    def save_record(self, record):
        file_path = self.__file_path
        records = []
        if os.path.exists(file_path):
            with open(file_path, 'r', encoding='utf-8') as f:
                records = json.load(f)
        entry_time = datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
        _record = record.copy()
        _record['entry_time']=entry_time
        records.append(_record)
        print(records)
        with open(file_path, 'w', encoding='utf-8') as f:
            json.dump(records, f, indent=4) 
    
    def get_save_file_path(self, mask=True):
        file_path = self.__file_path
        if not os.path.exists(file_path):
            return None
        if not mask:
            return file_path
        home_path = str(Path.home())
        return file_path.replace(home_path, '~')
        

#### Unit Testing

In [None]:
def test_do_rag_eval():
    dataset = {
        "contexts": [["Paris is the capital and most populous city of France."]],
        "question": ["What is the capital of France?"],
        "answer": ["Paris is the capital of France."],
        "ground_truth": ["Paris"]
    }
    do_rag_eval(dataset)
# test_do_rag_eval()
def test_plot_metrics():
    data_dict = {'answer_correctness': 0.1949719484, 'faithfulness': None, 
                 'answer_similarity': 0.7798877935, 'context_precision': 0.0,
                 'context_utilization': 0.0, 'context_recall': 0.0, 
                 'context_relevancy': 0.3333333333, 'answer_relevancy': 0.8321068844, 
                 'context_entity_recall': 0.0}
    RagEvaluationVisualizer().plot_graph(data_dict)
# test_plot_metrics()

## RAG Metrics Evaluation

In [None]:
SAMPLES = [
    {
        "name": "1a. Capital-city Perfect score",
        "dataset": {
            "contexts": [["Paris is the capital and most populous city of France."]],
            "question": ["What is the capital of France?"],
            "ground_truth": ["Paris"],
            "answer": ["Paris"]
        }
    },
    {
        "name": "1b. Capital-city Low score",
        "dataset": {
            "contexts": [["France became part of the European Union on January 1, 1958.",
                         "France co-founded the EU.",
                         "France played a significant role in development of and growth of EU over the years."]
                        ],
            "question": ["What is the capital of Germany?"],
            "ground_truth": ["Berlin"],
            "answer": ["Boston"]
        }
    }
]

In [None]:
rag_eval_visualizer = RagEvaluationVisualizer(SAMPLES)

def form_submit_button_clicked(_):
    CONTEXT_DATA_DELIMITER = RagEvaluationVisualizer.CONTEXT_DELIMITER
    input_form_data_dict = rag_eval_visualizer.get_input_form_data()
    # rag_eval_visualizer.set_output_graph('Fetching. Please wait...')
    print('input_form_data_dict =', input_form_data_dict)
    contexts = input_form_data_dict['contexts'].split(CONTEXT_DATA_DELIMITER)
    contexts = [x.strip() for x in contexts ]
    dataset_dict = {
        "question": [input_form_data_dict['question']],
        "answer": [input_form_data_dict['answer']],
        "contexts": [contexts],
        "ground_truth": [input_form_data_dict['ground_truth']],
    }
    output_handler = rag_eval_visualizer.get_output_handler()
    output_handler.clear_output()
    with output_handler:
        df = do_rag_eval(dataset_dict)
    eval_dict = json.loads(df.iloc[0].to_json())
    print('eval_dict =', eval_dict)
    
    HistoryUtil().save_record(eval_dict)
    
    metrics_dict = {k:v for k,v in eval_dict.items() if k not in dataset_dict}
    print('metrics_dict =', metrics_dict)
    rag_eval_visualizer.set_output_graph(metrics_dict)

In [None]:
rag_eval_visualizer.on_form_submit_callback(form_submit_button_clicked)
rag_eval_visualizer.show_ui()

## History

In [None]:
history_file_path = HistoryUtil().get_save_file_path()
if not history_file_path:
    print('No history found')
else:
    print('History retrieved from:', history_file_path)
    df = pd.read_json(history_file_path)
    display(df)