## This notebook is to evaluate SLM Vs LLM Performance

In [1]:
import os
import time
import urllib.request
import json
import ssl
from deepeval.metrics import AnswerRelevancyMetric, ContextualRelevancyMetric, ContextualPrecisionMetric, ContextualRecallMetric
from deepeval.test_case import LLMTestCase
from deepeval.models.base_model import DeepEvalBaseLLM
from langchain_openai import AzureChatOpenAI

class AzureOpenAIEvaluator(DeepEvalBaseLLM):
    def __init__(
        self,
        model
    ):
        self.model = model

    def load_model(self):
        return self.model

    def generate(self, prompt: str) -> str:
        chat_model = self.load_model()
        return chat_model.invoke(prompt).content

    async def a_generate(self, prompt: str) -> str:
        chat_model = self.load_model()
        res = await chat_model.ainvoke(prompt)
        return res.content

    def get_model_name(self):
        return "Custom Azure OpenAI Model"



In [2]:
import time
import pandas as pd
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from bert_score import score
import os
import time
import urllib.request
import json
import ssl
from deepeval.metrics import AnswerRelevancyMetric, ContextualRelevancyMetric, ContextualPrecisionMetric, ContextualRecallMetric
from deepeval.test_case import LLMTestCase
from deepeval.models.base_model import DeepEvalBaseLLM
from langchain_openai import AzureChatOpenAI
nltk.download('punkt')

class RAG_Evaluator():
    def __init__(self, framework):
        self.framework = framework
        if self.framework == "deepeval":
            custom_model = AzureChatOpenAI(
                openai_api_version="2023-03-15-preview",
                azure_deployment="gpt4omini",
                azure_endpoint="",
                openai_api_key="",
            )
            azure_openai = AzureOpenAIEvaluator(model=custom_model)
            self.answer_relevancy_metric = AnswerRelevancyMetric(
                model=azure_openai,
                threshold=0.7,
                include_reason=True
            )
            self.context_relevancy_metric = ContextualRelevancyMetric(
                model=azure_openai,
                threshold=0.7,
                include_reason=True
            )
            self.context_precision_metric = ContextualPrecisionMetric(
                threshold=0.7,
                model=azure_openai,
                include_reason=True
            )
            self.context_recall_metric = ContextualRecallMetric(
                threshold=0.7,
                model=azure_openai,
                include_reason=True
            )

    def check_answer_relevancy(self, question, actual_output):
        score = ""
        reason = ""

        if self.framework == "deepeval":
            test_case = LLMTestCase(
                input=question,
                actual_output=actual_output
            )
            self.answer_relevancy_metric.measure(test_case)
            score = self.answer_relevancy_metric.score
            reason = self.answer_relevancy_metric.reason

        return score, reason

    def check_contextual_relevancy(self, question, actual_output, retrieval_contexts):
        score = ""
        reason = ""

        if self.framework == "deepeval":
            test_case = LLMTestCase(
                input=question,
                actual_output=actual_output,
                retrieval_context=retrieval_contexts
            )
            self.context_relevancy_metric.measure(test_case)
            score = self.context_relevancy_metric.score
            reason = self.context_relevancy_metric.reason

        return score, reason

    def check_contextual_precision(self, question, actual_output, expected_output, retrieval_contexts):
        score = ""
        reason = ""

        if self.framework == "deepeval":
            test_case = LLMTestCase(
                input=question,
                actual_output=actual_output,
                expected_output=expected_output,
                retrieval_context=retrieval_contexts
            )
            self.context_precision_metric.measure(test_case)
            score = self.context_precision_metric.score
            reason = self.context_precision_metric.reason

        return score, reason

    def check_contextual_recall(self, question, actual_output, expected_output, retrieval_contexts):
        score = ""
        reason = ""

        if self.framework == "deepeval":
            test_case = LLMTestCase(
                input=question,
                actual_output=actual_output,
                expected_output=expected_output,
                retrieval_context=retrieval_contexts
            )
            self.context_recall_metric.measure(test_case)
            score = self.context_recall_metric.score
            reason = self.context_recall_metric.reason

        return score, reason

    def calculate_metrics(self, ground_truth, llm_response):
        def calculate_rouge(reference, hypothesis):
            scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
            scores = scorer.score(reference, hypothesis)
            return scores

        def calculate_bertscore(reference, hypothesis):
            P, R, F1 = score([hypothesis], [reference], lang="en", verbose=False)
            return {
                'precision': P.item(),
                'recall': R.item(),
                'f1': F1.item()
            }

        rouge_scores = calculate_rouge(ground_truth, llm_response)
        bertscore = calculate_bertscore(ground_truth, llm_response)

        results = {
            'ROUGE-1': rouge_scores['rouge1'].fmeasure,
            'ROUGE-2': rouge_scores['rouge2'].fmeasure,
            'ROUGE-L': rouge_scores['rougeL'].fmeasure,
            'BERTScore Precision': bertscore['precision'],
            'BERTScore Recall': bertscore['recall'],
            'BERTScore F1': bertscore['f1']
        }

        return results

    def perform_ragas_evaluation(self, question, expected_response, rag_response):
        rag_response_str = rag_response["response_output"]
        metrics = {}
        
        answer_relevancy_score, answer_relevancy_reason = self.check_answer_relevancy(question, rag_response_str)
        # time.sleep(5)

        relevant_contexts = [source_doc for source_doc in rag_response["source_documents"]]
        contextual_relevancy_score, contextual_relevancy_reason = self.check_contextual_relevancy(question, rag_response_str, relevant_contexts)
        # time.sleep(5)

        if expected_response != "":
            contextual_precision_score, contextual_precision_reason = self.check_contextual_precision(question, rag_response_str, expected_response, relevant_contexts)
            # time.sleep(5)
            contextual_recall_score, contextual_recall_reason = self.check_contextual_recall(question, rag_response_str, expected_response, relevant_contexts)
            # time.sleep(5)
        else:
            contextual_precision_score, contextual_precision_reason = 0.0, "NA"
            contextual_recall_score, contextual_recall_reason = 0.0, "NA"
        
        metrics.update({
            'answer_relevancy_score': answer_relevancy_score,
            'answer_relevancy_reason': answer_relevancy_reason,
            'contextual_relevancy_score': contextual_relevancy_score,
            'contextual_relevancy_reason': contextual_relevancy_reason,
            'contextual_precision_score': contextual_precision_score,
            'contextual_precision_reason': contextual_precision_reason,
            'contextual_recall_score': contextual_recall_score,
            'contextual_recall_reason': contextual_recall_reason
        })

        return metrics

    def perform_NLP_evaluation(self, question, expected_response, rag_response):
        rag_response_str = rag_response["response_output"]
        additional_metrics = {}     
        additional_metrics = self.calculate_metrics(expected_response, rag_response_str)
        return additional_metrics
    
    def get_rag_response(self, question):
        starttime = time.time()
        response = ""
        final_dict = {}
        def allowSelfSignedHttps(allowed):
                """Bypass the server certificate verification on the client side."""
                if allowed and not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None):
                    ssl._create_default_https_context = ssl._create_unverified_context

        allowSelfSignedHttps(True)  # This is needed if you use self-signed certificates.

        def send_chat_input_LLM(question):
            """Send a chat input to the API and return the parsed response."""
            # API request payload
            data = {
                "chat_input": question,
                "chat_history": []  # Modify this if you have previous chat history
            }
            
            # API credentials
            api_key = ""  # Replace with your actual API key
            url = ""  # Replace with the actual API URL

            # Prepare request body and headers
            body = str.encode(json.dumps(data))
            
            if not api_key:
                raise Exception("A valid API key is required to invoke the endpoint.")
            
            headers = {'Content-Type': 'application/json', 'Authorization': 'Bearer ' + api_key}
            req = urllib.request.Request(url, body, headers)
            
            # Initialize response structure
            text_docs = []
            final_dict = {}

            try:
                # Make the API call
                response = urllib.request.urlopen(req)
                result = response.read().decode('utf-8')
                
                # Parse the JSON response
                result_dict = json.loads(result)
                chat_output = result_dict.get('chat_output', {})
                
                # Extract chat response data
                response_output = chat_output.get('response', 'No response found')
                user_question = chat_output.get('user_question', 'No question found')
                source_documents = chat_output.get('source_documents', [])
                
                # Collect source document texts
                for doc in source_documents:
                    text = doc.get('text', 'No text found')
                    text_docs.append(text)
                
                # Build the response dictionary
                final_dict['response_output'] = response_output
                final_dict['user_question'] = user_question
                final_dict['source_documents'] = text_docs
                endtime = time.time()
                final_dict['infernce_time_llm'] = endtime-starttime
                return final_dict

            except urllib.error.HTTPError as error:
                print(f"The request failed with status code: {error.code}")
                print(error.info())
                print(error.read().decode("utf8", 'ignore'))
                return None          
            
            return response_dict
        response_dictionary = send_chat_input_LLM(question)
        return response_dictionary
def evaluate_responses(df, evaluator,evaluation_type = 'RAGAS', model_type='LLM'):
    """
    Evaluates LLM or SLM responses and Deepeval scores for each row in the DataFrame.

    Parameters:
    df (pandas.DataFrame): DataFrame containing columns 'question' and 'answer'.
    evaluator (RAG_Evaluator): An instance of the RAG_Evaluator with a specified framework.
    model_type (str): Specifies whether to use 'LLM' or 'SLM' for the response evaluation.

    Returns:
    pandas.DataFrame: DataFrame with added columns for model responses and Deepeval scores.
    """
    # Determine the prefix based on the model type
    prefix = model_type.upper()

    # # Separate the response output and source documents into different columns
    # df[f'response_output_{model_type}'] = df[f'{prefix}_response'].apply(lambda x: x['response_output'])
    # df[f'source_documents_{model_type}'] = df[f'{prefix}_response'].apply(lambda x: x['source_documents'])

    if evaluation_type == 'RAGAS':
    # Apply the evaluation model to score the responses
        df[f'deepeval_response_{model_type}'] = df.apply(
            lambda row: evaluator.perform_ragas_evaluation(row['question'], row['answer'], row[f'{prefix}_response']), axis=1)

        # Separate the evaluation scores into different columns
        df[f'answer_relevancy_score_{model_type}'] = df[f'deepeval_response_{model_type}'].apply(
            lambda x: x['answer_relevancy_score'])
        df[f'contextual_relevancy_score_{model_type}'] = df[f'deepeval_response_{model_type}'].apply(
            lambda x: x['contextual_relevancy_score'])
        df[f'contextual_precision_score_{model_type}'] = df[f'deepeval_response_{model_type}'].apply(
            lambda x: x['contextual_precision_score'])
        df[f'contextual_recall_score_{model_type}'] = df[f'deepeval_response_{model_type}'].apply(
            lambda x: x['contextual_recall_score'])
    elif evaluation_type == 'NLP':
        df[f'deepeval_NLP_response_{model_type}'] = df.apply(
            lambda row: evaluator.perform_NLP_evaluation(row['question'], row['answer'], row[f'{prefix}_response']), axis=1)

        # Append additional evaluation metrics
        df[f'ROUGE-1_{model_type}'] = df[f'deepeval_NLP_response_{model_type}'].apply(lambda x: x['ROUGE-1'])
        df[f'ROUGE-2_{model_type}'] = df[f'deepeval_NLP_response_{model_type}'].apply(lambda x: x['ROUGE-2'])
        df[f'ROUGE-L_{model_type}'] = df[f'deepeval_NLP_response_{model_type}'].apply(lambda x: x['ROUGE-L'])
        df[f'BERTScore_Precision_{model_type}'] = df[f'deepeval_NLP_response_{model_type}'].apply(lambda x: x['BERTScore Precision'])
        df[f'BERTScore_Recall_{model_type}'] = df[f'deepeval_NLP_response_{model_type}'].apply(lambda x: x['BERTScore Recall'])
        df[f'BERTScore_F1_{model_type}'] = df[f'deepeval_NLP_response_{model_type}'].apply(lambda x: x['BERTScore F1'])

    # Optional: Drop the intermediate deepeval response column if not needed
    # df.drop(columns=[f'deepeval_response_{model_type}'], inplace=True)
    return df



[nltk_data] Downloading package punkt to /home/azureuser/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# Read ground truth
import pandas as pd
import json
import warnings
import nest_asyncio

warnings.filterwarnings("ignore", category=UserWarning, message=".*Event loop is already running.*")
nest_asyncio.apply()

# Path to your JSONL file
file_path = "lab_maintenance_100_qa.jsonl"

# Read the JSON file as a list of dictionaries
with open(file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)  # Use json.load() since it's a JSON array, not a JSONL file

# Convert the list of dictionaries into a DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
df.head()

# df=df.head(2)

Unnamed: 0,question,answer,chapter
0,What is the main purpose of the microplate rea...,The microplate reader is used to read the resu...,1
1,What type of test is a microplate reader prima...,It is primarily used for the ELISA (Enzyme-Lin...,1
2,Describe the wavelength range typically used b...,Microplate readers typically operate within a ...,1
3,What are the key components required for an EL...,"Key components include a microplate reader, mi...",1
4,What are the different phases involved in an E...,ELISA involves coating wells with antibodies/a...,1


In [4]:
evaluator = RAG_Evaluator(framework = "deepeval")

In [5]:

# call Prompt flow to generate response 
# Apply the RAG model to generate responses for each question
prefix = 'LLM'
df[f'{prefix}_response'] = df.apply(lambda row: evaluator.get_rag_response(row['question']), axis=1)
    # Separate the response output and source documents into different columns
df[f'response_output_{prefix}'] = df[f'{prefix}_response'].apply(lambda x: x['response_output'])
df[f'context'] = df[f'{prefix}_response'].apply(lambda x: x['source_documents'])



In [7]:
df[f'LLM_Responsetime'] = df[f'{prefix}_response'].apply(lambda x: x['infernce_time_llm'])

In [8]:
df.head()

Unnamed: 0,question,answer,chapter,LLM_response,response_output_LLM,context,LLM_Responsetime
0,What is the main purpose of the microplate rea...,The microplate reader is used to read the resu...,1,{'response_output': 'The main purpose of the m...,The main purpose of the microplate reader is t...,[Title: data.pdfMAINTENANCE MANUAL FOR LABORAT...,2.023033
1,What type of test is a microplate reader prima...,It is primarily used for the ELISA (Enzyme-Lin...,1,{'response_output': 'A microplate reader is pr...,A microplate reader is primarily used for read...,[Title: data.pdfMAINTENANCE MANUAL FOR LABORAT...,1.482003
2,Describe the wavelength range typically used b...,Microplate readers typically operate within a ...,1,{'response_output': 'The wavelength range typi...,The wavelength range typically used by a micro...,[Title: data.pdfMAINTENANCE MANUAL FOR LABORAT...,1.365792
3,What are the key components required for an EL...,"Key components include a microplate reader, mi...",1,{'response_output': 'The key components requir...,The key components required for an ELISA test ...,[Title: data.pdfTABLE OF FIGURES\nviiiTable of...,1.322968
4,What are the different phases involved in an E...,ELISA involves coating wells with antibodies/a...,1,{'response_output': 'The ELISA technique invol...,The ELISA technique involves the following pha...,[Title: data.pdfMAINTENANCE MANUAL FOR LABORAT...,3.635758


In [9]:
df.to_csv("data_time_response.csv")

In [43]:
%%capture
# get RAGAS response for LLM
df_eval = evaluate_responses(df,evaluator,'RAGAS','LLM')

In [1]:
df_eval.to_csv("data_time_response.csv")

NameError: name 'df_eval' is not defined

In [45]:
%%capture
# get NLP response for LLM
final_output = evaluate_responses(df_eval,evaluator,'NLP','LLM')

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [46]:
final_output

Unnamed: 0,question,answer,chapter,LLM_response,response_output_LLM,context,deepeval_response_LLM,answer_relevancy_score_LLM,contextual_relevancy_score_LLM,contextual_precision_score_LLM,contextual_recall_score_LLM,deepeval_NLP_response_LLM,ROUGE-1_LLM,ROUGE-2_LLM,ROUGE-L_LLM,BERTScore_Precision_LLM,BERTScore_Recall_LLM,BERTScore_F1_LLM
0,What is the main purpose of the microplate rea...,The microplate reader is used to read the resu...,1,{'response_output': 'The main purpose of the m...,The main purpose of the microplate reader is t...,[Title: data.pdfMAINTENANCE MANUAL FOR LABORAT...,"{'answer_relevancy_score': 0.6666666666666666,...",0.666667,0.666667,1.0,1.0,"{'ROUGE-1': 0.6909090909090908, 'ROUGE-2': 0.5...",0.690909,0.528302,0.618182,0.911905,0.947884,0.929547
1,What type of test is a microplate reader prima...,It is primarily used for the ELISA (Enzyme-Lin...,1,{'response_output': 'A microplate reader is pr...,A microplate reader is primarily used for read...,[Title: data.pdfMAINTENANCE MANUAL FOR LABORAT...,"{'answer_relevancy_score': 0.6666666666666666,...",0.666667,1.0,1.0,1.0,"{'ROUGE-1': 0.5098039215686274, 'ROUGE-2': 0.1...",0.509804,0.163265,0.431373,0.894282,0.897778,0.896027
