## This notebook is to evaluate SLM Vs LLM Performance

In [1]:
import pandas as pd
df = pd.read_csv("data_final.csv")
df_new = df[['question','context','response','ground_truth']]
df_new.to_json('output.jsonl', orient='records', lines=True)


In [60]:
df_new.head()

Unnamed: 0,question,context,response,ground_truth
0,What is the main purpose of the microplate rea...,['Title: data.pdfMAINTENANCE MANUAL FOR LABORA...,The main purpose of the microplate reader is t...,The microplate reader is used to read the resu...
1,What type of test is a microplate reader prima...,['Title: data.pdfMAINTENANCE MANUAL FOR LABORA...,A microplate reader is primarily used for read...,It is primarily used for the ELISA (Enzyme-Lin...
2,Describe the wavelength range typically used b...,['Title: data.pdfMAINTENANCE MANUAL FOR LABORA...,The wavelength range typically used by a micro...,Microplate readers typically operate within a ...
3,What are the key components required for an EL...,['Title: data.pdfTABLE OF FIGURES\nviiiTable o...,The key components required for an ELISA test ...,"Key components include a microplate reader, mi..."
4,What are the different phases involved in an E...,['Title: data.pdfMAINTENANCE MANUAL FOR LABORA...,The ELISA technique involves the following pha...,ELISA involves coating wells with antibodies/a...


In [61]:
# Assuming the required evaluators are imported from azure.ai.evaluation
from azure.identity import DefaultAzureCredential

from azure.ai.evaluation import (
    RelevanceEvaluator, GroundednessEvaluator, CoherenceEvaluator,
    FluencyEvaluator, RougeScoreEvaluator, GleuScoreEvaluator,
    BleuScoreEvaluator, MeteorScoreEvaluator, SimilarityEvaluator,
    F1ScoreEvaluator, ViolenceEvaluator, SexualEvaluator,
    SelfHarmEvaluator, HateUnfairnessEvaluator, IndirectAttackEvaluator,
    ProtectedMaterialEvaluator,RougeType
)

class RAG_Evaluator:
    def __init__(self):
        # Assuming model_config is required for some evaluators
        azure_ai_project = {
                "subscription_id": "",
                "resource_group_name": "rg-meemankgpt",
                "project_name": "ws-meemankraft"
            }
        credential = DefaultAzureCredential()

        self.model_config = {
                    "azure_endpoint": "https://aoi-meemank.openai.azure.com/openai/deployments/gpt-4o-mini/chat/completions?api-version=2024-08-01-preview",
                    "api_key": "",
                    "azure_deployment":"gpt-4o-mini",
                    "api_version": "2024-08-01-preview",
            }
        
        # Initialize evaluators based on the table
        self.relevance_evaluator = RelevanceEvaluator(self.model_config)
        self.groundedness_evaluator = GroundednessEvaluator(self.model_config)
        self.coherence_evaluator = CoherenceEvaluator(self.model_config)
        self.fluency_evaluator = FluencyEvaluator(self.model_config)
        self.rouge_evaluator = RougeScoreEvaluator(rouge_type=RougeType.ROUGE_1)
        self.gleu_evaluator = GleuScoreEvaluator()
        self.bleu_evaluator = BleuScoreEvaluator()
        self.meteor_evaluator = MeteorScoreEvaluator(alpha=0.9, beta=3.0, gamma=0.5)
        self.similarity_evaluator = SimilarityEvaluator(self.model_config)
        self.f1_evaluator = F1ScoreEvaluator()
        self.violence_evaluator = ViolenceEvaluator(credential=credential, azure_ai_project=azure_ai_project)
        self.sexual_evaluator = SexualEvaluator(credential=credential, azure_ai_project=azure_ai_project)
        self.self_harm_evaluator = SelfHarmEvaluator(credential=credential, azure_ai_project=azure_ai_project)
        self.hate_unfairness_evaluator = HateUnfairnessEvaluator(credential=credential, azure_ai_project=azure_ai_project)
        self.indirect_attack_evaluator = IndirectAttackEvaluator(credential=credential, azure_ai_project=azure_ai_project)
        self.protected_material_evaluator = ProtectedMaterialEvaluator(credential=credential, azure_ai_project=azure_ai_project)

    def check_relevance_evaluator(self, response, context, query):
        print(f"Relevance Evaluation\nResponse: {response}\nContext: {context}\nQuery: {query}")
        score = self.relevance_evaluator(response=response, context=context, query=query)
        return score

    def check_all_evaluators(self, response, context, query, ground_truth=None):
        # Collect evaluation scores
        scores = {}

        # For evaluators that need response, context, and query
        scores['relevance'] = self.relevance_evaluator(response=response, context=context, query=query)
        scores['groundedness'] = self.groundedness_evaluator(response=response, context=context)
        scores['coherence'] = self.coherence_evaluator(response=response, query=query)
        scores['fluency'] = self.fluency_evaluator(response=response, query=query)
        
        # Evaluators that need ground_truth
        if ground_truth:
            scores['rouge'] = self.rouge_evaluator(response=response, ground_truth=ground_truth)
            scores['gleu'] = self.gleu_evaluator(response=response, ground_truth=ground_truth)
            scores['bleu'] = self.bleu_evaluator(response=response, ground_truth=ground_truth)
            scores['meteor'] = self.meteor_evaluator(response=response, ground_truth=ground_truth)
            scores['similarity'] = self.similarity_evaluator(response=response,query=query, ground_truth=ground_truth)
            scores['f1'] = self.f1_evaluator(response=response, ground_truth=ground_truth)
        
        # # # Evaluators for content safety
        # scores['violence'] = self.violence_evaluator(response=response, query=query)
        # scores['sexual'] = self.sexual_evaluator(response=response, query=query)
        # scores['self_harm'] = self.self_harm_evaluator(response=response, query=query)
        # scores['hate_unfairness'] = self.hate_unfairness_evaluator(response=response, query=query)
        # scores['indirect_attack'] = self.indirect_attack_evaluator(response=response, context=context)
        # scores['protected_material'] = self.protected_material_evaluator(response=response, query=query)

        return scores

# Example usage
evaluator = RAG_Evaluator()

# response = "The Alpine Explorer Tent is the most waterproof."
# context = """From our product list,
# the alpine explorer tent is the most waterproof.
# The Adventure Dining Table has a higher weight."""
# query = "Which tent is the most waterproof?"
# ground_truth = "The Alpine Explorer Tent is waterproof."

# # Call the method to check all evaluators and print the scores
# evaluation_scores = evaluator.check_all_evaluators(response, context, query, ground_truth)
# print(evaluation_scores)


In [62]:
# Function to evaluate each row
def evaluate_row(row):
    query = row['question']
    response = row['response']
    context = row['context']
    ground_truth = row['ground_truth']
    
    # Call the evaluator function (make sure evaluator object is initialized)
    evaluation_scores = evaluator.check_all_evaluators(response, context, query, ground_truth)
    
    return evaluation_scores

# Apply the evaluation function to each row and store the results in a new column
df_new['evaluation_scores'] = df_new.apply(evaluate_row, axis=1)

# Print the updated DataFrame with evaluation scores
print(df_new)

In [63]:
df_new

Unnamed: 0,question,context,response,ground_truth,evaluation_scores
0,What is the main purpose of the microplate rea...,['Title: data.pdfMAINTENANCE MANUAL FOR LABORA...,The main purpose of the microplate reader is t...,The microplate reader is used to read the resu...,"{'relevance': {'gpt_relevance': 5.0}, 'grounde..."
1,What type of test is a microplate reader prima...,['Title: data.pdfMAINTENANCE MANUAL FOR LABORA...,A microplate reader is primarily used for read...,It is primarily used for the ELISA (Enzyme-Lin...,"{'relevance': {'gpt_relevance': 5.0}, 'grounde..."
2,Describe the wavelength range typically used b...,['Title: data.pdfMAINTENANCE MANUAL FOR LABORA...,The wavelength range typically used by a micro...,Microplate readers typically operate within a ...,"{'relevance': {'gpt_relevance': 5.0}, 'grounde..."
3,What are the key components required for an EL...,['Title: data.pdfTABLE OF FIGURES\nviiiTable o...,The key components required for an ELISA test ...,"Key components include a microplate reader, mi...","{'relevance': {'gpt_relevance': 5.0}, 'grounde..."
4,What are the different phases involved in an E...,['Title: data.pdfMAINTENANCE MANUAL FOR LABORA...,The ELISA technique involves the following pha...,ELISA involves coating wells with antibodies/a...,"{'relevance': {'gpt_relevance': 5.0}, 'grounde..."
...,...,...,...,...,...
80,What kind of maintenance is required for dispe...,['Title: data.pdfMAINTENANCE MANUAL FOR LABORA...,The routine maintenance for dispensers focuses...,Maintenance includes cleaning the dispensing t...,"{'relevance': {'gpt_relevance': 5.0}, 'grounde..."
81,How should the volume accuracy of a dispenser ...,['Title: data.pdfMAINTENANCE MANUAL FOR LABORA...,I don't know.,Volume accuracy should be checked by dispensin...,"{'relevance': {'gpt_relevance': 1.0}, 'grounde..."
82,What factors could affect the accuracy of a la...,['Title: data.pdfMAINTENANCE MANUAL FOR LABORA...,Several factors could affect the accuracy of a...,Factors include worn or damaged dispensing tip...,"{'relevance': {'gpt_relevance': 5.0}, 'grounde..."
83,How can a laboratory dispenser be calibrated?,['Title: data.pdfMAINTENANCE MANUAL FOR LABORA...,I don't know.,Calibration involves dispensing a specific vol...,"{'relevance': {'gpt_relevance': 1.0}, 'grounde..."
