## This notebook is to evaluate SLM Vs LLM Performance

In [4]:
import pandas as pd
# df = pd.read_csv("slm_phi3_pytorch_16bits_inference_v1.csv")
df = pd.read_csv("data_LLM_BENCMARK.csv")
df_new = df[['question','context','response','ground_truth']]
df_new.to_json('output.jsonl', orient='records', lines=True)


In [5]:
df_new.head()

Unnamed: 0,question,context,response,ground_truth
0,What is the main purpose of the microplate rea...,['Title: data.pdfMAINTENANCE MANUAL FOR LABORA...,The main purpose of the microplate reader is t...,The microplate reader is used to read the resu...
1,What type of test is a microplate reader prima...,['Title: data.pdfMAINTENANCE MANUAL FOR LABORA...,A microplate reader is primarily used for read...,It is primarily used for the ELISA (Enzyme-Lin...
2,Describe the wavelength range typically used b...,['Title: data.pdfMAINTENANCE MANUAL FOR LABORA...,The wavelength range typically used by a micro...,Microplate readers typically operate within a ...
3,What are the key components required for an EL...,['Title: data.pdfTABLE OF FIGURES\nviiiTable o...,The key components required for an ELISA test ...,"Key components include a microplate reader, mi..."
4,What are the different phases involved in an E...,['Title: data.pdfMAINTENANCE MANUAL FOR LABORA...,The ELISA technique involves the following pha...,ELISA involves coating wells with antibodies/a...


In [14]:
# Assuming the required evaluators are imported from azure.ai.evaluation
from azure.identity import DefaultAzureCredential

from azure.ai.evaluation import (
    RelevanceEvaluator, GroundednessEvaluator, CoherenceEvaluator,
    FluencyEvaluator, RougeScoreEvaluator, GleuScoreEvaluator,
    BleuScoreEvaluator, MeteorScoreEvaluator, SimilarityEvaluator,
    F1ScoreEvaluator, ViolenceEvaluator, SexualEvaluator,
    SelfHarmEvaluator, HateUnfairnessEvaluator, IndirectAttackEvaluator,
    ProtectedMaterialEvaluator,RougeType
)

class RAG_Evaluator:
    def __init__(self):
        # Assuming model_config is required for some evaluators
        azure_ai_project = {
                "subscription_id": "62a476f9-9f3d-443f-991f-c2970691f3c9",
                "resource_group_name": "rg-meemankgpt",
                "project_name": "ws-meemankraft"
            }
        credential = DefaultAzureCredential()

        self.model_config = {
                    "azure_endpoint": "https://aoi-meemank.openai.azure.com/openai/deployments/gpt-4o-mini/chat/completions?api-version=2024-08-01-preview",
                    "api_key": "772624b8d13c4d19a12e77108401695e",
                    "azure_deployment":"gpt-4o-mini",
                    "api_version": "2024-08-01-preview",
            }
        
        # Initialize evaluators based on the table
        self.relevance_evaluator = RelevanceEvaluator(self.model_config)
        self.groundedness_evaluator = GroundednessEvaluator(self.model_config)
        self.coherence_evaluator = CoherenceEvaluator(self.model_config)
        self.fluency_evaluator = FluencyEvaluator(self.model_config)
        self.rouge_evaluator = RougeScoreEvaluator(rouge_type=RougeType.ROUGE_1)
        self.gleu_evaluator = GleuScoreEvaluator()
        self.bleu_evaluator = BleuScoreEvaluator()
        self.meteor_evaluator = MeteorScoreEvaluator(alpha=0.9, beta=3.0, gamma=0.5)
        self.similarity_evaluator = SimilarityEvaluator(self.model_config)
        self.f1_evaluator = F1ScoreEvaluator()
        # self.violence_evaluator = ViolenceEvaluator(credential=credential, azure_ai_project=azure_ai_project)
        # self.sexual_evaluator = SexualEvaluator(credential=credential, azure_ai_project=azure_ai_project)
        # self.self_harm_evaluator = SelfHarmEvaluator(credential=credential, azure_ai_project=azure_ai_project)
        # self.hate_unfairness_evaluator = HateUnfairnessEvaluator(credential=credential, azure_ai_project=azure_ai_project)
        # self.indirect_attack_evaluator = IndirectAttackEvaluator(credential=credential, azure_ai_project=azure_ai_project)
        # self.protected_material_evaluator = ProtectedMaterialEvaluator(credential=credential, azure_ai_project=azure_ai_project)

    def check_relevance_evaluator(self, response, context, query):
        print(f"Relevance Evaluation\nResponse: {response}\nContext: {context}\nQuery: {query}")
        score = self.relevance_evaluator(response=response, context=context, query=query)
        return score

    def check_all_evaluators(self, response, context, query, ground_truth=None):
        # Collect evaluation scores
        scores = {}

        # For evaluators that need response, context, and query
        scores['relevance'] = self.relevance_evaluator(response=response, context=context, query=query)
        scores['groundedness'] = self.groundedness_evaluator(response=response, context=context)
        scores['coherence'] = self.coherence_evaluator(response=response, query=query)
        scores['fluency'] = self.fluency_evaluator(response=response, query=query)
        
        # Evaluators that need ground_truth
        if ground_truth:
            scores['rouge'] = self.rouge_evaluator(response=response, ground_truth=ground_truth)
            scores['gleu'] = self.gleu_evaluator(response=response, ground_truth=ground_truth)
            scores['bleu'] = self.bleu_evaluator(response=response, ground_truth=ground_truth)
            scores['meteor'] = self.meteor_evaluator(response=response, ground_truth=ground_truth)
            scores['similarity'] = self.similarity_evaluator(response=response,query=query, ground_truth=ground_truth)
            scores['f1'] = self.f1_evaluator(response=response, ground_truth=ground_truth)
        
        # # # # Evaluators for content safety
        # scores['violence'] = self.violence_evaluator(response=response, query=query)
        # scores['sexual'] = self.sexual_evaluator(response=response, query=query)
        # scores['self_harm'] = self.self_harm_evaluator(response=response, query=query)
        # scores['hate_unfairness'] = self.hate_unfairness_evaluator(response=response, query=query)
        # scores['indirect_attack'] = self.indirect_attack_evaluator(response=response, context=context)
        # scores['protected_material'] = self.protected_material_evaluator(response=response, query=query)

        return scores

    def evaluate_for_dataframe(self, df):
        # Iterate over each row in the DataFrame
        for index, row in df.iterrows():
            response = row['response']
            context = row['context']
            query = row['question']
            ground_truth = row.get('ground_truth', None)

            # Get the evaluation scores for each row
            evaluation_scores = self.check_all_evaluators(response, context, query, ground_truth)

            # Append the scores to the DataFrame for the corresponding row
            for key, value in evaluation_scores.items():
                df.at[index, key] = value

        return df
# Example usage
evaluator = RAG_Evaluator()

In [15]:
df_new

Unnamed: 0,question,context,response,ground_truth,relevance,groundedness,coherence,fluency,rouge,gleu,bleu,meteor,similarity,f1
0,What is the main purpose of the microplate rea...,['Title: data.pdfMAINTENANCE MANUAL FOR LABORA...,The main purpose of the microplate reader is t...,The microplate reader is used to read the resu...,,,,,,,,,,
1,What type of test is a microplate reader prima...,['Title: data.pdfMAINTENANCE MANUAL FOR LABORA...,A microplate reader is primarily used for read...,It is primarily used for the ELISA (Enzyme-Lin...,,,,,,,,,,
2,Describe the wavelength range typically used b...,['Title: data.pdfMAINTENANCE MANUAL FOR LABORA...,The wavelength range typically used by a micro...,Microplate readers typically operate within a ...,,,,,,,,,,
3,What are the key components required for an EL...,['Title: data.pdfTABLE OF FIGURES\nviiiTable o...,The key components required for an ELISA test ...,"Key components include a microplate reader, mi...",,,,,,,,,,
4,What are the different phases involved in an E...,['Title: data.pdfMAINTENANCE MANUAL FOR LABORA...,The ELISA technique involves the following pha...,ELISA involves coating wells with antibodies/a...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80,What kind of maintenance is required for dispe...,['Title: data.pdfMAINTENANCE MANUAL FOR LABORA...,The routine maintenance for dispensers focuses...,Maintenance includes cleaning the dispensing t...,,,,,,,,,,
81,How should the volume accuracy of a dispenser ...,['Title: data.pdfMAINTENANCE MANUAL FOR LABORA...,I don't know.,Volume accuracy should be checked by dispensin...,,,,,,,,,,
82,What factors could affect the accuracy of a la...,['Title: data.pdfMAINTENANCE MANUAL FOR LABORA...,Several factors could affect the accuracy of a...,Factors include worn or damaged dispensing tip...,,,,,,,,,,
83,How can a laboratory dispenser be calibrated?,['Title: data.pdfMAINTENANCE MANUAL FOR LABORA...,I don't know.,Calibration involves dispensing a specific vol...,,,,,,,,,,


In [16]:
# df_new = df_new.head(4)
# df_new

In [17]:
score_columns = ['relevance', 'groundedness', 'coherence', 'fluency', 'rouge', 'gleu', 'bleu', 'meteor', 'similarity', 'f1']
for col in score_columns:
    df_new[col] = None

# Evaluate for each row and update the DataFrame
updated_df = evaluator.evaluate_for_dataframe(df_new)

# Print the updated DataFrame with evaluation scores
print(updated_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new[col] = None


                                             question  \
0   What is the main purpose of the microplate rea...   
1   What type of test is a microplate reader prima...   
2   Describe the wavelength range typically used b...   
3   What are the key components required for an EL...   
4   What are the different phases involved in an E...   
..                                                ...   
80  What kind of maintenance is required for dispe...   
81  How should the volume accuracy of a dispenser ...   
82  What factors could affect the accuracy of a la...   
83      How can a laboratory dispenser be calibrated?   
84  What are the typical troubleshooting steps for...   

                                              context  \
0   ['Title: data.pdfMAINTENANCE MANUAL FOR LABORA...   
1   ['Title: data.pdfMAINTENANCE MANUAL FOR LABORA...   
2   ['Title: data.pdfMAINTENANCE MANUAL FOR LABORA...   
3   ['Title: data.pdfTABLE OF FIGURES\nviiiTable o...   
4   ['Title: data.pdfMAINTENAN

In [18]:
updated_df

Unnamed: 0,question,context,response,ground_truth,relevance,groundedness,coherence,fluency,rouge,gleu,bleu,meteor,similarity,f1
0,What is the main purpose of the microplate rea...,['Title: data.pdfMAINTENANCE MANUAL FOR LABORA...,The main purpose of the microplate reader is t...,The microplate reader is used to read the resu...,{'gpt_relevance': 5.0},{'gpt_groundedness': 5.0},{'gpt_coherence': 4.0},{'gpt_fluency': 4.0},"{'rouge_precision': 0.59375, 'rouge_recall': 0...",{'gleu_score': 0.37681159420289856},{'bleu_score': 0.3473525504903788},{'meteor_score': 0.7467789159086368},{'gpt_similarity': 5.0},{'f1_score': 0.6956521739130435}
1,What type of test is a microplate reader prima...,['Title: data.pdfMAINTENANCE MANUAL FOR LABORA...,A microplate reader is primarily used for read...,It is primarily used for the ELISA (Enzyme-Lin...,{'gpt_relevance': 5.0},{'gpt_groundedness': 5.0},{'gpt_coherence': 4.0},{'gpt_fluency': 5.0},"{'rouge_precision': 0.3870967741935484, 'rouge...",{'gleu_score': 0.17164179104477612},{'bleu_score': 0.10046152640557755},{'meteor_score': 0.506305584946667},{'gpt_similarity': 5.0},{'f1_score': 0.4888888888888889}
2,Describe the wavelength range typically used b...,['Title: data.pdfMAINTENANCE MANUAL FOR LABORA...,The wavelength range typically used by a micro...,Microplate readers typically operate within a ...,{'gpt_relevance': 5.0},{'gpt_groundedness': 5.0},{'gpt_coherence': 5.0},{'gpt_fluency': 5.0},"{'rouge_precision': 0.46153846153846156, 'roug...",{'gleu_score': 0.20224719101123595},{'bleu_score': 0.1599738412076905},{'meteor_score': 0.6583025830258301},{'gpt_similarity': 4.0},{'f1_score': 0.5964912280701754}
3,What are the key components required for an EL...,['Title: data.pdfTABLE OF FIGURES\nviiiTable o...,The key components required for an ELISA test ...,"Key components include a microplate reader, mi...",{'gpt_relevance': 5.0},{'gpt_groundedness': 5.0},{'gpt_coherence': 4.0},{'gpt_fluency': 5.0},"{'rouge_precision': 0.3888888888888889, 'rouge...",{'gleu_score': 0.11797752808988764},{'bleu_score': 0.09260304181144532},{'meteor_score': 0.6023455669635259},{'gpt_similarity': 5.0},{'f1_score': 0.5333333333333333}
4,What are the different phases involved in an E...,['Title: data.pdfMAINTENANCE MANUAL FOR LABORA...,The ELISA technique involves the following pha...,ELISA involves coating wells with antibodies/a...,{'gpt_relevance': 5.0},{'gpt_groundedness': 5.0},{'gpt_coherence': 5.0},{'gpt_fluency': 5.0},"{'rouge_precision': 0.0871559633027523, 'rouge...",{'gleu_score': 0.02106084243369735},{'bleu_score': 0.008706658268698726},{'meteor_score': 0.25312083095101967},{'gpt_similarity': 5.0},{'f1_score': 0.16243654822335027}
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80,What kind of maintenance is required for dispe...,['Title: data.pdfMAINTENANCE MANUAL FOR LABORA...,The routine maintenance for dispensers focuses...,Maintenance includes cleaning the dispensing t...,{'gpt_relevance': 4.0},{'gpt_groundedness': 5.0},{'gpt_coherence': 5.0},{'gpt_fluency': 5.0},"{'rouge_precision': 0.10101010101010101, 'roug...",{'gleu_score': 0.023206751054852322},{'bleu_score': 0.00873167376108168},{'meteor_score': 0.26494708994709},{'gpt_similarity': 2.0},{'f1_score': 0.13861386138613863}
81,How should the volume accuracy of a dispenser ...,['Title: data.pdfMAINTENANCE MANUAL FOR LABORA...,I don't know.,Volume accuracy should be checked by dispensin...,{'gpt_relevance': 1.0},{'gpt_groundedness': 1.0},{'gpt_coherence': 1.0},{'gpt_fluency': 1.0},"{'rouge_precision': 0.0, 'rouge_recall': 0.0, ...",{'gleu_score': 0.00909090909090909},{'bleu_score': 0.00037568542726235045},{'meteor_score': 0.018796992481203003},{'gpt_similarity': 1.0},{'f1_score': 0.0}
82,What factors could affect the accuracy of a la...,['Title: data.pdfMAINTENANCE MANUAL FOR LABORA...,Several factors could affect the accuracy of a...,Factors include worn or damaged dispensing tip...,{'gpt_relevance': 5.0},{'gpt_groundedness': 5.0},{'gpt_coherence': 5.0},{'gpt_fluency': 5.0},"{'rouge_precision': 0.03383458646616541, 'roug...",{'gleu_score': 0.008891928864569083},{'bleu_score': 0.003290604748875199},{'meteor_score': 0.14388489208633096},{'gpt_similarity': 1.0},{'f1_score': 0.06349206349206349}
83,How can a laboratory dispenser be calibrated?,['Title: data.pdfMAINTENANCE MANUAL FOR LABORA...,I don't know.,Calibration involves dispensing a specific vol...,{'gpt_relevance': 1.0},{'gpt_groundedness': 1.0},{'gpt_coherence': 1.0},{'gpt_fluency': 1.0},"{'rouge_precision': 0.0, 'rouge_recall': 0.0, ...",{'gleu_score': 0.011111111111111112},{'bleu_score': 0.0010212188701441197},{'meteor_score': 0.022624434389140267},{'gpt_similarity': 1.0},{'f1_score': 0.0}


In [19]:
updated_df.to_csv("SLM_FINETUNING_BENCMARK_LLM.csv")

In [62]:
# # Function to evaluate each row
# def evaluate_row(row):
#     query = row['question']
#     response = row['response']
#     context = row['context']
#     ground_truth = row['ground_truth']
    
#     # Call the evaluator function (make sure evaluator object is initialized)
#     evaluation_scores = evaluator.check_all_evaluators(response, context, query, ground_truth)
#     df = pd.DataFrame([evaluation_scores])
#     return df

# # Apply the evaluation function to each row and store the results in a new column
# df_new['evaluation_scores'] = df_new.apply(evaluate_row, axis=1)

# # Print the updated DataFrame with evaluation scores
# print(df_new)

In [86]:
# df_new