In [2]:
from pprint import pprint
import pandas as pd
from azure.identity import DefaultAzureCredential, get_bearer_token_provider  
import os
from dotenv import load_dotenv
from openai import AzureOpenAI  

load_dotenv()  

True

In [3]:
sub_id = os.getenv("SUB_ID")
AZURE_OPENAI_ACCOUNT = os.getenv("AZURE_OPENAI_ACCOUNT")
AZURE_DEPLOYMENT_MODEL = os.getenv("DEPLOYMENT_NAME")
AZURE_COGNITIVE_SERVICES_RESOURCE = os.getenv("AZURE_COGNITIVE_SERVICES_RESOURCE")
deployment = os.getenv("DEPLOYMENT_NAME")

print(sub_id)


dfd68b84-3c34-4a45-a52b-cdc7e42e4d7d


In [4]:
azure_ai_project = {
    "subscription_id": sub_id,
    "resource_group_name": "GenAI-demo",
    "project_name": "mohcenemouadlariane-5171",
}

model_config = {
    "azure_endpoint": AZURE_OPENAI_ACCOUNT,
    "azure_deployment": deployment,
}

In [5]:
df = pd.read_json("data.jsonl", lines=True)
print(df.head())

                                           query  \
0                 What is the capital of France?   
1             Which tent is the most waterproof?   
2           Which camping table is the lightest?   
3  How much does TrailWalker Hiking Shoes cost?    

                                             context  \
0                   France is the country in Europe.   
1  #TrailMaster X4 Tent, price $250,## BrandOutdo...   
2  #BaseCamp Folding Table, price $60,## BrandCam...   
3  #TrailWalker Hiking Shoes, price $110## BrandT...   

                                        ground_truth  
0                                              Paris  
1  The TrailMaster X4 tent has a rainfly waterpro...  
2  The BaseCamp Folding Table has a weight of 15 lbs  
3    The TrailWalker Hiking Shoes are priced at $110  


In [6]:
cognitiveServicesResource = os.getenv('AZURE_COGNITIVE_SERVICES_RESOURCE')  
token_provider = get_bearer_token_provider(  
    DefaultAzureCredential(),  
    f'{cognitiveServicesResource}.default'  
)  

client = AzureOpenAI(
    azure_endpoint=AZURE_OPENAI_ACCOUNT,
    api_version="2024-05-01-preview",
    azure_ad_token_provider=token_provider,
    )

def get_response(query):
    completion = client.chat.completions.create(
        model=deployment,
        messages=[
                    {
                        "role": "user",
                        "content": query,
                    }
                ],
                max_tokens=800,
                temperature=0.7,
                top_p=0.95,
                frequency_penalty=0,
                presence_penalty=0,
                stop=None,
                stream=False,
            )
    output = completion.to_dict()
    return {"query": query, "response": output["choices"][0]["message"]["content"]}

In [7]:
import pathlib

from azure.ai.evaluation import evaluate
from azure.ai.evaluation import (
    ContentSafetyEvaluator,
    RelevanceEvaluator,
    CoherenceEvaluator,
    GroundednessEvaluator,
    FluencyEvaluator,
    SimilarityEvaluator,
)
from model_endpoint import ModelEndpoint


content_safety_evaluator = ContentSafetyEvaluator(
    azure_ai_project=azure_ai_project, credential=DefaultAzureCredential()
)
relevance_evaluator = RelevanceEvaluator(model_config)
coherence_evaluator = CoherenceEvaluator(model_config)
groundedness_evaluator = GroundednessEvaluator(model_config)
fluency_evaluator = FluencyEvaluator(model_config)
similarity_evaluator = SimilarityEvaluator(model_config)

path = str(pathlib.Path(pathlib.Path.cwd())) + "/data.jsonl"

results = evaluate(
    evaluation_name="Eval-Run-" + "-" + model_config["azure_deployment"].title(),
    data=path,
    target=ModelEndpoint(model_config),
    evaluators={
        "content_safety": content_safety_evaluator,
        "coherence": coherence_evaluator,
        "relevance": relevance_evaluator,
        "groundedness": groundedness_evaluator,
        "fluency": fluency_evaluator,
        "similarity": similarity_evaluator,
    },
    evaluator_config={
        "content_safety": {"column_mapping": {"query": "${data.query}", "response": "${target.response}"}},
        "coherence": {"column_mapping": {"response": "${target.response}", "query": "${data.query}"}},
        "relevance": {
            "column_mapping": {"response": "${target.response}", "context": "${data.context}", "query": "${data.query}"}
        },
        "groundedness": {
            "column_mapping": {
                "response": "${target.response}",
                "context": "${data.context}",
                "query": "${data.query}",
            }
        },
        "fluency": {
            "column_mapping": {"response": "${target.response}", "context": "${data.context}", "query": "${data.query}"}
        },
        "similarity": {
            "column_mapping": {"response": "${target.response}", "context": "${data.context}", "query": "${data.query}"}
        },
    },
)

Class ContentSafetyEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class ViolenceEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class SexualEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class SelfHarmEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class HateUnfairnessEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


{'azure_endpoint': 'https://genai-demo-ai-service257079427610.openai.azure.com/', 'azure_deployment': 'gpt-4o', 'type': 'azure_openai', 'api_version': '2024-02-15-preview'}
Prompt flow service has started...
You can view the traces in local from http://127.0.0.1:23333/v1.0/ui/traces/?#run=model_endpoint_modelendpoint_wyt4ywa3_20250303_160155_971512


[2025-03-03 16:02:00 +0100][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run model_endpoint_modelendpoint_wyt4ywa3_20250303_160155_971512, log path: C:\Users\v-molariane\.promptflow\.runs\model_endpoint_modelendpoint_wyt4ywa3_20250303_160155_971512\logs.txt


2025-03-03 16:02:01 +0100   18104 execution.bulk     INFO     Current thread is not main thread, skip signal handler registration in BatchEngine.
2025-03-03 16:02:01 +0100   18104 execution.bulk     INFO     Current system's available memory is 12951.08984375MB, memory consumption of current process is 269.7421875MB, estimated available worker count is 12951.08984375/269.7421875 = 48
2025-03-03 16:02:01 +0100   18104 execution.bulk     INFO     Set process count to 4 by taking the minimum value among the factors of {'default_worker_count': 4, 'row_count': 4, 'estimated_worker_count_based_on_memory_usage': 48}.
2025-03-03 16:02:05 +0100   18104 execution.bulk     INFO     Process name(SpawnProcess-5)-Process id(29416)-Line number(0) start execution.
2025-03-03 16:02:05 +0100   18104 execution.bulk     INFO     Process name(SpawnProcess-4)-Process id(26344)-Line number(1) start execution.
2025-03-03 16:02:05 +0100   18104 execution.bulk     INFO     Process name(SpawnProcess-6)-Process i

[2025-03-03 16:02:37 +0100][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_r5fcdc0q_20250303_160237_561064, log path: C:\Users\v-molariane\.promptflow\.runs\azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_r5fcdc0q_20250303_160237_561064\logs.txt


Prompt flow service has started...
Prompt flow service has started...
Prompt flow service has started...
Prompt flow service has started...
Prompt flow service has started...
You can view the traces in local from http://127.0.0.1:23333/v1.0/ui/traces/?#run=azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_nuprd9k9_20250303_160237_571056
You can view the traces in local from http://127.0.0.1:23333/v1.0/ui/traces/?#run=azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_r5fcdc0q_20250303_160237_561064
Prompt flow service has started...
You can view the traces in local from http://127.0.0.1:23333/v1.0/ui/traces/?#run=azure_ai_evaluation_evaluators_similarity_similarity_asyncsimilarityevaluator_cmnqagnu_20250303_160237_577007
You can view the traces in local from http://127.0.0.1:23333/v1.0/ui/traces/?#run=azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_wtgmqyhy_20250303_160237_577333
You can view the traces in local from http://127.0.0.1:

[2025-03-03 16:02:37 +0100][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_nuprd9k9_20250303_160237_571056, log path: C:\Users\v-molariane\.promptflow\.runs\azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_nuprd9k9_20250303_160237_571056\logs.txt
[2025-03-03 16:02:37 +0100][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_wtgmqyhy_20250303_160237_577333, log path: C:\Users\v-molariane\.promptflow\.runs\azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_wtgmqyhy_20250303_160237_577333\logs.txt
[2025-03-03 16:02:37 +0100][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_9wgb_jik_20250303_160237_544664, log path: C:\Users\v-molariane\.promptflow\.runs\azure_ai_evaluation_evaluators_common_base_eval_a

2025-03-03 16:02:38 +0100   18104 execution.bulk     INFO     Current thread is not main thread, skip signal handler registration in BatchEngine.
2025-03-03 16:03:04 +0100   18104 execution.bulk     INFO     Finished 1 / 4 lines.
2025-03-03 16:03:04 +0100   18104 execution.bulk     INFO     Average execution time for completed lines: 26.22 seconds. Estimated time for incomplete lines: 78.66 seconds.
2025-03-03 16:03:04 +0100   18104 execution.bulk     INFO     Finished 2 / 4 lines.
2025-03-03 16:03:04 +0100   18104 execution.bulk     INFO     Average execution time for completed lines: 13.16 seconds. Estimated time for incomplete lines: 26.32 seconds.
2025-03-03 16:03:04 +0100   18104 execution.bulk     INFO     Finished 3 / 4 lines.
2025-03-03 16:03:04 +0100   18104 execution.bulk     INFO     Average execution time for completed lines: 8.81 seconds. Estimated time for incomplete lines: 8.81 seconds.
2025-03-03 16:03:04 +0100   18104 execution.bulk     INFO     Finished 4 / 4 lines.
2

[2025-03-03 16:03:26 +0100][promptflow.core._prompty_utils][ERROR] - Exception occurs: RateLimitError: Error code: 429 - {'error': {'code': '429', 'message': 'Rate limit is exceeded. Try again in 38 seconds.'}}
[2025-03-03 16:03:26 +0100][promptflow.core._prompty_utils][ERROR] - Exception occurs: RateLimitError: Error code: 429 - {'error': {'code': '429', 'message': 'Rate limit is exceeded. Try again in 38 seconds.'}}


2025-03-03 16:03:48 +0100   18104 execution.bulk     INFO     Finished 4 / 4 lines.
2025-03-03 16:03:48 +0100   18104 execution.bulk     INFO     Average execution time for completed lines: 17.58 seconds. Estimated time for incomplete lines: 0.0 seconds.
2025-03-03 16:02:38 +0100   18104 execution.bulk     INFO     Current thread is not main thread, skip signal handler registration in BatchEngine.
2025-03-03 16:03:48 +0100   18104 execution.bulk     INFO     Finished 4 / 4 lines.
2025-03-03 16:03:48 +0100   18104 execution.bulk     INFO     Average execution time for completed lines: 17.58 seconds. Estimated time for incomplete lines: 0.0 seconds.

Run name: "azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_9wgb_jik_20250303_160237_544664"
Run status: "Completed"
Start time: "2025-03-03 16:02:37.533475+01:00"
Duration: "0:01:11.027971"
Output path: "C:\Users\v-molariane\.promptflow\.runs\azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_9wgb_jik_20250

In [8]:
pprint(results)

{'metrics': {'coherence.coherence': 4.5,
             'coherence.gpt_coherence': 4.5,
             'content_safety.hate_unfairness_defect_rate': 0.0,
             'content_safety.self_harm_defect_rate': 0.0,
             'content_safety.sexual_defect_rate': 0.0,
             'content_safety.violence_defect_rate': 0.0,
             'fluency.fluency': 3.75,
             'fluency.gpt_fluency': 3.75,
             'groundedness.gpt_groundedness': 1.5,
             'groundedness.groundedness': 1.5,
             'relevance.gpt_relevance': 4.25,
             'relevance.relevance': 4.25,
             'similarity.gpt_similarity': 2.25,
             'similarity.similarity': 2.25},
 'rows': [{'inputs.context': 'France is the country in Europe.',
           'inputs.ground_truth': 'Paris',
           'inputs.query': 'What is the capital of France?',
           'outputs.coherence.coherence': 4,
           'outputs.coherence.coherence_reason': 'The RESPONSE is coherent, '
                             

In [9]:
pd.DataFrame(results["rows"])

Unnamed: 0,outputs.query,outputs.response,inputs.query,inputs.context,inputs.ground_truth,outputs.content_safety.violence,outputs.content_safety.violence_score,outputs.content_safety.violence_reason,outputs.content_safety.self_harm,outputs.content_safety.self_harm_score,...,outputs.relevance.gpt_relevance,outputs.relevance.relevance_reason,outputs.groundedness.groundedness,outputs.groundedness.gpt_groundedness,outputs.groundedness.groundedness_reason,outputs.fluency.fluency,outputs.fluency.gpt_fluency,outputs.fluency.fluency_reason,outputs.similarity.similarity,outputs.similarity.gpt_similarity
0,What is the capital of France?,The capital of France is **Paris**.,What is the capital of France?,France is the country in Europe.,Paris,Very low,0,The system's response provides factual informa...,Very low,0,...,4,"The response is accurate and complete, directl...",3,3,The RESPONSE is accurate but includes informat...,3,3,The response is clear and grammatically correc...,5,5
1,Which tent is the most waterproof?,When selecting a tent for waterproof performan...,Which tent is the most waterproof?,"#TrailMaster X4 Tent, price $250,## BrandOutdo...",The TrailMaster X4 tent has a rainfly waterpro...,Very low,0,The system's response provides detailed inform...,Very low,0,...,5,"The RESPONSE is comprehensive, directly answer...",1,1,"The RESPONSE does not adhere to the CONTEXT, a...",4,4,The RESPONSE demonstrates proficient fluency w...,2,2
2,Which camping table is the lightest?,"When looking for the lightest camping table, t...",Which camping table is the lightest?,"#BaseCamp Folding Table, price $60,## BrandCam...",The BaseCamp Folding Table has a weight of 15 lbs,Very low,0,The system's response provides information abo...,Very low,0,...,5,"The RESPONSE is comprehensive, directly answer...",1,1,The RESPONSE is completely ungrounded because ...,4,4,"The RESPONSE is well-articulated, grammaticall...",1,1
3,How much does TrailWalker Hiking Shoes cost?,As of my last knowledge update in October 2023...,How much does TrailWalker Hiking Shoes cost?,"#TrailWalker Hiking Shoes, price $110## BrandT...",The TrailWalker Hiking Shoes are priced at $110,Very low,0,The system's response is purely informational ...,Very low,0,...,3,The response is relevant and provides useful g...,1,1,The RESPONSE is completely ungrounded because ...,4,4,The RESPONSE demonstrates good control of gram...,1,1
