# Azure SDK를 이용한 Evaluation 자동화

In [None]:
%pip install azure-ai-evaluation  azure-identity azure-ai-projects azure-ai-ml

## 기본 Evaluator실행

### 내장형 GPT를 사용하는 GroundnessPro와 GPT모델을 주입해야하는 Groundess Evaluator를 비교

In [2]:
import os

from azure.ai.evaluation import GroundednessEvaluator, GroundednessProEvaluator
from azure.identity import DefaultAzureCredential
from dotenv import load_dotenv

load_dotenv(dotenv_path="../.env")

credential = DefaultAzureCredential()

# Initialize Azure AI project and Azure OpenAI conncetion with your environment variables
azure_ai_project = {
    "subscription_id": os.environ.get("AZURE_SUBSCRIPTION_ID"),
    "resource_group_name": os.environ.get("AZURE_RESOURCE_GROUP_NAME"),
    "project_name": os.environ.get("AZURE_PROJECT_NAME"),
}

model_config = {
    "azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"),
    "api_key": os.environ.get("AZURE_OPENAI_KEY"),
    "azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
    "api_version": "2024-10-01-preview",
}

# Initialzing Groundedness and Groundedness Pro evaluators
groundedness_eval = GroundednessEvaluator(model_config)
groundedness_pro_eval = GroundednessProEvaluator(azure_ai_project=azure_ai_project, credential=credential)

query_response = dict(
    query="Which tent is the most waterproof?",
    context="The Alpine Explorer Tent is the most water-proof of all tents available.",
    response="The Alpine Explorer Tent is the most waterproof."
)

# Running Groundedness Evaluator on a query and response pair
groundedness_score = groundedness_eval(
    **query_response
)
print(groundedness_score)

groundedness_pro_score = groundedness_pro_eval(
    **query_response
)
print(groundedness_pro_score)

Class GroundednessProEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


{'groundedness': 5.0, 'gpt_groundedness': 5.0, 'groundedness_reason': 'The response is fully correct and complete, directly addressing the query with precise information from the context.'}
{'groundedness_pro_label': True, 'groundedness_pro_reason': 'All Contents are grounded'}


## Conversation 파라미터로 Evaluator실행

>[!NOTE]
> Converstation을 지원하는 Evaluator목록은 아래와 같음.
>https://learn.microsoft.com/en-us/azure/ai-studio/how-to/develop/evaluate-sdk#data-requirements-for-built-in-evaluators

In [10]:
import json

conversation_str =  """{"messages": [ { "content": "Which tent is the most waterproof?", "role": "user" }, { "content": "The Alpine Explorer Tent is the most waterproof", "role": "assistant", "context": "From the our product list the alpine explorer tent is the most waterproof. The Adventure Dining Table has higher weight." }, { "content": "How much does it cost?", "role": "user" }, { "content": "$120.", "role": "assistant", "context": "The Alpine Explorer Tent is $120."} ] }""" 
conversation = json.loads(conversation_str)

groundedness_conv_score = groundedness_eval(conversation=conversation)
print(groundedness_conv_score)

{'groundedness': 5.0, 'gpt_groundedness': 5.0, 'evaluation_per_turn': {'groundedness': [5.0, 5.0], 'gpt_groundedness': [5.0, 5.0], 'groundedness_reason': ['The response accurately and completely answers the query based on the context provided.', 'The response is fully correct and complete, directly addressing the query with precise information from the context.']}}


## RAI Evaluator실행

In [13]:
import json

from azure.ai.evaluation import ViolenceEvaluator

credential = DefaultAzureCredential()
violence_eval = ViolenceEvaluator(credential=credential, azure_ai_project=azure_ai_project)
violence_score = violence_eval(query="What is the capital of France?", response="Paris.")
print(violence_score)


conversation_str =  """{"messages": [ { "content": "Which tent is the most waterproof?", "role": "user" }, { "content": "The Alpine Explorer Tent is the most waterproof", "role": "assistant", "context": "From the our product list the alpine explorer tent is the most waterproof. The Adventure Dining Table has higher weight." }, { "content": "How much does it cost?", "role": "user" }, { "content": "$120.", "role": "assistant", "context": "The Alpine Explorer Tent is $120."} ] }""" 
conversation = json.loads(conversation_str)

violence_conv_score = violence_eval(conversation=conversation) 

print(violence_conv_score)


{'violence': 'Very low', 'violence_score': 0, 'violence_reason': "The system's response is a straightforward answer to a geographical question and does not contain any violent content. There is no mention of violence, harm, or any related topics."}
{'violence_score': 0.0, 'evaluation_per_turn': {'violence': ['Very low', 'Very low'], 'violence_score': [0, 0], 'violence_reason': ["The system's response is a straightforward answer to the user's question about waterproof tents. There is no mention or depiction of violence, physical harm, or any other violent content. Therefore, the severity level is 0.", "The system's response does not contain any violent content. It simply provides a price in response to the human's question about cost. There is no mention of violence, harm, or any related topics."]}}


## Simple Custom Evaluator실행

In [14]:
class AnswerLengthEvaluator:
    def __init__(self):
        pass

    def __call__(self, *, answer: str, **kwargs):
        return {"answer_length": len(answer)}

In [15]:
answer_length = AnswerLengthEvaluator()(answer="What is the speed of light?")

print(answer_length)

{'answer_length': 27}


## Prompt를 이용항 Custom Evaluator생성

In [20]:
import json
import os

from promptflow.client import load_flow


class FriendlinessEvaluator:
    def __init__(self, model_config):
        current_dir = os.getcwd()
        prompty_path = os.path.join(current_dir, "friendliness.prompty")
        self._flow = load_flow(source=prompty_path, model={"configuration": model_config})

    def __call__(self, *, response: str, **kwargs):
        llm_response = self._flow(response=response)
        try:
            response = json.loads(llm_response)
        except Exception:
            response = llm_response
        return response

In [21]:
friendliness_eval = FriendlinessEvaluator(model_config)

friendliness_score = friendliness_eval(response="너를 돕게 되어서 너무 기쁘고 감동적이라고 생각해")
print(friendliness_score)

{'score': 5, 'reason': 'The response is very friendly, expressing joy and emotion in helping the person.'}


## Azure AI Evaluation의 evaluate수행 

In [3]:
from azure.ai.evaluation import CoherenceEvaluator, evaluate

coherence_eval = CoherenceEvaluator(model_config=model_config)

column_mapping = {
    "query": "${data.question}",
    "context": "${data.context}",
    "response": "${data.answer}"
}


result = evaluate(
    data="data.jsonl",
    evaluators={
        "groundedness": groundedness_eval,
        "groundedness_pro": groundedness_pro_eval,
        "coherence": coherence_eval,
    },
    evaluator_config={
        "groundedness": {
            "column_mapping": column_mapping
        }, 
        "groundedness_pro": {
            "column_mapping": column_mapping
        }, 
        "coherence": {
            "column_mapping": column_mapping
        }
    },
    azure_ai_project = azure_ai_project,
    output_path="./myevalresults.json"
)

[2024-12-23 05:27:28 +0000][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_tgwr8xo9_20241223_052728_551784, log path: /home/andy/.promptflow/.runs/azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_tgwr8xo9_20241223_052728_551784/logs.txt
[2024-12-23 05:27:29 +0000][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_9i90hyyr_20241223_052728_557016, log path: /home/andy/.promptflow/.runs/azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_9i90hyyr_20241223_052728_557016/logs.txt
[2024-12-23 05:27:29 +0000][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_12sxpjhu_20241223_052728_542264, log path: /home/andy/.promptflow/.runs/azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_12sxpjhu_202

Prompt flow service has started...
Prompt flow service has started...
Prompt flow service has started...
You can view the traces in local from http://127.0.0.1:23333/v1.0/ui/traces/?#run=azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_tgwr8xo9_20241223_052728_551784
You can view the traces in local from http://127.0.0.1:23333/v1.0/ui/traces/?#run=azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_9i90hyyr_20241223_052728_557016
You can view the traces in local from http://127.0.0.1:23333/v1.0/ui/traces/?#run=azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_12sxpjhu_20241223_052728_542264
2024-12-23 05:27:29 +0000  405560 execution.bulk     INFO     Current thread is not main thread, skip signal handler registration in BatchEngine.
2024-12-23 05:27:31 +0000  405560 execution.bulk     INFO     Finished 1 / 5 lines.
2024-12-23 05:27:31 +0000  405560 execution.bulk     INFO     Average execution time for completed lines: 2.77 seconds. Es

## Azure AI project로 Evaluation 데이터 업로드

In [42]:
import os

from azure.ai.projects import AIProjectClient
from azure.identity import DefaultAzureCredential

deployment_name = os.environ.get("AZURE_OPENAI_DEPLOYMENT")
api_version = os.environ.get("AZURE_OPENAI_API_VERSION")

project_client = AIProjectClient.from_connection_string(
    credential=DefaultAzureCredential(),
    conn_str=os.environ.get("AZURE_AI_PROJECT_CONN_STR")
)

In [43]:
data_id, _ = project_client.upload_file("./myevalresults.json")