In [1]:
from minsearch import Index, VectorSearch
from openai import OpenAI, APIConnectionError, RateLimitError, APIStatusError
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
    retry_if_exception_type
)
import pickle
import hashlib
from sentence_transformers import SentenceTransformer

In [2]:
# model for qa purpose in english
embedding_model = SentenceTransformer('multi-qa-distilbert-cos-v1')

# Lets implement hybrid search

In [3]:
FILE_PATH = "vector_search_data.pkl"
loaded_data = None
try:
    with open(FILE_PATH, 'rb') as f:
        loaded_data = pickle.load(f)
        print(f"Data successfully unpickled from {FILE_PATH}")

    # Access the loaded data
    loaded_embeddings = loaded_data['embeddings']
    loaded_docs = loaded_data['documents']
except FileNotFoundError:
    print(f"Error: The file {FILE_PATH} was not found.")
except pickle.UnpicklingError as e:
    print(f"An error occurred during unpickling: {e}")

Data successfully unpickled from vector_search_data.pkl


In [4]:
from minsearch import VectorSearch, Index

In [5]:
index = Index(text_fields=['title', 'description', 'filename', 'section'],
              keyword_fields=[])
index.fit(loaded_docs)

vs = VectorSearch()
vs.fit(loaded_embeddings, loaded_docs)

<minsearch.vector.VectorSearch at 0x319636c60>

In [6]:
query = "How can I evaluate classification model results, and ensure numerical data is not drifted?"

In [7]:
import hashlib
from typing import List, Any, Callable
from pydantic_ai import Agent

In [8]:
def text_search(query: str) -> List[Any]:
    return index.search(query, num_results=5)

def vector_search(query: str) -> List[Any]:
    q = embedding_model.encode(query)
    return vs.search(q, num_results=5)

def hybrid_search(query: str) -> List[Any]:
    text_results = text_search(query)
    vector_results = vector_search(query)
    
    # Combine and deduplicate results
    seen_ids = set()
    combined_results = []

    for result in text_results + vector_results:
        text_to_hash = result['filename'] + ' ' + result['section'][0:250]
        encoded_string = text_to_hash.encode('utf-8')
        hash_object = hashlib.sha256(encoded_string)
        hex_digest = hash_object.hexdigest()
        if hex_digest not in seen_ids:
            seen_ids.add(result['filename'])
            combined_results.append(result)
    
    return combined_results

In [9]:
system_prompt = """
You are a helpful assistant to read and understand technical docs for evidently ai. 

Use the search tool to find relevant information from the data before answering questions.

If you can find specific information through search, use it to provide accurate answers.
If the search doesn't return relevant results, let the user know and provide general guidance.
"""

In [10]:
agent = Agent(
    name="faq_agent",
    instructions=system_prompt,
    tools=[hybrid_search],
    model='gpt-4o-mini')

In [11]:
result = await agent.run(user_prompt=query)
print(result.output)

To evaluate classification model results and ensure numerical data is not drifted, you can follow these comprehensive steps outlined in the Evidently AI documentation:

### Evaluating Classification Model Results
1. **Use the Classification Quality Preset**: Utilize the `ClassificationPreset` to evaluate and visualize the performance of your classification tasks. This can be done for a single dataset or by comparing it against a reference dataset.
   
2. **Metrics and Visualizations**:
   - **Metrics**: You can measure various metrics including Accuracy, Precision, Recall, F1-score, ROC AUC, and LogLoss.
   - **Visualizations**: Generate visual aids like Class Representation, Confusion Matrix, Class Separation Quality, Probability Distribution, ROC Curve, and PR Curve.
   - If you include feature columns in the report, it will also show performance by column to assess different data segments.

3. **Testing and Diagnostics**:
   - Enable tests to automatically run checks that assess if 

In [26]:
from pydantic_ai.messages import ModelMessagesTypeAdapter
import json

In [27]:
def log_entry(agent, messages, source: str="user"):
    tools = []
    for ts in agent.toolsets:
        tools.extend(ts.tools.keys())
    
    messages = json.loads(messages)

    return {
        "agent_name": agent.name,
        "system_prompt": agent._instructions,
        "provider": agent.model.system,
        "model": agent.model.model_name,
        "tools": tools,
        "messages": messages,
        "source": source
    }

In [29]:
entry = log_entry(
    agent=agent,
    messages=result.new_messages_json()
)

In [37]:
entry["messages"][-1]['timestamp']

'2025-12-28T02:00:17.110796Z'

In [30]:
from pathlib import Path
from datetime import datetime
import secrets

In [31]:
LOG_DIR = Path('logs')
LOG_DIR.mkdir(exist_ok=True)

In [32]:
def serializer(obj):
    if isinstance(obj, datetime):
        return obj.isoformat()
    raise TypeError(f"Type {type(obj)} not serializable")

In [52]:
def log_interaction(agent, messages, source: str="user"):

    entry = log_entry(
        agent=agent,
        messages=messages,
        source=source
    )

    ts = entry["messages"][-1]['timestamp']
    ts_obj = datetime.fromisoformat(ts.replace("Z", "+00:00"))
    ts_str = ts_obj.strftime("%Y%m%d_%H%M%S")
    rand_hex = secrets.token_hex(nbytes=3)

    filename = f"{agent.name}_{ts_str}_{rand_hex}.json"
    filepath = LOG_DIR / filename

    with open(filepath, mode='w', encoding='utf-8') as f_out:
        json.dump(obj=entry, fp=f_out, default=serializer)

    return filepath

In [53]:
# Lets create the first log file
log_interaction(agent=agent, messages=result.new_messages_json())

PosixPath('logs/faq_agent_20251228_020017_a78065.json')

In [54]:
# Lets ask another question
user_question = "What evaluation metrics are relevant for image data?"

In [55]:
result = await agent.run(user_prompt=user_question)
print(result.output)

When evaluating image data, particularly in the context of classification tasks, several key evaluation metrics are commonly used to assess the performance of models. Here are the relevant metrics:

1. **Accuracy**: Measures the overall correctness of the model across all classes.
   
2. **Precision**: Indicates the proportion of true positive results in relation to the total predicted positives. It helps understand how many of the predicted classes were actually correct.

3. **Recall (Sensitivity)**: Measures the proportion of true positives identified by the model compared to the actual positives. It shows how well the model can identify actual instances of a class.

4. **F1 Score**: The harmonic mean of precision and recall, providing a single metric that balances both concerns, which is particularly useful in scenarios where class distribution is uneven.

5. **ROC Curve**: The Receiver Operating Characteristic curve visualizes the true positive rate against the false positive rate 

In [56]:
log_interaction(agent=agent, messages=result.new_messages_json())

PosixPath('logs/faq_agent_20251228_025937_ee3694.json')

# Lets add references

In [57]:
system_prompt = """
You are a helpful assistant for a course.  

Use the search tool to find relevant information from the document materials before answering questions.  

If you can find specific information through search, use it to provide accurate answers.

Always include references by citing the filename of the source material you used.  
When citing the reference, replace "faq-main" by the full path to the GitHub repository: "https://github.com/evidentlyai/docs/tree/main"
Format: [LINK TITLE](FULL_GITHUB_LINK)

If the search doesn't return relevant results, let the user know and provide general guidance.  
""".strip()


In [58]:
# Create another version of agent, let's call it faq_agent_v2
agent = Agent(
    name="faq_agent_v2",
    instructions=system_prompt,
    tools=[hybrid_search],
    model='gpt-4o-mini'
)

In [59]:
# Lets ask another question
user_question = "What evaluation metrics are relevant for image data?"

In [60]:
result = await agent.run(user_prompt=user_question)
print(result.output)

For evaluating image data, particularly in the context of classification tasks, several metrics are commonly used to measure the performance of models. Here are some of the most relevant evaluation metrics for image classification:

1. **Accuracy**: The ratio of correctly predicted instances to the total instances. It provides a basic measure of performance.

2. **Precision**: Indicates the accuracy of the positive predictions. It is defined as the number of true positives divided by the sum of true positives and false positives.

3. **Recall (Sensitivity)**: Measures the ability to find all relevant instances, defined as the number of true positives divided by the sum of true positives and false negatives.

4. **F1 Score**: The harmonic mean of precision and recall, useful for dealing with class imbalances.

5. **ROC AUC (Receiver Operating Characteristic - Area Under Curve)**: This metric evaluates the trade-off between the true positive rate and false positive rate across different 

In [61]:
log_interaction(agent=agent, messages=result.new_messages_json())

PosixPath('logs/faq_agent_v2_20251228_030704_0134b1.json')

# Building the evaluation agent

In [63]:
# First we need the evaluation system prompt to build an eval agent with pydantic
# We will need the answer provided by agent, question of the user, log for further analysis
system_eval_prompt = """
Use this checklist to evaluate the quality of an AI agent's answer (<ANSWER>) to a user question (<QUESTION>).
We also include the entire log (<LOG>) for analysis.

For each item, check if the condition is met. 

Checklist:

- instructions_follow: The agent followed the user's instructions (in <INSTRUCTIONS>)
- instructions_avoid: The agent avoided doing things it was told not to do  
- answer_relevant: The response directly addresses the user's question  
- answer_clear: The answer is clear and correct  
- answer_citations: The response includes proper citations or sources when required  
- completeness: The response is complete and covers all key aspects of the request
- factual: The response was not hallucinated and covers actual facts
- tool_call_search: Is the search tool invoked? 

Output true/false for each check and provide a short explanation for your judgment.
""".strip()

In [116]:
# Providing the structure for output
from pydantic import BaseModel

class EvaluationCheck(BaseModel):
    check_name: str
    justification: str
    check_pass: bool

class EvaluationChecklist(BaseModel):
    checks: list[EvaluationCheck]
    summary: str

In [117]:
# Now we will build the agent

In [118]:
evaluation_agent = Agent(
    model="gpt-5-nano",#Using a different model to avoid bias and ensure better results
    name="eval_agent",
    instructions=system_eval_prompt,
    output_type=EvaluationChecklist
)

In [119]:
user_prompt_format = """
<INSTRUCTIONS>{instructions}</INSTRUCTIONS>
<QUESTION>{question}</QUESTION>
<ANSWER>{answer}</ANSWER>
<LOG>{log}</LOG>
""".strip()

In [120]:
# To fill the user prompt for evaluation, we need a function to fetch the log data

In [121]:
def load_log_data(log_file: str):
    try:
        with open(log_file, 'r') as f_in:
            log_data = json.load(f_in)
            log_data['log_file'] = log_file
            return log_data
    except FileNotFoundError:
        print(f"{log_file} is not found in the desired location")
    

In [122]:
log_record_file = "logs/faq_agent_v2_20251228_030704_0134b1.json"
log_data = load_log_data(log_file=log_record_file)

In [123]:
instructions = log_data["system_prompt"][0]
question = log_data['messages'][0]['parts'][0]['content']
answer = log_data['messages'][-1]['parts'][0]['content']
log = json.dumps(log_data['messages'])

In [124]:
user_prompt = user_prompt_format.format(
    instructions=instructions,
    question=question,
    answer=answer,
    log=log)

In [109]:
# Now we will send the user formatted prompt to the eval agent

In [126]:
result = await evaluation_agent.run(user_prompt=user_prompt, output_type=EvaluationChecklist)

In [127]:
checklist = result.output

In [128]:
print(checklist.summary)

The answer provides a comprehensive, accurate list of image classification evaluation metrics with a sourced reference. It demonstrates alignment with the instruction to use search results and cite sources.


In [130]:
checklist

EvaluationChecklist(checks=[EvaluationCheck(check_name='instructions_follow', justification='The answer used information from search results and provided a source citation to a Classification Metrics document, aligning with the instruction to search and cite sources.', check_pass=True), EvaluationCheck(check_name='instructions_avoid', justification='No prohibited content or actions; the answer stays within educational guidance for evaluation metrics.', check_pass=True), EvaluationCheck(check_name='answer_relevant', justification='The response directly lists common image classification metrics (accuracy, precision, recall, F1, ROC AUC, confusion matrix, log loss, PR curve, class representation).', check_pass=True), EvaluationCheck(check_name='answer_clear', justification='Metrics are listed clearly with brief explanations for some of them, making the answer easy to follow.', check_pass=True), EvaluationCheck(check_name='answer_citations', justification='Includes a citation link to the s

In [133]:
for check in checklist.checks:
    print(check)
    print()

check_name='instructions_follow' justification='The answer used information from search results and provided a source citation to a Classification Metrics document, aligning with the instruction to search and cite sources.' check_pass=True

check_name='instructions_avoid' justification='No prohibited content or actions; the answer stays within educational guidance for evaluation metrics.' check_pass=True

check_name='answer_relevant' justification='The response directly lists common image classification metrics (accuracy, precision, recall, F1, ROC AUC, confusion matrix, log loss, PR curve, class representation).' check_pass=True

check_name='answer_clear' justification='Metrics are listed clearly with brief explanations for some of them, making the answer easy to follow.' check_pass=True

check_name='answer_citations' justification='Includes a citation link to the source: [Classification Metrics](https://github.com/evidentlyai/docs/tree/main/docs-main/metrics/explainer_classification.

In [136]:
# We can reduce the tokens by simplifying the logs, and send less unnecessary info

In [135]:
def simplify_log_messages(messages):
    log_simplified = []

    for m in messages:
        parts = []
    
        for original_part in m['parts']:
            part = original_part.copy()
            kind = part['part_kind']
    
            if kind == 'user-prompt':
                del part['timestamp']
            if kind == 'tool-call':
                del part['tool_call_id']
            if kind == 'tool-return':
                del part['tool_call_id']
                del part['metadata']
                del part['timestamp']
                # Replace actual search results with placeholder to save tokens
                part['content'] = 'RETURN_RESULTS_REDACTED'
            if kind == 'text':
                del part['id']
    
            parts.append(part)
    
        message = {
            'kind': m['kind'],
            'parts': parts
        }
    
        log_simplified.append(message)
    return log_simplified

In [137]:
async def evaluate_log_record(eval_agent, log_record):
    messages = log_record['messages']

    instructions = log_data["system_prompt"][0]
    question = messages[0]['parts'][0]['content']
    answer = messages[-1]['parts'][0]['content']

    log_simplified = simplify_log_messages(messages)
    log = json.dumps(log_simplified)

    user_prompt = user_prompt_format.format(
        instructions=instructions,
        question=question,
        answer=answer,
        log=log)

    result = await eval_agent.run(user_prompt, output_type=EvaluationChecklist)
    return result.output 

In [139]:
log_record_file = "logs/faq_agent_20251228_025937_ee3694.json"
log_record = load_log_data(log_file=log_record_file)
eval1 =  await evaluate_log_record(eval_agent=evaluation_agent,
                    log_record=log_record)

In [140]:
print(eval1.summary)
print()
for check in eval1.checks:
    print(check)
    print()

The answer provides a solid list of common image classification metrics and is clear and factual, but it lacks citations and misses some important metrics (e.g., top-5 accuracy, per-class metrics for multi-class, and mAP in detection). The user’s instruction to cite source materials was not fulfilled.

check_name='instructions_follow' justification='User asked to use the search tool and cite sources in the answer; the provided answer did not include citations from source files.' check_pass=False

check_name='instructions_avoid' justification='No disallowed content; the response adheres to allowed content.' check_pass=True

check_name='answer_relevant' justification='The answer lists common evaluation metrics relevant to image data and classification tasks.' check_pass=True

check_name='answer_clear' justification="The list is organized with bullet points and explanations for each metric; it's clear." check_pass=True

check_name='answer_citations' justification='No citations were provid

In [167]:
question_generation_prompt = """
You are question generation assistant agent, whose goal is to generate questions associated 
with MLOPs and LLMOPs part of evaluation and monitoring, while leveraging evidently ai. 
These questions should be strictly based on how to and why evaluate
and monitor machine learning models for text, image, and structure data, or how to and why evaluate llms in production, 
so students and practioners of evidently ai, will be leveraging these questions to build monitoring and evaluation pipelines.
Feel free to use common questions about evidently ai asked in stack overflow, reddit or github.

The questions should:

- Be natural and varied in style
- Range from simple to complex
- Include both specific technical questions and general course questions
- There should be no number preceding the question

Maximum number of questions should not be more than 20.
There will be some sample docs that I will be providing with <Example></Example>
"""

In [148]:
FILE_PATH = "vector_search_data.pkl"
loaded_data = None
try:
    with open(FILE_PATH, 'rb') as f:
        loaded_data = pickle.load(f)
        print(f"Data successfully unpickled from {FILE_PATH}")

    # Access the loaded data
    loaded_docs = loaded_data['documents']
except FileNotFoundError:
    print(f"Error: The file {FILE_PATH} was not found.")
except pickle.UnpicklingError as e:
    print(f"An error occurred during unpickling: {e}")

Data successfully unpickled from vector_search_data.pkl


In [153]:
import random

random.seed(42)
sample = random.sample(loaded_docs, 10)

In [168]:
data = [f"<Example>{data}</Example>" for data in sample]
user_prompt = """Based on the data below can you generate maximum of 20 questions
Makes sure list doesnt have any numbering.
<EXAMPLES>{examples}</EXAMPLES>
""".format(examples=user_prompt)

In [169]:
class QuestionsList(BaseModel):
    questions: list[str]

question_generator = Agent(
    name="question_generator",
    instructions=question_generation_prompt,
    model='gpt-4o-mini',
    output_type=QuestionsList
)

In [170]:
result = await question_generator.run(user_prompt, output_type=QuestionsList)
questions = result.output.questions

In [171]:
questions

['Why is it essential to evaluate machine learning models in production environments?',
 'What challenges can arise during the monitoring of large language models (LLMs)?',
 'How can data drift affect the performance of machine learning models?',
 'What are the best practices for setting up an evaluation pipeline for text data models?',
 'In what ways can evaluation metrics differ between image and structured data models?',
 'How does synthetic data contribute to the evaluation of machine learning systems?',
 'What factors should be considered when determining the evaluation metrics for a specific machine learning model?',
 'How can the performance of an LLM be continuously monitored over time?',
 'What is the role of confusion matrices in evaluating machine learning models?',
 'How can visualizations enhance the understanding of model performance during evaluations?',
 'What specific metrics can be used to evaluate the accuracy of image classification models?',
 'Why is regression tes

In [172]:
# Now we will iterate over each question and generate some data

In [174]:
from tqdm.auto import tqdm

for q in tqdm(questions):
    print(q)

    result = await agent.run(user_prompt=q)
    print(result.output)

    log_interaction(
        agent,
        result.new_messages_json(),
        source='ai-generated'
    )

    print()


  0%|          | 0/20 [00:00<?, ?it/s]

Why is it essential to evaluate machine learning models in production environments?
Evaluating machine learning models in production environments is essential for several reasons:

1. **Performance Monitoring**: Continuous monitoring of model performance is crucial to ensure that models maintain their accuracy and effectiveness over time. Models may degrade due to changes in input data patterns, a phenomenon known as data drift. Regular evaluations help identify these issues early [source](https://github.com/evidentlyai/docs/tree/main/docs-main/docs/platform/evals_overview.mdx).

2. **Quality Assurance**: Evaluation processes help in ensuring the quality of predictions. This includes checking for accuracy in classification or regression outputs and assessing the quality of input data, such as the presence of missing values or out-of-range features [source](https://github.com/evidentlyai/docs/tree/main/docs-main/quickstart_ml.mdx).

3. **Regression Testing**: When models are updated or 

In [177]:
eval_set = []

for log_file in LOG_DIR.glob('*.json'):
    log_record = load_log_data(log_file=log_file)
    if log_record['source'] != 'ai-generated':
        continue
    eval_set.append(log_record)

In [184]:
eval_results = []
for eval_record in tqdm(eval_set):
    eval_result = await evaluate_log_record(eval_agent=evaluation_agent,
                                      log_record=eval_record)
    eval_results.append((eval_result, eval_record))

  0%|          | 0/20 [00:00<?, ?it/s]

In [185]:
import pandas as pd

In [198]:
rows = []
for data in eval_results:
    row = {"log_file": data[1]["log_file"],
           "source":  data[1]["source"],
           "model": data[1]["model"]}
    checks = {c.check_name: c.check_pass for c in data[0].checks}
    row.update(checks)
    rows.append(row)

In [199]:
eval_df = pd.DataFrame(rows)

In [200]:
# Check the evaluation results
eval_df.mean(numeric_only=True)

instructions_follow    0.80
instructions_avoid     1.00
answer_relevant        1.00
answer_clear           1.00
answer_citations       0.75
completeness           1.00
factual                1.00
tool_call_search       0.95
dtype: float64

In [201]:
# Overall llm judge evaluation tells us , that the agent is quite reliable