In [None]:
%pip install qdrant_client llama-index llama-index-vector-stores-qdrant llama-index-llms-openai llama-index-agent-openai
%pip install datasets

In [5]:
import nest_asyncio

nest_asyncio.apply()

In [7]:
import os
from llama_index.core.vector_stores.types import VectorStore
from llama_index.vector_stores.qdrant import QdrantVectorStore
from qdrant_client import QdrantClient, AsyncQdrantClient


def get_vector_store() -> VectorStore:
    COLLECTION_NAME = "document"

    aclient = AsyncQdrantClient(
        url="https://6a4f58ad-84bd-4868-b9ec-48fdb5c59ef3.europe-west3-0.gcp.cloud.qdrant.io:6333",
        api_key=os.getenv('QDRANT_API_KEY'),
    )
    client = QdrantClient(
        url="https://6a4f58ad-84bd-4868-b9ec-48fdb5c59ef3.europe-west3-0.gcp.cloud.qdrant.io:6333",
        api_key=os.getenv('QDRANT_API_KEY'),
    )

    return QdrantVectorStore(
        client=client,
        aclient=aclient,
        collection_name=COLLECTION_NAME,
    )

qdrant_store = get_vector_store()



In [11]:
from qdrant_client import QdrantClient, models
from qdrant_client.http.exceptions import UnexpectedResponse

def init_qdrant():
    """
    Initialize Qdrant client, remove the existing 'document' collection (if it exists),
    and create a new 'document' collection.
    """
    client = QdrantClient(
        url="https://6a4f58ad-84bd-4868-b9ec-48fdb5c59ef3.europe-west3-0.gcp.cloud.qdrant.io:6333",
        api_key=userdata.get('QDRANT_API_KEY'),
    )

    # Check if the collection exists and delete it if it does
    try:
        client.delete_collection(collection_name="document")
        print("Existing collection 'document' deleted.")
    except UnexpectedResponse as e:
        if "404" not in str(e):
            # If the error is not a 404 (collection not found), re-raise the exception
            raise e
        # If the collection doesn't exist, no action is needed
        print("Collection 'document' does not exist. Proceeding to create it.")

    # Create a new 'document' collection
    client.create_collection(
        collection_name="document",
        vectors_config=models.VectorParams(
            size=1536, distance=models.Distance.COSINE
        ),
    )
    print("New collection 'document' created.")

    return client

init_qdrant()

Existing collection 'document' deleted.
New collection 'document' created.


<qdrant_client.qdrant_client.QdrantClient at 0x7c7633bc31d0>

In [8]:
import os
from datetime import datetime
from typing import List, Optional
from llama_index.core.agent import AgentRunner
from llama_index.core.callbacks import CallbackManager
from llama_index.core.settings import Settings
from llama_index.core import VectorStoreIndex
from llama_index.core.tools import BaseTool, QueryEngineTool
from llama_index.core.chat_engine.types import ChatMessage

# Initialize basic components
def get_chat_engine(vector_store=None):
    SYSTEM_PROMPT = os.getenv("SYSTEM_PROMPT", "You are a helpful assistant.")
    tools: List[BaseTool] = []

    # Create index from vector store if provided
    if vector_store is not None:
        index = VectorStoreIndex.from_vector_store(vector_store=vector_store)
        query_engine = index.as_query_engine()
        query_tool = QueryEngineTool.from_defaults(query_engine=query_engine)
        tools.append(query_tool)

    return AgentRunner.from_llm(
        llm=Settings.llm,
        tools=tools,
        system_prompt=SYSTEM_PROMPT,
        verbose=True,
    )

# Function to simulate a chat interaction
async def chat_interaction(message: str, chat_history: List[ChatMessage] = None):
    if chat_history is None:
        chat_history = []

    chat_engine = get_chat_engine(qdrant_store)
    chat_response = await chat_engine.achat(message, chat_history)

    return chat_response.response

In [12]:
async def main():
    # Test single message
    response = await chat_interaction("Hello, how are you?")
    print("Response:", response)

    # Test with chat history
    history = [
        ChatMessage(role="user", content="What is AI?"),
        ChatMessage(role="assistant", content="AI is artificial intelligence...")
    ]
    response = await chat_interaction("Tell me more about machine learning", history)
    print("Response with history:", response)

# Run the async function
import asyncio
asyncio.run(main())

Added user message to memory: Hello, how are you?
Response: Hello! I'm here and ready to assist you. How can I help you today?
Added user message to memory: Tell me more about machine learning
=== Calling Function ===
Calling function: query_engine_tool with args: {"input":"Machine learning"}
Got output: Empty Response

Response with history: Machine learning is a subset of artificial intelligence that focuses on the development of algorithms and models that allow computers to learn from and make predictions or decisions based on data. It involves the use of statistical techniques to enable machines to improve their performance on a specific task without being explicitly programmed. Machine learning is widely used in various applications such as image recognition, natural language processing, and recommendation systems.


In [19]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

Settings.llm = OpenAI(
    model="gpt-4o-mini",
    temperature=0.1,
    max_tokens=512,
)

Settings.embed_model = OpenAIEmbedding(
    model="text-embedding-3-small",
)


In [35]:
import os
import pandas as pd
import numpy as np
from typing import List, Dict
from llama_index.core.evaluation import (
    QueryResponseEvaluator,
    RelevancyEvaluator,
    FaithfulnessEvaluator,
    ContextRelevancyEvaluator
)
from llama_index.core.chat_engine.types import ChatMessage
from datasets import load_dataset
from tqdm import tqdm
from llama_index.core.schema import Document

class MedicalChatbotEvaluator:
    def __init__(self, chat_engine, vector_store=None):
        self.chat_engine = chat_engine
        self.vector_store = vector_store

        # Initialize evaluators
        self.response_evaluator = QueryResponseEvaluator()
        self.relevancy_evaluator = RelevancyEvaluator()
        self.faithfulness_evaluator = FaithfulnessEvaluator()
        self.context_relevancy_evaluator = ContextRelevancyEvaluator()

    async def evaluate_single_response(self,
                                     question: str,
                                     ground_truth: str,
                                     chat_history: List[ChatMessage] = None) -> Dict:
        """Evaluate a single question-answer pair"""
        try:
            # Get response from chatbot
            response = await self.chat_engine.achat(question, chat_history)

            # Extract contexts
            contexts = []
            if hasattr(response, 'source_nodes'):
                contexts = [str(ctx.node.text) for ctx in response.source_nodes]
            elif self.vector_store:
                # If no source nodes but vector store exists, get relevant documents
                results = self.vector_store.similarity_search(question, k=3)
                contexts = [str(doc.text) for doc in results]

            # Ensure we have at least one context
            if not contexts:
                contexts = [response.response]  # Use response as context if no other context available

            # Run evaluations
            response_quality = await self.response_evaluator.aevaluate(
                query=question,
                response=response.response,
                ground_truth=ground_truth
            )

            relevancy_score = await self.relevancy_evaluator.aevaluate(
                query=question,
                response=response.response,
                contexts=contexts
            )

            faithfulness_score = await self.faithfulness_evaluator.aevaluate(
                query=question,
                response=response.response,
                contexts=contexts
            )

            context_relevancy = await self.context_relevancy_evaluator.aevaluate(
                query=question,
                contexts=contexts
            )

            return {
                'question': question,
                'response': response.response,
                'ground_truth': ground_truth,
                'response_quality': response_quality.score,
                'relevancy': relevancy_score.score,
                'faithfulness': faithfulness_score.score,
                'context_relevancy': context_relevancy.score,
                'sources_used': len(contexts)
            }
        except Exception as e:
            print(f"Error evaluating question: {question}")
            print(f"Error: {str(e)}")
            return {
                'question': question,
                'response': None,
                'ground_truth': ground_truth,
                'response_quality': 0.0,
                'relevancy': 0.0,
                'faithfulness': 0.0,
                'context_relevancy': 0.0,
                'sources_used': 0
            }

    async def evaluate_dataset(self, dataset_name: str = "bigbio/med_qa", split: str = "test"):
        """Evaluate using a HuggingFace dataset"""
        try:
            # Load dataset
            dataset = load_dataset(dataset_name, 'med_qa_en_source', split=split, trust_remote_code=True)
            results = []

            for item in tqdm(dataset):
                try:
                    # Adapt this according to your dataset structure
                    question = item['question']
                    ground_truth = item['answer']

                    result = await self.evaluate_single_response(question, ground_truth)
                    results.append(result)
                except Exception as e:
                    print(f"Error processing item: {item}")
                    print(f"Error: {str(e)}")
                    continue

            return pd.DataFrame([r for r in results if r is not None])
        except Exception as e:
            print(f"Error loading dataset: {str(e)}")
            return pd.DataFrame()

    def calculate_metrics(self, results_df: pd.DataFrame) -> Dict:
        """Calculate aggregate metrics"""
        if results_df.empty:
            return {
                'avg_response_quality': 0.0,
                'avg_relevancy': 0.0,
                'avg_faithfulness': 0.0,
                'avg_context_relevancy': 0.0,
                'avg_sources_used': 0.0,
                'std_response_quality': 0.0,
                'std_relevancy': 0.0,
                'std_faithfulness': 0.0,
                'std_context_relevancy': 0.0
            }

        metrics = {
            'avg_response_quality': results_df['response_quality'].mean(),
            'avg_relevancy': results_df['relevancy'].mean(),
            'avg_faithfulness': results_df['faithfulness'].mean(),
            'avg_context_relevancy': results_df['context_relevancy'].mean(),
            'avg_sources_used': results_df['sources_used'].mean(),
            'std_response_quality': results_df['response_quality'].std(),
            'std_relevancy': results_df['relevancy'].std(),
            'std_faithfulness': results_df['faithfulness'].std(),
            'std_context_relevancy': results_df['context_relevancy'].std()
        }
        return metrics

In [None]:
import matplotlib.pyplot as plt

async def main():
    chat_engine = get_chat_engine(qdrant_store)
    evaluator = MedicalChatbotEvaluator(chat_engine, qdrant_store)
    results_df = await evaluator.evaluate_dataset()

    metrics = evaluator.calculate_metrics(results_df)

    print("\nOverall Metrics:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.3f}")

    results_df.to_csv("evaluation_results.csv", index=False)

    plt.figure(figsize=(12, 6))
    metrics_to_plot = ['response_quality', 'relevancy', 'faithfulness', 'context_relevancy']
    plt.boxplot([results_df[metric] for metric in metrics_to_plot], labels=metrics_to_plot)
    plt.title("Distribution of Evaluation Metrics")
    plt.ylabel("Score")
    plt.savefig("evaluation_metrics.png")
    plt.close()

import asyncio
asyncio.run(main())