#Clinical Intelligence System

Clinical intelligent system interprets clinical questions, retrieves relevant information from trusted medical sources, and generates factually accurate, context-aware responses.

In [0]:
%run ./.setup/learner_setup

In [0]:
# Import necessary libraries

import os
import openai
import pandas as pd
import numpy as np
import warnings
from dotenv import load_dotenv
from langchain.vectorstores import Chroma
from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI
from langchain.document_loaders import PyMuPDFLoader
from langchain.schema import Document
from langchain.chains import RetrievalQA
from langchain_community.retrievers import BM25Retriever
from deepeval.models.base_model import DeepEvalBaseLLM
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from deepeval import evaluate
from deepeval.metrics import (
    ContextualPrecisionMetric,
    ContextualRecallMetric,
    ContextualRelevancyMetric,
    AnswerRelevancyMetric,
    FaithfulnessMetric,
    HallucinationMetric,
    GEval,
)

from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)

from sklearn.metrics.pairwise import cosine_similarity
warnings.filterwarnings("ignore")


In [0]:
# Authentication with the Azure API using client credentials.

import httpx

auth = "https://api.uhg.com/oauth2/token"
client_id = dbutils.secrets.get(scope = "AIML_Training", key = "client_id")
client_secret = dbutils.secrets.get(scope = "AIML_Training", key = "client_secret")
scope = "https://api.uhg.com/.default"
grant_type = "client_credentials"
async with httpx.AsyncClient() as client:
    body = {
        "grant_type": grant_type,
        "scope": scope,
        "client_id": client_id,
        "client_secret": client_secret,
    }
    headers = {"Content-Type": "application/x-www-form-urlencoded"}
    resp = await client.post(auth, headers=headers, data=body, timeout=120)
    token = resp.json()["access_token"]

load_dotenv("./Data/UAIS_vars.env")
 
AZURE_OPENAI_ENDPOINT = os.environ["MODEL_ENDPOINT"]
OPENAI_API_VERSION = os.environ["API_VERSION"]
CHAT_DEPLOYMENT_NAME = os.environ["MODEL_NAME"]
PROJECT_ID = os.environ["PROJECT_ID"]
EMBEDDINGS_DEPLOYMENT_NAME = os.environ["EMBEDDINGS_MODEL_NAME"]

chat_client = openai.AzureOpenAI(
        azure_endpoint=AZURE_OPENAI_ENDPOINT,
        api_version=OPENAI_API_VERSION,
        azure_deployment=CHAT_DEPLOYMENT_NAME,
        azure_ad_token=token,
        default_headers={
            "projectId": PROJECT_ID
        }
    )

embeddings_client = openai.AzureOpenAI(
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    api_version=OPENAI_API_VERSION,
    azure_deployment=EMBEDDINGS_DEPLOYMENT_NAME,
    azure_ad_token=token,
    default_headers={ 
        "projectId": PROJECT_ID
    }
) 


In [0]:
# Initialize AzureChatOpenAI model with the necessary parameters.

chat_model = AzureChatOpenAI(
    openai_api_version=OPENAI_API_VERSION,
    azure_deployment=CHAT_DEPLOYMENT_NAME,
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    azure_ad_token=token,
    default_headers={"projectId": PROJECT_ID},
)

In [0]:
# This function sends a prompt to the chat model and retrieves the response.

def get_response(prompt):
    response = chat_client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model=CHAT_DEPLOYMENT_NAME
    )
    
    return response.choices[0].message.content


In [0]:
# Wrap AzureChatOpenAI class wraps the AzureChatOpenAI model for use with DeepEval.

class AzureChatModelWrapper(DeepEvalBaseLLM):
    def __init__(self, model):
        self.model = model

    def load_model(self):
        return self.model

    def generate(self, prompt: str) -> str:
        return self.model.invoke(prompt).content

    async def a_generate(self, prompt: str) -> str:
        return (await self.model.ainvoke(prompt)).content

    def get_model_name(self):
        return "gpt-4o-mini_2024-07-18"


In [0]:
# Wrap initialized chat model for DeepEval

wrapped_model = AzureChatModelWrapper(chat_model)

## Dataset Loading and Preprocessing 

In [0]:
# Load and preprocess dataset
data = pd.read_csv("./Data/capstone1_rag_dataset.csv")
documents = data['context'].tolist()
 
# Wrap each document as LangChain Document
docs = [Document(page_content=doc) for doc in documents]
docs


## Embedding and Vector Store Creation

In this section, we initialize the AzureOpenAIEmbeddings to generate embeddings for the documents. These embeddings are then stored in a local ChromaDB vector store. The process involves checking if a vector store already exists and loading it, or creating a new one if it doesn't.



In [0]:
# initialize the AzureOpenAIEmbeddings with the necessary parameters.
embeddings = AzureOpenAIEmbeddings(
    azure_deployment=EMBEDDINGS_DEPLOYMENT_NAME,
    api_version=OPENAI_API_VERSION,
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    azure_ad_token=token,
    default_headers={
        "projectId": PROJECT_ID
    }
)

In [0]:
# create and store embeddings in a local ChromaDB vector store.

@retry(wait=wait_random_exponential(min=45, max=120), stop=stop_after_attempt(6))
def store_embeddings(persist_directory,docs=None):
    """Create or use existing vector store for embeddings"""
    
    # Check if vector store already exists
    if os.path.exists(persist_directory) and os.path.isdir(persist_directory):
        print(f"Loading existing vector store from {persist_directory}")
        # Load existing vector store
        vector_store = Chroma(
            persist_directory=persist_directory,
            embedding_function=embeddings
        )
    else:
        # Create new vector store
        print(f"Creating new vector store in {persist_directory}")
        vector_store = Chroma.from_documents(
            docs,
            embedding=embeddings,
            persist_directory=persist_directory
        )
        vector_store.persist()
    
    return vector_store


In [0]:
# store the embeddings in directory.
persist_directory = "tmp/vector_embeddings_OPENAI"
vectorstore = store_embeddings(docs=docs, persist_directory=persist_directory)


## Retrieval Strategy Exploration

In this section, we explore different retrieval strategies to enhance document retrieval. We define a sample query and implement two retrieval methods: semantic search and hybrid search. 

Semantic search retrieves top-k semantically relevant documents using vector similarity, while hybrid search combines BM25 keyword matching with vector similarity for more diverse retrieval results. These strategies help in efficiently finding relevant documents for a given query.

In [0]:
sample_query = "Explain about Cohen syndrome"

In [0]:
# to retrieve top_k semantically relevant documents from ChromaDB using vector search.
   
def semantic_search(vectordb, query, top_k=3):
    results = vectordb.similarity_search(query, k=top_k*2)  # fetch more to be safe
    unique_results = []
    seen_contents = set()

    for doc in results:
        if doc.page_content not in seen_contents:
            unique_results.append(doc)
            seen_contents.add(doc.page_content)
        if len(unique_results) >= top_k:
            break

    return unique_results

# Execute semantic search with the sample query    
results_semantic_search = semantic_search(vectordb = vectorstore, query=sample_query, top_k =3)
results_semantic_search

In [0]:
# to combine BM25 keyword matching with vector similarity for hybrid retrieval

def hybrid_search(vectordb, query, top_k=3):
    """
    Combines semantic and keyword search results for diverse retrieval.
    """
    # Get semantic search results
    semantic_results = vectordb.similarity_search(query, k=top_k*2)
    semantic_contents = [doc.page_content for doc in semantic_results]
    
    # Get keyword search results
    documents = [Document(page_content=doc) if isinstance(doc, str) else doc 
                for doc in vectordb.get()["documents"]]
    bm25_retriever = BM25Retriever.from_documents(documents)
    keyword_results = bm25_retriever.get_relevant_documents(query, k=top_k*2)
    
    # Take half from semantic results
    final_results = semantic_results[:top_k//2]
    
    # Add unique keyword results
    for doc in keyword_results:
        if len(final_results) >= top_k:
            break
        if doc.page_content not in semantic_contents:
            final_results.append(doc)
    
    # Fill remaining spots with semantic results
    remaining_spots = top_k - len(final_results)
    if remaining_spots > 0:
        start_idx = len(final_results) - remaining_spots
        final_results.extend(semantic_results[start_idx:start_idx+remaining_spots])
    
    return final_results

# Execute hybrid search with the sample query
results_hybrid_search = hybrid_search(vectordb = vectorstore, query=sample_query, top_k =3)
results_hybrid_search


## Generation Pipeline Integration
In this section, we integrate various components to form a complete generation pipeline. This involves using the results from the retrieval strategies to rerank document chunks based on their relevance to a query. The top-ranked chunks are then used to generate a context-aware answer using a chat model. This integration ensures that the generated answers are both relevant and accurate, leveraging the strengths of both retrieval and generation techniques.

In [0]:
# GPT rerank function uses GPT to rerank retrieved document chunks based on their relevance to the query.

def rerank(query, retrieved_docs, top_k=3):
    # Step 1: Prepare the ranking prompt
    prompt = f"""You are an expert assistant helping to rank document chunks based on their relevance to the following question:

    Question: {query}

    Here are the chunks:

    """

    for i, doc in enumerate(retrieved_docs):
        prompt += f"Chunk {i+1}:\n{doc.page_content.strip()}\n\n"

        prompt += f"""Instructions:
    - Rank the chunks strictly based on their relevance to the question.
    - You may return fewer than {top_k} chunks if fewer are relevant.
    - If none of the chunks are relevant, return an empty list: []
    - Respond only with the chunk numbers in descending order of relevance, like this:
    Chunk 3, Chunk 1, Chunk 5
    Or, if no relevant chunks are found, respond with:
    []
    """

    # Step 2: Call GPT for reranking
    
    gpt_output=get_response(prompt)
    print("GPT Rerank Output:\n", gpt_output)

    # Step 3: Extract chunk numbers from the output
    chunk_order = [int(s.strip().split()[-1]) - 1 for s in gpt_output.split(',') if s.strip().startswith("Chunk")]

    # Step 4: Return sorted chunk objects
    reranked_docs = [retrieved_docs[i] for i in chunk_order if i < len(retrieved_docs)]
    return reranked_docs

# Execute reranking with the hybrid search results
results_reranked = rerank(query = sample_query, retrieved_docs = results_hybrid_search)

In [0]:
# to generate an answer using the top-ranked document chunks and the chat model.

def generate_answer(query, top_chunks, model_name=CHAT_DEPLOYMENT_NAME):
    context = "\n\n".join([doc.page_content for doc in top_chunks])
    prompt = (
        f"You are an Expert Clinical AI assistant. Your task is to answer questions using **only** the information provided in the context below. Any information not found in the context must be excluded from your answer.\n\n"
        f"Context:\n{context}\n\n"
        f"Question: {query}\n\n"
        f"Instructions:\n"
        f"- First, identify the relevant facts from the context.\n"
        f"- Then, answer the question using only those facts.\n"
        f"- Do not use any external knowledge.\n"
        f"- Do not make assumptions or inferences beyond the context.\n"
        f"- If the answer is **not explicitly stated or directly inferable from the context**, respond exactly with: 'The question cannot be answered using the available documents.'\n"
        f"- If the question is out of scope or only partially answerable based on the context, clearly state the limitation and do not speculate.\n"
        f"- Before finalizing your answer, verify that every statement is directly supported by the context.\n\n"
        f"Answer:"
    )

    response = chat_client.chat.completions.create(
        messages=[{"role": "user", "content": prompt}],
        temperature=0,
        model=model_name
    )
    
    gpt_output = response.choices[0].message.content
    return gpt_output

In [0]:
# function to integrate semantic search, reranking, and answer generation into a single pipeline.
def master_function(user_query, vectordb, semantic_search, rerank, generate_answer):
    # Step 1: Perform semantic search to retrieve documents
    retrieved_docs = semantic_search(query=user_query,vectordb=vectordb)
    
    # Extract page_content from LangChain Document objects
    retrieved_docs_content = [doc.page_content for doc in retrieved_docs]
    
    # Step 2: Rerank the retrieved documents
    top_chunks = rerank(query=user_query, retrieved_docs=retrieved_docs)
    top_chunks_content = [doc.page_content for doc in top_chunks]

    # Step 3: Generate an answer based on the top-ranked chunks
    answer = generate_answer(query=user_query, top_chunks=top_chunks)
    
    # Return a dictionary with the question, retrieved documents, and generated answer
    return {
        "question": user_query,
        "retrieved_documents": top_chunks_content,
        "generated_answer": answer
    }

## Validating RAG pipeline on evaluation dataset
In this section, we validate the Retrieval-Augmented Generation (RAG) pipeline using an evaluation dataset. The process involves loading the dataset, extracting questions, and using the RAG pipeline to generate answers. The results are then compared against reference answers to assess the performance of the pipeline. This validation helps ensure that the system is generating accurate and contextually relevant responses.

In [0]:
# to load the validation dataset from a CSV file into a pandas DataFrame.
val_data = pd.read_csv("./Data/capstone1_rag_validation.csv")

In [0]:
# to extract the questions, reference contexts, and reference answers from the validation dataset.
questions_list = val_data['question'].tolist()
reference_context_list = val_data['reference_context'].tolist()
answers_list = val_data['reference_answer'].tolist()

In [0]:
# Generate validation results
val_results = []
for question in questions_list:
    row_data = master_function(
        user_query=question, 
        vectordb = vectorstore,
        semantic_search=semantic_search, 
        rerank=rerank, 
        generate_answer=generate_answer
    )
    val_results.append(row_data)
val_data_results = pd.DataFrame(val_results)
val_data_results

In [0]:
# precision metric function
def precision_at_k(gpt_response,expected_answer,k=3):
    k = k

    test_case = LLMTestCase(
        input=gpt_response['question'],
        actual_output=gpt_response['generated_answer'],
        expected_output=expected_answer,
        retrieval_context=gpt_response['retrieved_documents'][:k]
    )

    metric = ContextualPrecisionMetric(
        threshold=0.6,
        model=wrapped_model,
        include_reason=True,
        verbose_mode=True
    )

    result = evaluate([test_case], [metric])
    return result.test_results[0].metrics_data[0].score

In [0]:
# recall metric function
def recall_at_k(gpt_response, expected_answer, k=3):
    k = k
    test_case = LLMTestCase(
        input=gpt_response['question'],
        actual_output=gpt_response['generated_answer'],
        expected_output=expected_answer,
        retrieval_context=gpt_response['retrieved_documents'][:k]
    )

    metric = ContextualRecallMetric(
        threshold=0.6,
        model=wrapped_model,
        include_reason=True,
        verbose_mode=True
    )

    result = evaluate([test_case], [metric])
    return result.test_results[0].metrics_data[0].score

In [0]:
# answer relevancy metric function
def answer_relevancy_metric(gpt_response):
    test_case = LLMTestCase(
        input=gpt_response['question'],
        actual_output=gpt_response['generated_answer'],
    )

    metric = AnswerRelevancyMetric(
        threshold=0.6,
        model=wrapped_model,
        include_reason=True,
        verbose_mode=True
    )

    result = evaluate([test_case], [metric])
    return result.test_results[0].metrics_data[0].score

In [0]:
# faithfulness metric function
def faithfulness_metric(gpt_response):
    test_case = LLMTestCase(
        input=gpt_response['question'],
        actual_output=gpt_response["generated_answer"],
        retrieval_context=gpt_response['retrieved_documents']
    )

    metric = FaithfulnessMetric(
        threshold=0.6,
        model=wrapped_model,
        include_reason=True,
        verbose_mode=True
    )

    result = evaluate([test_case], [metric])
    return result.test_results[0].metrics_data[0].score

In [0]:
# Hallucination metric function
def hallucination_metric(gpt_response, expected_answer):
    test_case = LLMTestCase(
        input=gpt_response['question'],
        actual_output=gpt_response['generated_answer'],
        context=[expected_answer]
    )

    metric = HallucinationMetric(
        threshold=0.6,
        model=wrapped_model,
        include_reason=True,
        verbose_mode=False
    )

    result = evaluate([test_case], [metric])
    return result.test_results[0].metrics_data[0].score

In [0]:
# Evaluating metrics
precision_scores = []
recall_scores = []
answer_relevancy_scores = []
faithfulness_scores = []
hallucination_scores = []

results_val = val_data_results

for index, row in results_val.iterrows():
    expected_answer = answers_list[index]
    expected_context = reference_context_list[index]
    gpt_response = row

    # Retriver Evaluation Metrics
    precision = precision_at_k(gpt_response, expected_context, k=3)    
    recall = recall_at_k(gpt_response, expected_context, k=3)

    # Generator Evaluation Metrics:
    answer_relevancy = answer_relevancy_metric(gpt_response)
    faithfulness = faithfulness_metric(gpt_response)
    hallucination = hallucination_metric(gpt_response, expected_answer)

    precision_scores.append(precision)
    recall_scores.append(recall)
    answer_relevancy_scores.append(answer_relevancy)
    faithfulness_scores.append(faithfulness)
    hallucination_scores.append(hallucination)

results_val['precision@K'] = precision_scores
results_val['recall@K'] = recall_scores
results_val['answer_relevancy_score'] = answer_relevancy_scores
results_val['faithfulness_score'] = faithfulness_scores
results_val['hallucination_score'] = hallucination_scores
results_val

## Predictions on test dataset
In this section, we focus on generating predictions for the test dataset. The process involves loading the test questions from a CSV file, applying the RAG pipeline to generate answers, and storing the results. This step is crucial for evaluating the model's performance on unseen data and preparing the results for submission.

In [0]:
# loads the test dataset from a CSV file into a pandas DataFrame.
test_data = pd.read_csv("./Data/capstone1_rag_test_questions.csv")

In [0]:
test_questions_list = test_data['question'].tolist()

In [0]:
# Generate test results
test_results = []
for question in test_questions_list:
    row_data = master_function(
        user_query=question, 
        vectordb = vectorstore,
        semantic_search=semantic_search, 
        rerank=rerank, 
        generate_answer=generate_answer
    )
    test_results.append(row_data)

In [0]:
# Convert test results to a DataFrame
final_test_df = pd.DataFrame(test_results)
# Export DataFrame to CSV
final_test_df.to_csv('submission.csv',index=False)