# Clinical Intelligence System: RAG-based Capstone Project

This notebook demonstrates a Retrieval-Augmented Generation (RAG) pipeline for building a Clinical Intelligence System. The system leverages a set of 100 clinical documents as the exclusive source of truth, and uses advanced retrieval and generation techniques to answer clinical questions.

## Data Loading and Preprocessing

This section loads required libraries, sets up authentication, and prepares the environment for data ingestion and processing. It also defines utility functions for loading and preprocessing CSV data.

In [None]:
%run ./capstone_setup

In [None]:
# Import all required libraries and modules for RAG pipeline
import os
import openai
from dotenv import load_dotenv
import warnings
from langchain.vectorstores import Chroma
from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import CSVLoader
from langchain.schema import Document
from langchain.chains import RetrievalQA
from langchain_community.retrievers import BM25Retriever
from deepeval.models.base_model import DeepEvalBaseLLM
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from deepeval import evaluate
from deepeval.metrics import (
    ContextualPrecisionMetric,
    ContextualRecallMetric,
    ContextualRelevancyMetric,
    AnswerRelevancyMetric,
    FaithfulnessMetric,
    HallucinationMetric,
    GEval,
)
import pandas as pd
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)

warnings.filterwarnings("ignore")

In [None]:
# Authentication and environment variable setup for Azure OpenAI and UHG APIs
import httpx
auth = "https://api.uhg.com/oauth2/token"
client_id = dbutils.secrets.get(scope = "AIML_Training", key = "client_id")
client_secret = dbutils.secrets.get(scope = "AIML_Training", key = "client_secret")
scope = "https://api.uhg.com/.default"
grant_type = "client_credentials"
async with httpx.AsyncClient() as client:
    body = {
        "grant_type": grant_type,
        "scope": scope,
        "client_id": client_id,
        "client_secret": client_secret,
    }
    headers = {"Content-Type": "application/x-www-form-urlencoded"}
    resp = await client.post(auth, headers=headers, data=body, timeout=120)
    token = resp.json()["access_token"]


load_dotenv("./Data/UAIS_vars.env")

AZURE_OPENAI_ENDPOINT = os.environ["MODEL_ENDPOINT"]
OPENAI_API_VERSION = os.environ["API_VERSION"]
CHAT_DEPLOYMENT_NAME = os.environ["CHAT_MODEL_NAME"]
PROJECT_ID = os.environ["PROJECT_ID"]
EMBEDDINGS_DEPLOYMENT_NAME = os.environ["EMBEDDINGS_MODEL_NAME"]

chat_client = openai.AzureOpenAI(
        azure_endpoint=AZURE_OPENAI_ENDPOINT,
        api_version=OPENAI_API_VERSION,
        azure_deployment=CHAT_DEPLOYMENT_NAME,
        azure_ad_token=token,
        default_headers={
            "projectId": PROJECT_ID
        }
    )
embeddings_client = openai.AzureOpenAI(
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    api_version=OPENAI_API_VERSION,
    azure_deployment=EMBEDDINGS_DEPLOYMENT_NAME,
    azure_ad_token=token,
    default_headers={ 
        "projectId": PROJECT_ID
    }
)


In [None]:
# Verify the Environment configuration values are set correctly
print("Embeddings Client Configuration:")
print(f"Endpoint: {AZURE_OPENAI_ENDPOINT}")
print(f"API Version: {OPENAI_API_VERSION}")
print(f"Deployment Name: {EMBEDDINGS_DEPLOYMENT_NAME}")
print(f"Model Name: {CHAT_DEPLOYMENT_NAME}")
print(f"Project ID: {PROJECT_ID}")

In [None]:
chat_model = AzureChatOpenAI(
    openai_api_version=OPENAI_API_VERSION,
    azure_deployment=CHAT_DEPLOYMENT_NAME,
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    azure_ad_token=token,
    default_headers={"projectId": PROJECT_ID},
)

In [None]:
from langchain.document_loaders import CSVLoader

def load_csv_with_langchain(csv_path, source_column=None):
    # Use LangChain's built-in CSVLoader to load CSV data based on provided columns
    if source_column:
        loader = CSVLoader(csv_path, source_column=source_column)
    else:
        loader = CSVLoader(csv_path)
    
    # Load the CSV into LangChain's document format
    documents = loader.load()
    
    print(f"Successfully loaded {len(documents)} document rows from the CSV.")
    return documents


In [None]:
# Initialize Azure OpenAI Embeddings client for vectorization of documents
embeddings = AzureOpenAIEmbeddings(
    azure_deployment=EMBEDDINGS_DEPLOYMENT_NAME,
    api_version=OPENAI_API_VERSION,
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    azure_ad_token=token,
    default_headers={
        "projectId": PROJECT_ID
    }
)

In [None]:
# Set up tiktoken cache and disable ChromaDB telemetry for compliance
tiktoken_cache_dir = os.path.abspath("./.setup/tiktoken_cache/")
os.path.abspath("./.setup/tiktoken_cache/")
os.environ["TIKTOKEN_CACHE_DIR"] = tiktoken_cache_dir

# Due to UHG policies, we have to disable telemetry to use ChromaDB
# See here for more information: https://docs.trychroma.com/docs/overview/telemetry
os.environ["ANONYMIZED_TELEMETRY"]="False"

In [None]:
# Define a function to create and store embeddings in a local ChromaDB vector store.

@retry(wait=wait_random_exponential(min=45, max=120), stop=stop_after_attempt(6))
# Updated store_embeddings: load entire docs at once (no batching)
def store_embeddings(persist_directory, docs=None):
    if not docs:
        print('No documents provided')
        return None
    
    if os.path.exists(persist_directory) and os.path.isdir(persist_directory) and os.listdir(persist_directory):
        print(f"Loading existing vector store from {persist_directory}")
        vector_store = Chroma(
            persist_directory=persist_directory,
            embedding_function=embeddings
        )
    else:
        print(f"Creating new vector store in {persist_directory}")
        vector_store = Chroma.from_texts(
            texts=[doc.page_content for doc in docs],
            embedding=embeddings,
            persist_directory=persist_directory
        )
        vector_store.persist()
    return vector_store

In [None]:
def get_processed_document_name(persist_directory):
# Load the vector store to retrieve document IDs
    vectorstore = Chroma(
            persist_directory=persist_directory,
            embedding_function=embeddings
        )
        
# Returns a set of document source names that have already been processed and embedded.
# Used to avoid redundant embedding of the same CSV files.
    # Extract metadata from all documents in the store
    all_metadatas = vectorstore.get()["metadatas"]
    
    # Create a set of source file paths from metadata
    processed_sources = set()
    for metadata in all_metadatas:
        if metadata and "source" in metadata:
            processed_sources.add(metadata["source"])
    
    return processed_sources

In [None]:
def filter_new_csv(csv_path, persist_directory):
    """Filter out CSV files that have already been processed."""
    processed_sources = get_processed_document_name(persist_directory)

    # Ensure csv_path is a list
    if isinstance(csv_path, str):
        csv_path = [csv_path]
    
    # Find CSVs that haven't been processed yet
    new_csv = [path for path in csv_path if path not in processed_sources]
    
    if new_csv:
        print(f"Found {len(new_csv)} new CSVs to process: {new_csv}")
    else:
        print("No new CSVs to process.")
        
    return new_csv

## Retrieval Strategies (Semantic, Thresholding, Hybrid, Reranking)

This section implements various retrieval strategies to fetch relevant document chunks for a given query. The main approach here is semantic retrieval using vector similarity, but the code is structured to allow for hybrid and advanced strategies.

In [None]:
# Semantic retrieval: fetch top-k most similar document chunks using vector similarity
# Can be extended to hybrid retrieval by combining with BM25 or other keyword-based methods
def semantic_retrieval(query, vectorstore, top_k=3):
    results = vectorstore.similarity_search(query, k=top_k*2)  # fetch more to be safe
    unique_results = []
    seen_contents = set()

    for doc in results:
        if doc.page_content not in seen_contents:
            unique_results.append(doc)
            seen_contents.add(doc.page_content)
        if len(unique_results) >= top_k:
            break

    return unique_results

In [None]:
# Example: BM25 retrieval (strategy 2, if desired)

def bm25_rag_pipeline(query, vectorstore, top_k=3):
    """
    Combines semantic and keyword search results for diverse retrieval.
    """
    # Get semantic search results
    semantic_results = vectorstore.similarity_search(query, k=top_k*2)
    semantic_contents = [doc.page_content for doc in semantic_results]
    
    # Get keyword search results
    bm25_retriever = BM25Retriever.from_texts(
        texts=semantic_contents,
        embedding=embeddings_client,
        k=top_k*2
    )
    keyword_results = bm25_retriever.get_relevant_documents(query, k=top_k*2)
    
    # Take half from semantic results
    final_results = semantic_results[:top_k//2]
    
    # Add unique keyword results
    for doc in keyword_results:
        if len(final_results) >= top_k:
            break
        if doc.page_content not in semantic_contents:
            final_results.append(doc)
    
    # Fill remaining spots with semantic results
    remaining_spots = top_k - len(final_results)
    if remaining_spots > 0:
        start_idx = len(final_results) - remaining_spots
        final_results.extend(semantic_results[start_idx:start_idx+remaining_spots])
    return final_results

In [None]:
# Generate an answer using the chat model, strictly based on retrieved context.
# The prompt instructs the model to only use the provided context.
# If the answer is not found in the context, a fallback message is returned.
def generate_answer(query, top_chunks, model_name=CHAT_DEPLOYMENT_NAME):
    context = "\n\n".join([doc.page_content for doc in top_chunks])
    prompt = (
        f"Context:\n{context}\n\n"
        f"Question: {query}\n"
        f"Answer (generate an answer strictly based on the above context; do not use your own knowledge. "
        f"If the query is not covered in the context, respond with: 'No relevant content found in the provided csv data.'):"
    )
    
    response = chat_client.chat.completions.create(
        messages=[{"role": "user", "content": prompt}],
        temperature=0,
        model=model_name
    )
    
    gpt_output = response.choices[0].message.content
    return gpt_output



In [None]:
import hashlib
import pandas as pd

def get_file_hash(file_path):
    """Generate a hash for the given file."""
    hasher = hashlib.md5()
    with open(file_path, 'rb') as f:
        buf = f.read()
        hasher.update(buf)
    return hasher.hexdigest()

def csv_chatbot_pipeline(csv_path, user_query, persist_directory):
    """
    Full pipeline: Load CSV → Embed → Retrieve → Generate Answer.
    Returns a dictionary with context, question, and AI-generated response.
    """
    csv_hash_path = os.path.join(persist_directory, 'csv_hash.txt')
    current_csv_hash = get_file_hash(csv_path)
    
    if os.path.exists(persist_directory) and os.path.isdir(persist_directory) and os.listdir(persist_directory):
        if os.path.exists(csv_hash_path):
            with open(csv_hash_path, 'r') as f:
                stored_csv_hash = f.read().strip()
        else:
            stored_csv_hash = None
        
        if stored_csv_hash == current_csv_hash:
            print(f"Using existing embeddings from {persist_directory}")
            vectorstore = Chroma(
                persist_directory=persist_directory,
                embedding_function=embeddings
            )
        else:
            print(f"CSV file has changed. Updating embeddings in {persist_directory}")
            raw_docs = load_csv_with_langchain(csv_path)
            vectorstore = Chroma(
                persist_directory=persist_directory,
                embedding_function=embeddings
            )
            vectorstore.add_documents(raw_docs)
            vectorstore.persist()
            with open(csv_hash_path, 'w') as f:
                f.write(current_csv_hash)
    else:
        print(f"No existing embeddings found. Processing CSV...")
        raw_docs = load_csv_with_langchain(csv_path)
        vectorstore = store_embeddings(persist_directory, docs=raw_docs)
        os.makedirs(persist_directory, exist_ok=True)
        with open(csv_hash_path, 'w') as f:
            f.write(current_csv_hash)
    
    # # Retrieve relevant chunks based on the user query
    # retrieved = semantic_retrieval(user_query, vectorstore)
    
    # Retrieve relevant docs based on the user query using B25
    retrieved =bm25_rag_pipeline(user_query, vectorstore)

    # Generate the answer using retrieved chunks
    answer = generate_answer(user_query, retrieved)
    
    # Format and return the response
    return {
        'context': [doc.page_content for doc in retrieved],
        'question': user_query,
        'AI_generated_response': answer
    }

In [None]:

def load_questions_from_csv(question_csv_path):  
    """
    Load questions from a CSV file. Assumes a column named 'question'.
    """
    df = pd.read_csv(question_csv_path)
    return df['question'].tolist()
 
def batch_generate_answers_from_csv(csv_path, question_csv_path, persist_directory):
    """
    Loads questions from a CSV and generates answers for each.
    Returns a list of dicts with question and answer.
    """
    questions = load_questions_from_csv(question_csv_path)
    results = []
    for q in questions:
        pipeline_result = csv_chatbot_pipeline(csv_path, q, persist_directory)
        results.append({
            'question': q,
            'AI_generated_response': pipeline_result['AI_generated_response']
        })
    return results

## Validation using capstone1_rag_validation.csv

This section validates the RAG pipeline by comparing model-generated answers to human-annotated reference answers from the validation CSV. It uses DeepEval metrics for quantitative evaluation.

In [None]:
# Example usage for batch mode with questions and existing answers from CSV
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

csv_path = "./Data/capstone1_rag_dataset.csv"
persist_directory = "./capstone_croma.db/"
# question_csv_path = "./Data/capstone1_rag_test_questions.csv"
question_csv_path = "./Data/capstone1_rag_validation.csv"
# Load questions from CSV
questions = load_questions_from_csv(question_csv_path)
results = []
for q in questions:
    pipeline_result = csv_chatbot_pipeline(csv_path, q, persist_directory)
    retrieved_docs = pipeline_result['context']  # Assuming context is a list of strings
    results.append({
        'question': q,
        'retrieved_documents': retrieved_docs,
        'generated_answer': pipeline_result['AI_generated_response']
    })
result_df = pd.DataFrame(results)
print(result_df)

In [None]:
# Select only the required columns for submission
submission_df = result_df[['question', 'retrieved_documents', 'generated_answer']]
submission_df.to_csv("./Data/submission.csv", index=False)

## Test Evaluation using capstone1_rag_test_questions.csv

This section demonstrates how to use DeepEval to test model answers against human (manual) answers from the validation CSV. The same approach can be used for the test set by switching the input CSV.

In [None]:

os.environ["DEEPEVAL_TELEMETRY_OPT_OUT"] = "YES"

# Wrap AzureChatOpenAI in a compatible wrapper
class AzureChatModelWrapper(DeepEvalBaseLLM):
    def __init__(self, model):
        self.model = model

    def load_model(self):
        return self.model

    def generate(self, prompt: str) -> str:
        return self.model.invoke(prompt).content

    async def a_generate(self, prompt: str) -> str:
        return (await self.model.ainvoke(prompt)).content

    def get_model_name(self):
        return "azure-gpt4o-mini"



In [None]:
# Wrap it for DeepEval
wrapped_model = AzureChatModelWrapper(chat_model)

In [None]:
from deepeval.test_case import LLMTestCase
from deepeval.metrics import ContextualPrecisionMetric
from deepeval import evaluate

# Load validation data with manual answers
validation_df = pd.read_csv('./Data/capstone1_rag_validation.csv')

# Align result_df (model output) with validation_df (manual answers) by question
test_cases = []
for idx, row in validation_df.iterrows():
    model_row = result_df[result_df['question'] == row['question']]
    if model_row.empty:
        continue
    model_row = model_row.iloc[0]
    retrieved_documents = model_row['retrieved_documents']
    if isinstance(retrieved_documents, str):
        retrieved_documents = [retrieved_documents]
    test_cases.append(
        LLMTestCase(
            input=row['question'],
            actual_output=model_row['generated_answer'],
            expected_output=row['reference_answer'],
            retrieval_context=retrieved_documents
        )
    )

metric = ContextualPrecisionMetric(
    threshold=0.6,
    model=wrapped_model,
    include_reason=True,
    verbose_mode=True
)

results = evaluate(test_cases, [metric])

# Print summary
for i, res in enumerate(results.test_results):
    print(f'Q{i+1}:', res.metrics_data[0].success, '| Score:', res.metrics_data[0].score)
    print('Reason:', res.metrics_data[0].reason)
    print('-'*40)

In [None]:
test_case = LLMTestCase(
    input='What are the main eye-related symptoms and genetic cause of Wagner syndrome?',
    actual_output=result_df.iloc[0]['generated_answer'],
    context=["described worldwide; about half of these individuals are from the syndrome areas of the eye"]
)

metric = HallucinationMetric(
    threshold=0.6,
    model=wrapped_model,
    include_reason=True,
    verbose_mode=True
)

result = evaluate([test_case], [metric])

In [None]:
print('Sucess:', result.test_results[0].metrics_data[0].success)
print('Score:', result.test_results[0].metrics_data[0].score)
print('Reason:', result.test_results[0].metrics_data[0].reason)

## Summary and Best Strategy

This notebook implemented a RAG-based Clinical Intelligence System using a set of 100 clinical documents as the exclusive knowledge base. The pipeline includes data loading, embedding, semantic retrieval, response generation, and evaluation using DeepEval metrics.

**Key Takeaways:**
- Semantic retrieval with vector similarity provides strong baseline performance.
- The system is easily extensible to hybrid or reranking strategies for improved accuracy.
- Evaluation against human-annotated answers enables robust validation of the approach.

For best results, consider experimenting with hybrid retrieval and advanced reranking strategies.