#### Importing Libraries

In [7]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.retrievers import MultiQueryRetriever
from langchain.vectorstores import Pinecone
from langchain_pinecone import PineconeVectorStore
from pinecone.grpc import PineconeGRPC as Pinecone
from langchain.chains import RetrievalQA
import os
import json
from datetime import datetime

#### Intializing and Accessing Pinecone Database and OpenAI

In [8]:
def initialize_components(openai_api_key, pinecone_api_key, index_name):
    
    """Initialize and return core components."""
    pc = Pinecone(api_key=pinecone_api_key)
    index = pc.Index(index_name)

    embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)

    vectorstore = PineconeVectorStore(index=index, embedding=embeddings, text_key="text")

    llm = ChatOpenAI(temperature=0.1, model="gpt-4", openai_api_key=openai_api_key)
    return index, vectorstore, llm

#### Single-step Retrieval

In [9]:
def single_step_retrieval(vectorstore, question, k=4):
    """Simple semantic search using embeddings."""
    results = vectorstore.similarity_search_with_score(question, k=k)
    return [(doc.page_content, score) for doc, score in results]

#### Multi stage Retrieval

In [10]:
def multi_step_retrieval(vectorstore, llm, question, k=4):
    """Multi-step retrieval using query decomposition."""
    decomposition_prompt = """
    Break down this medical question into 2-3 key sub-queries that would help gather comprehensive information:
    Question: {question}
    Return only the sub-queries, one per line.
    """
    messages = [{"role": "user", "content": decomposition_prompt.format(question=question)}]
    sub_queries = llm.invoke(messages).content.strip().split('\n')

    all_results = []
    for sub_query in sub_queries:
        results = vectorstore.similarity_search_with_score(sub_query, k=2)
        all_results.extend([(doc.page_content, score, sub_query) for doc, score in results])

    all_results.sort(key=lambda x: x[1], reverse=True)
    return [(content, score) for content, score, _ in all_results[:k]]

#### Concept Based Retrieval

In [11]:
def concept_based_retrieval(vectorstore, llm, question, k=4):
    """Concept-based retrieval using medical concept extraction."""
    concept_prompt = """
    Extract key medical concepts from this question and reformulate as a search query:
    Question: {question}
    Format: List the concepts and combine them into a search query.
    """
    messages = [{"role": "user", "content": concept_prompt.format(question=question)}]
    concept_query = llm.invoke(messages).content

    results = vectorstore.similarity_search_with_score(concept_query, k=k)
    return [(doc.page_content, score) for doc, score in results]

#### Hybrid Retrieval

In [12]:
def hybrid_retrieval(vectorstore, llm, question, k=4):
    """Combine semantic and concept-based retrieval."""
    semantic_results = single_step_retrieval(vectorstore, question, k=k//2)
    concept_results = concept_based_retrieval(vectorstore, llm, question, k=k//2)

    all_results = semantic_results + concept_results
    seen_contents = set()
    unique_results = []

    for content, score in all_results:
        if content not in seen_contents:
            seen_contents.add(content)
            unique_results.append((content, score))

    return sorted(unique_results, key=lambda x: x[1], reverse=True)[:k]


#### Experimenting Each Retrieval Strategies and saving the result

In [13]:
def run_experiment(openai_api_key, pinecone_api_key, index_name, question, options):
    """Run and compare all retrieval strategies."""
    index, vectorstore, llm = initialize_components(openai_api_key, pinecone_api_key, index_name)

    experiments = {
        "single_step": {
            "results": single_step_retrieval(vectorstore, question),
            "method": "Simple semantic search"
        },
        "multi_step": {
            "results": multi_step_retrieval(vectorstore, llm, question),
            "method": "Query decomposition and multi-step retrieval"
        },
        "concept_based": {
            "results": concept_based_retrieval(vectorstore, llm, question),
            "method": "Medical concept extraction and search"
        },
        "hybrid": {
            "results": hybrid_retrieval(vectorstore, llm, question),
            "method": "Combined semantic and concept-based approach"
        }
    }

    # Save results
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"retrieval_experiments_{timestamp}.json"

    results = {
        "question": question,
        "options": options,
        "experiments": experiments
    }

    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(results, f, indent=2, ensure_ascii=False)

    print(f"\nResults saved to: {filename}")
    return results

#### Main Function

In [14]:
def main():
    """Main function to run the retrieval experiment."""
    sample_mcq = {
        "question": "A 32-year-old male patient with a history of substance abuse disorder (heroin) has presented with severe toothache for the last three days. His medical history reveals he is in active treatment for substance use disorder and he has been maintaining a drug-free lifestyle for the past six months. He's currently having a severe dental abscess in the lower right second molar confirmed by dental radiography. How would you manage his condition?",
        "options": [
            "A) Prescribe opioids for pain relief",
            "B) Consult with the patient's addiction specialist before prescribing opioids for pain management",
            "C) Prescribe non-opioid analgesics and refer the patient to a dental surgeon for abscess management",
            "D) Ignore the patient's request for pain relief due to his substance abuse history"
        ]
    }

    results = run_experiment(
        openai_api_key=os.environ.get("OPENAI_API_KEY"),
        pinecone_api_key=os.environ.get("PINECONE_API_KEY"),
        index_name="medicalqabot",
        question=sample_mcq["question"],
        options=sample_mcq["options"]
    )

    print("\nExperiment Summary:")
    for method, data in results["experiments"].items():
        print(f"\n{method.replace('_', ' ').title()}:")
        print(f"Number of documents retrieved: {len(data['results'])}")
        print(f"Top document similarity score: {data['results'][0][1] if data['results'] else 'N/A'}")



In [15]:

if __name__ == "__main__":
    main()

  embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
  llm = ChatOpenAI(temperature=0.1, model="gpt-4", openai_api_key=openai_api_key)



Results saved to: retrieval_experiments_20250112_224045.json

Experiment Summary:

Single Step:
Number of documents retrieved: 4
Top document similarity score: 0.8422038

Multi Step:
Number of documents retrieved: 4
Top document similarity score: 0.85318786

Concept Based:
Number of documents retrieved: 4
Top document similarity score: 0.83429736

Hybrid:
Number of documents retrieved: 3
Top document similarity score: 0.8422038
