#### Importing Necessary Libraries

In [70]:
from langchain_openai import OpenAI, OpenAIEmbeddings,ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from pinecone import Pinecone
import numpy as np
import json
import os

#### Acessing Pinecone database

In [71]:
pinecone_api_key = os.environ.get("PINECONE_API_KEY")

## Pinecone Initialization and acessing 
pc = Pinecone(api_key=pinecone_api_key)
index_name="medicalqabot"

print(pc.Index(index_name).describe_index_stats())

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 382}},
 'total_vector_count': 382}


In [72]:
def initialize_services(index_name,openai_api_key):

    ## Pinecone Initialization and acessing 
    pc = Pinecone(api_key=pinecone_api_key)
    index_name=index_name
    index = pc.Index(index_name)

    ## llm initialization
    llm = ChatOpenAI(
        temperature=0.2,
        model_name="gpt-4",
        openai_api_key=openai_api_key
    )
    
    embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)


    return index, llm, embeddings

In [73]:
def get_mcq_prompt():
    """Return the chat prompt template for MCQ generation"""
    prompt_template = """
    Based on the following medical context, create a accurate and unique multiple choice question.
    The question should test important clinical concepts and decision-making.
    
    Context: {context}
    
    Consider these aspects when generating the question:
    1. Focus on specific clinical scenarios rather than general knowledge
    2. Include realistic laboratory values, imaging findings, or patient symptoms
    3. Make the scenario factual enough to test clinical decision-making
    4. Include time-sensitive elements or risk factors that affect the decision
    5. Consider including relevant comorbidities or complications
    
    Generate a multiple choice question in valid JSON format with the following structure:
    {{
        "question": "Create a detailed clinical scenario that includes:
                    - Specific patient demographics
                    - Precise symptoms and timeline
                    - Relevant lab values or imaging results
                    - Important comorbidities
                    - Any critical time factors",
        "options": [
            "A) A specific, detailed intervention or treatment",
            "B) An alternative approach with different timing or method",
            "C) A reasonable but suboptimal choice given the specifics",
            "D) A common misconception or clearly incorrect approach"
        ],
        "correct_answer": "Letter of correct answer (A, B, C, or D)",
        "reasoning": "Provide detailed explanation including:
                     1. Why the correct answer is optimal for this specific case
                     2. Why each incorrect option is inappropriate
                     3. Key clinical factors that influenced the decision
                     4. Any relevant guidelines or evidence supporting the choice"
    }}
    
    Requirements for uniqueness:
    1. Avoid basic or commonly tested scenarios
    2. Include unique combinations of symptoms or findings
    3. Make sure each option is distinct and specific
    4. Focus on nuanced clinical decision-making
    5. Include recent medical guidelines when relevant
    
    Return only the JSON object with no additional text or formatting.
    """
    return ChatPromptTemplate.from_template(prompt_template)


In [74]:
def retrieve_contexts(index, num_contexts=50):
    """Retrieve diverse contexts from Pinecone"""
    vectors = index.query(
        vector=[0.0] * 1536,  # dummy vector to get all records
        top_k=num_contexts,
        include_metadata=True
    )
    return [match.metadata.get('text', '') for match in vectors.matches]

def cosine_similarity(v1, v2):
    """Calculate cosine similarity between two vectors"""
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

def calculate_relevance(embeddings, context, question):
    """Calculate semantic similarity between context and question"""
    context_embedding = embeddings.embed_query(context)
    question_embedding = embeddings.embed_query(question)
    return float(cosine_similarity(context_embedding, question_embedding))

In [75]:
def generate_questions(index, chat_llm, embeddings, num_questions=50):
    """Generate MCQs from medical contexts"""
    contexts = retrieve_contexts(index, num_questions)
    questions = []
    prompt = get_mcq_prompt()
    
    for i, context in enumerate(contexts):
        try:
            # Generate MCQ using ChatLLM
            response = chat_llm.invoke(prompt.format_messages(context=context))
            # Extract the content from AIMessage
            mcq_str = response.content
            # Parse the JSON string
            mcq = json.loads(mcq_str)
            
            # Add metadata for retrieval testing
            mcq['context_id'] = i
            mcq['relevance_score'] = calculate_relevance(embeddings, context, mcq['question'])
            
            questions.append(mcq)
            print(f"Successfully generated question {i+1}/{num_questions}")
            
        except Exception as e:
            print(f"Error generating question {i+1}: {str(e)}")
            continue
            
    return questions

In [76]:
def save_questions(questions, filename="medical_mcqs.json"):
    """Save generated questions to a JSON file"""
    with open(filename, 'w') as f:
        json.dump(questions, f, indent=4)

In [77]:
def main():
    """Main function to run the MCQ generation and testing"""

    # Set up credentials
    openai_api_key = os.environ.get("OPENAI_API_KEY")
    index_name = "medicalqabot" 
    
    # Initialize services
    index, llm, embeddings = initialize_services(
        index_name,
        openai_api_key
    )
    
    # Generate questions
    questions = generate_questions(index, llm, embeddings, num_questions=50)
    
    # Save questions
    save_questions(questions)
    

In [78]:
if __name__ == "__main__":
    questions = main()

Error generating question 1: Invalid control character at: line 10 column 271 (char 1146)
Successfully generated question 2/50
Successfully generated question 3/50
Successfully generated question 4/50
Successfully generated question 5/50
Successfully generated question 6/50
Successfully generated question 7/50
Successfully generated question 8/50
Successfully generated question 9/50
Successfully generated question 10/50
Successfully generated question 11/50
Successfully generated question 12/50
Successfully generated question 13/50
Successfully generated question 14/50
Successfully generated question 15/50
Successfully generated question 16/50
Successfully generated question 17/50
Successfully generated question 18/50
Successfully generated question 19/50
Successfully generated question 20/50
Successfully generated question 21/50
Successfully generated question 22/50
Successfully generated question 23/50
Successfully generated question 24/50
Successfully generated question 25/50
Succes