<a href="https://colab.research.google.com/github/Mankind124/RagbasedMCQ/blob/main/MCQ_RAG_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import json
import pandas as pd
import traceback
from dotenv import load_dotenv
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain, SequentialChain
from langchain.callbacks import get_openai_callback
from langchain.vectorstores import Pinecone
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import TextLoader
import pinecone

# Load environment variables
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENV = os.getenv("PINECONE_ENV")

# Initialize OpenAI
llm = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model_name="gpt-3.5-turbo", temperature=0.5)

# Initialize Pinecone
pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
index_name = "your_index_name"  # Replace with your Pinecone index name

# Create embeddings
embeddings = HuggingFaceEmbeddings()

# Load and split the document
loader = TextLoader("data.txt")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)

# Create and populate Pinecone index
docsearch = Pinecone.from_documents(texts, embeddings, index_name=index_name)

# Define the response JSON structure
RESPONSE_JSON = {
    "1": {
        "mcq": "multiple choice question",
        "options": {
            "a": "choice here",
            "b": "choice here",
            "c": "choice here",
            "d": "choice here",
        },
        "correct": "correct answer",
        "explanation": "Explanation for why the correct answer is correct and why the other options are incorrect."
    },
    # ... (repeat for 2 and 3)
}

# Define the prompt templates
TEMPLATE = """
Text: {context}
You are an expert MCQ maker specialized in
generating unique, scenario-based multiple-choice questions (MCQs)
for medical exams such as USMLE, NBME, PANRE-LA, PANCE, and MOCAPEDS. Your job is to create a quiz of
{number} multiple choice questions for {subject} students in {tone} tone, ensuring relevance,
challenge, and alignment with current medical standards and knowledge. Make sure the questions
are not repeated and check all the questions to be conforming to the text as well. Make sure to format
your response like RESPONSE_JSON below and use it as a guide. Ensure to make {number} MCQs.

### GUIDELINES
- Generate clinically relevant scenarios.
- Ensure that each question has one correct answer and three plausible distractors.
- Use the latest medical guidelines and research from PubMed and UpToDate.
- Clearly explain the correct answer and why the other options are incorrect.
- Maintain an educational and professional tone.

### CONSTRAINTS
- Do not generate patient-identifiable information.
- Avoid overly complex medical jargon unless necessary for the question.
- Ensure all information is accurate and up-to-date.

### CLARIFICATION
- Clarify medical terminology when necessary to ensure understanding.
- Fill in any missing details in the scenarios to make them comprehensive.

### TEMPLATE FOR SCENARIOS
A [AGE]-year-old [GENDER] comes to the [SITE OF CARE] because he/she has had
[PRESENTING SYMPTOMS] for the past [DURATION]. The patient says [SUBJECTIVE SYMPTOMS].
She is [BEHAVIOR]. [VITAL SIGNS] The patient appears [OBSERVATION]. Physical examination shows [PE FINDINGS]. [LAB RESULTS]. Then [Question]?

### PERSONALIZATION
- Tailor the difficulty and topic of questions based on the specific exam and target audience.
- Provide up-to-date references when generating the items.

### RESPONSE_JSON
{response_json}
"""

quiz_generation_prompt = PromptTemplate(
    input_variables=["context", "number", "subject", "tone", "response_json"],
    template=TEMPLATE
)

TEMPLATE2 = """
You are an expert English grammarian and writer. Given a Multiple Choice Quiz for {subject} students,
evaluate the complexity of the questions and give a complete analysis of the quiz. Use at most 50 words for complexity analysis.
If the quiz is not at par with the cognitive and analytical abilities of the students,
update the quiz questions which need to be changed and adjust the tone to perfectly fit the student abilities.

Quiz_MCQs:
{quiz}

Check from an expert English Writer of the above quiz:
"""

quiz_evaluation_prompt = PromptTemplate(input_variables=["subject", "quiz"], template=TEMPLATE2)

# Create LangChain chains
quiz_chain = LLMChain(llm=llm, prompt=quiz_generation_prompt, output_key="quiz", verbose=True)
review_chain = LLMChain(llm=llm, prompt=quiz_evaluation_prompt, output_key="review", verbose=True)
generate_evaluate_chain = SequentialChain(
    chains=[quiz_chain, review_chain],
    input_variables=["context", "number", "subject", "tone", "response_json"],
    output_variables=["quiz", "review"],
    verbose=True,
)

# Define retrieval function
def retrieve_context(query, k=3):
    similar_docs = docsearch.similarity_search(query, k=k)
    contexts = [doc.page_content for doc in similar_docs]
    return "\n".join(contexts)

# Set parameters
NUMBER = 5
SUBJECT = "Cardiovascular Disease"
TONE = "Easy"
query = f"Generate MCQs about {SUBJECT}"

# Retrieve context and generate MCQs
context = retrieve_context(query)

with get_openai_callback() as cb:
    response = generate_evaluate_chain(
        {
            "context": context,
            "number": NUMBER,
            "subject": SUBJECT,
            "tone": TONE,
            "response_json": json.dumps(RESPONSE_JSON)
        }
    )

# Print token usage and cost
print(f"Total Tokens: {cb.total_tokens}")
print(f"Prompt Tokens: {cb.prompt_tokens}")
print(f"Completion Tokens: {cb.completion_tokens}")
print(f"Total Cost: {cb.total_cost}")

# Process and format results
quiz = response.get("quiz")
quiz_dict = json.loads(quiz)

# Create DataFrame
quiz_table_data = []
for key, value in quiz_dict.items():
    mcq = value["mcq"]
    options = " | ".join([f"{option}: {option_value}" for option, option_value in value["options"].items()])
    correct = value["correct"]
    quiz_table_data.append({"MCQ": mcq, "Choices": options, "Correct": correct})

quiz_df = pd.DataFrame(quiz_table_data)

# Export to CSV
quiz_df.to_csv("Cardiovascular_Disease_MCQs.csv", index=False)

print("MCQs have been generated and saved to 'Cardiovascular_Disease_MCQs.csv'")

# Example output (for demonstration purposes)
example_output = """
A 65-year-old male comes to the emergency department because he has had chest pain for the past 2 hours.
The patient describes the pain as a heavy, squeezing sensation radiating to his left arm and jaw. He has a history of hypertension,
hyperlipidemia, and smoking. On examination, he appears anxious and diaphoretic. His vital signs are as follows: BP 160/90 mmHg, HR 100 bpm,
RR 20 breaths per minute, and SpO2 95% on room air. An electrocardiogram (ECG) shows ST-segment elevation in leads II, III, and aVF.

Which of the following is the most appropriate next step in management?

A. Administer sublingual nitroglycerin and monitor for response
B. Administer aspirin and arrange for immediate percutaneous coronary intervention (PCI)
C. Perform a chest X-ray to rule out other causes of chest pain
D. Administer a beta-blocker to reduce heart rate and myocardial oxygen demand

Answer: B. Administer aspirin and arrange for immediate percutaneous coronary intervention (PCI)
"""

print("Example Output:")
print(example_output)