In [1]:
import json
from tqdm import tqdm

## Data Ingestion

In [2]:
# Load dataset from JSON
with open("./../dataset/medical_qa_documents_with_id.json", "r") as f:
    docs_raw = json.load(f)

In [3]:
documents = docs_raw[0]["documents"]  # get the list of Q&A documents

print("Total documents loaded:", len(documents))
print("Keys in document:", docs_raw[0].keys())

Total documents loaded: 14443
Keys in document: dict_keys(['document_info', 'documents'])


In [4]:
documents = []

for docs_info in docs_raw:
    for doc in docs_info['documents']:
        documents.append(doc)

In [5]:
documents[0]

{'question': 'Who is at risk for Lymphocytic Choriomeningitis (LCM)?',
 'answer': 'LCMV infections can occur after exposure to fresh urine, droppings, saliva, or nesting materials from infected rodents. Transmission may also occur when these materials are directly introduced into broken skin, the nose, the eyes, or the mouth, or presumably, via the bite of an infected rodent. Person-to-person transmission has not been reported, with the exception of vertical transmission from infected mother to fetus, and rarely, through organ transplantation.',
 'qtype': 'susceptibility',
 'id': 'f72c0d85'}

### testing Grok API

In [6]:
import os
os.environ["GROQ_API_KEY"] = "gsk_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"

In [7]:
from groq import Groq

client = Groq()

In [8]:
q = "what is malaria?"

In [9]:
response = client.chat.completions.create(
    model="openai/gpt-oss-20b",
    messages=[
      {
        "role": "user",
        "content": q
      }
    ]
)

In [10]:
response.choices[0].message.content

'**Malaria** is a life‑threatening infectious disease caused by parasites of the genus *Plasmodium*. The most common species that infect humans are *P. falciparum*, *P. vivax*, *P. ovale*, *P. malariae*, and, more rarely, *P. knowlesi*.  \n\n### How it’s transmitted\n- **Mosquito bite**: The parasite is transmitted when an infected female *Anopheles* mosquito (the “malaria mosquito”) bites a human and injects sporozoites into the bloodstream.\n- **Other routes** (rare): blood transfusion, organ transplant, mother‑to‑child during pregnancy or delivery, or sharing of contaminated needles.\n\n### Life cycle (simplified)\n1. **Sporozoites** enter the bloodstream → travel to the liver.  \n2. **Liver stage**: parasites replicate in hepatocytes.  \n3. **Blood stage**: merozoites burst out of liver cells → invade red blood cells, multiply, and cause them to rupture.  \n4. **Release of new merozoites** → infect more red blood cells → cycle repeats.  \n\n### Symptoms\n- Fever (often spiking ever

## Indexing ElasticSearch

In [12]:
from elasticsearch import Elasticsearch

In [13]:
es_client = Elasticsearch(
    ["http://localhost:9200"],   # must include scheme http://
    request_timeout=60           # increase timeout in case ES is slow
)

print(es_client.info())

{'name': '25cfe1f5474e', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'buIn5x-TT1C4org_uA0YJw', 'version': {'number': '8.19.4', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': 'aa0a7826e719b392e7782716b323c4fb8fa3b392', 'build_date': '2025-09-16T22:06:03.940754111Z', 'build_snapshot': False, 'lucene_version': '9.12.2', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


In [16]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "answer": {"type": "text"},
            "question": {"type": "text"},
            "qtype": {"type": "keyword"} 
        }
    }
}

index_name = "medical-questions"

es_client.indices.create(index=index_name, body=index_settings)
print(es_client.cluster.health())


{'cluster_name': 'docker-cluster', 'status': 'green', 'timed_out': False, 'number_of_nodes': 1, 'number_of_data_nodes': 1, 'active_primary_shards': 1, 'active_shards': 1, 'relocating_shards': 0, 'initializing_shards': 0, 'unassigned_shards': 0, 'unassigned_primary_shards': 0, 'delayed_unassigned_shards': 0, 'number_of_pending_tasks': 0, 'number_of_in_flight_fetch': 0, 'task_max_waiting_in_queue_millis': 0, 'active_shards_percent_as_number': 100.0}


In [17]:
# es_client.indices.delete(index=index_name, ignore=[400, 404])
es_client.cat.shards(index=index_name, v=True)

TextApiResponse('index             shard prirep state   docs store dataset ip         node\nmedical-questions 0     p      STARTED    0  227b    227b 172.17.0.2 25cfe1f5474e\n')

In [18]:
documents[0]

{'question': 'Who is at risk for Lymphocytic Choriomeningitis (LCM)?',
 'answer': 'LCMV infections can occur after exposure to fresh urine, droppings, saliva, or nesting materials from infected rodents. Transmission may also occur when these materials are directly introduced into broken skin, the nose, the eyes, or the mouth, or presumably, via the bite of an infected rodent. Person-to-person transmission has not been reported, with the exception of vertical transmission from infected mother to fetus, and rarely, through organ transplantation.',
 'qtype': 'susceptibility',
 'id': 'f72c0d85'}

In [19]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|█████████████████████████████████████| 14443/14443 [06:00<00:00, 40.01it/s]


## RAG with Elastic Search

In [20]:
def build_prompt(query, search_results):

    prompt_template = """
You are a professional medical assistant.
Answer the QUESTION using only the CONTEXT provided from verified medical sources.
If the answer is not available in the CONTEXT, say "I'm not sure based on the available information."

QUESTION: {question}

CONTEXT:
{context}
""".strip()
    
    context = ""
    
    for doc in search_results:
        context = context + f"question: {doc['question']}\nanswer: {doc['answer']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [21]:
def llm(prompt):
    response = client.chat.completions.create(
        model="openai/gpt-oss-20b",
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [22]:
def elastic_search(query, qtype):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "answer", "qtype"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "qtype": qtype
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [25]:
def elastic_search_rag(query, qtype):
    search_results = elastic_search(query, qtype)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [26]:
query = "what is malaria?"
qtype = 'information'
answer = elastic_search_rag(query, qtype)
answer

'Malaria is a serious disease caused by a parasite that you contract when an infected mosquito bites you. It is a major cause of death worldwide, especially in developing countries with warm climates, and is almost eliminated in the United States. Four related parasites cause malaria, with the most deadly type found in sub‑Saharan Africa. Symptoms include chills, fever, flu‑like symptoms, vomiting, diarrhea, and jaundice, and a blood test can diagnose it. Treatment depends on the type of parasite and the region of infection, and prevention involves medication, insect repellant, protective clothing, and mosquito nets.'