In [None]:
#cell 1
import textwrap
import chromadb
import numpy as np
import pandas as pd
import re
import json
import os
import time

from chromadb import Documents, EmbeddingFunction, Embeddings
from google import genai
from google.genai import types

# PubMedQA 평가를 위한 라이브러리 추가
from datasets import load_dataset
from rouge_score import rouge_scorer

client = genai.Client(api_key='AIzaSyAwn8VQxHpkS-TCAKChibfo4joRobyFgpU') #API


for m in client.models.list():
  if 'embedContent' in m.supported_actions:
    print(m.name)

print(os.getcwd())

  from .autonotebook import tqdm as notebook_tqdm


models/embedding-001
models/text-embedding-004
models/gemini-embedding-exp-03-07
models/gemini-embedding-exp
/Users/euijin/AI_team_project/AI-project


In [None]:
#cell 2
import requests 
import xml.etree.ElementTree as ET 

class GeminiEmbeddingFunction(EmbeddingFunction):
  def __call__(self, input: Documents) -> Embeddings:
    EMBEDDING_MODEL_ID = "models/embedding-001"
    title = "Custom query"

    response = client.models.embed_content(
        model=EMBEDDING_MODEL_ID,
        contents=input,
        config=types.EmbedContentConfig(
          task_type="retrieval_document",
          title=title
        )
    )
    return [e.values for e in response.embeddings]

def preprocess_metadata(metadata):
    new_metadata = {}
    for k, v in metadata.items():
        if isinstance(v, list):
            new_metadata[k] = ", ".join(map(str, v))
        else:
            new_metadata[k] = v
    return new_metadata

def batch_add(collection, documents, metadatas, ids, batch_size=100):
    for i in range(0, len(documents), batch_size):
        batch_docs = documents[i:i+batch_size]
        batch_metas = metadatas[i:i+batch_size]
        batch_ids = ids[i:i+batch_size]
        collection.add(
            documents=batch_docs,
            metadatas=batch_metas,
            ids=batch_ids
        )

def create_chroma_db(json_data, name):
    chroma_client = chromadb.Client()
    try:
        chroma_client.delete_collection(name)
        print(f"Existing collection '{name}' deleted.")
    except Exception as e:
        print(f"No existing collection '{name}' to delete or error during deletion: {e}")

    db_collection = chroma_client.create_collection(
        name=name,
        embedding_function=GeminiEmbeddingFunction()
    )

    documents = [item["text"] for item in json_data]
    metadatas = [preprocess_metadata(item["metadata"]) for item in json_data]
    ids = [str(i) for i in range(len(json_data))]

    batch_add(db_collection, documents, metadatas, ids, batch_size=100)

    print(f"Successfully created and populated '{name}' with {len(documents)} documents.")
    return db_collection

def load_bioasq_task_b_data(file_paths):
    all_data = []
    for file_path in file_paths:
        print(f"Loading BioASQ data from {file_path}...")
        with open(file_path, 'r', encoding='utf-8') as f:
            json_data = json.load(f)
            if 'questions' in json_data:
                all_data.extend(json_data['questions'])
            else:
                print(f"Warning: 'questions' key not found in {file_path}. Skipping.")
    print(f"Total BioASQ questions loaded: {len(all_data)}")
    return all_data

def create_bioasq_embeddings(bioasq_data, collection_name="bioasq_qa_answers"):
    chroma_client = chromadb.Client()

    try:
        chroma_client.delete_collection(collection_name)
        print(f"Existing collection '{collection_name}' deleted.")
    except Exception as e:
        print(f"No existing collection '{collection_name}' to delete or error during deletion: {e}")

    db_bioasq_qa_collection = chroma_client.create_collection(
        name=collection_name,
        embedding_function=GeminiEmbeddingFunction()
    )

    documents = []
    metadatas = []
    ids = []

    for i, entry in enumerate(bioasq_data):
        question_text = entry.get('body', entry.get('question', ''))
        ideal_answer = entry.get('ideal_answer', '')
        exact_answer = entry.get('exact_answer', '')
        type_of_question = entry.get('type', 'unknown')
        id_val = entry.get('id', str(i))

        document_text = f"Question: {question_text} Answer: {ideal_answer}"

        documents.append(document_text)

        raw_meta = {
            "question": question_text,
            "type": type_of_question,
            "exact_answer": exact_answer,
            "ideal_answer": ideal_answer,
            "id": id_val
        }
        processed_meta = preprocess_metadata(raw_meta)
        metadatas.append(processed_meta)

        ids.append(id_val)

    batch_add(db_bioasq_qa_collection, documents, metadatas, ids, batch_size=100)
    print(f"Successfully created and populated '{collection_name}' with {len(documents)} documents.")
    return db_bioasq_qa_collection


#PubMed API 연동 함수들
NCBI_EUTILS_BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
COMMON_API_PARAMS = "retmode=xml&email=your_email@example.com"

def fetch_pubmed_uids_by_query(query, retmax=500):
    """주어진 쿼리로 PubMed UID를 검색하고 반환합니다."""
    esearch_url = f"{NCBI_EUTILS_BASE_URL}esearch.fcgi?db=pubmed&term={query}&retmax={retmax}&{COMMON_API_PARAMS}"
    try:
        response = requests.get(esearch_url)
        response.raise_for_status() # HTTP 에러 발생 시 예외 발생
        root = ET.fromstring(response.content)
        uids = [id_elem.text for id_elem in root.findall(".//Id")]
        return uids
    except requests.exceptions.RequestException as e:
        print(f"Error fetching UIDs for '{query}': {e}")
        return []
    except ET.ParseError as e:
        print(f"Error parsing XML for UIDs for '{query}': {e}")
        # print(f"Response content: {response.content.decode()}") # 디버깅용
        return []


def fetch_pubmed_article_details(uids):
    """UID 목록으로 PubMed 초록 및 상세 정보를 가져와 RAG 형식으로 반환합니다."""
    if not uids:
        return []

    chunk_size = 200
    all_articles_data = []

    for i in range(0, len(uids), chunk_size):
        chunk_uids = uids[i:i + chunk_size]
        efetch_url = f"{NCBI_EUTILS_BASE_URL}efetch.fcgi?db=pubmed&id={','.join(chunk_uids)}&{COMMON_API_PARAMS}"

        try:
            response = requests.get(efetch_url)
            response.raise_for_status()
            root = ET.fromstring(response.content)

            for pubmed_article in root.findall(".//PubmedArticle"):
                pmid = "N/A"
                title = ""
                abstract = ""
                pub_year = "Unknown"
                journal_title = "Unknown Journal"

                try:
                    # PMID 추출 (PubmedArticleSet/PubmedArticle/MedlineCitation/PMID)
                    pmid_elem = pubmed_article.find(".//MedlineCitation/PMID")
                    if pmid_elem is not None and pmid_elem.text:
                        pmid = pmid_elem.text.strip()

                    # 제목 추출 (PubmedArticleSet/PubmedArticle/MedlineCitation/Article/ArticleTitle)
                    article_title_elem = pubmed_article.find(".//Article/ArticleTitle")
                    if article_title_elem is not None and article_title_elem.text:
                        title = article_title_elem.text.strip()

                    # 초록 추출 (PubmedArticleSet/PubmedArticle/MedlineCitation/Article/Abstract/AbstractText)
                    abstract_texts = []
                    abstract_elems = pubmed_article.findall(".//Abstract/AbstractText")
                    if abstract_elems: # AbstractText 요소가 존재할 경우에만 처리
                        for abs_text in abstract_elems:
                            if abs_text is not None and abs_text.text: # 개별 AbstractText 요소의 텍스트 확인
                                abstract_texts.append(abs_text.text.strip())
                    abstract = " ".join(abstract_texts)

                    # 발행 연도 추출 (PubDate/Year 또는 PubDate/MedlineDate)
                    pub_year_elem = pubmed_article.find(".//PubDate/Year")
                    if pub_year_elem is not None and pub_year_elem.text:
                        pub_year = pub_year_elem.text.strip()
                    else:
                        pub_medline_date_elem = pubmed_article.find(".//PubDate/MedlineDate")
                        if pub_medline_date_elem is not None and pub_medline_date_elem.text:
                            pub_year = pub_medline_date_elem.text.split(' ')[0].strip() # '2023 Fall' -> '2023'

                    # 저널 제목 추출 (Journal/Title)
                    journal_title_elem = pubmed_article.find(".//Journal/Title")
                    if journal_title_elem is not None and journal_title_elem.text:
                        journal_title = journal_title_elem.text.strip()

                    document_text = f"Title: {title}. Abstract: {abstract}."
                    if not document_text.strip(): # 제목과 초록이 모두 없으면 스킵
                        continue

                    all_articles_data.append({
                        "text": document_text,
                        "metadata": {
                            "source": "PubMed",
                            "pmid": pmid,
                            "title": title,
                            "abstract": abstract,
                            "publication_year": pub_year,
                            "journal": journal_title
                        }
                    })
                except Exception as art_e:
                    print(f"Error parsing article (PMID: {pmid} / Raw Title: {title[:50]}...): {art_e}")
                    # print(f"Problematic XML part (if available): {ET.tostring(pubmed_article, encoding='unicode')[:500]}...") # 디버깅용
                    continue
            time.sleep(0.3) # API 호출 간 지연
        except requests.exceptions.RequestException as req_e:
            print(f"Error fetching details for UIDs chunk: {req_e}")
            time.sleep(1) # 오류 시 더 긴 지연
        except ET.ParseError as parse_e:
            print(f"Error parsing XML for details chunk: {parse_e}")
            # print(f"Response content (truncated): {response.content.decode()[:500]}...") # 디버깅용
            time.sleep(1) # 오류 시 더 긴 지연
    return all_articles_data

In [25]:
# Cell 3
# PubMedQA 데이터셋 로드 (평가에서만 사용)
pubmedqa_dataset = load_dataset("pubmed_qa", "pqa_labeled")

# Primary ChromaDB 초기화 (disease_rag_with_metadata.json 파일 필요)
db = None
try:
    with open('disease_rag_with_metadata.json', 'r', encoding='utf-8') as f:
        data = json.load(f)
    db = create_chroma_db(data, "my_collection")
except FileNotFoundError:
    print("Error: 'disease_rag_with_metadata.json' not found. Primary ChromaDB cannot be initialized.")
except Exception as e:
    print(f"Error initializing primary ChromaDB ('my_collection'): {e}")


# BioASQ Task B 데이터셋 파일 경로 지정 (추가된 파일 포함)
bioasq_file_paths = [
    'training5b.json',
    'training6b.json',
    'training7b.json',
    'training8b.json',
    'training9b.json',
    'training10b.json',
    'training11b.json',
    'training12b.json',
    'training13b.json'
]

db_bioasq_qa = None
try:
    all_bioasq_data = load_bioasq_task_b_data(bioasq_file_paths)
    db_bioasq_qa = create_bioasq_embeddings(all_bioasq_data, "bioasq_qa_answers")
except Exception as e:
    print(f"Error initializing BioASQ ChromaDB: {e}")
    db_bioasq_qa = None

# db_pubmedqa_long_answers는 RAG에 포함되지 않으므로 None으로 유지합니다.
db_pubmedqa_long_answers = None

No existing collection 'my_collection' to delete or error during deletion: Collection [my_collection] does not exists


  embedding_function=GeminiEmbeddingFunction()


Successfully created and populated 'my_collection' with 337 documents.
Loading BioASQ data from training5b.json...
Loading BioASQ data from training6b.json...
Loading BioASQ data from training7b.json...
Loading BioASQ data from training8b.json...
Loading BioASQ data from training9b.json...
Loading BioASQ data from training10b.json...
Loading BioASQ data from training11b.json...
Loading BioASQ data from training12b.json...
Loading BioASQ data from training13b.json...
Total BioASQ questions loaded: 33174
No existing collection 'bioasq_qa_answers' to delete or error during deletion: Collection [bioasq_qa_answers] does not exists


  embedding_function=GeminiEmbeddingFunction()


Successfully created and populated 'bioasq_qa_answers' with 33174 documents.


In [28]:
# Cell 4

print("\n--- Collecting PubMed data and creating a dedicated PubMed ChromaDB ---")

#PubMed 검색 약 300여 개의 쿼리
pubmed_search_queries = [
    "mitochondria programmed cell death plants",
    "strabismus amblyopia visual acuity",
    "syncope infants water induced urticaria",
    "transanal pull-through long-term results",
    "mammography screening tailored interventions women",
    "double balloon enteroscopy efficacy safety community setting",
    "emergency general surgery laparotomy mortality",
    "sleep disorders reporting heterogeneity",
    "HDL cholesterol mutations carotid intima-media thickness",
    "hospital preparedness multiple casualty incidents prediction",
    "ARDS children malignancy prognosis prediction",
    "secondhand smoke NICU infants health disparities",
    "nomogram prostate cancer biochemical recurrence outcomes",
    "cardiovascular risk resting heart rate West African population",
    "retroperitoneoscopic nephrectomy teaching model practice",
    "pharmacotherapy effectiveness",
    "gene therapy clinical trials",
    "immunotherapy cancer",
    "neurodegenerative diseases mechanisms",
    "diagnostic imaging accuracy",
    "epidural analgesia labor effective standard",
    "HER2 immunoreactivity prognostic urothelial carcinoma",
    "halofantrine ototoxic",
    "visceral adipose tissue measurement volume",
    "necrotizing fasciitis hyperbaric oxygenation therapy",
    "Hawkins sign necrosis astragalus fractures",
    "mandatory general surgery rotation clerkship",
    "acupuncture phonotraumatic vocal pathologies",
    "aneurysm repair aged 80 subarachnoid hemorrhage",
    "general practice IT innovation uptake",
    "hepatocellular carcinoma prognosis well differentiated",
    "Papanicolaou smears follow-up patient adherence",
    "biomolecular identification allergenic pollen aerobiological monitoring",
    "diabetes mellitus FDG-PET cervical cancer diagnosis efficacy",
    "excimer laser keratorefractive surgery corneas biomechanical wound healing",
    "radiotherapy rectal cancer prognosis pelvic exenteration",
    "surgeon detect early lymphedema reliably",
    "colorectal cancer synchronous liver metastases management",
    "motion perception deficit schizophrenia eye-tracking",
    "transgastric endoscopic splenectomy possible",
    "Fournier's gangrene dangerous",
    "kidney implantation elderly donors young recipients",
    "provider service networks Medicaid expenditures",
    "carotid artery stenosis coronary artery bypass surgery",
    "direct mesocolon invasion T4 gastric cancer staging",
    "injury severity heterotopic ossification acetabulum fractures",
    "statins stroke clinical outcome",
    "processing fluency participant information sheets recruitment",
    "sternal fracture children overlooked",
    "androgens sexual desire women correlation",
    "immediate breast reconstruction adjuvant chemotherapy compromise",
    "human papillomavirus pterygium risk factor",
    "PRISM predict PICU stay length",
    "transcatheter aortic valve implantation predilatation omitted",
    "autoerotic asphyxiation lethal outcome",
    "major depression alcohol use disorder adolescence comorbidity outcomes",
    "cold preparation use young children FDA warnings",
    "laryngeal mask supreme manual laypersons operate",
    "mesopic pupil size cobalt blue light slit-lamp biomicroscopy",
    "circumcision childhood",
    "colonoscopy acute diverticulitis management",
    "instrumental activities daily living predict dementia",
    "neuroendoscope ventriculoperitoneal shunt infection",
    "body perception parents children physicians body image",
    "phonological awareness specialized training preschool child",
    "streptococcal infection multiple sclerosis relationship",
    "2-methoxyestradiol reduce chemotherapeutics ovarian cancer",
    "joint line positions contralateral knee revision surgery surgery planning",
    "tibial component mechanical alignment unicompartmental knee replacement",
    "tumour expression VEGF venous invasion survival renal cell carcinoma",
    "injury poisoning mortality young men prevention factors",
    "antenatal corticosteroid administration pregnancy continuation",
    "obesity OSA severity CPAP machines",
    "prior preterm birth predict risk subsequent pregnancy",
    "aripiprazole pathological gambling risk factor",
    "immune suppression lysosomotropic amines cyclosporine T-cell responses",
    "induction chemotherapy nasopharyngeal carcinoma management",
    "contralateral hydrocele neonatal testicular torsion",
    "pedestrians street crossing decisions accuracy visually impaired",
    "Crohn's disease diagnosis",
    "Chaalia Pan Masala harmful health",
    "multi-modal cervical physical therapy tinnitus cervicogenic",
    "sputum systemic inflammation asthma phenotypes paucigranulocytic",
    "HIV STD control Jamaica",
    "Panton-Valentine leucocidin Staphylococcus aureus bacteraemia UK",
    "impaired fasting blood glucose preoperative mortality CABG",
    "positron emission tomography rectal cancer management",
    "accurate tidal volume manual resuscitator",
    "cigarette tax rate retail prices",
    "vertical lines distal esophageal mucosa esophagitis children",
    "hypoglycaemia increase cardiovascular events risk",
    "radiographic transition zone aganglionosis Hirschsprung's disease",
    "dexamethasone oral prednisone pediatric asthma exacerbations",
    "mammographic screening Sami ethnicity outcome",
    "electrochemiluminescence assays prediction type 1 diabetes",
    "antimicrobial prescribing expert agreement",
    "prostate cancer oligometastases favorable subset",
    "cycloplegic autorefraction young adults mandatory",
    "ultrasonography hepatocellular carcinoma screening prognosis",
    "Department of Transportation stroke survivors driving data",
    "neoadjuvant imatinib gastrointestinal stromal tumors Kit mutation",
    "bezafibrate prevent colon cancer coronary artery disease",
    "self-efficacy transformational leadership sleep quality",
    "microbial contamination hematopoietic cell transplantation outcomes",
    "CA 19-9 levels pancreaticoduodenectomy contraindication",
    "perioperative care animal model abdominal surgery fasting",
    "physicians side effects ACE inhibitors awareness",
    "residual fundus laparoscopic sleeve gastrectomy revision surgery",
    "physician estimates asthma severity black white patients",
    "laparoscopic surgery atrial fibrillation foregut surgery",
    "Main Gate Syndrome mass-casualty victim surge management",
    "communication disorders musical messages",
    "rheumatoid arthritis methotrexate folic acid long term",
    "National Institutes of Health Stroke Scale favor left hemisphere strokes",
    "novel surgical approach temporomandibular joint complications",
    "Young-Burgess classification pelvic ring fractures mortality",
    "vitamin D deficiency pediatric celiac disease",
    "unsafe sexual behaviour HIV-infected individuals increasing",
    "cholestasis small bowel atresia investigation necessary",
    "financial incentives cost-effective smoking cessation pregnancy",
    "medical students quality assurance programmes day surgery",
    "head neck paragangliomas volumetric analysis",
    "three-dimensional ultrasound-validated breast lesions histological assessment",
    "bone thickness inter-radicular space miniscrew placement mandibular",
    "general practice selection scores predict success MRCGP",
    "Deformity Angular Ratio spinal cord monitoring alerts pediatric scoliosis",
    "age personality disorder coping style psychiatric inpatients",
    "autoxidation carbohydrates lipids uremic plasma oxidative stress",
    "high-risk human papillomaviruses HPV breast milk detection",
    "quaternary cytoreductive surgery ovarian cancer",
    "chemotherapy survival advanced non-small cell lung carcinoma pneumologists",
    "topical ropivacaine post-tonsillectomy morbidity pediatric",
    "pain clinically relevant general adult psychiatry",
    "concomitant anterior apical repair midurethral sling mixed incontinence",
    "atypical antipsychotics adjunctive therapy depression cost savings",
    "anticoagulated intracerebral hemorrhage patients",
    "diagnostic therapeutic ureteroscopy dilatation ureteral meatus",
    "managed care low income usual source of care",
    "knee extensor strength dynamic stability ambulation Parkinson's disease",
    "distance to provider barrier care Medicaid cancer patients",
    "folic acid congenital heart defects Down syndrome",
    "mental health differences francophone non-francophone Manitoba",
    "type 1 diabetes mellitus Achilles tendon response 10km run",
    "fragility esophageal mucosa eosinophilic esophagitis sign",
    "cup-cage reconstruction oversized cups THA acetabular fractures stability",
    "pulmonary valve replacement adults late after repair tetralogy of fallot",
    "xanthogranulomatous cholecystitis premalignant condition",
    "TDP-43 type atrophy frontotemporal lobar degeneration",
    "oncoplastic surgery contraindication accelerated partial breast radiation",
    "child bipolar I disorder diagnostic characteristics team sample",
    "basal metabolic rate Chinese prediction equations applicability",
    "rugby headgear prevent concussion",
    "spinal subdural hematoma ruptured intracranial aneurysm sequela",
    "shape analysis differentiate carotid artery thrombus atherosclerotic plaque CTA",
    "dementia aphasia motor neuron disease underrecognised association",
    "solid culture tuberculosis influence clinical decision making India",
    "diffusion-weighted echo-planar MR imaging parotid gland tumors histologic subtypes",
    "endothelin-1 hemodynamic changes hemodialysis role",
    "risk factors suicidal behavior affective disorder polarity",
    "multidisciplinary breast cancer clinics effectiveness",
    "international normalised ratio INR reliable",
    "health status disease activity damage SLE patients associations",
    "PSA repeatedly fluctuating levels avoid biopsy reassurance",
    "zero central line-associated bloodstream infection rate sustainable",
    "gastric electrical stimulation GI symptoms healthcare benefits",
    "pulp chamber pulpotomy permanent treatment",
    "transverse apex coronal apex levels adolescent idiopathic scoliosis",
    "Child Health Computing System identify cerebral palsy children",
    "public awareness campaigns effects",
    "breast cancer prognosis inherited",
    "nuchal translucency thickness first trimester predict GDM onset",
    "arginine vasopressin V1a receptor microsatellites hypersexuality children bipolar disorder",
    "Stage I non-small cell lung carcinoma early stage",
    "Residency Selection Criteria predict Orthopaedic Surgery Residency Performance",
    "optimism survival optimistic outlook better survival advanced ages",
    "better to be big",
    "arch form sagittal molar relationship Bolton tooth-size discrepancy",
    "cold knife conization LEEP same procedure",
    "pectins cold acclimation de-acclimation winter oil-seed rape plants",
    "updating emotional content working memory depression-specific deficit",
    "ultrasound imaging puncture facilitate internal jugular vein cannulation",
    "prostate specific antigen adjusted body mass index",
    "obstructive sleep apnea affect aerobic fitness",
    "intracerebroventricular injection metformin AICAR plasma melatonin AMPK",
    "literacy cerebral hemispherectomy isolated right hemisphere read",
    "gender dimorphism injury hemorrhagic shock hormonal differences",
    "communication terminally ill patients taught",
    "delays time to primary treatment breast cancer impact survival",
    "increased carotid artery pulsatility resistance indexes young obese males",
    "acceptance rates national preventive home visit programme older people socially imbalanced",
    "open access publishing increase scientific articles impact",
    "in vivo visualization pyloric mucosal hypertrophy infants hypertrophic pyloric stenosis etiologic role",
    "early adopter drugs exist",
    "high blood pressure reduce chronic low back pain risk",
    "responsibility affect public valuation health care interventions",
    "nasal fractures closed reduction satisfying",
    "improvements outreach clinical family community-based services child survival",
    "spontaneous remission polyarteritis nodosa occur",
    "gluten tolerance adult patients celiac disease 20 years diagnosis",
    "symptoms predict COPD smokers",
    "search engine diagnostic tool difficult immunological allergologic cases Google useful",
    "uniformity evidence-based treatments practice",
    "approved doctors medical referees UK seafarer's fitness agree",
    "hepatorenal syndrome missing prognostic factors",
    "vaginal dose assessment image-guided brachytherapy cervical cancer dose-point evaluation",
    "prescriptions proxy asthma children good choice",
    "familial transmission drinking patterns young adulthood persist",
    "fenofibrate sleep apnoea syndrome role proof of concept",
    "mental imagery functional magnetic resonance imaging predict recovery disorders consciousness",
    "nurse cystoscopist feasible option",
    "cardiovascular evaluation necessary beta-blocker therapy infantile hemangiomas",
    "specialty care improved survival congestive heart failure",
    "polymyalgia rheumatica prognosis predicted disease onset",
    "ascitis volume anthropometric measurements hospitalized alcoholic cirrhotics",
    "audit identify maternal mortality different settings rich poor",
    "lymphadenectomy modified neoadjuvant chemotherapy cervical cancer extent",
    "Wound Cultures Information Microbiology Blood Cultures Severe Burn Patients",
    "systematic use patient-rated depression severity monitoring helpful feasible clinical psychiatry",
    "treadmill training post stroke secondary benefits",
    "lunar position influence time of delivery",
    "oral endotracheal intubation efficacy impaired helicopter environment",
    "prostatic syndrome pleural effusion different diseases",
    "telemedicine type 1 diabetes technology glycaemic control",
    "controlled ovarian stimulation intrauterine insemination unexplained non-conception multiple pregnancies",
    "half-dose contrast-enhanced three-dimensional MR angiography abdominal aorta pelvis",
    "digital tomosynthesis alternative noncontrast CT nephrolithiasis follow-up",
    "high cumulative insulin exposure atherosclerosis type 1 diabetes risk factor",
    "hepatitis G virus TT virus cryptogenic chronic liver disease",
    "older patients self-management intervention Netherlands differ",
    "preoperative locoregional staging gastric cancer MRI place",
    "familiar teammates request accept more backup",
    "transsphenoidal pituitary surgery Cushing's disease predict outcome",
    "bedside assessment reliably exclude aspiration acute stroke",
    "UK radiologists satisfied training suspected child abuse",
    "artefacts 24-h pharyngeal oesophageal pH monitoring simplification feasible",
    "cell microenvironment mediastinal lymph nodes predict metastases non-small cell lung cancer",
    "wandering physically nonaggressive agitation equivalent",
    "size-reducing ascending aortoplasty external reinforcement aortic surgery option",
    "depression diagnosis antidepressant prescribing vary by location",
    "routinely collected ambulance data assaults reduction community violence",
    "volume change uterine myomas pregnancy grow",
    "laparoscopic adrenalectomy safe effective adrenal masses larger than 7 cm",
    "upstream solutions supplemental security income program reduce disability elderly",
    "profiling quality of care peer review role",
    "eyelid-parotid metastasis screen coexisting masses",
    "CT ordering practices change educate residents radiation exposure",
    "context deprivation all-cause mortality relationship",
    "stress increase imitation drinking behavior",
    "school food policy Dutch primary schools improvement",
    "atopy patch test house dust mites specific atopic dermatitis",
    "juvenile osteochondritis dissecans growth disturbance secondary physis epiphysis",
    "preoperative statins reduce atrial fibrillation coronary artery bypass grafting",
    "uniform basal endometrial gene expression profile implantation window pregnant ICSI",
    "decisional algorithms replace global introspection individual causality assessment ADRs",
    "arterial line sampling activated plasma thromboplastin time cardiac surgery",
    "timing initial surfactant treatment chronic lung disease mortality premature infants",
    "clinician assessment acute chest syndrome febrile sickle cell disease accurate",
    "Outcome Feedback Emergency Medicine Training Programs apply deliberate practice theory",
    "multiple SNP testing BRCA2 BRCA1 female carriers improve risk prediction models clinical assessment",
    "special interest laparoscopy affect treatment acute cholecystitis",
    "progression valvar aortic stenosis predict accurately",
    "automatic transmission improve driving behavior older drivers",
    "birth characteristics low intellectual performance early adulthood socioeconomic factors",
    "scintigraphy guideline method determining amputation levels diabetic foot",
    "determination between complete incomplete traumatic spinal cord injury clinically relevant",
    "improvements survival gynaecological cancer Anglia region centralisation multidisciplinary management",
    "symptoms matter patients for phase I clinical trials",
    "vitamin D insufficiency deficiency related osteochondritis dissecans",
    "episodic migraineurs selectively attend headache-related visual stimuli",
    "mitral replacement repair functional mitral regurgitation dilated ischemic cardiomyopathy",
    "nontriploid partial hydatidiform moles exist",
    "cholecystectomy indication concomitant splenectomy mild hereditary spherocytosis",
    "surgery radiation therapy impact survival extrapulmonary small cell cancers",
    "MR Diagnosis Bone Metastases 1.5 T 3 T STIR Imaging Omitted",
    "histologic evaluation testicular remnant vanishing testes syndrome surgical management necessary",
    "pain intensity predict poor opioid response cancer patients",
    "prerecorded lecture VODcasts affect lecture attendance first-year pre-clinical Graduate Entry Medicine students",
    "intrauterine influence obesity",
    "Assessing Patient Reported Outcomes Measures Phone Interviews Patient Self-Survey Clinic Measuring Same Thing",
    "cytokeratin immunoreactivity useful diagnosis short-segment Barrett's oesophagus Korea",
    "routine dissection station 9 lymph nodes necessary primary lung cancer",
    "cutaneous melanoma multiethnic population different disease",
    "marital status living arrangement mortality gender vary",
    "somatostatin confer insulinostatic effects neuromedin u rat pancreas",
    "Kell alloimmunization pregnancy fetal thrombocytopenia",
    "computer-aided diagnosis CAD MR-mammography MRM whole lesion time curve distribution analysis",
    "vaccine protection elderly Austrian seniors adequately protected vaccinations",
    "Vitamin D supplementation regulatory T cells healthy subjects autoimmune diseases treatment",
    "gynecological cancer alarm symptoms specialist care lifestyle socioeconomic status",
    "elective re-siting intravenous cannulae decrease peripheral thrombophlebitis",
    "third trimester ultrasound predict presentation first twin delivery",
    "risk factors major depression midlife community sample women prior major depression",
    "totally implantable venous access device placement interventional radiologists prophylactic antibiotics necessary",
    "starting insulin type 2 diabetes continue oral hypoglycemic agents",
    "clinical identifiers early-stage primary idiopathic adhesive capsulitis real picture",
    "delaying surgery immature adolescent idiopathic scoliosis patients progressive curve fusion levels",
    "bactericidal activity 3 cutaneous mucosal antiseptic solutions interfering substances Improvement NF EN 13727 European Standard",
    "empiric treatment uncomplicated urinary tract infection fluoroquinolones older women Israel lost treatment option",
    "risk calculators accurately predict surgical site occurrences",
    "EMS Providers Provide Appropriate Tidal Volumes Simulated Adult-sized Patient Pediatric-sized Bag-Valve-Mask",
    "students' scores preclerkship clinical performance examinations predict fail senior clinical performance examination",
    "Recovery Outcome Measures Place Culture Attitudes Faith",
    "surgeon familiarization current evidence change practice",
    "advanced epithelial ovarian carcinoma Thai women offer second-look laparotomy",
    "high-sensitivity C-reactive protein associated carotid atherosclerosis healthy Koreans",
    "validation 2009 TNM version renal cell carcinoma further improvements needed",
    "interstitial fluid concentrations meropenem equivalent plasma concentrations critically ill patients continuous renal replacement therapy",
    "Sensation Return Nasal Tip Microfat Grafting",
    "older men benefit curative therapy localized prostate cancer",
    "pituitary apoplexy histological features influence clinical presentation outcome",
    "African American women require fewer calories maintain weight",
    "Emergency double-balloon enteroscopy combined real-time viewing capsule endoscopy feasible combined approach acute overt-obscure gastrointestinal bleeding",
    "Cardiopulmonary bypass temperature does not affect postoperative euthyroid sick syndrome",
    "body dysmorphic disorder psychotic subtype",
    "hospice care nursing homes improve pain management end of life",
    "human resources development goal West Africa training ophthalmologist diplomates improved",
    "binge drinking early pregnancy increase psychomotor deficits risk",
    "tranexamic acid reduce desmopressin-induced hyperfibrinolysis",
    "ultrasound-scored synovitis depend pharmacokinetics subcutaneous anti-TNF agents rheumatoid arthritis",
    "counter sampling medical provider education alter prescribing behavior",
    "Global Longitudinal Pathway medical education curriculum influenced medical students' skills attitudes culturally diverse populations",
    "failed IUD insertions community practice under-recognized problem",
    "Type II supracondylar humerus fractures treated nonoperatively",
    "promise specialty pharmaceuticals worth price",
    "invasive diagnosis nosocomial pneumonia off-hours delay treatment",
    "undescended testes age at orchiopexy affect survival testis",
    "lipids blood pressure diabetes smoking confer equal risk myocardial infarction women men",
    "family practice residency teaching sites reflect community practice",
    "midwives' competence affected working rural location",
    "effective orifice area patient aortic annulus area ratio compare different bioprostheses",
    "pediatric VCUG academic children's hospital radiographic scout image necessary",
    "endometrial polyps pre-menopausal women similar post-menopausal women",
    "metabolic syndrome heart dimensions hypertensive patients",
    "idiopathic misty mesentery multidetector computed tomography obesity triggering cause",
    "pediatric concussion patients compliant discharge instructions",
    "zeolite hemostatic agent beneficial reducing blood loss arterial injury",
    "treatment as prevention resource-limited settings maintain HIV viral load suppression",
    "octogenarians high risk carotid endarterectomy",
    "mammography quality standards act affected mammography quality North Carolina",
    "hospitals provide lower quality care weekends",
    "screening history invasive cervical cancer academic medical center updated guidelines miss cancers",
    "oral mucocele ranula human immunodeficiency virus-related salivary gland disease",
    "incidence gbs carrier rates pregnant women northern Israel increase",
    "dedicated discharge coordinator improve hospital discharge quality",
    "dobutamine stress echocardiography induce cardiac troponin elevation",
    "cigarettes cinema parental restriction R-rated movie viewing reduce adolescent smoking susceptibility",
    "laboratories reporting serum quantitative hCG results correctly",
    "steroids aminoglycoside-containing ear drops reduce cochlear toxicity",
    "regular primary care clinician improve quality preventive care young children",
    "performance selection processes predict dental student performance",
    "reconsider lobectomy low-risk paediatric thyroid cancer time",
    "advance care planning model feasible community palliative care",
    "preoperative serum C-reactive protein levels predict definitive pathological stage clinically localized prostate cancer",
    "sub-classification low-grade cerebellar astrocytoma clinically meaningful",
    "CLASS voluntary public insurance program realistic meet long-term support service needs adults disabilities",
    "intraoperative neuromonitoring better functional outcome open TME",
    "predict head neck cancer survivors develop fears of recurrence",
    "secular growth acceleration appear fetal life",
    "hippocampal atrophy MRI predict cognitive decline",
    "chemoradiotherapy locally advanced squamous cell carcinoma esophagus surgical resection required",
    "quantitative left ventricular regional wall motion change fibrous tissue resection endomyocardial fibrosis",
    "bridge experience long-term implantable left ventricular assist devices alternative transplantation",
    "occupational nuclear power plant radiation affect conception pregnancy",
    "Lloyd-Davies position Trendelenburg disaster waiting to happen",
    "patient outcome compromised initial experience robot-assisted radical cystectomy",
    "Retromandibular Transparotid Approach reliable option surgical treatment condylar fractures",
    "appendectomy timing delayed surgery increase complications",
    "vitamin D deficiency CKD patients ergocalciferol K DOQI guidelines adequate",
    "femoral version intramedullary nailing trauma-trained non-trauma trained surgeons difference",
    "Viral Co-Infection influence severity acute respiratory infection children",
    "clinical studies elucidate connection length storage transfused red blood cells clinical outcomes",
    "cardiogenic shock acute myocardial infarction elderly patients admission tertiary center improve survival",
    "learning needs postpartum women socioeconomic status matter",
    "pitfalls urinary stone identification CT attenuation values different scanner models",
    "laminoplasty outcomes degenerative stenosis ossification posterior longitudinal ligament difference",
    "immunohistochemical assessment steroid hormone receptors anal canal implications anal incontinence",
    "acute fibrinous organizing pneumonia expression immune dysregulation",
    "hypotension coronary disease profound hypotensive events cause myocardial ischaemic events",
    "HIV1 2 point of care test sputum screening TB HIV co-infection Central India",
    "inhaled corticosteroids affect perception dyspnea bronchoconstriction asthma",
    "routine chest radiography transbronchial biopsy necessary",
    "safe to perform rectal anastomosis gynaecological debulking surgery without diverting stoma",
    "angiotensin-converting enzyme-1 ACE-1 gene polymorphism chronic kidney disease hypertensive patients",
    "laparoscopic antireflux surgery improve quality life gastro-oesophageal reflux disease medical therapy",
    "semi-closed endarterectomy superficial femoral artery short venous bypass limb-threatening ischemia alternative",
    "biofeedback training psychophysiological responses enhance athletes sport performance",
    "cytokines epilepsy role",
    "cue-induced behavioural activation novel model alcohol craving",
    "routine offering influenza vaccination office-based settings reduce racial ethnic disparities adult influenza vaccination",
    "colorectal cancer young patients distinct clinical entity",
    "implant retention recommended treatment infected TKA",
    "long-term significance postictal psychotic episodes predictive interictal psychotic episodes",
    "primary care physicians underprescribe antibiotics peptic ulcer disease",
    "blunt trauma intoxicated patients computed tomography abdomen necessary",
    "chronic progressive cervical myelopathy HTLV-I infection Variant form HAM TSP",
    "gender difference survival resected non-small cell lung cancer histology-related phenomenon",
    "lumbar drainage postoperative cerebrospinal fluid fistula spine surgery effective",
    "obesity risk factor wheezing adolescents",
    "end-tidal carbon dioxide measurement correlate arterial carbon dioxide extremely low birth weight infants first week life",
    "CA72-4 useful biomarker differential diagnosis ovarian endometrioma epithelial ovarian cancer",
    "Preservation PCL cruciate-retaining TKA tibial tuberosity reliable predictor PCL footprint location",
    "English antibiotic awareness campaigns changed public's knowledge attitudes antibiotic use",
    "increased nerve length treatment volume improve trigeminal neuralgia radiosurgery",
    "affinity column-mediated immunoassay method suitable alternative microparticle enzyme immunoassay method blood tacrolimus assay",
    "tumor depth included prognostication soft tissue sarcoma",
    "amoxapine atypical antipsychotic",
    "topical N-acetylcysteine application myringotomy cause severe otorrhea",
    "non-HDL-cholesterol better predictor long-term outcome acute myocardial infarction compared LDL-cholesterol",
    "intrapartum vibroacoustic stimulation effective predictor fetal acidosis",
    "puberty young adolescent alcohol use parents moderating role",
    "early postoperative oral intake limited laparoscopy",
    "loss of consciousness predict neuropsychological decrements concussion"
]

chroma_client = chromadb.Client()
collection_name_pubmed = "pubmed_research_articles"

# 기존 컬렉션 삭제 후 생성 (초기화 목적)
try:
    chroma_client.delete_collection(collection_name_pubmed)
    print(f"Existing collection '{collection_name_pubmed}' deleted.")
except Exception as e:
    print(f"No existing collection '{collection_name_pubmed}' to delete or error during deletion: {e}")

db_pubmed_research = chroma_client.create_collection(
    name=collection_name_pubmed,
    embedding_function=GeminiEmbeddingFunction()
)
print(f"Collection '{collection_name_pubmed}' created.")

total_pubmed_articles_added = 0
# 이미 추가된 PMID를 추적하여 중복 방지
existing_pubmed_ids = set()

for query_term in pubmed_search_queries:
    print(f"\nSearching PubMed for: '{query_term}'")
    uids = fetch_pubmed_uids_by_query(query_term, retmax=100) # 각 쿼리당 100개 초록
    if not uids:
        print(f"No UIDs found for '{query_term}'. Skipping.")
        continue

    articles_data = fetch_pubmed_article_details(uids)

    new_docs, new_metas, new_ids = [], [], []
    for article in articles_data:
        pmid = article["metadata"]["pmid"]
        if pmid != "N/A" and pmid not in existing_pubmed_ids: # 유효한 PMID이고 중복이 아닐 경우
            new_docs.append(article["text"])
            new_metas.append(preprocess_metadata(article["metadata"]))
            new_ids.append(pmid)
            existing_pubmed_ids.add(pmid)

    if new_ids:
        print(f"Adding {len(new_ids)} new PubMed articles to '{collection_name_pubmed}'...")
        batch_add(db_pubmed_research, new_docs, new_metas, new_ids, batch_size=100)
        total_pubmed_articles_added += len(new_ids)
        print(f"Successfully added {len(new_ids)} articles.")
    else:
        print("No new PubMed articles to add from this query term (possibly all duplicates or parsing error).")
    time.sleep(1) # 각 검색 쿼리 사이에 1초 지연 (API 제한 준수)

print(f"\nTotal PubMed articles added to '{collection_name_pubmed}': {total_pubmed_articles_added}")
print("PubMed data collection and embedding complete.")


--- Collecting PubMed data and creating a dedicated PubMed ChromaDB ---
No existing collection 'pubmed_research_articles' to delete or error during deletion: Collection [pubmed_research_articles] does not exists
Collection 'pubmed_research_articles' created.

Searching PubMed for: 'mitochondria programmed cell death plants'


  embedding_function=GeminiEmbeddingFunction()


Adding 100 new PubMed articles to 'pubmed_research_articles'...
Successfully added 100 articles.

Searching PubMed for: 'strabismus amblyopia visual acuity'
Adding 99 new PubMed articles to 'pubmed_research_articles'...
Successfully added 99 articles.

Searching PubMed for: 'syncope infants water induced urticaria'
Adding 2 new PubMed articles to 'pubmed_research_articles'...
Successfully added 2 articles.

Searching PubMed for: 'transanal pull-through long-term results'
Adding 86 new PubMed articles to 'pubmed_research_articles'...
Successfully added 86 articles.

Searching PubMed for: 'mammography screening tailored interventions women'
Adding 100 new PubMed articles to 'pubmed_research_articles'...
Successfully added 100 articles.

Searching PubMed for: 'double balloon enteroscopy efficacy safety community setting'
No UIDs found for 'double balloon enteroscopy efficacy safety community setting'. Skipping.

Searching PubMed for: 'emergency general surgery laparotomy mortality'
Adding

In [29]:
# Cell 5
def get_relevant_passage(query, db, n_results=5): # 이 함수는 단일 DB 검색용으로 유지
  if db is None:
      return []
  results = db.query(query_texts=[query], n_results=n_results, include=['documents', 'metadatas', 'distances']) # distances 포함
  passages = []
  if results and results['documents'] and results['documents'][0]:
      for i in range(len(results['documents'][0])):
          doc = results['documents'][0][i]
          meta = results['metadatas'][0][i]
          distance = results['distances'][0][i] # 거리 값 포함

          if db.name == "my_collection": # db_general에 해당
              passage_text = f"일반 질병 정보: {doc} (나이: {meta.get('age', '정보 없음')}, 성별: {meta.get('gender', '정보 없음')}, 혈압: {meta.get('blood_pressure', '정보 없음')}, 콜레스테롤: {meta.get('cholesterol', '정보 없음')})"
          elif db.name == "bioasq_qa_answers": # db_qa에 해당
              passage_text = f"BioASQ Q&A: {doc}"
          elif db.name == "pubmed_research_articles": # db_pubmed_research에 해당
              passage_text = f"PubMed 논문 초록: {doc}"
          else:
              passage_text = doc # 알 수 없는 DB

          passages.append({'text': passage_text, 'distance': distance, 'source_db': db.name})
  return passages

# get_relevant_passage_intelligent 함수를 다중 DB 검색 및 통합으로 변경
def get_relevant_passage_intelligent(query, db_general, db_qa, db_pubmed_research, n_results=5):
    all_retrieved_passages = []

    # 각 DB에서 독립적으로 검색하고 결과를 통합
    if db_general is not None:
        general_passages = get_relevant_passage(query, db_general, n_results)
        for p in general_passages:
            all_retrieved_passages.append(p)
            print(f"Retrieved from general DB (dist={p['distance']:.4f}): {p['text'][:50]}...")

    if db_qa is not None:
        qa_passages = get_relevant_passage(query, db_qa, n_results)
        for p in qa_passages:
            all_retrieved_passages.append(p)
            print(f"Retrieved from BioASQ DB (dist={p['distance']:.4f}): {p['text'][:50]}...")

    if db_pubmed_research is not None:
        pubmed_passages = get_relevant_passage(query, db_pubmed_research, n_results)
        for p in pubmed_passages:
            all_retrieved_passages.append(p)
            print(f"Retrieved from PubMed DB (dist={p['distance']:.4f}): {p['text'][:50]}...")
    
    # 거리(유사도) 기준으로 정렬합니다. (거리가 낮을수록 더 관련 높음)
    all_retrieved_passages.sort(key=lambda x: x['distance'])

    # LLM에게 전달할 최종 Passage는 상위 N개만 선택
    # 여기서는 각 DB에서 n_results만큼 가져온 후 통합 정렬하여 최대 n_results * DB_COUNT 개가 될 수 있음
    # LLM 컨텍스트 길이를 고려하여 최종 반환 개수를 제한
    # 일단은 정렬된 모든 Passage를 반환하되, 너무 많으면 프롬프트에서 잘리도록
    # LLM 프롬프트에 전달할 때는 'text' 필드만 추출
    return all_retrieved_passages[:n_results*3] # 최대 3 * n_results 개의 passage 반환 (모든 DB에서 가져오므로)

In [30]:
# Cell 6 
# db가 None이 아닐 경우에만 get() 호출
if db is not None:
    sample_data = db.get(include=['documents', 'embeddings'], limit=3)
    df = pd.DataFrame({
        "IDs": sample_data['ids'],
        "Documents": sample_data['documents'],
        "Embeddings": [str(emb)[:50] + "..." for emb in sample_data['embeddings']]
    })
    print("--- Sample Primary ChromaDB (my_collection) Data ---")
    print(df)
else:
    print("Primary ChromaDB (db) is not initialized, skipping sample_data display.")

# db_bioasq_qa가 None이 아닐 경우에만 get() 호출
if db_bioasq_qa is not None:
    sample_bioasq_data = db_bioasq_qa.get(include=['documents', 'embeddings', 'metadatas'], limit=3)
    df_bioasq = pd.DataFrame({
        "IDs": sample_bioasq_data['ids'],
        "Documents": sample_bioasq_data['documents'],
        "Metadatas": sample_bioasq_data['metadatas'],
        "Embeddings": [str(emb)[:50] + "..." for emb in sample_bioasq_data['embeddings']]
    })
    print("\n--- Sample BioASQ ChromaDB (bioasq_qa_answers) Data ---")
    print(df_bioasq)
else:
    print("BioASQ ChromaDB (db_bioasq_qa) is not initialized, skipping sample_bioasq_data display.")

# db_pubmed_research 컬렉션 데이터 샘플 추가
if 'db_pubmed_research' in globals() and db_pubmed_research is not None:
    sample_pubmed_research_data = db_pubmed_research.get(include=['documents', 'embeddings', 'metadatas'], limit=3)
    df_pubmed_research = pd.DataFrame({
        "IDs": sample_pubmed_research_data['ids'],
        "Documents": sample_pubmed_research_data['documents'],
        "Metadatas": sample_pubmed_research_data['metadatas'],
        "Embeddings": [str(emb)[:50] + "..." for emb in sample_pubmed_research_data['embeddings']]
    })
    print("\n--- Sample PubMed Research ChromaDB (pubmed_research_articles) Data ---")
    print(df_pubmed_research)
else:
    print("\nPubMed Research ChromaDB (db_pubmed_research) is not initialized, skipping sample data display.")


# 예시 사용: 모든 DB를 함께 사용하는 지능형 검색 테스트
# 일반 질병 쿼리 예시
query_general = "나이: 30세, 성별: Female, 증상: 기침, 콧물, 인후통"
passages_general = get_relevant_passage_intelligent(query_general, db, db_bioasq_qa, db_pubmed_research, 5)
print("\n--- Passages for General Query (using intelligent search) ---")
for p in passages_general:
    print(p)

print("-" * 50)

# 의학 논문/QA 쿼리 예시 (BioASQ 및 PubMed Research DB 활용)
query_medical = "Can tailored interventions increase mammography use among HMO women? A clinical study"
passages_medical = get_relevant_passage_intelligent(query_medical, db, db_bioasq_qa, db_pubmed_research, 5)
print("\n--- Passages for Medical QA Query (using intelligent search) ---")
for p in passages_medical:
    print(p)

print("-" * 50)

# 일반적인 증상 쿼리 (세 가지 DB 중 더 적합한 DB 사용)
query_symptom = "Fever, Fatigue, Difficulty Breathing"
passages_symptom = get_relevant_passage_intelligent(query_symptom, db, db_bioasq_qa, db_pubmed_research, 5)
print("\n--- Passages for Symptom Query (using intelligent search) ---")
for p in passages_symptom:
    print(p)

# LLM 답변 생성 예시 (Geminai.ipynb의 make_prompt 함수 -> 이름 변경)
# make_prompt_for_general_chat 함수는 관련 정보의 텍스트만 필요로 합니다.
# get_relevant_passage_intelligent는 이제 딕셔너리 리스트를 반환하므로, 'text' 필드만 추출하여 전달합니다.
def make_prompt_for_general_chat(query, relevant_passages_texts): # 함수 인자명을 명확히 하고, 일반 채팅용임을 표시
  escaped = " ".join([p.replace("'", "").replace('"', "").replace("\n", " ") for p in relevant_passages_texts]) # text_only_passages 사용
  prompt = f"""
  당신은 사용자의 증상과 개인 프로필 정보를 기반으로 질병을 설명하고, **일상생활에서 할 수 있는 구체적이고 실용적인 조언을 상세하게 제공하는** 의료 상담 도우미입니다. 당신의 답변은 정보가 풍부하고 친절하며, **극단적이거나 심각한 질병을 직접적으로 진단하거나 추천하는 뉘앙스를 피해야 합니다.** 답변은 최소 100단어 이상으로 작성해 주세요.

  **단계별 지시사항:**
  1. 사용자 질문을 이해하고 핵심 증상(예: 기침, 콧물)을 정확하고 상세하게 파악하세요.
  2. 제공된 '관련 정보 (PASSAGE)'를 면밀히 검토하여 사용자 증상과 가장 밀접하게 일치하는 질병(들)을 식별하되, **데이터에 기반한 질병 연관성을 언급하되 불필요하게 심각성을 강조하지 마세요.**
  3. **여러 질병이 검색될 경우, 가장 일반적이거나 흔한 질환(예: 감기, 알레르기)을 우선적으로 상세히 설명하고, 그 다음으로 관련된 다른 질병들도 간략하게 제시하세요.**
  4. 나이, 성별, 혈압, 콜레스테롤 수치와 같은 환자 프로필 정보가 있다면, 이를 **답변의 서론 부분에 해당 질병이 특정 프로필의 환자에게서 관찰될 수 있는 '사례'로 자연스럽게 통합하여 설명의 깊이를 더하세요.** 질병 진단의 직접적인 근거로 오해되지 않도록 주의하세요.
  5. 답변은 정보가 풍부하고 명확하며 친절하게 작성하며, 다음 **상세 권장 출력 형식**을 따르되, **세부적인 구문은 모델의 자연스러운 생성에 맡기세요.**

  **상세 권장 출력 형식:**
  안녕하세요! [사용자 질문에서 파악된 증상]이(가) 있으시군요. 불편하시겠지만, 몇 가지 가능한 원인과 생활 속 대처법을 함께 알아보겠습니다.

  (선택적: 임상 데이터에 따르면, [관련 정보의 나이]세 [관련 정보의 성별] 환자 중 [관련 정보의 혈압] 혈압과 [관련 정보의 콜레스테롤] 콜레스테롤 수치를 가진 분들에게서 [해당 질병과 연결된 증상]이 관찰된 사례가 있습니다.) 이러한 증상들은 [관련 정보에서 찾은 가장 일반적이고 가능성 높은 질병]과 관련이 있을 수 있습니다.

  [질병에 대한 간략한 추가 설명 (2-3문장)]. 이 질병의 일반적인 경과나 특징에 대해 간략히 설명해 주세요.

  이럴 때는 다음과 같은 생활 습관 개선을 통해 증상 완화에 도움을 줄 수 있습니다:
  - **충분한 휴식:** 몸이 회복하는 데 필요한 시간을 주세요.
  - **수분 섭취:** 따뜻한 물, 차 등을 자주 마셔 목을 촉촉하게 유지하고 탈수를 예방하세요.
  - **실내 환경 관리:** 적절한 실내 습도를 유지하고 환기를 자주 해주세요.
  - **영양가 있는 음식 섭취:** 면역력 강화를 위해 비타민과 미네랄이 풍부한 음식을 드세요.
  - [추가적인 일반적인 조언 1 (예: 스트레스 관리, 가벼운 운동 등)]
  - [추가적인 일반적인 조언 2 (예: 마스크 착용, 손 씻기 등)]

  만약 [사용자 질문에서 파악된 증상] 외에 다른 불편한 증상이 있거나, 현재 증상이 나아지지 않고 오히려 심해진다면 [다른 관련 질병]일 수도 있습니다. (이때, 극단적인 질병은 가급적 언급하지 않거나, \\\"드물게는 ~일 수도 있습니다\\\"와 같이 조심스러운 표현을 사용하세요.)

  더 궁금한 점이 있으시면 언제든지 다시 질문해주세요. 항상 건강하시길 바랍니다.

  아래는 참고할 수 있는 임상 데이터입니다:
  - 사용자 질문 (QUESTION): \\\"{query}\\\"
  - 관련 정보 (PASSAGE): \\\"{escaped}\\\"

  **주의사항:**
  - 병원 방문 및 전문적인 상담을 직접적으로 권유하는 문구는 최종 답변에 포함하지 마세요.
  - PASSAGE에 영어 단어가 포함되어 있다면, 괄호 안에 한글 뜻을 함께 제공해 주세요.
  - **제공된 정보 내에서 '기침'과 '습진'의 연관성이 있더라도, '기침'이라는 증상에 더 일반적이고 흔한 질병(예: Common Cold, Influenza)이 있다면 이를 우선적으로 고려하여 답변하세요.**
  - **'말라리아'와 같이 심각한 질병은 사용자가 직접적으로 언급하지 않는 한, 일반적인 증상만으로는 추천하지 마세요.**

  ANSWER:
  """.format(query=query, relevant_passages=escaped)
  return prompt

query = "나이: 25세, 성별: Male, 혈압: Normal, 콜레스테롤: Normal, 증상 : 심각한 공부하기 싫음"
passages_from_retrieval = get_relevant_passage_intelligent(query, db, db_bioasq_qa, db_pubmed_research, 5)

text_only_passages_for_prompt = [p['text'] for p in passages_from_retrieval]
prompt = make_prompt_for_general_chat(query, text_only_passages_for_prompt) # make_prompt -> make_prompt_for_general_chat 로 함수명 변경

MODEL_ID = "gemini-2.0-flash"
answer = client.models.generate_content(
    model = MODEL_ID,
    contents = prompt
)

if answer and hasattr(answer, 'text') and isinstance(answer.text, str):
    final_answer_text = answer.text
    
    final_answer = final_answer_text.split("ANSWER:")
    if len(final_answer) > 1:
        final_answer = final_answer[1].strip()
    else:
        final_answer = final_answer_text.strip()
elif answer and hasattr(answer, 'text') and isinstance(answer.text, list):
    final_answer_text = " ".join(answer.text)
    final_answer = final_answer_text.split("ANSWER:")
    if len(final_answer) > 1:
        final_answer = final_answer[1].strip()
    else:
        final_answer = final_answer_text.strip()
else:
    final_answer = "LLM 응답 생성 실패 또는 빈 응답"

print("\n--- LLM 답변 ---")
print(final_answer)

--- Sample Primary ChromaDB (my_collection) Data ---
  IDs                                          Documents  \
0   0  증상: Fever, Fatigue, Difficulty Breathing이(가) 있...   
1   1  증상: Cough, Fatigue이(가) 있고, 나이: 25세, 성별: Female...   
2   2  증상: Cough, Fatigue이(가) 있고, 나이: 25세, 성별: Female...   

                                          Embeddings  
0  [ 2.24745572e-02 -7.10671246e-02 -3.98193412e-...  
1  [ 6.27511144e-02 -6.39859959e-02 -7.34391063e-...  
2  [ 5.98195456e-02 -6.23048060e-02 -7.82496259e-...  

--- Sample BioASQ ChromaDB (bioasq_qa_answers) Data ---
                        IDs  \
0  55031181e9bde69634000014   
1  55046d5ff8aee20f27000007   
2  54e25eaaae9738404b000017   

                                           Documents  \
0  Question: Is Hirschsprung disease a mendelian ...   
1  Question: List signaling molecules (ligands) t...   
2  Question: Is the protein Papilin secreted? Ans...   

                                           Metadatas  \
0  {'id': '55031181e9bd

In [None]:
# Cell 7
import time
import re # 정규 표현식 사용을 위해 re 모듈 임포트

def make_short_answer_prompt_for_pubmedqa(query, relevant_passages):
    escaped_passages = " ".join([p.replace("'", "").replace('"', "").replace("\\n", " ") for p in relevant_passages])

    if not escaped_passages or escaped_passages == "제공된 관련 정보가 없습니다.":
        escaped_passages = "제공된 관련 정보가 없습니다."

    prompt = f"""
    당신은 의학적 질문에 대해 'YES', 'NO', 'MAYBE'로만 답변하는 의료 AI 도우미입니다.
    당신은 **오직 제공된 '참고 정보'에 기반하여 답변해야 합니다.** '참고 정보'에는 일반 질병 정보, BioASQ Q&A 데이터, 그리고 PubMed 논문 초록이 포함될 수 있습니다. 만약 'MAYBE'로 답변한다면 그것은 질문에 답하는 데 실패했음을 의미하며, 이는 바람직하지 않습니다. **가능하다면 'YES'나 'NO'로 답변하려고 최선을 다해야 합니다.**

    **단계별 추론 과정 (Chain-of-Thought):**
    1.  **질문 분석:** '의학적 질문'이 무엇을 묻고 있는지 핵심 요소를 파악하세요.
    2.  **정보 탐색 및 매칭:** '참고 정보 (PASSAGE)'를 면밀히 검토하여 '의학적 질문'에 대한 **직접적이고 명확한 긍정적 증거나 부정적 증거가 있는지** 찾으세요. 특히 PubMed 논문 초록 정보가 있다면, 해당 연구 결과가 질문에 직접적인 답을 제공하는지 주의 깊게 확인하세요. 만약 '참고 정보'가 "제공된 관련 정보가 없습니다."라고 되어 있다면, 이를 명확히 인지하고 다음 단계를 진행하세요.
    3.  **판단 근거 도출 (제공된 정보에 엄격하게 기반):**
        * 만약 '참고 정보'에서 질문에 대한 **직접적이고 명확한 긍정적 증거**를 찾았다면, 그 근거를 요약하여 제시하고 'YES'라고 판단합니다.
        * 만약 '참고 정보'에서 질문에 대한 **직접적이고 명확한 부정적 증거** 또는 질문과 **명백히 상반되는 내용**을 찾았다면, 그 근거를 요약하여 제시하고 'NO'라고 판단합니다.
        * 만약 '참고 정보'에 질문에 대한 **직접적이고 명확한 긍정적 또는 부정적 증거가 전혀 없다면**, LLM의 일반적인 지식을 사용한 추론을 시도하지 말고, 단순히 '참고 정보에 명확한 근거가 없습니다.'라고 명시하세요. 이 경우 'MAYBE'로 결정될 것입니다.
    4.  **최종 답변 결정 (단 하나의 단어):** 위 분석과 판단 근거를 바탕으로 다음 지침을 **반드시** 따르세요.
        * **최종 답변:** 'YES', 'NO', 'MAYBE' 중 오직 하나의 단어만 선택하여 이 태그 뒤에 **아무것도 추가하지 않고** 출력하세요.
        * 예시: `최종 답변: YES`
        * **절대 다른 설명이나 문구를 덧붙이지 마세요.**

    **의학적 질문 (QUESTION):** \"{query}\"
    **참고 정보 (PASSAGE):** \"{escaped_passages}\"

    ANSWER:
    **단계별 분석 및 판단:**
    """
    return prompt

def evaluate_accuracy_pubmedqa(llm_response: str, ground_truth_decision: str) -> bool:
    normalized_llm_response = llm_response.strip().lower()
    normalized_ground_truth = ground_truth_decision.strip().lower()

    extracted_answer = "maybe" # 기본값 설정

    match = re.search(r'최종 답변:\s*(yes|no|maybe)', normalized_llm_response, re.DOTALL)
    if match:
        extracted_answer = match.group(1).strip()
    else:
        # 가장 마지막에 나오는 명확한 YES/NO/MAYBE를 찾기 위해
        # 단어 경계를 사용하는 re.search + 순서대로 확인
        if re.search(r'\byes\b', normalized_llm_response):
            extracted_answer = 'yes'
        elif re.search(r'\bno\b', normalized_llm_response):
            extracted_answer = 'no'
        elif re.search(r'\bmaybe\b', normalized_llm_response):
            extracted_answer = 'maybe'
        # 다른 경우 (아무것도 없거나 여러 개 혼재) 기본값 'maybe' 유지

    if extracted_answer not in ['yes', 'no', 'maybe']:
        extracted_answer = 'maybe' # 최종 유효성 검사

    return extracted_answer == normalized_ground_truth

print("\nLoading PubMedQA dataset...")
pubmedqa_dataset = load_dataset("pubmed_qa", "pqa_labeled")
pubmedqa_test_data = pubmedqa_dataset['train']
print(f"PubMedQA test set loaded with {len(pubmedqa_test_data)} examples.")


all_accuracies = []
example_evaluations = []

num_samples_to_evaluate = 50
num_samples_to_show_examples = 25

print(f"\nEvaluating on {num_samples_to_evaluate} PubMedQA samples for Accuracy (using PubMed, BioASQ, General DBs with strict CoT and no aggressive post-processing)...")

for i, entry in enumerate(pubmedqa_test_data):
    if i >= num_samples_to_evaluate:
        break

    question = entry['question']
    ground_truth_decision = entry['final_decision']

    #동적 검색 로직
    initial_relevant_passages_with_meta = []
    initial_text_passages = []
    parsed_first_answer = "maybe" # 초기 LLM 응답 파싱 결과 (maybe로 초기화)
    llm_first_response = "최종 답변: MAYBE" # LLM의 첫 번째 응답 (기본값)
    final_llm_response_for_eval = "최종 답변: MAYBE" # 최종 평가에 사용될 LLM 응답
    final_passages_used = [] # 최종 평가에 사용될 Passage

    try:
        # 1. 초기 RAG 검색 (모든 DB에서 가져옴)
        print(f"Sample {i}: Initial RAG search...")
        initial_relevant_passages_with_meta = get_relevant_passage_intelligent(question, db, db_bioasq_qa, db_pubmed_research, 5)

        initial_text_passages = [p['text'] for p in initial_relevant_passages_with_meta]
        
        # 2. LLM에게 초기 정보로 답변을 시도
        initial_prompt = make_short_answer_prompt_for_pubmedqa(question, initial_text_passages)
        
        MODEL_ID = "gemini-2.0-flash"
        response = client.models.generate_content(
            model=MODEL_ID,
            contents=initial_prompt
        )
        llm_first_response = response.text.strip()
        time.sleep(2) # API 호출 간 2초 지연 (Quota exceeded 방지 위해 증가)

        # LLM의 첫 번째 답변에서 'YES', 'NO', 'MAYBE'를 파싱
        # evaluate_accuracy_pubmedqa 함수 내부의 파싱 로직을 여기에도 적용
        match_first = re.search(r'최종 답변:\s*(yes|no|maybe)', llm_first_response.lower(), re.DOTALL)
        if match_first:
            parsed_first_answer = match_first.group(1).strip()
        else:
            # 태그 없이 'yes', 'no', 'maybe' 단독 출현 시
            if re.search(r'\byes\b', llm_first_response.lower()) and not re.search(r'\bno\b', llm_first_response.lower()):
                parsed_first_answer = 'yes'
            elif re.search(r'\bno\b', llm_first_response.lower()) and not re.search(r'\byes\b', llm_first_response.lower()):
                parsed_first_answer = 'no'
            elif re.search(r'\bmaybe\b', llm_first_response.lower()):
                parsed_first_answer = 'maybe'
            else:
                parsed_first_answer = 'maybe' # 최종적으로도 파싱 안 되면 maybe

        final_llm_response_for_eval = llm_first_response # 초기 응답으로 설정
        final_passages_used = initial_relevant_passages_with_meta # 초기 Passage로 설정
        
    except Exception as e:
        if "429 RESOURCE_EXHAUSTED" in str(e):
            print(f"Quota exceeded (initial) for Sample {i}. Retrying after 5 seconds...")
            time.sleep(5)
            try: # 재시도
                response = client.models.generate_content(model=MODEL_ID, contents=initial_prompt)
                llm_first_response = response.text.strip()
                time.sleep(2) # 재시도 후에도 지연
                
                # 재시도 성공 시 첫 답변 파싱
                match_first = re.search(r'최종 답변:\s*(yes|no|maybe)', llm_first_response.lower(), re.DOTALL)
                if match_first:
                    parsed_first_answer = match_first.group(1).strip()
                else:
                    if re.search(r'\byes\b', llm_first_response.lower()) and not re.search(r'\bno\b', llm_first_response.lower()): parsed_first_answer = 'yes'
                    elif re.search(r'\bno\b', llm_first_response.lower()) and not re.search(r'\byes\b', llm_first_response.lower()): parsed_first_answer = 'no'
                    elif re.search(r'\bmaybe\b', llm_first_response.lower()): parsed_first_answer = 'maybe'
                    else: parsed_first_answer = 'maybe'

                final_llm_response_for_eval = llm_first_response
                final_passages_used = initial_relevant_passages_with_meta
            except Exception as e_retry:
                print(f"Error on retry (initial) for Sample {i}: {e_retry}")
        else:
            print(f"Error processing initial response for Sample {i}: {e}")

    # 3. 정보 부족 판단 및 동적 추가 검색 (parsed_first_answer가 'maybe'인 경우)
    if parsed_first_answer == 'maybe':
        print(f"Sample {i}: Initial LLM response is MAYBE. Attempting dynamic PubMed retrieval for additional info.")
        
        dynamic_pubmed_uids = fetch_pubmed_uids_by_query(question, retmax=5) # 질문당 추가 5개 초록
        dynamic_pubmed_articles = fetch_pubmed_article_details(dynamic_pubmed_uids)
        time.sleep(1) # PubMed API 호출 간 1초 지연

        if dynamic_pubmed_articles:
            new_passages_from_dynamic = []
            for art in dynamic_pubmed_articles:
                new_passages_from_dynamic.append({
                    'text': art['text'],
                    'distance': 0.0,
                    'source_db': 'PubMed_Dynamic'
                })
            
            combined_passages_with_meta = initial_relevant_passages_with_meta + new_passages_from_dynamic
            combined_passages_with_meta.sort(key=lambda x: x['distance'])

            combined_text_passages = [p['text'] for p in combined_passages_with_meta]
            final_prompt = make_short_answer_prompt_for_pubmedqa(question, combined_text_passages)

            # 4. LLM에게 재시도 (추가된 정보로)
            try:
                print(f"  Sample {i}: Re-evaluating LLM with combined passages.")
                response = client.models.generate_content(
                    model=MODEL_ID,
                    contents=final_prompt
                )
                final_llm_response_for_eval = response.text.strip()
                final_passages_used = combined_passages_with_meta # 평가에 사용된 최종 Passage 업데이트
                time.sleep(2) # API 호출 간 2초 지연 (증가)
            except Exception as e:
                print(f"Error processing dynamic re-evaluation for Sample {i}: {e}")
        else:
            print(f"  Sample {i}: No additional relevant PubMed articles found dynamically for this query.")
    
    # 최종 정확도 평가
    is_correct = evaluate_accuracy_pubmedqa(final_llm_response_for_eval, ground_truth_decision)
    all_accuracies.append(is_correct)

    if i < num_samples_to_show_examples:
        example_evaluations.append({
            'question': question,
            'ground_truth_decision': ground_truth_decision,
            'llm_raw_response': final_llm_response_for_eval,
            'is_correct': is_correct,
            'relevant_passages_used': final_passages_used
        })

    print(f"Sample {i} finished. Result: {'Correct' if is_correct else 'Incorrect'} (Ground Truth: {ground_truth_decision}, LLM: {final_llm_response_for_eval.split('최종 답변:')[-1].strip() if '최종 답변:' in final_llm_response_for_eval.lower() else final_llm_response_for_eval[:30]})")


print(f"\n--- PubMedQA 평가 예시 (첫 {num_samples_to_show_examples}개, 정확도) ---")
for ex in example_evaluations:
    print(f"질문: {ex['question']}")
    print(f"정답 (Ground Truth Decision): {ex['ground_truth_decision']}")
    print(f"LLM 원본 응답:\n{ex['llm_raw_response']}")

    temp_llm_response = ex['llm_raw_response'].strip().lower()
    extracted_answer_for_display = "maybe"
    final_answer_tag_start = temp_llm_response.rfind('최종 답변:')
    if final_answer_tag_start != -1:
        answer_raw_text_for_display = temp_llm_response[final_answer_tag_start + len('최종 답변:'):].strip()
        words_for_display = answer_raw_text_for_display.split()
        if words_for_display:
            extracted_answer_for_display = words_for_display[0].strip().lower()
        else:
            extracted_answer_for_display = 'maybe'
    else:
        if 'yes' == temp_llm_response:
            extracted_answer_for_display = 'yes'
        elif 'no' == temp_llm_response:
            extracted_answer_for_display = 'no'
        elif 'maybe' == temp_llm_response:
            extracted_answer_for_display = 'maybe'
        else:
            extracted_answer_for_display = 'maybe'

    if extracted_answer_for_display not in ['yes', 'no', 'maybe']:
        extracted_answer_for_display = 'maybe'

    print(f"LLM 추출 답변: {extracted_answer_for_display}")
    print(f"정확도 일치 여부: {ex['is_correct']}")
    print(f"사용된 관련 정보:")
    for passage_info in ex['relevant_passages_used']:
        source_db = passage_info.get('source_db', 'Unknown DB')
        distance = passage_info.get('distance', 'N/A')
        text_preview = passage_info['text'][:100] + "..." if len(passage_info['text']) > 100 else passage_info['text']
        print(f"  - [{source_db}, Dist: {distance:.4f}]: {text_preview}")
    print("-" * 50)

if all_accuracies:
    total_correct = sum(all_accuracies)
    accuracy = total_correct / len(all_accuracies)
    print("\n--- PubMedQA 평가 결과 (정확도) ---")
    print(f"평가 샘플 수: {len(all_accuracies)}")
    print(f"정확도: {accuracy:.4f}")
else:
    print("\nNo evaluation results to display.")


Loading PubMedQA dataset...
PubMedQA test set loaded with 1000 examples.

Evaluating on 50 PubMedQA samples for Accuracy (using PubMed, BioASQ, General DBs with strict CoT and no aggressive post-processing)...
Sample 0: Initial RAG search...
Retrieved from general DB (dist=0.6623): 일반 질병 정보: 증상: Fatigue이(가) 있고, 나이: 70세, 성별: Male, 혈...
Retrieved from general DB (dist=0.6714): 일반 질병 정보: 증상: Fatigue이(가) 있고, 나이: 50세, 성별: Male, 혈...
Retrieved from general DB (dist=0.6733): 일반 질병 정보: 증상: Fatigue이(가) 있고, 나이: 30세, 성별: Male, 혈...
Retrieved from general DB (dist=0.6761): 일반 질병 정보: 증상: Fatigue이(가) 있고, 나이: 45세, 성별: Female,...
Retrieved from general DB (dist=0.6779): 일반 질병 정보: 증상: Fatigue이(가) 있고, 나이: 55세, 성별: Female,...
Retrieved from BioASQ DB (dist=0.5232): BioASQ Q&A: Question: Is alternative splicing of a...
Retrieved from BioASQ DB (dist=0.5512): BioASQ Q&A: Question: What are the results of loss...
Retrieved from BioASQ DB (dist=0.5735): BioASQ Q&A: Question: Can mitochondria transfer fr...
