# imports

In [None]:
import pandas as pd

In [None]:
# df = pd.read_csv("../src/data/raw/discharge.csv", nrows=1000)
# print(df.head())

          note_id  subject_id   hadm_id note_type  note_seq  \
0  10000032-DS-21    10000032  22595853        DS        21   
1  10000032-DS-22    10000032  22841357        DS        22   
2  10000032-DS-23    10000032  29079034        DS        23   
3  10000032-DS-24    10000032  25742920        DS        24   
4  10000084-DS-17    10000084  23052089        DS        17   

             charttime            storetime  \
0  2180-05-07 00:00:00  2180-05-09 15:26:00   
1  2180-06-27 00:00:00  2180-07-01 10:15:00   
2  2180-07-25 00:00:00  2180-07-25 21:42:00   
3  2180-08-07 00:00:00  2180-08-10 05:43:00   
4  2160-11-25 00:00:00  2160-11-25 15:09:00   

                                                text  
0   \nName:  ___                     Unit No:   _...  
1   \nName:  ___                     Unit No:   _...  
2   \nName:  ___                     Unit No:   _...  
3   \nName:  ___                     Unit No:   _...  
4   \nName:  ___                    Unit No:   __...  


In [None]:
from pathlib import Path

BASE_DIR = Path().resolve().parent

# print(BASE_DIR)
VECTOR_DIR = BASE_DIR / "src" / "data" / "vector_store"

In [None]:
from app.retrieval.search import Retriever

retriever = Retriever(VECTOR_DIR)
query = "patient diagnosed with acute myocardial infarction"

  from .autonotebook import tqdm as notebook_tqdm


Retriever loaded successfully.


In [None]:
import faiss
import pickle
import numpy as np
from pathlib import Path
from sentence_transformers import SentenceTransformer

class Retriever:
    def __init__(self, index_dir: Path, model_name: str = "BAAI/bge-base-en-v1.5"):
        self.index_path = index_dir / "poc_index.index"
        self.meta_path = index_dir / "poc_metadata.pkl"
        self.model = SentenceTransformer(model_name)
        
        self._load_index()

    def _load_index(self):
        if not self.index_path.exists():
            raise FileNotFoundError(f"Index not found at {self.index_path}")
        
        self.index = faiss.read_index(str(self.index_path))
        with open(self.meta_path, "rb") as f:
            self.metadata = pickle.load(f)
        print("Retriever loaded successfully.")

    def search(self, query: str, k: int = 3):
        # BGE-1.5 Instruction for Queries
        query_text = f"Represent this sentence for retrieval: {query}"
        
        query_embedding = self.model.encode([query_text], normalize_embeddings=True)
        distances, indices = self.index.search(np.array(query_embedding).astype('float32'), k)
        
        results = []
        for i, idx in enumerate(indices[0]):
            if idx == -1: continue
            meta = self.metadata[idx]
            results.append({
                "score": float(distances[0][i]), # L2 distance (lower is better)
                "text": meta.get("chunk_text"),
                "note_id": meta.get("note_id")
            })
        return results

In [None]:
results = retriever.search(query=query, k = 3)

In [None]:
import json

print(json.dumps(results, indent=2))

[
  {
    "score": 0.4247196614742279,
    "text": "Status: Clear and coherent. Level of Consciousness: Alert and interactive. Activity Status: Ambulatory - Independent. Discharge Instructions: Dear Dr. , was a pleasure taking care of you during this admission. You came to the hospital for chest pressure. You were found to have an elevation in your cardiac enzymes. You had a cardiac catheterization that showed the coronaries were clean, but you had mild anterolateral and mid inferior hypokinesis of the left ventricle. We are unsure what caused these findings, but are concerned that you either had an obstruction that resolved or a coronary vasospasm. You were feeling better, and we started you on a low dose of Metoprolol. You should have cardiology follow up, which Dr. set up. You should additionally have a repeat cardiac echo and can consider having your cardiologist order a cardiac MRI for you within the next two weeks to further evaluate for possible myocardial infarction. If you wou

In [None]:
query = "history of atrial fibrillation"
results = retriever.search(query, k = 3)

print(json.dumps(results, indent=2))

[
  {
    "score": 0.5971420407295227,
    "text": "CARDIAC HISTORY: # Atrial fibrillation (on coumadin) # Coronary artery disease # with EF 40% -PERCUTANEOUS CORONARY INTERVENTIONS: cardiac catheterization at in showing \"small vessel disease\", cardiac cath in showing two vessel disease without any intervention 3. OTHER PAST MEDICAL HISTORY: # History of Non-Hodgkin's lymphoma # Multinodular Goiter # Chronic Low Back Pain # s/p hysterectomy # s/p bilateral knee replacements # s/p bilateral eye surgery Social History: Family History: Diabetes; Grandmother died of MI at . Father: MI in , Mother: died before her of \"heart condition that was undiagnosed\" Physical Exam: ADMISSION PHYSICAL EXAM: Vitals - T 98 BP 154/77 HR 77 RR 22 SaO2 95% on 2L GENERAL: Elderly obese female appears mildly dyspneic HEENT: EOMI, MMM. Thyroid not palpable. No nodules palpated in neck. Swallow mechanism normal. CARDIAC: Irreg irreg, no m/r/g. LUNG: Distant lung sounds. Absence lung sounds at bilateral bases

In [None]:
query = "patient reports chest pain radiating to left arm"
results = retriever.search(query,k=3)

print(json.dumps(results, indent=2))

[
  {
    "score": 0.41463416814804077,
    "text": "Name: Unit No: Admission Date: Discharge Date: Date of Birth: Sex: F Service: MEDICINE Allergies: Penicillins / Vasotec Attending: . Chief Complaint: Chest pain Major Surgical or Invasive Procedure: None History of Present Illness: HISTORY OF PRESENT ILLNESS: with a past medical history significant for CAD, chronic chest wall pain, dementia who presents with left-sided chest pain. The patient reports that she has had similar chest pain intermittently for many years. She notes, however, that she had an new component this morning that was a sharp left-sided pain with radiation into her left shoulder which she has not experienced in the past.At its peak, the pain was a intensity. The pain spontaneously resolved this morning after several minutes according to her report. Denies shortness of breath, nausea, vomiting, diaphoresis, orthopnea. The patient denies any fevers, cough, abdominal pain, urinary or bowel symptoms. In the ED, initial

In [None]:
query = "patient on ASA and Plavix"
results = retriever.search(query,k=3)

print(json.dumps(results, indent=2))

[
  {
    "score": 0.49249863624572754,
    "text": "to lab on and a DES was placed to LAD. - cont atorvastatin 80 mg and ASA 81 mg - Plavix 75mg daily. - Continue Metoprolol, Losarten #2 HTN: clinically stable - Continue LOsarten, Metoprolol, Isosorbide #3 HLD: cont atorvastatin 80 mg #4 SSS s/p pacer: tele stable. History of AFIB, on Xarelto - Now needs triple therapy - Reviewed with team best plan, given CLL and risk for bleeding and best evidence. Fellow recommending to continue Xarelto with ASA and Plavix and for his Cardiologist to determine whether to convert to Coumadin longer term #5 CLL: currently on ibutinib po daily. Per OMR notes pt has missed some doses lately. This medicine can lead to bleeding, thrombycytopenia and neutropenia. Appears to be tol well so far. WBC was 50 when initiated. Of note, no interaction with clopidogrel. restart three days post-procedure. #6 Cognitive impairment: unclear how severe this is as he remembers most of his meds and seems oriented at pres

# 1. Test: Specific Procedure & Volume (Targets Chunk 4 & 5)
# This tests if it can find specific numbers and procedural context.

In [None]:
from pathlib import Path

BASE_DIR = Path().resolve().parent

# print(BASE_DIR)
VECTOR_DIR = BASE_DIR / "src" / "data" / "vector_store"