In [1]:
"""
NOTEBOOK 03: EMBEDDING GENERATION & VECTOR STORE
Convert de-identified clinical notes into embeddings and build patient-specific vector store
"""

# Install required packages
!pip install -q sentence-transformers chromadb

import os
import json
from datetime import datetime
import pandas as pd
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings

# Mount Drive
from google.colab import drive
drive.mount('/content/drive')

# Paths
PROJECT_ROOT = "/content/drive/MyDrive/Colab_Notebooks/LLMs/clinical_notes_qa_project"
INPUT_DIR = f"{PROJECT_ROOT}/02_data_preprocessing/outputs/deidentified_notes"
OUTPUT_DIR = f"{PROJECT_ROOT}/03_knowledge_base/outputs"
EMBEDDINGS_DIR = f"{OUTPUT_DIR}/embeddings"
VECTOR_STORE_DIR = f"{OUTPUT_DIR}/vector_store"

os.makedirs(EMBEDDINGS_DIR, exist_ok=True)
os.makedirs(VECTOR_STORE_DIR, exist_ok=True)

# Load patient metadata
with open(f"{PROJECT_ROOT}/01_data_generation/outputs/patient_metadata.json", 'r') as f:
    metadata = json.load(f)

patients = metadata['patients']
print(f"‚úÖ Setup complete")
print(f"üìÇ Input: {INPUT_DIR}")
print(f"üìÇ Output: {OUTPUT_DIR}")
print(f"üìä Patients: {len(patients)}")

[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m52.0/52.0 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m21.1/21.1 MB[0m [31m40.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m278.2/278.2 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m2.0/2.0 MB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m17.4/17.4 MB[0m [31m80.5 MB/s[0m eta [36m0:00:00[0m
[2K 



Mounted at /content/drive
‚úÖ Setup complete
üìÇ Input: /content/drive/MyDrive/Colab_Notebooks/LLMs/clinical_notes_qa_project/02_data_preprocessing/outputs/deidentified_notes
üìÇ Output: /content/drive/MyDrive/Colab_Notebooks/LLMs/clinical_notes_qa_project/03_knowledge_base/outputs
üìä Patients: 10


In [2]:
# Load sentence transformer model
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
print(f"üì• Loading embedding model: {MODEL_NAME}")

embedding_model = SentenceTransformer(MODEL_NAME)

# Test the model
test_text = "Patient has Type 2 Diabetes with HbA1c of 8.2%"
test_embedding = embedding_model.encode(test_text)

print(f"‚úÖ Model loaded successfully")
print(f"üìä Embedding dimension: {len(test_embedding)}")
print(f"üìä Sample embedding (first 10 values): {test_embedding[:10]}")

üì• Loading embedding model: sentence-transformers/all-MiniLM-L6-v2


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

‚úÖ Model loaded successfully
üìä Embedding dimension: 384
üìä Sample embedding (first 10 values): [ 0.03516354  0.03772797 -0.06244111  0.03789256 -0.08139446 -0.02656122
  0.05103768  0.10170711 -0.06696238 -0.05255044]


In [3]:
def chunk_clinical_note(note_text, chunk_size=512, overlap=50):
    """
    Split clinical note into semantic chunks
    Strategy: Split by sections (CHIEF COMPLAINT, HISTORY, etc.)
    and further by character limit
    """
    # Define section headers in clinical notes
    section_headers = [
        "CHIEF COMPLAINT:",
        "HISTORY OF PRESENT ILLNESS:",
        "HISTORY:",
        "CURRENT MEDICATIONS:",
        "VITAL SIGNS:",
        "PHYSICAL EXAMINATION:",
        "LAB RESULTS:",
        "LABORATORY RESULTS:",
        "ASSESSMENT:",
        "PLAN:"
    ]

    chunks = []
    current_section = None
    current_text = ""

    for line in note_text.split('\n'):
        # Check if this line is a section header
        is_header = any(header in line for header in section_headers)

        if is_header:
            # Save previous section if it exists
            if current_text.strip():
                chunks.append({
                    'section': current_section,
                    'text': current_text.strip()
                })
            # Start new section
            current_section = line.strip()
            current_text = line + '\n'
        else:
            current_text += line + '\n'

            # If chunk gets too large, split it
            if len(current_text) > chunk_size:
                chunks.append({
                    'section': current_section,
                    'text': current_text.strip()
                })
                current_text = ""

    # Add final chunk
    if current_text.strip():
        chunks.append({
            'section': current_section,
            'text': current_text.strip()
        })

    return chunks

# Test chunking
sample_patient = patients[0]
sample_file = f"{INPUT_DIR}/{sample_patient['patient_id']}/visit_1_{sample_patient['visit_dates'][0]}.txt"
with open(sample_file, 'r') as f:
    sample_note = f.read()

sample_chunks = chunk_clinical_note(sample_note)
print(f"‚úÖ Chunking strategy defined")
print(f"üìä Sample note split into {len(sample_chunks)} chunks")
print(f"\nFirst chunk preview:")
print(f"Section: {sample_chunks[0]['section']}")
print(f"Text (first 150 chars): {sample_chunks[0]['text'][:150]}...")

‚úÖ Chunking strategy defined
üìä Sample note split into 8 chunks

First chunk preview:
Section: None
Text (first 150 chars): CLINICAL NOTE
[PATIENT_NAME] (ID: patient_001)
Date: [DATE] | Age: 69 | Gender:...


In [4]:
# Process all clinical notes and generate embeddings
all_chunks = []
chunk_id = 0

print("üìù Processing clinical notes and generating embeddings...")

for patient in patients:
    patient_id = patient['patient_id']
    patient_dir = f"{INPUT_DIR}/{patient_id}"

    if not os.path.exists(patient_dir):
        continue

    for filename in sorted(os.listdir(patient_dir)):
        if not filename.endswith('.txt'):
            continue

        # Read note
        with open(f"{patient_dir}/{filename}", 'r') as f:
            note_text = f.read()

        # Extract visit date from filename (format: visit_1_YYYY-MM-DD.txt)
        visit_date = filename.split('_')[-1].replace('.txt', '')
        visit_number = filename.split('_')[1]

        # Chunk the note
        chunks = chunk_clinical_note(note_text)

        for chunk_idx, chunk in enumerate(chunks):
            # Generate embedding
            embedding = embedding_model.encode(chunk['text'])

            # Store chunk with metadata
            all_chunks.append({
                'chunk_id': f"{patient_id}_visit{visit_number}_chunk{chunk_idx}",
                'patient_id': patient_id,
                'visit_date': visit_date,
                'visit_number': int(visit_number),
                'chunk_index': chunk_idx,
                'section': chunk['section'],
                'text': chunk['text'],
                'embedding': embedding.tolist()
            })
            chunk_id += 1

print(f"‚úÖ Generated embeddings for {len(all_chunks)} chunks")
print(f"üìä Average chunks per patient: {len(all_chunks)/len(patients):.1f}")

# Show distribution
chunks_per_patient = {}
for chunk in all_chunks:
    pid = chunk['patient_id']
    chunks_per_patient[pid] = chunks_per_patient.get(pid, 0) + 1

print(f"\nüìä Chunks per patient:")
for pid, count in list(chunks_per_patient.items())[:5]:
    print(f"  ‚Ä¢ {pid}: {count} chunks")

üìù Processing clinical notes and generating embeddings...
‚úÖ Generated embeddings for 288 chunks
üìä Average chunks per patient: 28.8

üìä Chunks per patient:
  ‚Ä¢ patient_001: 32 chunks
  ‚Ä¢ patient_002: 24 chunks
  ‚Ä¢ patient_003: 32 chunks
  ‚Ä¢ patient_004: 32 chunks
  ‚Ä¢ patient_005: 32 chunks


In [5]:
# Initialize ChromaDB
print("üóÑÔ∏è  Initializing ChromaDB vector store...")

client = chromadb.PersistentClient(path=VECTOR_STORE_DIR)

# Create or get collection
collection = client.get_or_create_collection(
    name="clinical_notes",
    metadata={"description": "Patient clinical notes with embeddings"}
)

# Add all chunks to vector store
print(f"üì• Adding {len(all_chunks)} chunks to vector store...")

# Prepare data for ChromaDB
ids = [chunk['chunk_id'] for chunk in all_chunks]
embeddings = [chunk['embedding'] for chunk in all_chunks]
documents = [chunk['text'] for chunk in all_chunks]
metadatas = [{
    'patient_id': chunk['patient_id'],
    'visit_date': chunk['visit_date'],
    'visit_number': chunk['visit_number'],
    'section': chunk['section'] if chunk['section'] else 'unknown',
    'chunk_index': chunk['chunk_index']
} for chunk in all_chunks]

# Add to collection
collection.add(
    ids=ids,
    embeddings=embeddings,
    documents=documents,
    metadatas=metadatas
)

print(f"‚úÖ Vector store created successfully")
print(f"üìä Collection size: {collection.count()} chunks")

üóÑÔ∏è  Initializing ChromaDB vector store...
üì• Adding 288 chunks to vector store...
‚úÖ Vector store created successfully
üìä Collection size: 288 chunks


In [6]:
# Test retrieval with patient filtering
test_patient_id = "patient_001"
test_query = "What medications is the patient taking?"

print(f"üîç Testing retrieval for {test_patient_id}")
print(f"üìù Query: '{test_query}'")

# Generate query embedding
query_embedding = embedding_model.encode(test_query)

# Retrieve with patient filter
results = collection.query(
    query_embeddings=[query_embedding.tolist()],
    n_results=3,
    where={"patient_id": test_patient_id}  # CRITICAL: Patient isolation
)

print(f"\n‚úÖ Retrieved {len(results['documents'][0])} chunks")
print(f"\nüìÑ Top retrieved chunks:")

for i, (doc, metadata, distance) in enumerate(zip(
    results['documents'][0],
    results['metadatas'][0],
    results['distances'][0]
), 1):
    print(f"\n--- Chunk {i} (Similarity: {1-distance:.3f}) ---")
    print(f"Patient: {metadata['patient_id']}")
    print(f"Visit: {metadata['visit_date']}, Section: {metadata['section']}")
    print(f"Text: {doc[:200]}...")

üîç Testing retrieval for patient_001
üìù Query: 'What medications is the patient taking?'

‚úÖ Retrieved 3 chunks

üìÑ Top retrieved chunks:

--- Chunk 1 (Similarity: 0.168) ---
Patient: patient_001
Visit: 2024-09-24, Section: CURRENT MEDICATIONS:
Text: CURRENT MEDICATIONS:
  ‚Ä¢ Rosuvastatin 10mg
  ‚Ä¢ Atorvastatin 20mg...

--- Chunk 2 (Similarity: 0.078) ---
Patient: patient_001
Visit: 2024-05-01, Section: CURRENT MEDICATIONS:
Text: CURRENT MEDICATIONS:
  ‚Ä¢ Montelukast 10mg
  ‚Ä¢ Fluticasone 250mcg...

--- Chunk 3 (Similarity: 0.072) ---
Patient: patient_001
Visit: 2024-09-02, Section: CURRENT MEDICATIONS:
Text: CURRENT MEDICATIONS:
  ‚Ä¢ Atorvastatin 20mg...


In [7]:
# CRITICAL TEST: Ensure no cross-patient data leakage
print("üîí Testing patient data isolation...")

test_queries = [
    "What are the patient's lab results?",
    "What medications is the patient on?",
    "What is the patient's primary diagnosis?"
]

isolation_passed = True

for patient in patients[:3]:  # Test first 3 patients
    patient_id = patient['patient_id']

    for query in test_queries:
        query_embedding = embedding_model.encode(query)

        # Retrieve with patient filter
        results = collection.query(
            query_embeddings=[query_embedding.tolist()],
            n_results=5,
            where={"patient_id": patient_id}
        )

        # Check that ALL results belong to the correct patient
        for metadata in results['metadatas'][0]:
            if metadata['patient_id'] != patient_id:
                print(f"‚ùå ISOLATION BREACH: Query for {patient_id} returned data from {metadata['patient_id']}")
                isolation_passed = False

if isolation_passed:
    print("‚úÖ Patient isolation test PASSED")
    print("   All queries correctly filtered by patient_id")
else:
    print("‚ö†Ô∏è  Patient isolation test FAILED")
    print("   Cross-patient data leakage detected!")

üîí Testing patient data isolation...
‚úÖ Patient isolation test PASSED
   All queries correctly filtered by patient_id


In [8]:
# Save embeddings to file (for backup/analysis)
embeddings_file = f"{EMBEDDINGS_DIR}/all_embeddings.json"
with open(embeddings_file, 'w') as f:
    json.dump(all_chunks, f, indent=2)

# Save chunking configuration
chunking_config = {
    "strategy": "semantic_section_based",
    "chunk_size": 512,
    "overlap": 50,
    "section_headers_used": [
        "CHIEF COMPLAINT", "HISTORY", "CURRENT MEDICATIONS",
        "VITAL SIGNS", "LAB RESULTS", "ASSESSMENT", "PLAN"
    ],
    "total_chunks": len(all_chunks),
    "chunks_per_patient": chunks_per_patient
}

config_file = f"{OUTPUT_DIR}/chunking_config.json"
with open(config_file, 'w') as f:
    json.dump(chunking_config, f, indent=2)

print(f"‚úÖ Saved embeddings to: {embeddings_file}")
print(f"‚úÖ Saved config to: {config_file}")

‚úÖ Saved embeddings to: /content/drive/MyDrive/Colab_Notebooks/LLMs/clinical_notes_qa_project/03_knowledge_base/outputs/embeddings/all_embeddings.json
‚úÖ Saved config to: /content/drive/MyDrive/Colab_Notebooks/LLMs/clinical_notes_qa_project/03_knowledge_base/outputs/chunking_config.json


In [9]:
# MLOps artifact logging
mlops_log = {
    "notebook": "03_embedding_generation",
    "execution_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "data_version": "v1.0",
    "model": {
        "name": MODEL_NAME,
        "dimension": len(test_embedding),
        "type": "sentence-transformers"
    },
    "inputs": {
        "deidentified_notes_dir": INPUT_DIR,
        "num_patients": len(patients)
    },
    "outputs": {
        "embeddings_file": embeddings_file,
        "vector_store_dir": VECTOR_STORE_DIR,
        "chunking_config": config_file
    },
    "statistics": {
        "total_chunks": len(all_chunks),
        "avg_chunks_per_patient": round(len(all_chunks)/len(patients), 2),
        "vector_store_size": collection.count()
    },
    "chunking": chunking_config,
    "patient_isolation_test": "PASSED" if isolation_passed else "FAILED"
}

mlops_log_path = f"{OUTPUT_DIR}/mlops_embedding_log.json"
with open(mlops_log_path, 'w') as f:
    json.dump(mlops_log, f, indent=2)

print("\n" + "="*70)
print("‚úÖ NOTEBOOK 03 COMPLETE")
print("="*70)
print(f"""
üìÇ Outputs:
  ‚Ä¢ Vector store: {VECTOR_STORE_DIR}
  ‚Ä¢ Embeddings backup: {embeddings_file}
  ‚Ä¢ Chunking config: {config_file}
  ‚Ä¢ MLOps log: {mlops_log_path}

üìä Summary:
  ‚Ä¢ Total chunks: {len(all_chunks)}
  ‚Ä¢ Embedding dimension: {len(test_embedding)}
  ‚Ä¢ Model: {MODEL_NAME}
  ‚Ä¢ Patient isolation: {'‚úÖ PASSED' if isolation_passed else '‚ùå FAILED'}
""")


‚úÖ NOTEBOOK 03 COMPLETE

üìÇ Outputs:
  ‚Ä¢ Vector store: /content/drive/MyDrive/Colab_Notebooks/LLMs/clinical_notes_qa_project/03_knowledge_base/outputs/vector_store
  ‚Ä¢ Embeddings backup: /content/drive/MyDrive/Colab_Notebooks/LLMs/clinical_notes_qa_project/03_knowledge_base/outputs/embeddings/all_embeddings.json
  ‚Ä¢ Chunking config: /content/drive/MyDrive/Colab_Notebooks/LLMs/clinical_notes_qa_project/03_knowledge_base/outputs/chunking_config.json
  ‚Ä¢ MLOps log: /content/drive/MyDrive/Colab_Notebooks/LLMs/clinical_notes_qa_project/03_knowledge_base/outputs/mlops_embedding_log.json

üìä Summary:
  ‚Ä¢ Total chunks: 288
  ‚Ä¢ Embedding dimension: 384
  ‚Ä¢ Model: sentence-transformers/all-MiniLM-L6-v2
  ‚Ä¢ Patient isolation: ‚úÖ PASSED

