In [4]:
import sys
sys.path.insert(0, '..')

from pathlib import Path
import json

import importlib
if 'src.services.faiss_service' in sys.modules:
    importlib.reload(sys.modules['src.services.faiss_service'])
if 'src.utils.document_chunker' in sys.modules:
    importlib.reload(sys.modules['src.utils.document_chunker'])

from src.services.faiss_service import FAISSService
from src.utils.document_chunker import DocumentChunker

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
print("Initializing services...\n")
faiss_service = FAISSService()
chunker = DocumentChunker()
print(f"\n✓ Services initialized")
print(f"Chunk size: {chunker.chunk_size} chars")
print(f"Chunk overlap: {chunker.chunk_overlap} chars")

Initializing services...

Loading embedding model: all-MiniLM-L6-v2


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Created new FAISS index with dimension 384
Initialized empty metadata store

✓ Services initialized
Chunk size: 1200 chars
Chunk overlap: 200 chars


In [6]:
stats = faiss_service.get_stats()

print("=" * 60)
print("FAISS INDEX STATISTICS (BEFORE)")
print("=" * 60)
print(f"Total Documents: {stats['total_documents']}")
print(f"Index Dimension: {stats['index_dimension']}")
print(f"Embedding Model: {stats['embedding_model']}")
print("=" * 60)

FAISS INDEX STATISTICS (BEFORE)
Total Documents: 0
Index Dimension: 384
Embedding Model: all-MiniLM-L6-v2


In [7]:
guidance_dir = Path("../data/compliance/guidance")

print("Loading guidance documents from:", guidance_dir.absolute())
print()

if not guidance_dir.exists():
    print("✗ Guidance directory not found")
else:
    files = list(guidance_dir.glob("*.txt"))
    print(f"Found {len(files)} guidance document(s):")
    for f in files:
        print(f"  - {f.name}")

Loading guidance documents from: c:\projects\HealthOps\backend\notebooks\..\data\compliance\guidance

Found 2 guidance document(s):
  - GDPR_Healthcare.txt
  - HIPAA_Guidelines.txt


In [8]:
all_chunks_data = chunker.chunk_directory(str(guidance_dir), "*.txt")

print("=" * 60)
print("DOCUMENT CHUNKING RESULTS")
print("=" * 60)

total_chunks = 0
for doc_id, chunks in all_chunks_data.items():
    stats = chunker.get_chunk_stats(chunks)
    total_chunks += stats['total_chunks']
    
    print(f"\nDocument: {doc_id}")
    print(f"  Total Chunks: {stats['total_chunks']}")
    print(f"  Sections: {stats['sections']}")
    print(f"  Avg Chunk Size: {stats['avg_chunk_size']:.0f} chars")
    print(f"  Min/Max: {stats['min_chunk_size']}/{stats['max_chunk_size']} chars")
    print(f"  Total Characters: {stats['total_chars']}")

print(f"\n{'=' * 60}")
print(f"Total chunks across all documents: {total_chunks}")
print("=" * 60)

DOCUMENT CHUNKING RESULTS

Document: GDPR_Healthcare
  Total Chunks: 55
  Sections: 55
  Avg Chunk Size: 118 chars
  Min/Max: 0/276 chars
  Total Characters: 6478

Document: HIPAA_Guidelines
  Total Chunks: 32
  Sections: 32
  Avg Chunk Size: 181 chars
  Min/Max: 0/462 chars
  Total Characters: 5780

Total chunks across all documents: 87


In [9]:
print("Sample chunks from first document:\n")

first_doc = list(all_chunks_data.keys())[0]
chunks = all_chunks_data[first_doc]

print(f"Document: {first_doc}")
print(f"Showing first 3 chunks:\n")

for i, chunk in enumerate(chunks[:3], 1):
    print(f"Chunk {i}:")
    print(f"  ID: {chunk['chunk_id']}")
    print(f"  Section: {chunk['section_title']}")
    print(f"  Type: {chunk['chunk_type']}")
    print(f"  Size: {chunk['char_count']} chars")
    print(f"  Text: {chunk['text'][:200]}...")
    print()

Sample chunks from first document:

Document: GDPR_Healthcare
Showing first 3 chunks:

Chunk 1:
  ID: GDPR_Healthcare_sec0
  Section: # GDPR Healthcare Compliance Guide
  Type: section
  Size: 0 chars
  Text: ...

Chunk 2:
  ID: GDPR_Healthcare_sec1
  Section: ## 1. Introduction to GDPR in Healthcare
  Type: section
  Size: 247 chars
  Text: The General Data Protection Regulation (GDPR) sets strict requirements for how healthcare organizations process personal data of EU citizens. Healthcare data is considered a special category of person...

Chunk 3:
  ID: GDPR_Healthcare_sec2
  Section: ## 2. Key Principles
  Type: section
  Size: 0 chars
  Text: ...



In [10]:
print("Adding all chunks to FAISS index...\n")

documents_to_add = []

for doc_id, chunks in all_chunks_data.items():
    for chunk in chunks:
        documents_to_add.append({
            "id": chunk['chunk_id'],
            "text": chunk['text'],
            "document_type": "compliance_guidance",
            "metadata": {
                "source_document": doc_id,
                "source_file": chunk['source_file'],
                "section_title": chunk['section_title'],
                "section_index": chunk['section_index'],
                "chunk_index": chunk['chunk_index'],
                "chunk_type": chunk['chunk_type'],
                "char_count": chunk['char_count']
            }
        })

result = faiss_service.add_documents_batch(documents_to_add)

print("=" * 60)
print("FAISS INDEXING RESULTS")
print("=" * 60)
print(f"Status: {'SUCCESS' if result['success'] else 'FAILED'}")
print(f"Message: {result['message']}")
if result['success']:
    print(f"Documents Added: {result['documents_added']}")
    print(f"Total Documents: {result['total_documents']}")
print("=" * 60)

Adding all chunks to FAISS index...

Saved FAISS index and metadata
FAISS INDEXING RESULTS
Status: SUCCESS
Message: Added 87 documents successfully
Documents Added: 87
Total Documents: 87


In [11]:
queries = [
    "What are the requirements for protecting patient health information?",
    "How should we handle data breaches?",
    "What encryption is required for health records?",
    "What are GDPR patient rights?"
]

for query in queries:
    print("\n" + "=" * 60)
    print(f"QUERY: {query}")
    print("=" * 60)
    
    result = faiss_service.search(query, top_k=3, document_type="compliance_guidance")
    
    if result['success']:
        print(f"\nFound {len(result['results'])} relevant chunks:\n")
        
        for idx, doc in enumerate(result['results'], 1):
            print(f"{idx}. Chunk ID: {doc['document_id']}")
            print(f"   Similarity: {doc['similarity_score']:.4f}")
            print(f"   Source: {doc['metadata'].get('source_document', 'N/A')}")
            print(f"   Section: {doc['metadata'].get('section_title', 'N/A')}")
            print(f"   Text: {doc['text'][:200]}...")
            print()
    else:
        print(f"Search failed: {result['message']}")


QUERY: What are the requirements for protecting patient health information?

Found 3 relevant chunks:

1. Chunk ID: HIPAA_Guidelines_sec8
   Similarity: 0.6227
   Source: HIPAA_Guidelines
   Section: ### 3.3 Individual Rights
   Text: Patients have the right to:
- Access their health information
- Request amendments to their health information
- Receive an accounting of disclosures
- Request restrictions on certain uses and disclos...

2. Chunk ID: GDPR_Healthcare_sec3
   Similarity: 0.6099
   Source: GDPR_Healthcare
   Section: ### 2.1 Lawfulness, Fairness, and Transparency
   Text: Healthcare organizations must process personal data lawfully, fairly, and in a transparent manner. Patients must be informed about how their data is being used....

3. Chunk ID: HIPAA_Guidelines_sec3
   Similarity: 0.5724
   Source: HIPAA_Guidelines
   Section: ### 2.1 Definition
   Text: Protected Health Information (PHI) is any information about health status, provision of healthcare, or payment for hea

In [12]:
final_stats = faiss_service.get_stats()

print("\n" + "=" * 60)
print("FINAL STATISTICS")
print("=" * 60)
print(f"Total Chunks Indexed: {final_stats['total_documents']}")
print(f"Embedding Model: {final_stats['embedding_model']}")
print(f"Vector Dimension: {final_stats['index_dimension']}")
print(f"\nDocument Types:")
for doc_type, count in final_stats['document_types'].items():
    print(f"  - {doc_type}: {count}")
print(f"\nIndex Location: {final_stats['index_path']}")
print("=" * 60)
print("\n✓ Guidance documents chunked and indexed successfully!")
print("✓ Ready for semantic search across all compliance sections!")


FINAL STATISTICS
Total Chunks Indexed: 87
Embedding Model: all-MiniLM-L6-v2
Vector Dimension: 384

Document Types:
  - compliance_guidance: 87

Index Location: data\compliance\faiss_index

✓ Guidance documents chunked and indexed successfully!
✓ Ready for semantic search across all compliance sections!
