# Privacy-Preserving RAG Example

This notebook demonstrates the Privacy-Preserving RAG architecture that protects sensitive information during retrieval and generation processes. It incorporates techniques like differential privacy, secure multi-party computation, and data anonymization.

In [None]:
# Import required modules
import numpy as np
import hashlib
from datetime import datetime
from src.rag_specialized.privacy_preserving.privacy_preserving_rag import (
    PrivacyPreservingRAG, PrivacyDocument, PrivacyQuery, PrivacyConfig, PrivacyLevel
)

## Initialize the Privacy-Preserving RAG System

In [None]:
# Create privacy configuration
config = PrivacyConfig(
    epsilon=1.0,  # Privacy budget parameter
    delta=1e-5,   # Delta parameter for approximate DP
    enable_pii_detection=True,
    enable_anonymization=True
)

# Initialize the Privacy-Preserving RAG system
privacy_rag = PrivacyPreservingRAG(config=config)
print("Privacy-Preserving RAG system initialized!")
print(f"Privacy config: epsilon={config.epsilon}, delta={config.delta}")

## Create Sample Privacy-Aware Documents

In [None]:
# Create sample privacy-aware documents with different privacy levels
documents = [
    PrivacyDocument(
        id="doc1",
        content="John Smith is our lead engineer at Microsoft. His email is john.smith@company.com and phone is 555-123-4567.",
        privacy_level=PrivacyLevel.PII,  # Contains personally identifiable information
        access_controls=["admin", "hr"],
        metadata={"department": "engineering", "role": "lead"}
    ),
    PrivacyDocument(
        id="doc2",
        content="Microsoft Corporation is a technology company headquartered in Redmond, WA. Founded in 1975 by Bill Gates and Paul Allen.",
        privacy_level=PrivacyLevel.PUBLIC,
        metadata={"category": "company_info", "founded": 1975}
    ),
    PrivacyDocument(
        id="doc3",
        content="Our Q4 financial report shows revenue of $50M. Contact CFO Jane Doe at jane.doe@company.com for details.",
        privacy_level=PrivacyLevel.CONFIDENTIAL,
        access_controls=["executive", "finance"],
        metadata={"report": "Q4_2023", "revenue": "$50M"}
    ),
    PrivacyDocument(
        id="doc4",
        content="Employee satisfaction survey results show 85% positive feedback. Survey ID: SURV-2023-001.",
        privacy_level=PrivacyLevel.INTERNAL,
        metadata={"survey_type": "satisfaction", "score": 85}
    )
]

print(f"Created {len(documents)} sample privacy-aware documents")
for i, doc in enumerate(documents):
    print(f"  Doc {i+1}: {doc.id} - {doc.privacy_level.value} - {len(doc.content)} chars")

## Add Documents to the RAG System

In [None]:
# Add documents to the system
num_added = privacy_rag.add_documents(documents)
print(f"Added {num_added} privacy-aware documents to the system")

## Create and Execute Privacy-Aware Queries

In [None]:
# Create a privacy-aware query for public information
public_query = PrivacyQuery(
    text="What is Microsoft Corporation?",
    user_id="user123",
    required_privacy_level=PrivacyLevel.PUBLIC
)

# Create a simple embedding for the query
query_text_hash = hashlib.md5(public_query.text.encode()).hexdigest()
query_embedding = np.frombuffer(bytes.fromhex(query_text_hash[:32]), dtype=np.float32)
if len(query_embedding) < 384:
    query_embedding = np.pad(query_embedding, (0, 384 - len(query_embedding)), 'constant')
elif len(query_embedding) > 384:
    query_embedding = query_embedding[:384]

# Execute the query
public_result = privacy_rag.query(public_query, query_embedding, k=2)

print("Public Information Query:")
print(f"Query: {public_query.text}")
print(f"Answer: {public_result.answer}")
print(f"Privacy preserved: {public_result.privacy_preserved}")
print(f"Privacy techniques applied: {[tech.value for tech in public_result.privacy_techniques_applied]}")
print(f"Privacy budget consumed: {public_result.privacy_budget_consumed:.3f}")
print(f"Confidence: {public_result.confidence:.3f}")
print(f"Latency: {public_result.latency_ms:.2f}ms")
print(f"Sources: {len(public_result.sources)} documents retrieved")

In [None]:
# Create a privacy-aware query for confidential information
# Note: This user doesn't have access to confidential documents
confidential_query = PrivacyQuery(
    text="What are the Q4 financial results?",
    user_id="regular_user",  # This user doesn't have finance access
    required_privacy_level=PrivacyLevel.CONFIDENTIAL
)

# Create a simple embedding for the query
conf_query_hash = hashlib.md5(confidential_query.text.encode()).hexdigest()
conf_query_embedding = np.frombuffer(bytes.fromhex(conf_query_hash[:32]), dtype=np.float32)
if len(conf_query_embedding) < 384:
    conf_query_embedding = np.pad(conf_query_embedding, (0, 384 - len(conf_query_embedding)), 'constant')
elif len(conf_query_embedding) > 384:
    conf_query_embedding = conf_query_embedding[:384]

# Execute the query
conf_result = privacy_rag.query(confidential_query, conf_query_embedding, k=2)

print("\nConfidential Information Query (without access):")
print(f"Query: {confidential_query.text}")
print(f"Answer: {conf_result.answer}")
print(f"Privacy preserved: {conf_result.privacy_preserved}")
print(f"Sources: {len(conf_result.sources)} documents retrieved")

In [None]:
# Create a privacy-aware query for confidential information with proper access
# Simulate a user with finance access
for doc in documents:
    if 'finance' in doc.access_controls:
        doc.access_controls.append('finance_user')  # Grant access to finance_user

# Re-initialize the system with updated access controls
privacy_rag_updated = PrivacyPreservingRAG(config=config)
privacy_rag_updated.add_documents(documents)

finance_query = PrivacyQuery(
    text="What are the Q4 financial results?",
    user_id="finance_user",  # This user has finance access
    required_privacy_level=PrivacyLevel.CONFIDENTIAL
)

# Create a simple embedding for the query
finance_query_hash = hashlib.md5(finance_query.text.encode()).hexdigest()
finance_query_embedding = np.frombuffer(bytes.fromhex(finance_query_hash[:32]), dtype=np.float32)
if len(finance_query_embedding) < 384:
    finance_query_embedding = np.pad(finance_query_embedding, (0, 384 - len(finance_query_embedding)), 'constant')
elif len(finance_query_embedding) > 384:
    finance_query_embedding = finance_query_embedding[:384]

# Execute the query
finance_result = privacy_rag_updated.query(finance_query, finance_query_embedding, k=1)

print("\nConfidential Information Query (with access):")
print(f"Query: {finance_query.text}")
print(f"Answer: {finance_result.answer}")
print(f"Privacy preserved: {finance_result.privacy_preserved}")
print(f"Sources: {len(finance_result.sources)} documents retrieved")

## Examine Privacy Protection Mechanisms

In [None]:
# Check the privacy status of the system
privacy_status = privacy_rag.get_privacy_status()
print("Privacy Status:")
print(f"  Current epsilon: {privacy_status['budget_status']['current_epsilon']}")
print(f"  Max epsilon: {privacy_status['budget_status']['max_epsilon']}")
print(f"  Remaining budget: {privacy_status['budget_status']['remaining_budget']}")
print(f"  Total documents: {privacy_status['statistics']['total_documents']}")
print(f"  Documents with PII: {privacy_status['statistics']['documents_with_pii']}")
print(f"  Access-controlled documents: {privacy_status['statistics']['access_controlled_documents']}")

## Performance Analysis

In [None]:
# Perform multiple queries to analyze performance
queries = [
    (PrivacyLevel.PUBLIC, "What is Microsoft Corporation?"),
    (PrivacyLevel.PUBLIC, "Who founded Microsoft?"),
    (PrivacyLevel.INTERNAL, "What do employee surveys show?"),
    (PrivacyLevel.PUBLIC, "Tell me about the company")
]

latencies = []
confidences = []
privacy_preserved_flags = []

for req_privacy_level, query_text in queries:
    query = PrivacyQuery(
        text=query_text,
        user_id="test_user",
        required_privacy_level=req_privacy_level
    )
    
    # Create embedding
    query_hash = hashlib.md5(query_text.encode()).hexdigest()
    query_emb = np.frombuffer(bytes.fromhex(query_hash[:32]), dtype=np.float32)
    if len(query_emb) < 384:
        query_emb = np.pad(query_emb, (0, 384 - len(query_emb)), 'constant')
    elif len(query_emb) > 384:
        query_emb = query_emb[:384]
    
    result = privacy_rag.query(query, query_emb, k=2)
    latencies.append(result.latency_ms)
    confidences.append(result.confidence)
    privacy_preserved_flags.append(result.privacy_preserved)

print(f"Average query latency: {np.mean(latencies):.2f}ms")
print(f"Latency std deviation: {np.std(latencies):.2f}ms")
print(f"Average confidence: {np.mean(confidences):.3f}")
print(f"Privacy preserved in {sum(privacy_preserved_flags)}/{len(privacy_preserved_flags)} queries")

## Summary

In this notebook, we explored the Privacy-Preserving RAG architecture:

1. **Initialization**: Created an instance of the PrivacyPreservingRAG system with privacy configuration
2. **Privacy-Aware Documents**: Added documents with different privacy levels and access controls
3. **Privacy Queries**: Executed queries with privacy considerations and access controls
4. **Privacy Mechanisms**: Examined privacy protection mechanisms and status
5. **Performance Evaluation**: Measured query latency, confidence, and privacy preservation

The Privacy-Preserving RAG system successfully protected sensitive information while providing appropriate responses based on user access rights and privacy requirements.