# Graph-Enhanced RAG Example

This notebook demonstrates the Graph-Enhanced RAG architecture that leverages knowledge graphs to improve retrieval and generation. It builds entity-relation graphs from documents and uses graph-based reasoning to enhance the RAG process.

In [None]:
# Import required modules
import numpy as np
import hashlib
from datetime import datetime
from src.rag_specialized.graph_enhanced.graph_enhanced_rag import (
    GraphEnhancedRAG, GraphDocument, GraphQuery, Entity, EntityType, Relation, RelationType
)

## Initialize the Graph-Enhanced RAG System

In [None]:
# Initialize the Graph-Enhanced RAG system
graph_rag = GraphEnhancedRAG(graph_weight=0.5)
print("Graph-Enhanced RAG system initialized!")

## Create Sample Graph Documents

In [None]:
# Create sample graph documents with entities and relations
documents = [
    GraphDocument(
        id="doc1",
        content="John Smith works at Microsoft Corporation. He is a software engineer located in Seattle, Washington.",
        metadata={"source": "employee_directory", "department": "engineering"}
    ),
    GraphDocument(
        id="doc2",
        content="Microsoft Corporation is headquartered in Redmond, Washington. The company was founded by Bill Gates and Paul Allen.",
        metadata={"source": "company_info", "category": "about"}
    ),
    GraphDocument(
        id="doc3",
        content="Seattle is a major city in Washington state. It is known for its tech industry and coffee culture.",
        metadata={"source": "city_guide", "category": "location"}
    ),
    GraphDocument(
        id="doc4",
        content="Apple Inc. is a technology company headquartered in Cupertino, California. It was founded by Steve Jobs, Steve Wozniak, and Ronald Wayne.",
        metadata={"source": "company_info", "category": "about"}
    )
]

print(f"Created {len(documents)} sample graph documents")
for i, doc in enumerate(documents):
    print(f"  Doc {i+1}: {doc.id} - {len(doc.content)} chars")

## Add Documents to the RAG System

In [None]:
# Add documents to the system
num_added = graph_rag.add_documents(documents)
print(f"Added {num_added} graph documents to the system")

## Create and Execute Graph Queries

In [None]:
# Create a graph query about employment
employment_query = GraphQuery(
    text="Where does John Smith work?",
    hops=2,  # Allow up to 2 hops in the graph
    include_related=True
)

# Create a simple embedding for the query
query_text_hash = hashlib.md5(employment_query.text.encode()).hexdigest()
query_embedding = np.frombuffer(bytes.fromhex(query_text_hash[:32]), dtype=np.float32)
if len(query_embedding) < 384:
    query_embedding = np.pad(query_embedding, (0, 384 - len(query_embedding)), 'constant')
elif len(query_embedding) > 384:
    query_embedding = query_embedding[:384]

# Execute the query
employment_result = graph_rag.query(employment_query, query_embedding, k=2)

print("Employment Query:")
print(f"Query: {employment_query.text}")
print(f"Answer: {employment_result.answer}")
print(f"Entities mentioned: {len(employment_result.entities_mentioned)}")
print(f"Relations discovered: {len(employment_result.relations_discovered)}")
print(f"Reasoning paths: {len(employment_result.reasoning_paths)}")
print(f"Confidence: {employment_result.confidence:.3f}")
print(f"Latency: {employment_result.latency_ms:.2f}ms")
print(f"Sources: {len(employment_result.sources)} documents retrieved")

In [None]:
# Create a graph query about company founders
founder_query = GraphQuery(
    text="Who founded Microsoft Corporation?",
    hops=3,  # Allow more hops for complex reasoning
    include_related=True
)

# Create a simple embedding for the query
founder_query_hash = hashlib.md5(founder_query.text.encode()).hexdigest()
founder_query_embedding = np.frombuffer(bytes.fromhex(founder_query_hash[:32]), dtype=np.float32)
if len(founder_query_embedding) < 384:
    founder_query_embedding = np.pad(founder_query_embedding, (0, 384 - len(founder_query_embedding)), 'constant')
elif len(founder_query_embedding) > 384:
    founder_query_embedding = founder_query_embedding[:384]

# Execute the query
founder_result = graph_rag.query(founder_query, founder_query_embedding, k=2)

print("\nFounder Query:")
print(f"Query: {founder_query.text}")
print(f"Answer: {founder_result.answer}")
print(f"Entities mentioned: {len(founder_result.entities_mentioned)}")
print(f"Relations discovered: {len(founder_result.relations_discovered)}")
print(f"Reasoning paths: {founder_result.reasoning_paths}")

In [None]:
# Create a location-based graph query
location_query = GraphQuery(
    text="What state is Seattle in?",
    hops=2,
    include_related=True
)

# Create a simple embedding for the query
location_query_hash = hashlib.md5(location_query.text.encode()).hexdigest()
location_query_embedding = np.frombuffer(bytes.fromhex(location_query_hash[:32]), dtype=np.float32)
if len(location_query_embedding) < 384:
    location_query_embedding = np.pad(location_query_embedding, (0, 384 - len(location_query_embedding)), 'constant')
elif len(location_query_embedding) > 384:
    location_query_embedding = location_query_embedding[:384]

# Execute the query
location_result = graph_rag.query(location_query, location_query_embedding, k=2)

print("\nLocation Query:")
print(f"Query: {location_query.text}")
print(f"Answer: {location_result.answer}")
print(f"Entities mentioned: {len(location_result.entities_mentioned)}")
print(f"Sources: {len(location_result.sources)} documents retrieved")

## Explore the Knowledge Graph Structure

In [None]:
# Examine the knowledge graph structure
kg = graph_rag.retriever.knowledge_graph
print(f"Knowledge Graph Statistics:")
print(f"  Number of entities: {len(kg.entities)}")
print(f"  Number of relations: {len(kg.relations)}")
print(f"  Number of nodes in graph: {len(kg.graph.nodes)}")
print(f"  Number of edges in graph: {len(kg.graph.edges)}")

# Display some entities
print(f"\nSample Entities:")
for i, (eid, entity) in enumerate(list(kg.entities.items())[:5]):
    print(f"  {i+1}. {entity.name} ({entity.type.value})")

# Display some relations
print(f"\nSample Relations:")
for i, (rid, relation) in enumerate(list(kg.relations.items())[:5]):
    subj = kg.entities.get(relation.subject_id, type('obj', (object,), {'name': 'Unknown'})())
    obj = kg.entities.get(relation.object_id, type('obj', (object,), {'name': 'Unknown'})())
    print(f"  {i+1}. {subj.name} --{relation.type.value}--> {obj.name}")

## Performance Analysis

In [None]:
# Perform multiple queries to analyze performance
queries = [
    "Where does John Smith work?",
    "Who founded Microsoft?",
    "What is Seattle known for?",
    "Where is Microsoft headquartered?",
    "Who founded Apple Inc.?"
]

latencies = []
confidences = []
entity_counts = []

for query_text in queries:
    query = GraphQuery(text=query_text, hops=2, include_related=True)
    
    # Create embedding
    query_hash = hashlib.md5(query_text.encode()).hexdigest()
    query_emb = np.frombuffer(bytes.fromhex(query_hash[:32]), dtype=np.float32)
    if len(query_emb) < 384:
        query_emb = np.pad(query_emb, (0, 384 - len(query_emb)), 'constant')
    elif len(query_emb) > 384:
        query_emb = query_emb[:384]
    
    result = graph_rag.query(query, query_emb, k=2)
    latencies.append(result.latency_ms)
    confidences.append(result.confidence)
    entity_counts.append(len(result.entities_mentioned))

print(f"Average query latency: {np.mean(latencies):.2f}ms")
print(f"Latency std deviation: {np.std(latencies):.2f}ms")
print(f"Average confidence: {np.mean(confidences):.3f}")
print(f"Average entities mentioned: {np.mean(entity_counts):.1f}")

## Summary

In this notebook, we explored the Graph-Enhanced RAG architecture:

1. **Initialization**: Created an instance of the GraphEnhancedRAG system
2. **Graph Documents**: Added documents that form entity-relation graphs
3. **Graph Queries**: Executed queries that leverage graph-based reasoning
4. **Knowledge Graph Analysis**: Examined the structure of the knowledge graph
5. **Performance Evaluation**: Measured query latency, confidence, and entity discovery

The Graph-Enhanced RAG system successfully processed graph-based queries and returned relevant responses while leveraging entity relationships and multi-hop reasoning.