In [None]:
# Import libraries
from langchain_community.graphs import Neo4jGraph
from graph_rag_query import GraphRAGQuery
from graph_rag_embeddings import EntityEmbeddings
from graph_rag_hybrid import HybridRetriever
from graph_rag_context import ContextBuilder
import json

In [None]:
# Setup Neo4j connection
NEO4J_URI = "neo4j+s://0c367113.databases.neo4j.io"
NEO4J_USERNAME = "neo4j"
NEO4J_PASSWORD = "gTO1K567hBLzkRdUAhhEb-UqvBjz0i3ckV3M9v_-Nio"

graph = Neo4jGraph(
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD
)

print("K·∫øt n·ªëi Neo4j th√†nh c√¥ng!")

## Step 1: Initialize All Components

In [None]:
# Initialize components t·ª´ c√°c b∆∞·ªõc tr∆∞·ªõc
print("Loading components...")

# B∆∞·ªõc 1: Graph Queries
graph_query = GraphRAGQuery(graph)
print("Graph queries ready")

# B∆∞·ªõc 2: Embeddings
embeddings = EntityEmbeddings(graph)
print("Embeddings ready")

# B∆∞·ªõc 3: Hybrid Retriever
hybrid = HybridRetriever(graph_query, embeddings)
print("Hybrid retriever ready")

# B∆∞·ªõc 4: Context Builder
builder = ContextBuilder(max_context_length=8000)
print("Context builder ready")

## Step 2: Test Basic Context Building

In [None]:
# Retrieve context cho c√¢u h·ªèi
question = "Ai l√† ng∆∞·ªùi ch·ªâ huy chi·∫øn d·ªãch ƒêi·ªán Bi√™n Ph·ªß?"

print(f"Question: {question}\n")

# Step 1: Hybrid retrieval
retrieval_context = hybrid.retrieve(
    question=question,
    top_k=10,
    vector_top_k=5,
    expansion_depth=1
)

In [None]:
# Step 2: Build structured context
context = builder.build_rag_context(
    question=question,
    retrieval_context=retrieval_context,
    max_entities=10,
    max_relationships=15
)

print("=== STRUCTURED CONTEXT ===")
print(f"Question: {context['question']}")
print(f"Question type: {context['question_type']}")
print(f"Entities: {context['entity_count']}")
print(f"Relationships: {context['relationship_count']}")
print(f"Summary: {context['context_summary']}")

## Step 3: Format for Gemini - Q&A Style

In [None]:
# Format cho Q&A task
qa_prompt = builder.format_for_gemini(
    context=context,
    prompt_type="qa",
    include_instructions=True
)

print("=" * 70)
print("GEMINI PROMPT - Q&A STYLE")
print("=" * 70)
print(qa_prompt)
print("\n" + "=" * 70)
print(f"Prompt length: {len(qa_prompt)} characters")
print(f"Estimated tokens: {builder.estimate_token_count(qa_prompt)}")

## Step 4: Test Different Prompt Types

In [None]:
# Test Summary prompt
summary_question = "T√≥m t·∫Øt v·ªÅ chi·∫øn d·ªãch ƒêi·ªán Bi√™n Ph·ªß"

summary_retrieval = hybrid.retrieve(summary_question, top_k=10)
summary_context = builder.build_rag_context(summary_question, summary_retrieval)

summary_prompt = builder.format_for_gemini(
    context=summary_context,
    prompt_type="summary",
    include_instructions=True
)

print("=" * 70)
print("GEMINI PROMPT - SUMMARY STYLE")
print("=" * 70)
print(summary_prompt)

In [None]:
# Test Explanation prompt
explain_question = "Gi·∫£i th√≠ch vai tr√≤ c·ªßa V√µ Nguy√™n Gi√°p trong kh√°ng chi·∫øn"

explain_retrieval = hybrid.retrieve(explain_question, top_k=12, expansion_depth=2)
explain_context = builder.build_rag_context(explain_question, explain_retrieval)

explain_prompt = builder.format_for_gemini(
    context=explain_context,
    prompt_type="explain",
    include_instructions=True
)

print("=" * 70)
print("GEMINI PROMPT - EXPLANATION STYLE")
print("=" * 70)
print(explain_prompt)

In [None]:
# Test Timeline prompt
timeline_question = "C√°c s·ª± ki·ªán quan tr·ªçng t·ª´ 1945 ƒë·∫øn 1954"

timeline_retrieval = hybrid.retrieve(timeline_question, top_k=15)
timeline_context = builder.build_rag_context(timeline_question, timeline_retrieval)

timeline_prompt = builder.format_for_gemini(
    context=timeline_context,
    prompt_type="timeline",
    include_instructions=True
)

print("=" * 70)
print("GEMINI PROMPT - TIMELINE STYLE")
print("=" * 70)
print(timeline_prompt)

## Step 5: Test Token Limit Management

In [None]:
# Test v·ªõi large context
large_question = "L·ªãch s·ª≠ kh√°ng chi·∫øn ch·ªëng Ph√°p t·ª´ 1945 ƒë·∫øn 1954"

large_retrieval = hybrid.retrieve(
    large_question, 
    top_k=20,
    expansion_depth=2,
    include_paths=True
)

large_context = builder.build_rag_context(
    large_question,
    large_retrieval,
    max_entities=20,
    max_relationships=30
)

# Format v√† check size
large_prompt = builder.format_for_gemini(large_context, prompt_type="qa")

print("=== LARGE CONTEXT ===")
print(f"Entities: {large_context['entity_count']}")
print(f"Relationships: {large_context['relationship_count']}")
print(f"Prompt length: {len(large_prompt)} characters")
print(f"Estimated tokens: {builder.estimate_token_count(large_prompt)}")

In [None]:
# Truncate n·∫øu qu√° d√†i
MAX_TOKENS = 6000  # Gemini free tier: 8K context, ƒë·ªÉ 6K cho safe

if builder.estimate_token_count(large_prompt) > MAX_TOKENS:
    print(f"‚ö†Ô∏è Context qu√° d√†i! Truncating...\n")
    
    truncated_context = builder.truncate_to_token_limit(
        large_context,
        max_tokens=MAX_TOKENS
    )
    
    truncated_prompt = builder.format_for_gemini(truncated_context, prompt_type="qa")
    
    print("=== AFTER TRUNCATION ===")
    print(f"Entities: {truncated_context['entity_count']}")
    print(f"Relationships: {truncated_context['relationship_count']}")
    print(f"Prompt length: {len(truncated_prompt)} characters")
    print(f"Estimated tokens: {builder.estimate_token_count(truncated_prompt)}")
else:
    print("‚úÖ Context size OK!")

## Step 6: Test Multi-Turn Conversation Context

In [None]:
# Simulate conversation history
conversation_history = [
    {
        "question": "Ai l√† l√£nh ƒë·∫°o kh√°ng chi·∫øn ch·ªëng Ph√°p?",
        "answer": "H·ªì Ch√≠ Minh l√† ng∆∞·ªùi l√£nh ƒë·∫°o ch√≠nh c·ªßa phong tr√†o kh√°ng chi·∫øn ch·ªëng Ph√°p, v·ªõi vai tr√≤ Ch·ªß t·ªãch n∆∞·ªõc v√† l√£nh t·ª• c·ªßa ƒê·∫£ng C·ªông s·∫£n Vi·ªát Nam."
    },
    {
        "question": "V√µ Nguy√™n Gi√°p ch·ªâ huy nh·ªØng chi·∫øn d·ªãch n√†o?",
        "answer": "ƒê·∫°i t∆∞·ªõng V√µ Nguy√™n Gi√°p ch·ªâ huy nhi·ªÅu chi·∫øn d·ªãch quan tr·ªçng, ƒë·∫∑c bi·ªát l√† Chi·∫øn d·ªãch ƒêi·ªán Bi√™n Ph·ªß nƒÉm 1954."
    }
]

# Current question (follow-up)
followup_question = "Chi·∫øn d·ªãch ƒë√≥ di·ªÖn ra nh∆∞ th·∫ø n√†o?"

# Retrieve context
followup_retrieval = hybrid.retrieve(followup_question, top_k=10)
followup_context = builder.build_rag_context(followup_question, followup_retrieval)

# Create multi-turn prompt
multi_turn_prompt = builder.create_multi_turn_context(
    conversation_history=conversation_history,
    current_context=followup_context,
    max_history=2
)

print("=" * 70)
print("MULTI-TURN CONVERSATION PROMPT")
print("=" * 70)
print(multi_turn_prompt)

## Step 7: Test Few-Shot Examples

In [None]:
# Define few-shot examples
examples = [
    {
        "question": "Ai l√† Ch·ªß t·ªãch n∆∞·ªõc Vi·ªát Nam D√¢n ch·ªß C·ªông h√≤a?",
        "answer": "H·ªì Ch√≠ Minh l√† Ch·ªß t·ªãch n∆∞·ªõc Vi·ªát Nam D√¢n ch·ªß C·ªông h√≤a, ƒë·ªìng th·ªùi l√† ng∆∞·ªùi s√°ng l·∫≠p v√† l√£nh ƒë·∫°o ƒê·∫£ng C·ªông s·∫£n Vi·ªát Nam."
    },
    {
        "question": "Chi·∫øn d·ªãch ƒêi·ªán Bi√™n Ph·ªß k·∫øt th√∫c khi n√†o?",
        "answer": "Chi·∫øn d·ªãch ƒêi·ªán Bi√™n Ph·ªß k·∫øt th√∫c v√†o ng√†y 7/5/1954, ƒë√°nh d·∫•u th·∫Øng l·ª£i quy·∫øt ƒë·ªãnh c·ªßa qu√¢n ƒë·ªôi Vi·ªát Nam trong kh√°ng chi·∫øn ch·ªëng Ph√°p."
    }
]

# Base prompt
test_question = "Vi·ªát Minh ƒë∆∞·ª£c th√†nh l·∫≠p nƒÉm n√†o?"
test_retrieval = hybrid.retrieve(test_question, top_k=8)
test_context = builder.build_rag_context(test_question, test_retrieval)
base_prompt = builder.format_for_gemini(test_context, prompt_type="qa")

# Add examples
few_shot_prompt = builder.add_examples_to_prompt(
    base_prompt=base_prompt,
    examples=examples,
    max_examples=2
)

print("=" * 70)
print("FEW-SHOT PROMPT")
print("=" * 70)
print(few_shot_prompt)

## Step 8: Compare Prompt Styles

In [None]:
# Compare different prompt formats cho c√πng question
compare_question = "T·∫ßm quan tr·ªçng c·ªßa chi·∫øn th·∫Øng ƒêi·ªán Bi√™n Ph·ªß"

compare_retrieval = hybrid.retrieve(compare_question, top_k=10, expansion_depth=2)
compare_context = builder.build_rag_context(compare_question, compare_retrieval)

prompt_types = ["qa", "summary", "explain"]

for ptype in prompt_types:
    prompt = builder.format_for_gemini(compare_context, prompt_type=ptype, include_instructions=False)
    tokens = builder.estimate_token_count(prompt)
    
    print(f"\n{'='*70}")
    print(f"PROMPT TYPE: {ptype.upper()}")
    print(f"{'='*70}")
    print(f"Length: {len(prompt)} chars")
    print(f"Tokens: ~{tokens}")
    print(f"\nPreview (first 500 chars):")
    print(prompt[:500] + "...")

## Step 9: Test Context Quality Analysis

In [None]:
# Analyze context quality
def analyze_context_quality(context: dict) -> dict:
    """Analyze v√† report context quality metrics"""
    
    entities = context['entities']
    relationships = context['relationships']
    
    # Entity diversity
    entity_types = {}
    for e in entities:
        etype = e['type']
        entity_types[etype] = entity_types.get(etype, 0) + 1
    
    # Average relevance
    avg_relevance = sum(e['relevance_score'] for e in entities) / len(entities) if entities else 0
    
    # Description coverage
    with_desc = sum(1 for e in entities if e['description'])
    desc_coverage = (with_desc / len(entities) * 100) if entities else 0
    
    # Relationship density
    rel_density = len(relationships) / len(entities) if entities else 0
    
    return {
        'entity_count': len(entities),
        'entity_types': entity_types,
        'avg_relevance': avg_relevance,
        'description_coverage': desc_coverage,
        'relationship_count': len(relationships),
        'relationship_density': rel_density
    }

# Test v·ªõi different questions
test_questions = [
    "Ai ch·ªâ huy qu√¢n ƒë·ªôi?",
    "Chi·∫øn d·ªãch ƒêi·ªán Bi√™n Ph·ªß di·ªÖn ra ·ªü ƒë√¢u?",
    "C√°c s·ª± ki·ªán nƒÉm 1954"
]

print("=== CONTEXT QUALITY ANALYSIS ===")
for question in test_questions:
    retrieval = hybrid.retrieve(question, top_k=10)
    context = builder.build_rag_context(question, retrieval)
    quality = analyze_context_quality(context)
    
    print(f"\nQuestion: {question}")
    print(f"  Entities: {quality['entity_count']}")
    print(f"  Avg relevance: {quality['avg_relevance']:.3f}")
    print(f"  Description coverage: {quality['description_coverage']:.1f}%")
    print(f"  Relationships: {quality['relationship_count']}")
    print(f"  Rel. density: {quality['relationship_density']:.2f}")
    print(f"  Entity types: {quality['entity_types']}")

## Step 10: Save Sample Prompts

In [None]:
# Save sample prompts ƒë·ªÉ reference
sample_prompts = {}

sample_questions = [
    ("Ai l√£nh ƒë·∫°o kh√°ng chi·∫øn ch·ªëng Ph√°p?", "qa"),
    ("T√≥m t·∫Øt chi·∫øn d·ªãch ƒêi·ªán Bi√™n Ph·ªß", "summary"),
    ("Gi·∫£i th√≠ch vai tr√≤ c·ªßa V√µ Nguy√™n Gi√°p", "explain"),
]

for question, ptype in sample_questions:
    retrieval = hybrid.retrieve(question, top_k=10)
    context = builder.build_rag_context(question, retrieval)
    prompt = builder.format_for_gemini(context, prompt_type=ptype)
    
    sample_prompts[f"{ptype}_{question[:30]}"] = {
        'question': question,
        'type': ptype,
        'prompt': prompt,
        'token_estimate': builder.estimate_token_count(prompt)
    }

# Save to file
with open('sample_prompts.json', 'w', encoding='utf-8') as f:
    json.dump(sample_prompts, f, ensure_ascii=False, indent=2)

print("‚úÖ Saved sample prompts to sample_prompts.json")
print(f"\nSummary:")
for key, data in sample_prompts.items():
    print(f"  {data['type']:10s} - {data['question'][:40]:40s} - ~{data['token_estimate']} tokens")

## Summary

‚úÖ **B∆∞·ªõc 4 ho√†n th√†nh!**

**ƒê√£ implement:**
1. ‚úÖ `build_rag_context()` - Structure retrieval results
2. ‚úÖ 4 prompt types cho Gemini:
   - **Q&A**: Tr·∫£ l·ªùi c√¢u h·ªèi tr·ª±c ti·∫øp
   - **Summary**: T√≥m t·∫Øt th√¥ng tin
   - **Explain**: Gi·∫£i th√≠ch chi ti·∫øt
   - **Timeline**: X√¢y d·ª±ng timeline s·ª± ki·ªán
3. ‚úÖ Token management:
   - Estimate token count
   - Auto truncate n·∫øu qu√° limit
4. ‚úÖ Multi-turn conversation support
5. ‚úÖ Few-shot examples integration
6. ‚úÖ Context quality analysis

**Prompt Format:**
- Instructions ‚Üí Context (Entities + Relationships + Paths) ‚Üí Question ‚Üí Answer
- Structured v√† d·ªÖ ƒë·ªçc cho Gemini
- Ti·∫øng Vi·ªát t·ª± nhi√™n
- Source citations (n·∫øu c√≥)

**Token Limits:**
- Target: ~4000-6000 tokens cho context
- Gemini free: 8K context window
- Auto truncate n·∫øu exceed

**Next:** B∆∞·ªõc 5 & 6 - RAG Prompts + LLM Integration (connect v·ªõi Gemini API) üöÄ