In [4]:
import sys
sys.path.append("..")  # add project root, not src

from src.document_loader import DocumentProcessor
from src.config import config

In [6]:

def create_sample_document():
    """Create a sample document for testing"""
    sample_content = """
    GraphRAG System Implementation

    This document describes the implementation of a GraphRAG system using LangChain, Gemini, and Neo4j.

    Key Components:
    1. Document Processing - Uses semantic chunking to process documents
    2. Knowledge Graph Creation - Extracts entities and relationships using LLM
    3. Vector Storage - Stores embeddings for similarity search
    4. Hybrid Retrieval - Combines graph traversal with vector search

    Technologies Used:
    - LangChain: Framework for building LLM applications
    - Gemini: Google's large language model for text processing
    - Neo4j: Graph database for storing relationships
    - Python: Programming language for implementation

    Benefits:
    - Better context understanding through relationships
    - More accurate retrieval through graph traversal  
    - Explainable results through graph structure
    """
    
    sample_file = config.DOCUMENTS_DIR / "sample_graphrag.docx"
    
    try:
        from docx import Document as DocxDocument
        doc = DocxDocument()
        doc.add_paragraph(sample_content)
        doc.save(sample_file)
        print(f"✅ Sample document created: {sample_file}")
    except ImportError:
        # Fallback - create a text file
        with open(config.DOCUMENTS_DIR / "sample_graphrag.txt", "w") as f:
            f.write(sample_content)
        print("✅ Sample text file created (install python-docx for .docx support)")

# Create sample if no documents exist
if not list(config.DOCUMENTS_DIR.glob("*.docx")):
    print("No documents found. Creating sample document...")
    create_sample_document()

No documents found. Creating sample document...
✅ Sample document created: /home/mushfiq/Desktop/graphrag_with_neo4j/notebooks/../documents/sample_graphrag.docx


In [7]:
processor = DocumentProcessor()
documents = processor.load_documents()

if documents:
    print("\n📊 Document Statistics:")
    print(f"Total documents: {len(documents)}")
    print(f"Average chunk length: {sum(len(doc.page_content) for doc in documents) // len(documents)}")
    
    # Show sample chunks
    print("\n📝 Sample chunks:")
    for i, doc in enumerate(documents[:3]):
        print(f"Chunk {i+1} (from {doc.metadata['source']}):")
        print(f"  Content: {doc.page_content[:200]}...")
        print(f"  Metadata: {doc.metadata}")
        print()
else:
    print("⚠️  No documents loaded. Please add .docx files to the documents folder.")

Found 1 documents to process...
Processing: sample_graphrag.docx
  Created 2 chunks
Total: 2 document chunks loaded

📊 Document Statistics:
Total documents: 2
Average chunk length: 434

📝 Sample chunks:
Chunk 1 (from sample_graphrag.docx):
  Content: GraphRAG System Implementation

    This document describes the implementation of a GraphRAG system using LangChain, Gemini, and Neo4j. Key Components:
    1....
  Metadata: {'source': 'sample_graphrag.docx', 'chunk_id': 0, 'doc_id': 'sample_graphrag'}

Chunk 2 (from sample_graphrag.docx):
  Content: Document Processing - Uses semantic chunking to process documents
    2. Knowledge Graph Creation - Extracts entities and relationships using LLM
    3. Vector Storage - Stores embeddings for similari...
  Metadata: {'source': 'sample_graphrag.docx', 'chunk_id': 1, 'doc_id': 'sample_graphrag'}



In [8]:
import pickle

if documents:
    with open('../processed_documents.pkl', 'wb') as f:
        pickle.dump(documents, f)
    print("✅ Documents saved for later use")

✅ Documents saved for later use
