In [1]:
pip install -r requirements_conda.txt

Collecting lightrag-hku (from -r requirements_conda.txt (line 7))
  Using cached lightrag_hku-1.3.9-py3-none-any.whl.metadata (69 kB)
Collecting ollama (from -r requirements_conda.txt (line 8))
  Using cached ollama-0.5.1-py3-none-any.whl.metadata (4.3 kB)
Collecting neo4j (from -r requirements_conda.txt (line 11))
  Using cached neo4j-5.28.1-py3-none-any.whl.metadata (5.9 kB)
Collecting networkx (from -r requirements_conda.txt (line 12))
  Using cached networkx-3.5-py3-none-any.whl.metadata (6.3 kB)
Collecting pandas (from -r requirements_conda.txt (line 15))
  Downloading pandas-2.3.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (91 kB)
Collecting pypdf2 (from -r requirements_conda.txt (line 16))
  Using cached pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting jupyterlab (from -r requirements_conda.txt (line 19))
  Downloading jupyterlab-4.4.4-py3-none-any.whl.metadata (16 kB)
Collecting aiohttp (from lightrag-hku->-r requirements_conda.txt (line 7))
  Downloading aiohttp-3.12

In [5]:
import asyncio
import os
from PyPDF2 import PdfReader
from neo4j import GraphDatabase
import xml.etree.ElementTree as ET

# --- CORRECTED IMPORTS from the lightrag library ---
# These are the official, supported import paths from the demo scripts.
from lightrag import LightRAG, QueryParam
from lightrag.llm.ollama import ollama_model_complete, ollama_embed
from lightrag.kg.shared_storage import initialize_pipeline_status
from lightrag.utils import EmbeddingFunc, logger, set_verbose_debug

# --- Main Configuration ---
PDF_FOLDER = "docs"
WORKING_DIR = "./pa_rag_storage" # Directory for LightRAG's local files

# --- Ollama Model Configuration ---
OLLAMA_GENERATION_MODEL = "gemma3:4b"
OLLAMA_EMBEDDING_MODEL = "nomic-embed-text"
OLLAMA_EMBEDDING_DIM = 768 # The output dimension for nomic-embed-text

# --- Neo4j Database Configuration ---
os.environ["NEO4J_URI"] = "bolt://localhost:7687"
os.environ["NEO4J_USERNAME"] = "neo4j"
os.environ["NEO4J_PASSWORD"] = "giacomo3234" # <--- IMPORTANT: SET YOUR PASSWORD

os.makedirs(WORKING_DIR, exist_ok=True)

print("Configuration and corrected imports are set.")

Configuration and corrected imports are set.


In [None]:
# --- Helper function to read PDFs ---
def get_text_from_pdfs(folder_path):
    all_text = ""
    for filename in os.listdir(folder_path):
        if filename.lower().endswith('.pdf'):
            print(f"Reading PDF: {filename}")
            with open(os.path.join(folder_path, filename), 'rb') as f:
                reader = PdfReader(f)
                for page in reader.pages:
                    all_text += page.extract_text() + "\n"
    return all_text

async def run_lightrag_ingestion():
    # --- Initialize LightRAG with Ollama using the correct functions ---
    embedding_function_wrapper = EmbeddingFunc(
        embedding_dim=OLLAMA_EMBEDDING_DIM,
        max_token_size=8192,
        func=lambda texts: ollama_embed(texts, embed_model=OLLAMA_EMBEDDING_MODEL)
    )
    
    # Configure the LLM to have a larger context window, as shown in the demo
    llm_kwargs = {
        "options": {"num_ctx": 8192}, # Set context window
        "timeout": 600 # Set a longer timeout for complex tasks
    }

    rag_instance = LightRAG(
        working_dir=WORKING_DIR,
        llm_model_func=ollama_model_complete,
        llm_model_name=OLLAMA_GENERATION_MODEL,
        llm_model_kwargs=llm_kwargs,
        embedding_func=embedding_function_wrapper,
        graph_storage="Neo4JStorage",
        # --- NEW: Reduce the chunk size to make the LLM's job easier ---
        chunk_token_size=512,  # <<< REDUCED CHUNK SIZE
        chunk_overlap_token_size=50
    )


    print("--- Initializing LightRAG Storages (connecting to Neo4j...) ---")
    await rag_instance.initialize_storages()
    await initialize_pipeline_status()

    print("--- Reading PA Regulation PDFs ---")
    document_text = get_text_from_pdfs(PDF_FOLDER)
    if not document_text:
        print("No text found. Halting.")
        return

    documents_to_insert = [document_text]
    document_ids = ["pa_regulations_corpus"]
    
    print(f"\n--- Ingesting document with ID '{document_ids[0]}' into LightRAG... ---")
    await rag_instance.ainsert(documents_to_insert, ids=document_ids)
    
    print("\n--- Ingestion Complete! ---")
    print("Your knowledge graph has been generated and loaded directly into Neo4j.")

# Run the asynchronous ingestion process
await run_lightrag_ingestion()

INFO: Process 94157 Shared-Data already initialized (multiprocess=False)
INFO:nano-vectordb:Init {'embedding_dim': 768, 'metric': 'cosine', 'storage_file': './pa_rag_storage/vdb_entities.json'} 0 data
INFO:nano-vectordb:Init {'embedding_dim': 768, 'metric': 'cosine', 'storage_file': './pa_rag_storage/vdb_relationships.json'} 0 data
INFO:nano-vectordb:Init {'embedding_dim': 768, 'metric': 'cosine', 'storage_file': './pa_rag_storage/vdb_chunks.json'} 0 data
INFO: Process 94157 storage namespace already initialized: [full_docs]
INFO: Process 94157 storage namespace already initialized: [text_chunks]
INFO: Process 94157 storage namespace already initialized: [llm_response_cache]
INFO: Process 94157 storage namespace already initialized: [doc_status]
INFO: Process 94157 storage namespace already initialized: [full_docs]
INFO: Process 94157 storage namespace already initialized: [text_chunks]
INFO: Process 94157 storage namespace already initialized: [llm_response_cache]
INFO: Process 94157 

--- Initializing LightRAG Storages (connecting to Neo4j...) ---
--- Reading PA Regulation PDFs ---
Reading PDF: Legge regionale n_37_2014 artt. 20-21-22.pdf
Reading PDF: Direttiva 2014_25_UE.pdf
Reading PDF: Direttiva 2014_23_UE.pdf
Reading PDF: Decreto Legislativo 7 marzo 2005_agg_L_147_2013.pdf
Reading PDF: L. 27 Dicembre 2006 n.296 (Finanziaria 2007).pdf
Reading PDF: L. 23 Dicembre 2000 n.388 (Finanziaria 2001).pdf
