In [1]:
pip install -r requirements_conda.txt

Collecting lightrag-hku (from -r requirements_conda.txt (line 7))
  Using cached lightrag_hku-1.3.9-py3-none-any.whl.metadata (69 kB)
Collecting ollama (from -r requirements_conda.txt (line 8))
  Using cached ollama-0.5.1-py3-none-any.whl.metadata (4.3 kB)
Collecting neo4j (from -r requirements_conda.txt (line 11))
  Using cached neo4j-5.28.1-py3-none-any.whl.metadata (5.9 kB)
Collecting networkx (from -r requirements_conda.txt (line 12))
  Using cached networkx-3.5-py3-none-any.whl.metadata (6.3 kB)
Collecting pandas (from -r requirements_conda.txt (line 15))
  Downloading pandas-2.3.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (91 kB)
Collecting pypdf2 (from -r requirements_conda.txt (line 16))
  Using cached pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting jupyterlab (from -r requirements_conda.txt (line 19))
  Downloading jupyterlab-4.4.4-py3-none-any.whl.metadata (16 kB)
Collecting aiohttp (from lightrag-hku->-r requirements_conda.txt (line 7))
  Downloading aiohttp-3.12

In [1]:
import asyncio
import os
from PyPDF2 import PdfReader
from neo4j import GraphDatabase
import xml.etree.ElementTree as ET
import logging 
import sys
# --- CORRECTED IMPORTS from the lightrag library ---
# These are the official, supported import paths from the demo scripts.
from lightrag import LightRAG, QueryParam
from lightrag.llm.ollama import ollama_model_complete, ollama_embed
from lightrag.kg.shared_storage import initialize_pipeline_status
from lightrag.utils import EmbeddingFunc, logger, set_verbose_debug
from prompt_it import PROMPTS_IT
# --- Main Configuration ---
PDF_FOLDER = "docs"
WORKING_DIR = "./pa_rag_storage" # Directory for LightRAG's local files

# --- Ollama Model Configuration ---
OLLAMA_GENERATION_MODEL = "gemma3:1b"
OLLAMA_EMBEDDING_MODEL = "nomic-embed-text"
OLLAMA_EMBEDDING_DIM = 768 # The output dimension for nomic-embed-text

# --- Neo4j Database Configuration ---
os.environ["NEO4J_URI"] = "neo4j://localhost:7687"
os.environ["NEO4J_USERNAME"] = "neo4j"
os.environ["NEO4J_PASSWORD"] = "giacomo3234" 

os.makedirs(WORKING_DIR, exist_ok=True)

print("Configuration and corrected imports are set.")

# Get the root logger for the 'lightrag' library
lightrag_logger = logging.getLogger("lightrag")

# Set the logging level to INFO or DEBUG
# - INFO: Shows major steps (e.g., "Processing chunk 1/10", "Extracting entities"). This is a good default.
# - DEBUG: Shows extremely detailed information, including the exact prompts being sent to the LLM. Very useful for debugging.
lightrag_logger.setLevel(logging.INFO)

# Make sure the logs are printed to your notebook's output
# We check if a handler already exists to avoid duplicate messages
if not any(isinstance(handler, logging.StreamHandler) for handler in lightrag_logger.handlers):
    handler = logging.StreamHandler(sys.stdout)
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    handler.setFormatter(formatter)
    lightrag_logger.addHandler(handler)

# --- Helper function to read PDFs ---
def get_text_from_pdfs(folder_path):
    all_text = ""
    for filename in os.listdir(folder_path):
        if filename.lower().endswith('.pdf'):
            print(f"Reading PDF: {filename}")
            with open(os.path.join(folder_path, filename), 'rb') as f:
                reader = PdfReader(f)
                for page in reader.pages:
                    all_text += page.extract_text() + "\n"
    return all_text

print("Logging configured. You will now see detailed progress from the LightRAG library.")

[94m2025-07-07 16:16:25 - pipmaster.package_manager - INFO - Targeting pip associated with Python: /Users/giacomo/Documents/kg+llm_task2_nlp/.conda/bin/python | Command base: /Users/giacomo/Documents/kg+llm_task2_nlp/.conda/bin/python -m pip[0m
[94m2025-07-07 16:16:25 - pipmaster.package_manager - INFO - Targeting pip associated with Python: /Users/giacomo/Documents/kg+llm_task2_nlp/.conda/bin/python | Command base: /Users/giacomo/Documents/kg+llm_task2_nlp/.conda/bin/python -m pip[0m
[94m2025-07-07 16:16:25 - pipmaster.async_package_manager - INFO - [Async] Initialized for Python: /Users/giacomo/Documents/kg+llm_task2_nlp/.conda/bin/python[0m


Configuration and corrected imports are set.
Logging configured. You will now see detailed progress from the LightRAG library.


In [2]:
# --- Helper function to read PDFs ---
def get_text_from_pdfs(folder_path):
    all_text = ""
    for filename in os.listdir(folder_path):
        if filename.lower().endswith('.pdf'):
            print(f"Reading PDF: {filename}")
            with open(os.path.join(folder_path, filename), 'rb') as f:
                reader = PdfReader(f)
                for page in reader.pages:
                    all_text += page.extract_text() + "\n"
    return all_text

async def run_lightrag_ingestion():
    # --- Initialize LightRAG with Ollama using the correct functions ---
    embedding_function_wrapper = EmbeddingFunc(
        embedding_dim=OLLAMA_EMBEDDING_DIM,
        max_token_size=4096,
        func=lambda texts: ollama_embed(texts, embed_model=OLLAMA_EMBEDDING_MODEL)
    )
    
    # Configure the LLM to have a larger context window, as shown in the demo
    llm_kwargs = {
        "options": {"num_ctx": 4096}, # Set context window
        "timeout": 300 # Set a longer timeout for complex tasks
    }

    rag_instance = LightRAG(
        working_dir=WORKING_DIR,
        llm_model_func=ollama_model_complete,
        llm_model_name=OLLAMA_GENERATION_MODEL,
        llm_model_kwargs=llm_kwargs,
        llm_model_max_async=1,
        embedding_func=embedding_function_wrapper,
        graph_storage="Neo4JStorage",
        # --- NEW: Reduce the chunk size to make the LLM's job easier ---
        chunk_token_size=512,  # <<< REDUCED CHUNK SIZE
        chunk_overlap_token_size=50,
        addon_params={
        "language": PROMPTS_IT["DEFAULT_LANGUAGE"],
        "entity_types": PROMPTS_IT["DEFAULT_ENTITY_TYPES"],
        "entity_extraction_prompt": PROMPTS_IT["entity_extraction"],
        "entity_extraction_examples": PROMPTS_IT["entity_extraction_examples"]
        }
    )


    print("--- Initializing LightRAG Storages (connecting to Neo4j...) ---")
    await rag_instance.initialize_storages()
    await initialize_pipeline_status()

    print("--- Reading PA Regulation PDFs ---")
    document_text = get_text_from_pdfs(PDF_FOLDER)
    if not document_text:
        print("No text found. Halting.")
        return

    documents_to_insert = [document_text]
    document_ids = ["pa_regulations_corpus"]
    
    print(f"\n--- Ingesting document with ID '{document_ids[0]}' into LightRAG... ---")
    await rag_instance.ainsert(documents_to_insert, ids=document_ids)
    
    print("\n--- Ingestion Complete! ---")
    print("Your knowledge graph has been generated and loaded directly into Neo4j.")

# Run the asynchronous ingestion process
await run_lightrag_ingestion()

INFO: Process 6428 Shared-Data created for Single Process
INFO:nano-vectordb:Init {'embedding_dim': 768, 'metric': 'cosine', 'storage_file': './pa_rag_storage/vdb_entities.json'} 0 data
INFO:nano-vectordb:Init {'embedding_dim': 768, 'metric': 'cosine', 'storage_file': './pa_rag_storage/vdb_relationships.json'} 0 data
INFO:nano-vectordb:Init {'embedding_dim': 768, 'metric': 'cosine', 'storage_file': './pa_rag_storage/vdb_chunks.json'} 0 data
INFO: Process 6428 initialized updated flags for namespace: [full_docs]
INFO: Process 6428 ready to initialize storage namespace: [full_docs]


--- Initializing LightRAG Storages (connecting to Neo4j...) ---
2025-07-07 16:16:27,265 - lightrag - INFO - Process 6428 KV load full_docs with 0 records


INFO: Process 6428 initialized updated flags for namespace: [text_chunks]
INFO: Process 6428 ready to initialize storage namespace: [text_chunks]


2025-07-07 16:16:27,266 - lightrag - INFO - Process 6428 KV load text_chunks with 0 records


INFO: Process 6428 initialized updated flags for namespace: [entities]
INFO: Process 6428 initialized updated flags for namespace: [relationships]
INFO: Process 6428 initialized updated flags for namespace: [chunks]
INFO: Process 6428 initialized updated flags for namespace: [llm_response_cache]
INFO: Process 6428 ready to initialize storage namespace: [llm_response_cache]


2025-07-07 16:16:27,267 - lightrag - INFO - Process 6428 KV load llm_response_cache with 0 records


INFO: Process 6428 initialized updated flags for namespace: [doc_status]
INFO: Process 6428 ready to initialize storage namespace: [doc_status]


2025-07-07 16:16:27,315 - lightrag - INFO - Process 6428 doc status load doc_status with 2 records


INFO: Process 6428 storage namespace already initialized: [full_docs]
INFO: Process 6428 storage namespace already initialized: [text_chunks]
INFO: Process 6428 storage namespace already initialized: [llm_response_cache]
INFO: Process 6428 storage namespace already initialized: [doc_status]


2025-07-07 16:16:27,331 - lightrag - INFO - Chunk-entity-relation at neo4j://localhost:7687 not found. try to create specified database.
2025-07-07 16:16:27,331 - lightrag - INFO - Chunk-entity-relation at neo4j://localhost:7687 not found. try to create specified database.
2025-07-07 16:16:27,360 - lightrag - INFO - Connected to None at neo4j://localhost:7687
2025-07-07 16:16:27,360 - lightrag - INFO - Connected to None at neo4j://localhost:7687
2025-07-07 16:16:27,368 - lightrag - INFO - Storage Initialization completed!


INFO: Process 6428 Pipeline namespace initialized


--- Reading PA Regulation PDFs ---
Reading PDF: Legge regionale n_37_2014 artt. 20-21-22.pdf
Reading PDF: Direttiva 2014_25_UE.pdf
Reading PDF: Direttiva 2014_23_UE.pdf
Reading PDF: Decreto Legislativo 7 marzo 2005_agg_L_147_2013.pdf
Reading PDF: L. 27 Dicembre 2006 n.296 (Finanziaria 2007).pdf
Reading PDF: L. 23 Dicembre 2000 n.388 (Finanziaria 2001).pdf
Reading PDF: dPR 5 ottobre 2010_207_agg_DM_infrastrutture_24apr2014.pdf
Reading PDF: Direttiva 2014_24_UE.pdf
Reading PDF: D.Lgs. 50_2016.pdf
Reading PDF: DGR_17_2024_01_22_signed_signed.pdf
Reading PDF: Decreto legislativo 12 aprile  2006_163_agg_DL_24apr2014_n_66.pdf
Reading PDF: L. 23 Dicembre 1999 n.488 (Finanziaria 2000).pdf
Reading PDF: BURP_n.177_del_17112008.pdf
Reading PDF: DELIBERAZIONE DELLA GIUNTA REGIONALE 21 marzo 2017 n_354.pdf
Reading PDF: Dir.1999 93 CE del Parlamento Europeo e del Consiglio.pdf

--- Ingesting document with ID 'pa_regulations_corpus' into LightRAG... ---
2025-07-07 16:16:44,927 - lightrag - INFO - No 

In [10]:
pip install google-generativeai

Collecting google-generativeai
  Downloading google_generativeai-0.8.5-py3-none-any.whl.metadata (3.9 kB)
Collecting google-ai-generativelanguage==0.6.15 (from google-generativeai)
  Downloading google_ai_generativelanguage-0.6.15-py3-none-any.whl.metadata (5.7 kB)
Collecting google-api-core (from google-generativeai)
  Using cached google_api_core-2.25.1-py3-none-any.whl.metadata (3.0 kB)
Collecting google-api-python-client (from google-generativeai)
  Downloading google_api_python_client-2.175.0-py3-none-any.whl.metadata (7.0 kB)
Collecting google-auth>=2.15.0 (from google-generativeai)
  Using cached google_auth-2.40.3-py2.py3-none-any.whl.metadata (6.2 kB)
Collecting protobuf (from google-generativeai)
  Using cached protobuf-6.31.1-cp39-abi3-macosx_10_9_universal2.whl.metadata (593 bytes)
Collecting tqdm (from google-generativeai)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting proto-plus<2.0.0dev,>=1.22.3 (from google-ai-generativelanguage==0.6.15->google-

In [None]:
import os
import asyncio
from PyPDF2 import PdfReader


from lightrag import LightRAG, QueryParam
from lightrag.utils import EmbeddingFunc
import google.generativeai as genai
from prompt_it import PROMPTS_IT
import numpy as np

# --- Step 3: Configure Gemini API Key ---
# Best practice in Kaggle is to use "Secrets" (Add-ons -> Secrets)
# from kaggle_secrets import UserSecretsClient
# user_secrets = UserSecretsClient()
# GEMINI_API_KEY = user_secrets.get_secret("GEMINI_API_KEY")

# For a quick test, you can paste it here, but this is not recommended for sharing.
GEMINI_API_KEY = "AIzaSyBzof1iaPDCMamcu4Re7-0UPWZrJydKbT0" # <--- PASTE YOUR KEY HERE
genai.configure(api_key=GEMINI_API_KEY)

# --- Step 4: Create the Gemini Adapter Functions ---

# --- Adapter for Text Generation ---
async def gemini_generator(prompt: str, **kwargs) -> str:
    """
    An async wrapper for the Gemini Pro API for text generation.
    """
    try:
        request_options = {"timeout": 120}
        model = genai.GenerativeModel('gemini-2.5-flash-lite-preview-06-17')
        response = await model.generate_content_async(
            prompt,
            request_options=request_options
        )
        return response.text
    except Exception as e:
        print(f"Error during Gemini generation: {e}")
        return ""

# --- Adapter for Text Embedding ---
# Note: Google has a specific model for embeddings
GEMINI_EMBEDDING_MODEL = "models/embedding-001"
GEMINI_EMBEDDING_DIM = 768 # The dimension for embedding-001

async def gemini_embedder(texts: list[str]) -> np.ndarray:
    """
    An async wrapper for the Gemini Embedding API with a timeout.
    """
    try:
        request_options = {"timeout": 60} # Embeddings are usually faster

        result = await genai.embed_content_async(
            model="models/embedding-001",
            content=texts,
            task_type="retrieval_document",
            request_options=request_options
        )
        return np.array(result['embedding'])
    except Exception as e:
        print(f"Error during Gemini embedding: {e}")
        return np.array([])


# --- Step 5: Initialize LightRAG with the Gemini Adapters ---

# --- Configuration ---
PDF_FOLDER = "./docs" # Assumes this is set from a previous cell
WORKING_DIR = "./pa_rag_gemini_storage_07_07" # Directory for LightRAG's local files
os.makedirs(WORKING_DIR, exist_ok=True)


async def run_gemini_ingestion():
    # --- Initialize LightRAG with our new Gemini functions ---
    embedding_function_wrapper = EmbeddingFunc(
        embedding_dim=GEMINI_EMBEDDING_DIM,
        max_token_size=4096,
        func=gemini_embedder # Pass our custom embedder
    )

    rag_instance = LightRAG(
        working_dir=WORKING_DIR,
        llm_model_func=gemini_generator, # Pass our custom generator
        embedding_func=embedding_function_wrapper,
        # You can still use Neo4j for storage
        graph_storage="Neo4JStorage",
        llm_model_max_async=1, 
        addon_params={
        "language": PROMPTS_IT["DEFAULT_LANGUAGE"],
        "entity_types": PROMPTS_IT["DEFAULT_ENTITY_TYPES"],
        "entity_extraction_prompt": PROMPTS_IT["entity_extraction"],
        "entity_extraction_examples": PROMPTS_IT["entity_extraction_examples"]
        }
    )

    print("--- Initializing LightRAG Storages (for Gemini) ---")
    await rag_instance.initialize_storages()

    print("--- Initializing Pipeline Status ---")
    await initialize_pipeline_status()


    print("--- Initializing LightRAG Storages (connecting to Neo4j...) ---")
    await rag_instance.initialize_storages()
    await initialize_pipeline_status()

    print("--- Reading PA Regulation PDFs ---")
    document_text = get_text_from_pdfs(PDF_FOLDER)
    if not document_text:
        print("No text found. Halting.")
        return

    documents_to_insert = [document_text]
    document_ids = ["pa_regulations_corpus_lmstudio"]
    
    print(f"\n--- Ingesting document with ID '{document_ids[0]}' into LightRAG... ---")
    await rag_instance.ainsert(documents_to_insert, ids=document_ids)
    
    print("\n--- Ingestion Complete! ---")
    print("Your knowledge graph has been generated and loaded directly into Neo4j.")

# Run the asynchronous ingestion process
await run_gemini_ingestion()

INFO: Process 3761 Shared-Data already initialized (multiprocess=False)
INFO:nano-vectordb:Init {'embedding_dim': 768, 'metric': 'cosine', 'storage_file': './pa_rag_gemini_storage_07_07/vdb_entities.json'} 0 data
INFO:nano-vectordb:Init {'embedding_dim': 768, 'metric': 'cosine', 'storage_file': './pa_rag_gemini_storage_07_07/vdb_relationships.json'} 0 data
INFO:nano-vectordb:Init {'embedding_dim': 768, 'metric': 'cosine', 'storage_file': './pa_rag_gemini_storage_07_07/vdb_chunks.json'} 0 data
INFO: Process 3761 storage namespace already initialized: [full_docs]
INFO: Process 3761 storage namespace already initialized: [text_chunks]
INFO: Process 3761 storage namespace already initialized: [llm_response_cache]
INFO: Process 3761 storage namespace already initialized: [doc_status]
INFO: Process 3761 storage namespace already initialized: [full_docs]
INFO: Process 3761 storage namespace already initialized: [text_chunks]
INFO: Process 3761 storage namespace already initialized: [llm_respo

--- Initializing LightRAG Storages (for Gemini) ---
2025-07-07 16:10:33,340 - lightrag - INFO - Chunk-entity-relation at neo4j://localhost:7687 not found. try to create specified database.
2025-07-07 16:10:33,340 - lightrag - INFO - Chunk-entity-relation at neo4j://localhost:7687 not found. try to create specified database.
2025-07-07 16:10:33,377 - lightrag - INFO - Connected to None at neo4j://localhost:7687
2025-07-07 16:10:33,379 - lightrag - INFO - Connected to None at neo4j://localhost:7687
--- Initializing Pipeline Status ---
--- Initializing LightRAG Storages (connecting to Neo4j...) ---
--- Reading PA Regulation PDFs ---
Reading PDF: Legge regionale n_37_2014 artt. 20-21-22.pdf
Reading PDF: Direttiva 2014_25_UE.pdf
Reading PDF: Direttiva 2014_23_UE.pdf
Reading PDF: Decreto Legislativo 7 marzo 2005_agg_L_147_2013.pdf
Reading PDF: L. 27 Dicembre 2006 n.296 (Finanziaria 2007).pdf
Reading PDF: L. 23 Dicembre 2000 n.388 (Finanziaria 2001).pdf
Reading PDF: dPR 5 ottobre 2010_207_agg

CancelledError: 

Error during Gemini generation: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash-lite"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 15
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 58
}
]
2025-07-07 16:14:02,001 - lightrag - INFO - Chunk 33 of 1676 extracted 0 Ent + 0 Rel
2025-07-07 16:14:02,002 - lightrag - INFO - Storage Finalization completed!
Error during Gemini generation: 429 You exceeded your current quota, please check your plan and billing details. For more information on this 