In [1]:
pip install --upgrade --quiet  langchain langchain-neo4j langchain-openai langchain-experimental neo4j pypdf2

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
from PyPDF2 import PdfReader

# --- LangChain Imports ---
from langchain_core.documents import Document
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_openai import ChatOpenAI # We use this class to connect to any OpenAI-compatible server

# --- Neo4j Imports ---
from neo4j import GraphDatabase

LM_STUDIO_API_BASE = "http://localhost:1234/v1"
LM_STUDIO_API_KEY = "not-needed"

# --- Neo4j Database Configuration ---
NEO4J_URI = "neo4j://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "giacomo3234" # <--- IMPORTANT: SET YOUR PASSWORD

print("Configuration set.")

Configuration set.


In [None]:
from PyPDF2 import PdfReader
import re
import os

def clean_text_for_markdown_pypdf2(text):
    """Cleans text for better Markdown display, handling multiple newlines."""
    if text is None:
        return ""
    text = re.sub(r'\n\s*\n', '\n\n', text)  # Consolidate paragraph breaks
    text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)  # Replace single newlines with space
    return text.strip()

def pypdf2_to_markdown_chunks(pdf_path):
    """
    Extracts text from a PDF using PyPDF2 and splits it per page.
    
    Returns:
        list of str: Each string is a Markdown formatted page.
    """
    try:
        reader = PdfReader(pdf_path)
    except Exception as e:
        return [f"<!-- Error opening PDF {pdf_path}: {e} -->"]

    chunks = []
    for page_num, page in enumerate(reader.pages):
        page_md = f"\n## Page {page_num + 1}\n\n"
        try:
            # Extract and clean the page text
            page_text = page.extract_text()
            cleaned_text = clean_text_for_markdown_pypdf2(page_text)
            if cleaned_text:
                page_md += f"### Text on Page {page_num + 1}\n" + cleaned_text + "\n"
            else:
                page_md += f"<!-- No text found on Page {page_num + 1} -->\n"
        except Exception as e:
            page_md += f"<!-- Error extracting text from Page {page_num + 1}: {e} -->\n"
        chunks.append(page_md)
    return chunks

if __name__ == "__main__":
    folder_path = "docs"  # Folder containing PDF files
    list_docs_md = []
    
    if not os.path.exists(folder_path):
        print(f"The folder '{folder_path}' does not exist. Please provide a valid folder path.")
    else:
        for file_name in os.listdir(folder_path):
            if file_name.endswith(".pdf"):
                pdf_file_path = os.path.join(folder_path, file_name)
                print(f"Processing file: {pdf_file_path}")
                # Get each page separately, as Markdown content
                page_chunks = pypdf2_to_markdown_chunks(pdf_file_path)
                for chunk in page_chunks:
                    # Create a Document for each page with associated PDF name in metadata
                    doc = Document(page_content=chunk, metadata={"pdf_name": file_name})
                    list_docs_md.append(doc)
                    
    print("Total documents created:", len(list_docs_md))

Processing file: docs/Legge regionale n_37_2014 artt. 20-21-22.pdf
Processing file: docs/Direttiva 2014_25_UE.pdf
Processing file: docs/Direttiva 2014_23_UE.pdf
Processing file: docs/Decreto Legislativo 7 marzo 2005_agg_L_147_2013.pdf
Processing file: docs/L. 27 Dicembre 2006 n.296 (Finanziaria 2007).pdf
Processing file: docs/L. 23 Dicembre 2000 n.388 (Finanziaria 2001).pdf
Processing file: docs/dPR 5 ottobre 2010_207_agg_DM_infrastrutture_24apr2014.pdf
Processing file: docs/Direttiva 2014_24_UE.pdf
Processing file: docs/D.Lgs. 50_2016.pdf
Processing file: docs/DGR_17_2024_01_22_signed_signed.pdf
Processing file: docs/Decreto legislativo 12 aprile  2006_163_agg_DL_24apr2014_n_66.pdf


In [None]:
# --- Initialize the LangChain LLM, pointing to LM Studio ---
llm = ChatOpenAI(
    temperature=0,
    base_url=LM_STUDIO_API_BASE,
    api_key=LM_STUDIO_API_KEY,
    model_name="google/gemma3:4b" 
)

llm_transformer = LLMGraphTransformer(llm=llm)

print("LangChain components initialized and pointing to LM Studio.")

LangChain components initialized and pointing to LM Studio.


In [24]:
print(list_docs_md[0].page_content)


## Page 1

### Text on Page 1
REPUBBLICA ITALIANA ANNO XLV BARI, 8 AGOSTO 2014 n. 109BOLLETTINO UFFICIALE della Regione Puglia Leggi e regolamenti regionali VOLUME PRIMO 2014.08. 19  09:09:08 +02'00'



In [None]:
print("--- Starting graph extraction... ---")
print("This may take a while depending on the document size and your computer's performance.")

# The transformer converts the text documents into graph documents (nodes and relationships)
graph_documents = await llm_transformer.aconvert_to_graph_documents(list_docs_md)

# --- Inspect the results ---
# The result is a list, but we only processed one large document.
main_graph = graph_documents[0]

print(f"\n--- Extraction Complete! ---")
print(f"Number of nodes: {len(main_graph.nodes)}")
print(f"Number of relationships: {len(main_graph.relationships)}")

print("\n--- Sample Nodes ---")
for node in main_graph.nodes[:5]:
    print(node)

print("\n--- Sample Relationships ---")
for rel in main_graph.relationships[:5]:
    print(rel)

--- Starting graph extraction... ---
This may take a while depending on the document size and your computer's performance.


In [None]:
def upload_graph_to_neo4j(graph_doc, uri, user, password):
    """Uploads a LangChain GraphDocument to Neo4j."""
    driver = GraphDatabase.driver(uri, auth=(user, password))
    
    # Use a set to avoid duplicate node creation queries
    nodes = {node.id: node for node in graph_doc.nodes}
    
    with driver.session(database="neo4j") as session:
        # Clear existing data
        print("Clearing existing Neo4j database...")
        session.run("MATCH (n) DETACH DELETE n")
        
        # Create uniqueness constraint for faster merging
        session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (e:__Entity__) REQUIRE e.id IS UNIQUE")
        
        # Upload nodes
        print(f"Uploading {len(nodes)} nodes...")
        upload_nodes_query = """
        UNWIND $nodes AS node_data
        MERGE (e:__Entity__ {id: node_data.id})
        // Dynamically set the label and properties
        // The label is capitalized from the node type
        // The node id is also set as a 'name' property for easy viewing
        WITH e, node_data
        CALL apoc.create.setLabels(e, [apoc.text.upper(node_data.type)]) YIELD node
        SET node.name = node_data.id
        RETURN count(node)
        """
        session.run(upload_nodes_query, nodes=list(nodes.values()))
        
        # Upload relationships
        print(f"Uploading {len(graph_doc.relationships)} relationships...")
        upload_rels_query = """
        UNWIND $rels AS rel_data
        MATCH (source:__Entity__ {id: rel_data.source.id})
        MATCH (target:__Entity__ {id: rel_data.target.id})
        // Use the relationship type as the graph relationship type
        CALL apoc.create.relationship(source, rel_data.type, {}, target) YIELD rel
        RETURN count(rel)
        """
        session.run(upload_rels_query, rels=[rel.dict() for rel in graph_doc.relationships])
        
        # Remove the generic __Entity__ label now that specific labels are set
        session.run("MATCH (e:__Entity__) REMOVE e:__Entity__")
        
    driver.close()
    print("Upload to Neo4j complete!")

# --- Run the upload process ---
print("\n--- Starting upload to Neo4j... ---")
upload_graph_to_neo4j(main_graph, NEO4J_URI, NEO4J_USER, NEO4J_PASSWORD)