In [1]:
import os
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_openai import ChatOpenAI
from langchain_neo4j import Neo4jGraph
from langchain_core.documents import Document
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document
from langchain_openai import ChatOpenAI
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_text_splitters import RecursiveCharacterTextSplitter
from dotenv import load_dotenv

In [2]:
# Load .env file
load_dotenv()

# Read environment variables
NEO4J_URL = os.getenv("NEO4J_URL")
NEO4J_USERNAME = os.getenv("NEO4J_USERNAME")
NEO4J_PASSWORDD = os.getenv("NEO4J_PASSWORDD")
NEO4J_DATABASE1 = os.getenv("NEO4J_DATABASE1")
NEO4J_DATABASE2 = os.getenv("NEO4J_DATABASE2")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [3]:
graph = Neo4jGraph(
    url=NEO4J_URL,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORDD,
    database=NEO4J_DATABASE2,
    enhanced_schema=True
)

ClientError: {code: Neo.ClientError.Database.DatabaseNotFound} {message: Graph not found: pdfs}

In [5]:
print("Clearing existing graph for a fresh start...")
graph.query("MATCH (n) DETACH DELETE n")

Clearing existing graph for a fresh start...


[]

In [4]:
graph.refresh_schema()
print(graph.schema)

Node properties:
- **Document**
  - `id`: STRING Example: "9b2e3f806199be76c6b831ad6b4639be"
  - `text`: STRING Available options: ['Alen  Sultanic    FF  Swipe  File  Part  1       C', 'it   becomes   a   shortcut   to   ___________    ', 'And  it  has  nothing  to  do  with  your  _______', 'All   that   and   more   will   happen   for   yo', 'you   know   how   to   ________________   …     Y', '______________     And   Im   not   the   only   o', 'TIMELINE  LANGUAGE   First  you’ll___,  then  once', 'So   after   you   know   how   to   ______,   ___', 'ASSUMPTIVE  QUESTIONS   They  always  open  with  ', "OPENERS    The  Two  Types  of  Coaches   There's "]
  - `creator`: STRING Available options: ['PyPDF']
  - `creationdate`: STRING Available options: ['']
  - `producer`: STRING Available options: ['Skia/PDF m134 Google Docs Renderer']
  - `source`: STRING Available options: ['pdfs/1. Copywriting Swipe Files (Alen).pdf']
  - `total_pages`: INTEGER Min: 83, Max: 83
  - `page`: IN

In [7]:

# 1. Initialize LLM + transformer
llm = ChatOpenAI(temperature=0, model_name="gpt-5-mini")
llm_transformer = LLMGraphTransformer(llm=llm)

# 2. Define your PDF list
pdf_files = [
    "pdfs/1. Copywriting Swipe Files (Alen).pdf",
    # "pdfs/2. Copywriting Swipe Files (Alen).pdf",
    # "pdfs/3. Copywriting Swipe Files (Alen).pdf",
    # "pdfs/4. FB Posts (Alen).pdf",
    # "pdfs/5. FB Posts (Alen).pdf",
]

# 3. Define splitter
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,   # ~1000 tokens (balanced)
    chunk_overlap=150  # keeps context between chunks
)

# 4. Process each PDF separately
for pdf in pdf_files:
    loader = PyPDFLoader(pdf)
    pages = loader.load()

    # Split into chunks (Recursive handles pages seamlessly)
    chunks = splitter.split_documents(pages)
    print(f"{pdf} → {len(chunks)} chunks")


    # Convert chunks into graph docs
    graph_documents = []
    for i, chunk in enumerate(chunks, start=1):  # start=1 makes it 1-based index
        chunk_doc = [Document(
            page_content=chunk.page_content,
            metadata={"source": pdf, **chunk.metadata}  # preserve page info
        )]
        chunk_graph_docs = await llm_transformer.aconvert_to_graph_documents(chunk_doc)
        print(f"Converted chunk No.{i} out of {len(chunks)} to {len(chunk_graph_docs)} graph docs")
        graph_documents.extend(chunk_graph_docs)

        if i == 10:  # Limit to first 10 chunks for testing
            print("Reached 10 chunks, stopping further processing for this PDF.")
            break

    # Push into graph
    print(f"pushing {len(graph_documents)} graph docs to Neo4j...")
    graph.add_graph_documents(graph_documents, baseEntityLabel=True, include_source=True)

    print(f"Finished processing {pdf}")

pdfs/1. Copywriting Swipe Files (Alen).pdf → 154 chunks
Converted chunk No.1 out of 154 to 1 graph docs
Converted chunk No.2 out of 154 to 1 graph docs
Converted chunk No.3 out of 154 to 1 graph docs
Converted chunk No.4 out of 154 to 1 graph docs
Converted chunk No.5 out of 154 to 1 graph docs
Converted chunk No.6 out of 154 to 1 graph docs
Converted chunk No.7 out of 154 to 1 graph docs
Converted chunk No.8 out of 154 to 1 graph docs
Converted chunk No.9 out of 154 to 1 graph docs
Converted chunk No.10 out of 154 to 1 graph docs
Reached 10 chunks, stopping further processing for this PDF.
pushing 10 graph docs to Neo4j...
Finished processing pdfs/1. Copywriting Swipe Files (Alen).pdf


In [None]:
# from langchain_core.documents import Document

# llm = ChatOpenAI(temperature=0, model_name="gpt-5")
# llm_transformer = LLMGraphTransformer(llm=llm)

# text = """
# Marie Curie, born in 1867, was a Polish and naturalised-French physicist and chemist who conducted pioneering research on radioactivity.
# She was the first woman to win a Nobel Prize, the first person to win a Nobel Prize twice, and the only person to win a Nobel Prize in two scientific fields.
# Her husband, Pierre Curie, was a co-winner of her first Nobel Prize, making them the first-ever married couple to win the Nobel Prize and launching the Curie family legacy of five Nobel Prizes.
# She was, in 1906, the first woman to become a professor at the University of Paris.
# """
# documents = [Document(page_content=text)]
# graph_documents = await llm_transformer.aconvert_to_graph_documents(documents)
# graph.add_graph_documents(graph_documents, baseEntityLabel=True, include_source=True)

In [8]:
graph.refresh_schema()
print(f"Graph Schema:{graph.schema}")

Graph Schema:Node properties:
- **Document**
  - `id`: STRING Example: "9b2e3f806199be76c6b831ad6b4639be"
  - `text`: STRING Available options: ['Alen  Sultanic    FF  Swipe  File  Part  1       C', 'it   becomes   a   shortcut   to   ___________    ', 'And  it  has  nothing  to  do  with  your  _______', 'All   that   and   more   will   happen   for   yo', 'you   know   how   to   ________________   …     Y', '______________     And   Im   not   the   only   o', 'TIMELINE  LANGUAGE   First  you’ll___,  then  once', 'So   after   you   know   how   to   ______,   ___', 'ASSUMPTIVE  QUESTIONS   They  always  open  with  ', "OPENERS    The  Two  Types  of  Coaches   There's "]
  - `creator`: STRING Available options: ['PyPDF']
  - `creationdate`: STRING Available options: ['']
  - `producer`: STRING Available options: ['Skia/PDF m134 Google Docs Renderer']
  - `source`: STRING Available options: ['pdfs/1. Copywriting Swipe Files (Alen).pdf']
  - `total_pages`: INTEGER Min: 83, Max: 83
 