## Introduction
This script, PDF2Graph, automates the process of extracting text from PDF documents, constructing a knowledge graph, and uploading it to a Neo4j graph database. The workflow includes splitting large PDF files, parsing them to extract relevant information, creating a property graph using the extracted data, and storing the graph in Neo4j for easy querying and analysis. This tool is ideal for transforming unstructured PDF content into structured graph data for further semantic search, reasoning, or visualization.



## Setup (Installs, Data, Models)

In [4]:
# !pip install llama-index
# !pip install llama-index-core==0.10.42
# !pip install llama-index-embeddings-openai
# !pip install llama-index-postprocessor-flag-embedding-reranker
# !pip install git+https://github.com/FlagOpen/FlagEmbedding.git
# !pip install llama-index-graph-stores-neo4j
# !pip install llama-parse


In [6]:
import nest_asyncio

nest_asyncio.apply()

In [1]:
import os

# API access to llama-cloud
os.environ["LLAMA_CLOUD_API_KEY"] = ""

#### Setup Model

Here we use gpt-4o and default OpenAI embeddings.

In [2]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings

llm = OpenAI(model="gpt-4o")
embed_model = OpenAIEmbedding(model="text-embedding-3-small")

Settings.llm = llm
Settings.embed_model = embed_model

#### Load Data


In [3]:
pdf_path = "D:/LLM project/test data/20190118_Rev003_Navy OH Foundation Report.pdf"
pdf_filename = os.path.basename(pdf_path).replace(" ", "").replace(".pdf", "")
pkl_filename = pdf_filename + ".pkl" 
pkl_filename = os.path.join("split_pdfs", pkl_filename)

In [7]:
from llama_parse import LlamaParse

parsed_docs = LlamaParse(result_type="text").load_data(pdf_path)

Started parsing the file under job_id cdedc71b-12ad-41ac-8d11-3d44b45a9c6d


In [2]:
"""
PDF Splitting and Parsing Automation

Description:
This script automates the process of splitting a large PDF file into smaller parts, 
parsing each part using LlamaParse to extract textual content, and saving the parsed 
data as serialized pickle (.pkl) files. Finally, it combines all parsed segments into 
a single .pkl file for downstream use, such as semantic search, vectorization, or RAG-based applications.

Dependencies:
- PyMuPDF (fitz)
- llama_parse
- pickle
- os

Usage:
Define `pdf_path`, `output_dir`, and `pkl_filename`, then run the `main()` function.
"""

import fitz 
import os
from llama_parse import LlamaParse
import pickle

def split_pdf(pdf_path, output_dir, num_splits=5):
   doc = fitz.open(pdf_path)
   total_pages = len(doc)
   pages_per_split = total_pages // num_splits
   pdf_parts = []
   
   for i in range(num_splits):
       start_page = i * pages_per_split
       end_page = (i + 1) * pages_per_split if i < num_splits - 1 else total_pages
       
       split_pdf_path = os.path.join(output_dir, f"split_{i + 1}.pdf")
       new_doc = fitz.open()
       for page in range(start_page, end_page):
           new_doc.insert_pdf(doc, from_page=page, to_page=page)
       new_doc.save(split_pdf_path)
       new_doc.close()
       pdf_parts.append(split_pdf_path)
   
   return pdf_parts

def parse_and_save_pdfs(pdf_list, output_dir):
   parser = LlamaParse(result_type="text")
   saved_files = []
   
   for i, pdf in enumerate(pdf_list):
       try:
           docs = parser.load_data(pdf)  
           output_file = os.path.join(output_dir, f"parsed_{i+1}.pkl")
           with open(output_file, "wb") as f:
               pickle.dump(docs, f)
           saved_files.append(output_file)
           
       except Exception as e:
           print(f"Error parsing {pdf}: {e}")
   
   return saved_files

def combine_parsed_files(file_list, output_file):
   all_docs = []
   for file in file_list:
       with open(file, "rb") as infile:
           docs = pickle.load(infile)
           all_docs.extend(docs)
   
   with open(output_file, "wb") as f:
       pickle.dump(all_docs, f)

def main(pdf_path, output_dir, pkl_filename):
   os.makedirs(output_dir, exist_ok=True)
   pdf_parts = split_pdf(pdf_path, output_dir, num_splits=5)
   parsed_files = parse_and_save_pdfs(pdf_parts, output_dir)
   combine_parsed_files(parsed_files, pkl_filename)


output_dir = "split_pdfs"
main(pdf_path, output_dir, pkl_filename)  


with open(os.path.join(output_dir, pkl_filename), "rb") as f:
   parsed_docs = pickle.load(f)

In [8]:
print(f"parsed_docs type: {type(parsed_docs)}")  # should be list

parsed_docs type: <class 'list'>


In [1]:
import pickle
print(pkl_filename)
with open(pkl_filename, "rb") as f:
    parsed_docs= pickle.load(f)

print(parsed_docs[:50]) 
print(pkl_filename)

In [10]:
from copy import deepcopy
from llama_index.core.schema import TextNode, Document
from llama_index.core import VectorStoreIndex


def get_sub_docs(docs):
    """Split docs into pages, by separator."""
    sub_docs = []
    for doc in docs:
        doc_chunks = doc.text.split("\n---\n")
        for doc_chunk in doc_chunks:
            sub_doc = Document(
                text=doc_chunk,
                metadata=deepcopy(doc.metadata),
            )
            sub_docs.append(sub_doc)

    return sub_docs

In [13]:
# this will split into pages
sub_docs = get_sub_docs(parsed_docs)

#### Initialize Graph Store

Here I use Neo4j

In [3]:
from llama_index.graph_stores.neo4j import Neo4jPGStore
password = input("Please enter your password for Neo4j: ")
graph_store = Neo4jPGStore(
    username="neo4j",
    password=password,
    url="bolt://localhost:7689",
)
vec_store = None
#Built a bridge between a Python notebook and Neo4j.

## Construct Knowledge Graph


In [15]:
from llama_index.core.indices.property_graph import (
    ImplicitPathExtractor,
    SimpleLLMPathExtractor,
)
from llama_index.core import PropertyGraphIndex
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.graph_stores.neo4j import Neo4jPGStore
from llama_index.core import PropertyGraphIndex
from llama_index.core.schema import Document
from typing import Dict, List

In [20]:
class MultiGraphStore:
    def __init__(self, neo4j_config: Dict[str, str]):
        """
        Constructor: Initializes the Neo4j configuration.
        """
        self.neo4j_config = neo4j_config
        self.graphs = {}

    def setup_graph(self, graph_name: str):
        """
        Set up the environment for each graph (using the filename as the graph name).
        """
        if graph_name not in self.graphs:
            # Create a Neo4j graph store
            self.graphs[graph_name] = Neo4jPGStore(
                username=self.neo4j_config["username"],
                password=self.neo4j_config["password"],
                url=self.neo4j_config["url"],
                database="neo4j",  # Only use the default database
            )
            print(f"Graph {graph_name} setup successfully")
        else:
            print(f"Graph {graph_name} already exists")

    def get_graph(self, graph_name: str):
        """
        Retrieve the graph by its name.
        """
        return self.graphs.get(graph_name)

    def clear_graph(self, graph_name: str):
        """
        Clear the data in the specified graph.
        """
        if graph_name in self.graphs:
            self.graphs[graph_name].clear()
            print(f"Graph {graph_name} cleared")
        else:
            print(f"Graph {graph_name} does not exist")

    def clear_all_graphs(self):
        """
        Clear the data in all stored graphs.
        """
        for graph_name in self.graphs:
            self.graphs[graph_name].clear()
            print(f"Graph {graph_name} cleared")


In [18]:
def create_knowledge_graph_for_pdf(
    file_name: str, 
    processed_docs: List[Document], 
    neo4j_config: Dict
) -> PropertyGraphIndex:
    """
    Create a knowledge graph and add it to Neo4j.
    The PDF file name is used as the name of the knowledge graph.

    Args:
        file_name (str): The name of the PDF file, used to name the graph.
        processed_docs (List[Document]): List of processed documents.
        neo4j_config (Dict): Configuration for connecting to Neo4j.
    """
    # Create an instance of MultiGraphStore
    multi_graph = MultiGraphStore(neo4j_config)

    # Sanitize the file name to ensure a valid graph name
    safe_name = "".join(c for c in file_name if c.isalnum() or c == '_')

    # Set up the graph environment (create storage in Neo4j)
    multi_graph.setup_graph(safe_name)

    # Retrieve the corresponding graph store
    graph_store = multi_graph.get_graph(safe_name)

    # Add the graph name to each document's metadata
    for doc in processed_docs:
        doc.metadata = doc.metadata or {}
        doc.metadata["graph_name"] = safe_name

    # Create the knowledge graph using PropertyGraphIndex
    index = PropertyGraphIndex.from_documents(
        processed_docs,
        embed_model=OpenAIEmbedding(model_name="text-embedding-3-small"),
        kg_extractors=[
            ImplicitPathExtractor(),
            SimpleLLMPathExtractor(
                llm=OpenAI(model="gpt-3.5-turbo", temperature=0.3),
                num_workers=4,
                max_paths_per_chunk=10,
            ),
        ],
        property_graph_store=graph_store,
        show_progress=True,
    )

    print(f"Successfully created knowledge graph for {file_name}")
    return index


In [21]:

neo4j_config = {
    "username": "neo4j",
    "password": password,
    "url": "bolt://localhost:7689",  
}

# Create and add a knowledge graph to Neo4j
index = create_knowledge_graph_for_pdf(pdf_filename, sub_docs, neo4j_config)


Graph Attentionisallyouneed setup successfully


Parsing nodes:   0%|          | 0/11 [00:00<?, ?it/s]

Extracting implicit paths: 100%|██████████| 13/13 [00:00<?, ?it/s]
Extracting paths from text: 100%|██████████| 13/13 [00:07<00:00,  1.69it/s]
Generating embeddings: 100%|██████████| 1/1 [00:01<00:00,  1.09s/it]
Generating embeddings: 100%|██████████| 3/3 [00:04<00:00,  1.63s/it]


Successfully created knowledge graph for Attentionisallyouneed
