# Creating Knowledge graphs from Pdf files

Reference Document: https://bratanic-tomaz.medium.com/constructing-knowledge-graphs-from-text-using-openai-functions-096a6d010c17

In [None]:
# =============================
# **1. Import Necessary Libraries**
# =============================

import os
import json
from datetime import datetime
from typing import List, Dict, Any, Optional
from functools import partial

from langchain.vectorstores.neo4j_vector import Neo4jVector
from langchain.text_splitter import TokenTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain.graphs import Neo4jGraph
from langchain.schema import Document
from langchain.chains import GraphCypherQAChain
from langchain.chains.llm import LLMChain
from langchain.chains.question_answering.stuff_prompt import CHAT_PROMPT
from langchain.callbacks.manager import CallbackManagerForChainRun
from langchain_community.graphs.graph_document import Node, Relationship, GraphDocument
from neo4j import GraphDatabase

from pydantic import BaseModel, Field
from tqdm import tqdm
from dotenv import load_dotenv

# =============================
# **2. Setup Environment Variables**
# =============================

# Load environment variables from .env file (if using one)
load_dotenv()

# Define Neo4j connection parameters
# Set OpenAI API key directly (Note: It's not recommended to hardcode API keys in scripts)
# Add OpenAI API key
NEO4J_URI = os.getenv("NEO4J_URI", "bolt://localhost:7687")
NEO4J_USER = os.getenv("NEO4J_USERNAME", "neo4j")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD", "password")  # Update as needed

api_key = ""
os.environ["OPENAI_API_KEY"] = api_key
os.environ["API_KEY_OPENAI"] = api_key

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY', '')  # Ensure this is set in your environment

if not OPENAI_API_KEY:
    print("Error: OPENAI_API_KEY is not set in the environment variables.")
    raise ValueError("OPENAI_API_KEY is required.")

# =============================
# **3. Initialize OpenAI Embeddings and LLM**
# =============================

# Initialize OpenAI Embeddings
embeddings = OpenAIEmbeddings(api_key=OPENAI_API_KEY)

# Initialize OpenAI ChatGPT (GPT-4) LLM
llm = ChatOpenAI(
    api_key=OPENAI_API_KEY,
    model='gpt-4',
    temperature=0  # Adjust as needed
)

# =============================
# **4. Define Utility Functions**
# =============================

def flush():
    """
    Perform garbage collection to free up memory.
    """
    import gc
    gc.collect()

flush()

# =============================
# **5. Define KnowledgeGraph Model and Mapping Functions**
# =============================

# Define KnowledgeGraph class
class KnowledgeGraph(BaseModel):
    """Generate a knowledge graph with entities and relationships."""
    nodes: List[Node] = Field(
        ..., description="List of nodes in the knowledge graph")
    rels: List[Relationship] = Field(
        ..., description="List of relationships in the knowledge graph"
    )

def format_property_key(s: str) -> str:
    """
    Convert a string to camelCase for property keys.
    """
    words = s.split()
    if not words:
        return s
    first_word = words[0].lower()
    capitalized_words = [word.capitalize() for word in words[1:]]
    return "".join([first_word] + capitalized_words)

def props_to_dict(props: Optional[List[BaseModel]]) -> dict:
    """
    Convert a list of Property models to a dictionary.
    """
    properties = {}
    if not props:
        return properties
    for p in props:
        properties[format_property_key(p.key)] = p.value
    return properties

def map_to_base_node(node: Node) -> Node:
    """
    Map the KnowledgeGraph Node to the base Node with proper formatting.
    """
    properties = props_to_dict(node.properties) if node.properties else {}
    # Add name property for better Cypher statement generation
    if 'name' not in properties and isinstance(node.id, str):
        properties["name"] = node.id.title()
    return Node(
        id=node.id.title() if isinstance(node.id, str) else node.id,
        type=node.type.capitalize(),
        properties=properties
    )

def map_to_base_relationship(rel: Relationship) -> Relationship:
    """
    Map the KnowledgeGraph Relationship to the base Relationship with proper formatting.
    """
    source = map_to_base_node(rel.source)
    target = map_to_base_node(rel.target)
    properties = props_to_dict(rel.properties) if rel.properties else {}
    return Relationship(
        source=source,
        target=target,
        type=rel.type,
        properties=properties
    )

# =============================
# **6. Define OpenAI Extraction Chain**
# =============================

from langchain.chains.openai_functions import create_structured_output_chain
from langchain.prompts import ChatPromptTemplate

def get_extraction_chain(
    allowed_nodes: Optional[List[str]] = None,
    allowed_rels: Optional[List[str]] = None
    ):
    """
    Create a structured output chain for knowledge graph extraction using OpenAI's GPT-4.
    """
    prompt = ChatPromptTemplate.from_messages(
        [(
          "system",
          f"""# Knowledge Graph Instructions for GPT-4
## 1. Overview
You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph.
- **Nodes** represent entities and concepts. They're akin to Wikipedia nodes.
- The aim is to achieve simplicity and clarity in the knowledge graph, making it accessible for a vast audience.
## 2. Labeling Nodes
- **Consistency**: Ensure you use basic or elementary types for node labels.
  - For example, when you identify an entity representing a person, always label it as **"Person"**. Avoid using more specific terms like "Mathematician" or "Scientist".
- **Node IDs**: Never utilize integers as node IDs. Node IDs should be names or human-readable identifiers found in the text.
{'- **Allowed Node Labels:** ' + ", ".join(allowed_nodes) if allowed_nodes else ""}
{'- **Allowed Relationship Types:** ' + ", ".join(allowed_rels) if allowed_rels else ""}
## 3. Handling Numerical Data and Dates
- Numerical data, like age or other related information, should be incorporated as attributes or properties of the respective nodes.
- **No Separate Nodes for Dates/Numbers**: Do not create separate nodes for dates or numerical values. Always attach them as attributes or properties of nodes.
- **Property Format**: Properties must be in a key-value format.
- **Quotation Marks**: Never use escaped single or double quotes within property values.
- **Naming Convention**: Use camelCase for property keys, e.g., `birthDate`.
## 4. Coreference Resolution
- **Maintain Entity Consistency**: When extracting entities, it's vital to ensure consistency.
If an entity, such as "John Doe", is mentioned multiple times in the text but is referred to by different names or pronouns (e.g., "Joe", "he"),
always use the most complete identifier for that entity throughout the knowledge graph. In this example, use "John Doe" as the entity ID.
Remember, the knowledge graph should be coherent and easily understandable, so maintaining consistency in entity references is crucial.
## 5. Handling Unrelated Questions
- **Recognizing Irrelevance**: If the question does not pertain to the provided schema or cannot be answered using the knowledge graph, respond with a clear and concise message indicating the lack of context.
- **Example Response:** "I don't have the context to answer that question."
## 6. Strict Compliance
Adhere to the rules strictly. Non-compliance will result in termination.
          """
        ),
            ("user", "Use the given format to extract information from the following input: {input}"),
            ("user", "Tip: Make sure to answer in the correct format"),
        ])
    return create_structured_output_chain(KnowledgeGraph, llm, prompt, verbose=False)


# =============================
# **7. Define Function to Extract and Store Graph**
# =============================

def extract_and_store_graph(
    document: Document,
    allowed_nodes: Optional[List[str]] = None,
    allowed_rels: Optional[List[str]] = None
    ) -> GraphDocument:
    """
    Extracts a knowledge graph from the given document and stores it in Neo4j.
    
    Args:
        document (Document): The document to extract the knowledge graph from.
        allowed_nodes (Optional[List[str]]): List of allowed node labels.
        allowed_rels (Optional[List[str]]): List of allowed relationship types.
        
    Returns:
        GraphDocument: The extracted knowledge graph document.
    """
    try:
        # Extract graph data using OpenAI functions
        extract_chain = get_extraction_chain(allowed_nodes, allowed_rels)
        data = extract_chain.invoke(document.page_content)['function']
        
        # Construct a GraphDocument
        graph_document = GraphDocument(
            nodes = [map_to_base_node(node) for node in data.nodes],
            relationships = [map_to_base_relationship(rel) for rel in data.rels],
            source = document
        )
        
        # Store information into Neo4j
        print(f"Storing GraphDocument with {len(graph_document.nodes)} nodes and {len(graph_document.relationships)} relationships.")
        graph.add_graph_documents([graph_document])
        
        return graph_document
    except Exception as e:
        print(f"Error extracting and storing graph: {e}")
        raise e

# =============================
# **8. Initialize Neo4j Graph**
# =============================

# Initialize Neo4jGraph
graph = Neo4jGraph(
    url=NEO4J_URI,
    username=NEO4J_USER,
    password=NEO4J_PASSWORD
)

# =============================
# **9. Load PDF and Split into Documents**
# =============================

# Initialize PDF Loader
loader = PyPDFLoader("/Users/ishukalra/Documents/kaya/nlp-hands-on-projects/data/Lyft-Annual-Report-2021.pdf")

start_time = datetime.now()

# Load and split the PDF into pages
pages = loader.load_and_split()

# Define chunking strategy
text_splitter = TokenTextSplitter(chunk_size=200, chunk_overlap=20)

# Only take the first 4 pages of the document
documents = text_splitter.split_documents(pages[:4])

print(f"Number of documents after splitting: {len(documents)}")

# =============================
# **10. Iterate Over Documents and Extract Graph**
# =============================

distinct_nodes = set()
relations = []

for i, d in tqdm(enumerate(documents), total=len(documents), desc="Processing Documents"):
    try:
        graph_document = extract_and_store_graph(
            d,
            allowed_nodes=['Person', 'Organization', 'Event', 'Location'],  # Example allowed nodes
            allowed_rels=['WORKS_AT', 'PART_OF', 'LOCATED_IN']  # Example allowed relationships
        )
        
        # Get distinct nodes
        for node in graph_document.nodes:
            distinct_nodes.add(node.id)
        
        # Get all relations   
        for relation in graph_document.relationships:
            relations.append(relation.type)
        
    except Exception as e:
        print(f"Error processing document {i}: {e}")

# =============================
# **11. Collect Metadata and Update JSON File**
# =============================

end_time = datetime.now() 

LLM = "OpenAI-GPT-4"
file = loader.file_path
processed_time = str(end_time - start_time)

llm_data = {
    "LLM": LLM, 
    "File": loader.file_path, 
    "Processing Time": processed_time, 
    "Node count": len(distinct_nodes), 
    "Relation count": len(relations),
    "Nodes": list(distinct_nodes),
    "Relations": relations
}

json_file_path = '../data/llm_comparision.json'

# Ensure the directory exists
os.makedirs(os.path.dirname(json_file_path), exist_ok=True)

try:
    # Load existing data
    if os.path.exists(json_file_path):
        with open(json_file_path, 'r') as json_file:
            try:
                data = json.load(json_file)
                if isinstance(data, dict):
                    data = [data]
            except json.JSONDecodeError:
                print(f"Warning: The file {json_file_path} is corrupted. Overwriting with new data.")
                data = []
    else:
        data = []
    
    # Append new data
    data.append(llm_data)   
    
    # Write back to JSON file
    with open(json_file_path, 'w') as json_file:
        json.dump(data, json_file, indent=4) 
    
    print("Metadata updated successfully.")
except Exception as e:
    print(f"Failed to update metadata JSON file: {e}")


Number of documents after splitting: 9


Processing Documents:   0%|          | 0/9 [00:00<?, ?it/s]

Storing GraphDocument with 0 nodes and 0 relationships.


Processing Documents:  11%|█         | 1/9 [00:03<00:25,  3.17s/it]

Storing GraphDocument with 5 nodes and 2 relationships.


Processing Documents:  33%|███▎      | 3/9 [00:34<01:17, 12.96s/it]

Error extracting and storing graph: 'str' object has no attribute 'value'
Error processing document 2: 'str' object has no attribute 'value'


Processing Documents:  44%|████▍     | 4/9 [01:01<01:32, 18.59s/it]

Storing GraphDocument with 5 nodes and 4 relationships.


Processing Documents:  56%|█████▌    | 5/9 [01:04<00:51, 12.86s/it]

Storing GraphDocument with 1 nodes and 0 relationships.


Processing Documents:  67%|██████▋   | 6/9 [01:25<00:47, 15.70s/it]

Storing GraphDocument with 5 nodes and 4 relationships.


Processing Documents:  78%|███████▊  | 7/9 [01:39<00:30, 15.07s/it]

Error extracting and storing graph: 'str' object has no attribute 'value'
Error processing document 6: 'str' object has no attribute 'value'


Processing Documents:  89%|████████▉ | 8/9 [01:49<00:13, 13.38s/it]

Storing GraphDocument with 3 nodes and 2 relationships.


Processing Documents: 100%|██████████| 9/9 [01:50<00:00, 12.32s/it]

Storing GraphDocument with 0 nodes and 0 relationships.
Metadata updated successfully.





In [None]:

# =============================
# **12. Initialize OpenAI Embeddings and Neo4jVector**
# =============================

# Manually create the vector index with correct syntax
def create_vector_index(uri, user, password, index_name, embedding_dimension, similarity_metric='cosine'):
    """
    Creates a vector index in Neo4j with the specified parameters.
    
    Args:
        uri (str): Neo4j URI.
        user (str): Neo4j username.
        password (str): Neo4j password.
        index_name (str): Name of the vector index.
        embedding_dimension (int): Dimension of the embedding vectors.
        similarity_metric (str): Similarity metric ('cosine', 'euclidean', or 'dot_product').
    """
    driver = GraphDatabase.driver(uri, auth=(user, password))
    with driver.session() as session:
        try:
            # Corrected Cypher query without IF NOT EXISTS
            cypher_query = (
                f"CREATE VECTOR INDEX `{index_name}` "
                f"FOR (m:Chunk) "
                f"ON (m.embedding) "
                f"OPTIONS {{ "
                f"indexConfig: {{ "
                f"`vector.dimensions`: {embedding_dimension}, "
                f'`vector.similarity_function`: "{similarity_metric}" '
                f"}} "
                f"}}"
            )
            print("Executing Cypher Query:")
            print(cypher_query)
            session.run(cypher_query)
            print(f"Vector index `{index_name}` created successfully.")
        except Exception as e:
            print(f"Error creating vector index `{index_name}`: {e}")
            raise e
        finally:
            driver.close()

# Create the vector index manually
create_vector_index(
    uri=NEO4J_URI,
    user=NEO4J_USER,
    password=NEO4J_PASSWORD,
    index_name='KG_Enhanced_QnA_Biomedical',  # Using underscores instead of hyphens
    embedding_dimension=1536,  # Hardcoded as per your requirement
    similarity_metric='cosine'  # Choose the appropriate similarity metric
)

# Now, initialize Neo4jVector without attempting to create the index again
try:
    neo4j_vector = Neo4jVector.from_documents(
        documents,
        embeddings,
        index_name='KG_Enhanced_QnA_Biomedical',  # Using underscores
        url=NEO4J_URI,
        username=NEO4J_USER,
        password=NEO4J_PASSWORD
    )
    print("Neo4jVector initialized successfully.")
except Exception as e:
    print(f"Error initializing Neo4jVector: {e}")
    raise e


Executing Cypher Query:
CREATE VECTOR INDEX `KG_Enhanced_QnA_Biomedical` FOR (m:Chunk) ON (m.embedding) OPTIONS { indexConfig: { `vector.dimensions`: 1536, `vector.similarity_function`: "cosine" } }
Vector index `KG_Enhanced_QnA_Biomedical` created successfully.
Neo4jVector initialized successfully.


In [None]:

# =============================
# **13. Perform Similarity Search**
# =============================

query = "Hi, tell me about the Lyft revenue"
vector_results = neo4j_vector.similarity_search(query, k=2)

for i, res in enumerate(vector_results):
    print(f"Result {i+1}:")
    print(res.page_content)
    if i != len(vector_results) - 1:
        print()

if vector_results:
    vector_result = vector_results[0].page_content
else:
    print("No vector results found.")
    vector_result = ""


Result 1:

 
Delaware 20-8809830
(State or other jurisdiction of
incorporation or organization)(I.R.S. Employer
Identification No.)
185 Berry Street,  Suite 5000
San Francisco , California 94107
(Address of principal executive offices) (Zip Code)
Registrant’s telephone number, including area code: ( 844) 250-2773  
Securities registered pursuant to Section 12(b) of the Act: 
Title of each classTrading
Symbol(s) Name of each exchange on which registered
Class A common stock, par value of $0.00001 per share LYFT Nasdaq Global Select Market
Securities registered pursuant to Section 12(g) of the Act: None 
Indicate by check mark if the Registrant is a well-known seasoned issuer, as defined in Rule 405 of the Securities Act. Yes ☒ No ☐

Result 2:
UNITED STATES 
SECURITIES AND EXCHANGE COMMISSION 
Washington, D.C. 20549 
FORM 10-K /A 
(Amendment No. 1)
 
(Mark One)
☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
For the fiscal year ended December 31, 20

In [None]:
from langchain.chains.base import Chain

# =============================
# **14. Define and Run QA Chain**
# =============================

# Define the Cypher query for vector search
vector_search = """
WITH $embedding AS e
CALL db.index.vector.queryNodes('KG_Enhanced_QnA_Biomedical', $k, e) YIELD node, score
RETURN node.text AS result
ORDER BY score DESC
LIMIT $k
"""

# Define the custom Neo4jVectorChain
class Neo4jVectorChainCustom(Chain):
    graph: Neo4jGraph
    embeddings: OpenAIEmbeddings
    qa_chain: LLMChain
    input_key: str = "query"
    output_key: str = "result"

    @property
    def input_keys(self) -> List[str]:
        return [self.input_key]

    @property
    def output_keys(self) -> List[str]:
        return [self.output_key]

    def _call(self, inputs: Dict[str, str], run_manager=None) -> Dict[str, Any]:
        question = inputs[self.input_key]
        embedding = self.embeddings.embed_query(question)

        context = self.graph.query(
            vector_search,
            {'embedding': embedding, 'k': 3}
        )
        context = [el['result'] for el in context]

        result = self.qa_chain({"question": question, "context": context})
        final_result = result[self.qa_chain.output_key]
        return {self.output_key: final_result}

# Initialize the custom chain
chain = Neo4jVectorChainCustom(
    graph=graph,
    embeddings=embeddings,
    qa_chain=LLMChain(llm=llm, prompt=CHAT_PROMPT),
    verbose=True
)

# Run the custom chain for QA
question = "How can we enhance the specificity and efficiency of CRISPR/Cas9 gene-editing technology to minimize off-target effects and increase its potential for therapeutic applications?"
graph_result = chain.run(question)

print("QA Result:")
print(graph_result)

# =============================
# **15. Alternative: Using GraphCypherQAChain**
# =============================

# chain_alternative = GraphCypherQAChain.from_llm(
#     cypher_llm=llm,
#     qa_llm=llm,
#     graph=graph,
#     verbose=True,
#     return_intermediate_steps=True,
#     validate_cypher=True,
#     allow_dangerous_requests=True
# )

# graph_result_alternative = chain_alternative.run(question)

# print("Alternative QA Result:")
# print(graph_result_alternative)

# =============================
# **16. Final Clean-Up**
# =============================

# Close Neo4j driver if not already closed
# (Assuming Neo4jGraph manages the driver internally)
# If not, ensure to close it appropriately.
# graph.driver.close()  # Uncomment if necessary




[1m> Entering new Neo4jVectorChainCustom chain...[0m

[1m> Finished chain.[0m
QA Result:
I'm sorry, but the provided context does not contain information on how to enhance the specificity and efficiency of CRISPR/Cas9 gene-editing technology to minimize off-target effects and increase its potential for therapeutic applications.


In [None]:
!neo4j --version

5.24.0
