### Extract Text

In [1]:
import fitz  # PyMuPDF

def extract_text_from_pdf(file_path):
    doc = fitz.open(file_path)
    pages = []
    for page_number in range(len(doc)):
        page = doc.load_page(page_number)
        text = page.get_text().strip()
        pages.append({
            "page_number": page_number + 1,
            "text": text
        })
    doc.close()
    return pages

# Extract and print text
pdf_path = "./metformin1.pdf"
pages = extract_text_from_pdf(pdf_path)

### Extract Entities

In [2]:
import spacy
nlp = spacy.load("en_core_web_sm")


In [3]:
# Function to extract entities from text
def extract_entities(text):
    doc = nlp(text)
    entities = []
    for ent in doc.ents:
        entities.append({
            "text": ent.text,
            "label": ent.label_
        })
    return entities

In [91]:
all_entities = []

for page in pages:
    entities = extract_entities(page["text"])
    all_entities.append({
        "page_number": page["page_number"],
        "entities": entities
    })
len(all_entities)

35

### Building and storing your FAISS index

In [5]:
# from sentence_transformers import SentenceTransformer
# import faiss
# import numpy as np
# import json
# import pickle

# # 1. Load the embedding model
# model = SentenceTransformer('all-MiniLM-L6-v2')

# # 2. Flatten entities and prepare metadata
# unique_entities = {}
# metadata = []
# embedding_inputs = []

# for item in all_entities:
#     page = item["page_number"]
#     for ent in item["entities"]:
#         text = ent["text"].strip()
#         key = text.lower()

#         if key not in unique_entities:
#             unique_entities[key] = {
#                 "text": text,
#                 "label": ent["label"],
#                 "pages": [page]
#             }
#             embedding_inputs.append(text)
#         else:
#             unique_entities[key]["pages"].append(page)

# # 3. Compute embeddings
# embeddings = model.encode(embedding_inputs, convert_to_numpy=True)

# # 4. Create FAISS index
# dim = embeddings.shape[1]
# index = faiss.IndexFlatL2(dim)
# index.add(embeddings)

# # 5. Save metadata alongside index
# metadata = list(unique_entities.values())

# # Save FAISS index
# faiss.write_index(index, "entity_index.faiss")

# # Save metadata to disk
# with open("entity_metadata.pkl", "wb") as f:
#     pickle.dump(metadata, f)

# print(f"Stored {len(metadata)} unique entities into FAISS.")

### Ollama Function

In [37]:
import requests

def ask_ollama(prompt):
    url = "http://home-pc.tail4924f5.ts.net:11434/api/generate"
    headers = {
        "Content-Type": "application/json"
    }
    payload = {
        "model": "llama3",
        "prompt": prompt,
        "stream": False
    }

    response = requests.post(url, headers=headers, json=payload)
    response.raise_for_status()
    
    return response.json()["response"]

In [38]:
ask_ollama("Hello")

"Hello! It's nice to meet you. Is there something I can help you with, or would you like to chat?"

## Building cypher queries for neo4j

In [None]:
# --- 2. Neo4j Aura connection --
from neo4j import GraphDatabase

NEO4J_URI = ""
NEO4J_USER = ""
NEO4J_PASSWORD = ""
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))


In [40]:
def test_connection():
    try:
        with driver.session() as session:
            result = session.run("RETURN 'Neo4j connection successful!' AS message")
            print(result.single()["message"])
    except Exception as e:
        print("❌ Failed to connect to Neo4j:", e)

In [41]:
def run_cypher_query(query):
    with driver.session() as session:
        session.run(query)

In [90]:
import re

# --- Sanitization function ---
def sanitize_cypher(raw_response: str) -> str:
    query = re.sub(r"```(?:cypher)?|```", "", raw_response, flags=re.IGNORECASE).strip()
    query = re.sub(r"//.*?$", "", query, flags=re.MULTILINE)
    query = re.sub(r"#.*?$", "", query, flags=re.MULTILINE)
    query = re.sub(r"\{\s*\}", "", query)  # Remove empty {}
    query = re.sub(r"\[\s*:(\w+)\s*\{\s*\}\s*\]", r"[:\1]", query)  # Remove empty relationship props
    query = re.sub(r"CREATE\s+\((\w+):(\w+)\s*\{(.*?)\}\),\s*\(\1",
                   r"CREATE (\1:\2 {\3})\nMATCH (\1", query)  # Avoid duplicate creates
    return query.strip()

# --- Base prompt ---
base_prompt = """
You are an expert at writing Neo4j Cypher queries.
I want to build out my knowledge graph.
Whenever I ask you to create relationships, assume the nodes may NOT already exist — you must create them along with the relationship.

STRICT RULES:
1. Output MUST be a single valid Cypher `CREATE` query **and nothing else** — no explanations, no comments, no blank lines before or after, no extra text, no code fences.
   - The query must end immediately after the final semicolon (or end of query if no semicolon is used).
   - Never output acknowledgements like "Sure" or "Let me know if you need any further assistance".
2. Use exactly ONE CREATE statement that includes all nodes and the relationship in a single query.
   - If creating multiple nodes, list them in the SAME CREATE clause separated by commas and linked by the relationship.
3. NEVER produce more than one Cypher statement per query.
4. Always use correct Cypher node syntax:
   (variableName:Label {propertyKey: propertyValue, ...})
5. Variable names:
   - Must be derived from the entity name by:
       * Replacing spaces with underscores
       * Removing all characters except letters, numbers, and underscores
       * If it starts with a number or any non-letter character, prefix with 'n'
       * If it becomes empty after sanitization, use 'entity'
   - Must reflect the sanitized entity name — do not use generic n1/n2 unless the entity is literally numeric.
   - Examples: "750" → n750, "750 Entity" → n750_Entity, "2mg Dose" → n2mg_Dose
6. Node labels:
   - Must come immediately after the colon, before the `{}`.
   - If entity type is unknown, use label `Entity`.
7. Property values:
   - Must be valid strings, numbers, or booleans.
   - Must always include `text: "<original entity text exactly as given>"`.
   - Never leave them blank. If value unknown, omit property entirely.
   - NEVER use parameter placeholders like `$id`, `$name`, `$value` — always hardcode the actual value or omit the property.
8. Relationship properties:
   - Must have valid values.
   - NEVER produce `{}`, `{-}`, or `{   }` in relationships.
   - If no valid property exists, omit the `{}` entirely (e.g., `-[:RELATES_TO]->`).
9. Always ensure relationship direction makes semantic sense from <<entity1>> to <<entity2>>.
10. All Cypher identifiers (variable names, labels, relationship types, property keys) must:
    - Start with a letter or underscore (NEVER a number)
    - Contain only letters, numbers, and underscores
    - No spaces or special characters

Example of valid syntax:
CREATE (daily:Entity {text: "daily"})-[:HAS_VALUE]->(n2550:Entity {text: "2550"});

Invalid syntax examples to AVOID:
❌ CREATE (daily:Entity {text: "daily"}),(2550:Entity {text: "2550"})  # Variable starts with a number
❌ CREATE (750_Entity:Entity {text: "750"})                            # Must be n750_Entity
❌ CREATE (a:Label) CREATE (b:Label)                                  # Multiple CREATE statements
❌ (CARDINAL: {text: "2"})                                            # Missing label after colon
❌ -[:RELATION {prop: }]->                                            # Empty property value
❌ CREATE (a:Label {id: $id})                                         # Parameter placeholders not allowed
❌ CREATE (...) ... <extra text>                                      # No trailing explanations allowed

---

Task:
Give me the Cypher query to create both nodes and the relationship between <<entity1>> and <<entity2>> following the above rules.
Output ONLY the Cypher query exactly, with nothing else before or after.
"""

# --- Loop over entities ---
for item in all_entities:
    entities = item["entities"]
    if len(entities) >= 2:
        entity1 = entities[0]["text"]
        entity2 = entities[1]["text"]

        # Prepare prompt for Ollama
        prompt = base_prompt.replace("<<entity1>>", entity1).replace("<<entity2>>", entity2)

        # Get raw Cypher from Ollama
        raw_cypher_query = ask_ollama(prompt)

        # Sanitize
        cypher_query = sanitize_cypher(raw_cypher_query)

        # Print & execute
        print(f"Raw from Ollama: {raw_cypher_query}")
        print(f"Sanitized Cypher: {cypher_query}")
        print(f"Executing Cypher: {cypher_query}")
        run_cypher_query(cypher_query)

driver.close()


Raw from Ollama: CREATE (n2:Entity {text: "2"})-[:RELATES_TO]->(Metformin:Entity {text: "Metformin"});
Sanitized Cypher: CREATE (n2:Entity {text: "2"})-[:RELATES_TO]->(Metformin:Entity {text: "Metformin"});
Executing Cypher: CREATE (n2:Entity {text: "2"})-[:RELATES_TO]->(Metformin:Entity {text: "Metformin"});


  with driver.session() as session:


Raw from Ollama: CREATE (n750_Entity:Entity {text: "750"})-[:RELATES_TO]->(System_Components:Entity {text: "System Components"});
Sanitized Cypher: CREATE (n750_Entity:Entity {text: "750"})-[:RELATES_TO]->(System_Components:Entity {text: "System Components"});
Executing Cypher: CREATE (n750_Entity:Entity {text: "750"})-[:RELATES_TO]->(System_Components:Entity {text: "System Components"});
Raw from Ollama: CREATE (nCmax_Entity:Entity {text: "Cmax"})-[:RELATES_TO]->(n7hours_Entity:Entity {text: "7 hours"});
Sanitized Cypher: CREATE (nCmax_Entity:Entity {text: "Cmax"})-[:RELATES_TO]->(n7hours_Entity:Entity {text: "7 hours"});
Executing Cypher: CREATE (nCmax_Entity:Entity {text: "Cmax"})-[:RELATES_TO]->(n7hours_Entity:Entity {text: "7 hours"});
Raw from Ollama: CREATE (nTable_1:Entity {text: "Table 1"})-[:RELATES_TO]->(n3_5:Entity {text: "3.5"});
Sanitized Cypher: CREATE (nTable_1:Entity {text: "Table 1"})-[:RELATES_TO]->(n3_5:Entity {text: "3.5"});
Executing Cypher: CREATE (nTable_1:Entit