### Extract Text


In [3]:
import fitz  # PyMuPDF


def extract_text_from_pdf(file_path):
    doc = fitz.open(file_path)
    pages = []
    for page_number in range(len(doc)):
        page = doc.load_page(page_number)
        text = page.get_text().strip()
        pages.append({"page_number": page_number + 1, "text": text})
    doc.close()
    return pages


# Extract and print text
pdf_path = "./metformin1.pdf"
pages = extract_text_from_pdf(pdf_path)

### Extract Entities


In [4]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [5]:
# Function to extract entities from text
def extract_entities(text):
    doc = nlp(text)
    entities = []
    for ent in doc.ents:
        entities.append({"text": ent.text, "label": ent.label_})
    return entities

In [6]:
all_entities = []

for page in pages:
    entities = extract_entities(page["text"])
    all_entities.append({"page_number": page["page_number"], "entities": entities})

print(all_entities[0])

{'page_number': 1, 'entities': [{'text': '2', 'label': 'CARDINAL'}, {'text': 'Metformin', 'label': 'PERSON'}, {'text': 'C4H11N5 •', 'label': 'ORG'}, {'text': '165.63', 'label': 'CARDINAL'}, {'text': '12.4', 'label': 'CARDINAL'}, {'text': '1%', 'label': 'PERCENT'}, {'text': '6.68', 'label': 'CARDINAL'}, {'text': '500', 'label': 'CARDINAL'}, {'text': '850', 'label': 'CARDINAL'}, {'text': '1000', 'label': 'CARDINAL'}, {'text': '500', 'label': 'CARDINAL'}, {'text': '850', 'label': 'CARDINAL'}, {'text': '1000', 'label': 'CARDINAL'}, {'text': '500', 'label': 'CARDINAL'}, {'text': '750', 'label': 'CARDINAL'}, {'text': '500', 'label': 'CARDINAL'}, {'text': '4079189', 'label': 'CARDINAL'}, {'text': 'FDA', 'label': 'ORG'}, {'text': 'https://www.fda.gov/drugsatfda', 'label': 'ORG'}]}


In [7]:
import json


with open("entities.json", "w", encoding="utf-8") as f:
    json.dump(all_entities, f, ensure_ascii=False, indent=2)

### Unique labels


In [8]:
import json

# Load the JSON back in
with open("entities.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Collect all unique labels
unique_labels = {entity["label"] for page in data for entity in page["entities"]}

# Turn into a sorted list if you want them ordered
unique_labels = sorted(unique_labels)

print(unique_labels)

['CARDINAL', 'DATE', 'GPE', 'LAW', 'LOC', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART']


### Building and storing your FAISS index


### Ollama Function


In [42]:
import requests


def ask_ollama(prompt):
    url = "http://home-pc.tail4924f5.ts.net:11434/api/generate"
    headers = {"Content-Type": "application/json"}
    payload = {"model": "gpt-oss:20b", "prompt": prompt, "stream": False}

    response = requests.post(url, headers=headers, json=payload)
    response.raise_for_status()

    return response.json()["response"]

In [10]:
ask_ollama("Hello who are you")

"Nice to meet you! I am LLaMA, an AI assistant developed by Meta AI that can understand and respond to human input in a conversational manner. I'm not a human, but a computer program designed to simulate conversation, answer questions, and even generate text on a given topic or topic area. I'm constantly learning and improving my responses based on the interactions I have with users like you! What brings you here today?"

In [11]:
import json

# Load the uploaded entities.json file
with open("entities.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Create a set of unique (label, text) pairs
unique_entities = {
    (ent["label"], ent["text"]) for page in data for ent in page["entities"]
}

# Count unique entities
unique_count = len(unique_entities)
unique_count

569

### Dumping nodes in Neo4J


In [59]:
# --- 2. Neo4j Aura connection --
from neo4j import GraphDatabase

NEO4J_URI = "neo4j+s://033a4c34.databases.neo4j.io"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "yQ02-iiANgP8fMDa_Ndj6mq4DioXPbbqGdP2_JBqrLg"
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

In [12]:
def test_connection():
    try:
        with driver.session() as session:
            result = session.run("RETURN 'Neo4j connection successful!' AS message")
            print(result.single()["message"])
    except Exception as e:
        print("❌ Failed to connect to Neo4j:", e)


test_connection()

❌ Failed to connect to Neo4j: name 'driver' is not defined


In [126]:
import json

# --- 1. Load entities.json ---
with open("entities.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# --- 3. Collect unique (label, text) pairs ---
unique_entities = set()
for page in data:
    for ent in page["entities"]:
        label = ent["label"].strip()
        text = ent["text"].strip()
        unique_entities.add((label, text))


# --- 4. Create nodes in Neo4j ---
def create_nodes(tx, entities):
    for label, text in entities:
        # MERGE avoids duplicates if you re-run the script
        query = f"""
        MERGE (n:`{label}` {{text: $text}})
        """
        tx.run(query, text=text)


with driver.session() as session:
    session.write_transaction(create_nodes, unique_entities)

print(f"Inserted {len(unique_entities)} unique nodes.")

driver.close()

  session.write_transaction(create_nodes, unique_entities)


Inserted 568 unique nodes.


## Semantic Chunking


In [13]:
from langchain.text_splitter import RecursiveCharacterTextSplitter


def chunk_pages(pages, chunk_size=1000, chunk_overlap=200):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap
    )
    all_chunks = []
    for page in pages:
        chunks = splitter.split_text(page["text"])
        all_chunks.extend(chunks)
    return all_chunks

In [14]:
chunks = chunk_pages(pages)
for i, chunk in enumerate(chunks, 1):
    print(f"--- Chunk {i} ---")
    print(len(chunk), chunk)
    print()

--- Chunk 1 ---
926 GLUCOPHAGE®  
(metformin hydrochloride) Tablets  
GLUCOPHAGE® XR  
(metformin hydrochloride) Extended-Release Tablets  
DESCRIPTION 
GLUCOPHAGE® (metformin hydrochloride) Tablets and GLUCOPHAGE® XR (metformin 
hydrochloride) Extended-Release Tablets are oral antihyperglycemic drugs used in the 
management of type 2 diabetes. Metformin hydrochloride (N,N-dimethylimidodicarbonimidic 
diamide hydrochloride) is not chemically or pharmacologically related to any other classes of 
oral antihyperglycemic agents. The structural formula is as shown: structural formula
Metformin hydrochloride is a white to off-white crystalline compound with a molecular formula 
of C4H11N5 • HCl and a molecular weight of 165.63. Metformin hydrochloride is freely soluble 
in water and is practically insoluble in acetone, ether, and chloroform. The pKa of metformin is 
12.4. The pH of a 1% aqueous solution of metformin hydrochloride is 6.68.

--- Chunk 2 ---
916 in water and is practically inso

### Making the embeddings of the the labels


In [15]:
import json
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import pickle

# --- 1. Load entities.json ---
with open("entities.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# --- 2. Extract unique labels ---
labels = sorted({ent["label"] for page in data for ent in page["entities"]})
print("Unique labels:", labels)

# --- 3. Load embedding model ---
model = SentenceTransformer("all-MiniLM-L6-v2")  # Small, fast model

# --- 4. Create embeddings ---
embeddings = model.encode(labels, convert_to_numpy=True, normalize_embeddings=True)

# --- 5. Create FAISS index ---
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings)

# --- 6. Save FAISS index ---
faiss.write_index(index, "labels.index")

# --- 7. Also save mapping from index to labels ---
with open("labels_mapping.pkl", "wb") as f:
    pickle.dump(labels, f)

print("FAISS index and label mapping saved.")

  from .autonotebook import tqdm as notebook_tqdm


Unique labels: ['CARDINAL', 'DATE', 'GPE', 'LAW', 'LOC', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART']
FAISS index and label mapping saved.


In [16]:
import json
import faiss
import numpy as np
import pickle
from sentence_transformers import SentenceTransformer

# --- Load entities.json ---
with open("entities.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# --- Extract unique text values ---
unique_texts = sorted(
    {ent["text"].strip() for page in data for ent in page["entities"]}
)

print(f"Total unique entity texts: {len(unique_texts)}")

# --- Load embedding model ---
model = SentenceTransformer("all-MiniLM-L6-v2")

# --- Create normalized embeddings ---
embeddings = model.encode(
    unique_texts, convert_to_numpy=True, normalize_embeddings=True
)

# --- Create FAISS cosine similarity index ---
dim = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)  # Cosine similarity with normalized vectors
index.add(embeddings)

# --- Save index and mapping ---
faiss.write_index(index, "entity_texts_cosine.index")
with open("entity_texts_mapping.pkl", "wb") as f:
    pickle.dump(unique_texts, f)

print("FAISS index for entity texts saved.")

Total unique entity texts: 563
FAISS index for entity texts saved.


### Searching the Faiss


In [17]:
# --- Load index and mapping ---
index = faiss.read_index("entity_texts_cosine.index")
with open("entity_texts_mapping.pkl", "rb") as f:
    unique_texts = pickle.load(f)

# --- Load embedding model ---
model = SentenceTransformer("all-MiniLM-L6-v2")

# --- Query ---
query = """916 in water and is practically insoluble in acetone, ether, and chloroform. The pKa of metformin is 
12.4. The pH of a 1% aqueous solution of metformin hydrochloride is 6.68. 
GLUCOPHAGE tablets contain 500 mg, 850 mg, or 1000 mg of metformin hydrochloride. Each 
tablet contains the inactive ingredients povidone and magnesium stearate. In addition, the coating 
for the 500 mg and 850 mg tablets contains hypromellose and the coating for the 1000 mg tablet 
contains hypromellose and polyethylene glycol. 
GLUCOPHAGE XR contains 500 mg or 750 mg of metformin hydrochloride as the active 
ingredient."""
embedding = model.encode([query], convert_to_numpy=True, normalize_embeddings=True)

# --- Search ---
similarities, indices = index.search(embedding, k=30)
results = [
    (unique_texts[idx], float(sim)) for idx, sim in zip(indices[0], similarities[0])
]

# --- Sort descending (already sorted by FAISS) ---
for text, sim in results:
    print(f"{text} -> {sim:.4f}")

Twice Daily 
GLUCOPHAGE XR 
1000 -> 0.5015
358 L. Metformin -> 0.4670
600 mg/kg -> 0.4543
45 mL/min/1.73 m2 -> 0.4325
Metformin -> 0.4306
mL. Metformin -> 0.4291
GLUCOPHAGE -> 0.4197
60 mL -> 0.3668
Gender 
Metformin -> 0.3663
Total Triglycerides -> 0.3614
Dosage -> 0.3463
DOSAGE -> 0.3463
30 mL -> 0.3352
50 grams -> 0.3273
mcg/mL -> 0.3174
Plasma Glucose -> 0.3143
Serum Lipid 
Variables -> 0.3094
https://www.fda.gov/drugsatfda -> 0.2962
HDL-Cholesterol -> 0.2759
Insulin Dose -> 0.2659
GLU -> 0.2565
Hemoglobin A1c -> 0.2547
LDL-Cholesterol -> 0.2527
Drug Interactions -> 0.2470
serum creatinine -> 0.2396
CLINICAL PHARMACOLOGY -> 0.2247
9.6 
2.6 
Nausea/Vomiting 
6.5 
1.5 -> 0.2213
Once Daily 
Hemoglobin A1c -> 0.2159
Placebo/Insulin Summary -> 0.1946
Bristol-Myers Squibb Company -> 0.1914


### Building Relationships


In [18]:
import requests
from pydantic import BaseModel
from typing import List, Type, TypeVar

T = TypeVar("T", bound=BaseModel)


class Relationship(BaseModel):
    source: str
    target: str
    relation: str


class Relationships(BaseModel):
    relationships: List[Relationship]


ollama_api_endpoint = "http://localhost:11434/api/chat"


def ask_ollama_structured(endpoint, model, prompt: str, model_class: Type[T]) -> T:
    """
    Sends a prompt to Ollama API and parses the result into the given Pydantic model.

    Args:
        prompt (str): The user's input prompt.
        model_class (Type[T]): The Pydantic model class to parse the response.

    Returns:
        An instance of the provided Pydantic model class.
    """
    # Convert Pydantic model schema to JSON Schema
    schema = model_class.model_json_schema()
    payload = {
        "model": model,
        "messages": [{"role": "user", "content": prompt}],
        "stream": False,
        "format": schema,  # Ollama expects the format in JSON schema form
    }

    resp = requests.post(
        endpoint,
        headers={"Content-Type": "application/json"},
        json=payload,
    )
    resp.raise_for_status()

    # The API returns a dict with 'message' -> 'content' containing the JSON output
    raw_json = resp.json()["message"]["content"]

    # Parse into the Pydantic model
    return model_class.model_validate_json(raw_json)

In [116]:
result = ask_ollama_structured(
    ollama_api_endpoint,
    "llama3",
    "Extract out relationships from the provided text. Do not use spaces between words in relationships "
    "Text: "
    "The second largest country  in the world by land area, Canada, known for its vast and diverse landscapes ranging from the Rocky Mountains and endless prairies to dense forests and Arctic tundra. It is a highly developed nation with a strong economy, multicultural society, and a reputation for politeness and inclusivity. Canada has two official languages, English and French, reflecting its colonial history under Britain and France. Its cities like Toronto, Vancouver, and Montreal are global hubs for culture, business, and education, while its national parks and wilderness areas make it a haven for outdoor enthusiasts. Governed as a parliamentary democracy and constitutional monarchy, Canada places a strong emphasis on human rights, healthcare, and quality of life."
    ""
    "Source and Target entities can be any entity relevant from the text. Source and Target both should not only be the same entity every time. Try to extract out all meaningful information from the text. Relevant Entities separated by commas are given below. Make sure to only define the relationship in a maximum of 3 words."
    "Britain, Rocky Mountains, prairies, forests, Arctic tundra, economy, multicultural society, politeness, inclusivity, English, French, Britain, France, Toronto, Vancouver, Montreal, culture, Canada, business, education, national parks, wilderness areas, outdoor enthusiasts, parliamentary democracy, constitutional monarchy, human rights, healthcare, quality of life.",
    Relationships,
)
print(result)

relationships=[Relationship(source='Canada', target='Britain', relation='Colonized'), Relationship(source='Canada', target='France', relation='Colonized'), Relationship(source='Canada', target='Rocky Mountains', relation='Has Landscapes'), Relationship(source='Canada', target='prairies', relation='Has Landscapes'), Relationship(source='Canada', target='forests', relation='Has Landscapes'), Relationship(source='Canada', target='Arctic tundra', relation='Has Landscapes'), Relationship(source='Canada', target='economy', relation='Has'), Relationship(source='Canada', target='multicultural society', relation='Has'), Relationship(source='Canada', target='politeness', relation='Is Known For'), Relationship(source='Canada', target='inclusivity', relation='Is Known For'), Relationship(source='Canada', target='English', relation='Has Language'), Relationship(source='Canada', target='French', relation='Has Language'), Relationship(source='Toronto', target='culture', relation='Is Hub For'), Relati

In [117]:
result.relationships

[Relationship(source='Canada', target='Britain', relation='Colonized'),
 Relationship(source='Canada', target='France', relation='Colonized'),
 Relationship(source='Canada', target='Rocky Mountains', relation='Has Landscapes'),
 Relationship(source='Canada', target='prairies', relation='Has Landscapes'),
 Relationship(source='Canada', target='forests', relation='Has Landscapes'),
 Relationship(source='Canada', target='Arctic tundra', relation='Has Landscapes'),
 Relationship(source='Canada', target='economy', relation='Has'),
 Relationship(source='Canada', target='multicultural society', relation='Has'),
 Relationship(source='Canada', target='politeness', relation='Is Known For'),
 Relationship(source='Canada', target='inclusivity', relation='Is Known For'),
 Relationship(source='Canada', target='English', relation='Has Language'),
 Relationship(source='Canada', target='French', relation='Has Language'),
 Relationship(source='Toronto', target='culture', relation='Is Hub For'),
 Relatio

In [115]:
# print result in a formatted pretty way
# result is a pydantic class

print(result.model_dump_json(indent=2))

{
  "relationships": [
    {
      "source": "Canada",
      "target": "Rocky Mountains",
      "relation": "located in"
    },
    {
      "source": "Canada",
      "target": "prairies",
      "relation": "has"
    },
    {
      "source": "Canada",
      "target": "forests",
      "relation": "has"
    },
    {
      "source": "Canada",
      "target": "Arctic tundra",
      "relation": "has"
    },
    {
      "source": "Canada",
      "target": "economy",
      "relation": "strong in"
    },
    {
      "source": "Canada",
      "target": "multicultural society",
      "relation": "is a part of"
    },
    {
      "source": "Canada",
      "target": "politeness",
      "relation": "reputation for"
    },
    {
      "source": "Canada",
      "target": "inclusivity",
      "relation": "reputation for"
    },
    {
      "source": "Canada",
      "target": "English",
      "relation": "has official language"
    },
    {
      "source": "Canada",
      "target": "French",
      "rela

In [78]:
type(result.model_dump_json(indent=2))

str

In [85]:
all_results = []
for rel in result.relationships:
    all_results.append(rel.model_dump())
json.dumps(all_results)

'[{"source": "Canada", "target": "Rocky Mountains", "relation": "has"}, {"source": "Canada", "target": "prairies", "relation": "has"}, {"source": "Canada", "target": "forests", "relation": "has"}, {"source": "Canada", "target": "Arctic tundra", "relation": "has"}, {"source": "Canada", "target": "economy", "relation": "has"}, {"source": "Canada", "target": "multicultural society", "relation": "is"}, {"source": "Canada", "target": "politeness", "relation": "reputation"}, {"source": "Canada", "target": "inclusivity", "relation": "reputation"}, {"source": "Canada", "target": "English", "relation": "has official language"}, {"source": "Canada", "target": "French", "relation": "has official language"}, {"source": "Canada", "target": "Britain", "relation": "colonial history under"}, {"source": "Canada", "target": "France", "relation": "colonial history under"}, {"source": "Canada", "target": "Toronto", "relation": "city"}, {"source": "Canada", "target": "Vancouver", "relation": "city"}, {"sou

In [19]:
import faiss
import pickle
import json
from sentence_transformers import SentenceTransformer


def build_relationships_for_chunks(chunks, k=30, save_path="relationships.json"):
    # --- Load FAISS index and mapping ---
    index = faiss.read_index("entity_texts_cosine.index")
    with open("entity_texts_mapping.pkl", "rb") as f:
        unique_texts = pickle.load(f)

    # --- Load embedding model ---
    model = SentenceTransformer("all-MiniLM-L6-v2")

    all_relationships = []

    for _, chunk in enumerate(chunks, 1):
        print(f"Starting new chunk processing... for {_}")
        # Get relevant entities from FAISS
        embedding = model.encode(
            [chunk], convert_to_numpy=True, normalize_embeddings=True
        )
        similarities, indices = index.search(embedding, k=k)
        results = [unique_texts[idx] for idx in indices[0]]

        # Prepare the prompt
        prompt = f"""
Extract out relationships from the provided text. Do not use spaces between words in relationships
\"\"\"{chunk}\"\"\"

Source and Target entities can be any entity relevant from the text. Source and Target both should not only be the same entity every time. Try to extract out all meaningful information from the text. Relevant Entities separated by commas are given below. Make sure to only define the relationship in a maximum of 3 words.
Relevant entities:
{results}
"""

        llm_output = ask_ollama_structured(
            ollama_api_endpoint, "llama3", prompt, Relationships
        )

        for rel in llm_output.relationships:
            all_relationships.append(rel.model_dump())

        # Save progress after each chunk
        with open(save_path, "w", encoding="utf-8") as f:
            json.dump(all_relationships, f, ensure_ascii=False, indent=2)

    return all_relationships

In [122]:
build_relationships_for_chunks(chunks)

Starting new chunk processing... for 1
Starting new chunk processing... for 2
Starting new chunk processing... for 3
Starting new chunk processing... for 4
Starting new chunk processing... for 5
Starting new chunk processing... for 6
Starting new chunk processing... for 7
Starting new chunk processing... for 8
Starting new chunk processing... for 9
Starting new chunk processing... for 10
Starting new chunk processing... for 11
Starting new chunk processing... for 12
Starting new chunk processing... for 13
Starting new chunk processing... for 14
Starting new chunk processing... for 15
Starting new chunk processing... for 16
Starting new chunk processing... for 17
Starting new chunk processing... for 18
Starting new chunk processing... for 19
Starting new chunk processing... for 20
Starting new chunk processing... for 21
Starting new chunk processing... for 22
Starting new chunk processing... for 23
Starting new chunk processing... for 24
Starting new chunk processing... for 25
Starting 

[{'source': 'Metformin',
  'target': 'Chemical Composition',
  'relation': 'related to'},
 {'source': 'Metformin', 'target': 'GLUCOPHAGE', 'relation': 'is'},
 {'source': 'Metformin',
  'target': 'GLUCOPHAGE XR',
  'relation': 'is extended-release form of'},
 {'source': 'Metformin hydrochloride',
  'target': 'C4H11N5 • HCl',
  'relation': 'molecular formula is'},
 {'source': 'Metformin hydrochloride',
  'target': '165.63',
  'relation': 'molecular weight is'},
 {'source': 'Metformin hydrochloride',
  'target': 'water',
  'relation': 'freely soluble in'},
 {'source': 'Metformin hydrochloride',
  'target': 'acetone, ether, and chloroform',
  'relation': 'practically insoluble in'},
 {'source': 'metformin', 'target': 'water', 'relation': 'insoluble'},
 {'source': 'metformin', 'target': 'acetone', 'relation': 'insoluble'},
 {'source': 'metformin', 'target': 'ether', 'relation': 'insoluble'},
 {'source': 'metformin', 'target': 'chloroform', 'relation': 'insoluble'},
 {'source': 'GLUCOPHAGE t

In [20]:
import re
import json
from neo4j import GraphDatabase

NEO4J_URI = "neo4j+s://033a4c34.databases.neo4j.io"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "yQ02-iiANgP8fMDa_Ndj6mq4DioXPbbqGdP2_JBqrLg"
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))


def sanitize_relationship(rel):
    """
    Sanitizes a relationship name to be Neo4j-compatible:
    - Uppercases everything
    - Keeps only A–Z, 0–9, and underscores
    - Ensures it starts with an uppercase letter (adds 'R_' if not)
    - Defaults to 'RELATED_TO' if empty
    """
    # Uppercase and remove invalid characters
    clean = re.sub(r"[^A-Z0-9_]", "_", str(rel).upper())

    # Remove leading/trailing underscores from cleanup
    clean = clean.strip("_")

    # If starts with a number or is empty, prefix with 'R_'
    if not clean or clean[0].isdigit():
        clean = f"R_{clean}"

    return clean


def create_relationships_from_json(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        relationships = json.load(f)

    with driver.session() as session:
        for rel in relationships:
            source = rel.get("source")
            relationship = sanitize_relationship(rel.get("relation", "RELATED_TO"))
            target = rel.get("target")

            if not source or not target:
                print(f"⚠️ Skipping invalid relationship: {rel}")
                continue

            query = f"""
            MATCH (a {{text: $source}}), (b {{text: $target}})
            MERGE (a)-[r:{relationship}]->(b)
            """
            print(query, source, target)
            session.run(query, source=source, target=target)

    print(f"✅ Created {len(relationships)} relationships from {file_path}")

In [21]:
def delete_all_relationships():
    """
    Deletes all relationships in the Neo4j database.
    Nodes remain untouched.
    """
    with driver.session() as session:
        session.run("MATCH ()-[r]->() DELETE r")
    print("🗑️ All relationships deleted.")

In [22]:
# delete_all_relationships()

In [159]:
create_relationships_from_json("relationships.json")


            MATCH (a {text: $source}), (b {text: $target})
            MERGE (a)-[r:RELATED_TO]->(b)
             Metformin Chemical Composition

            MATCH (a {text: $source}), (b {text: $target})
            MERGE (a)-[r:IS]->(b)
             Metformin GLUCOPHAGE

            MATCH (a {text: $source}), (b {text: $target})
            MERGE (a)-[r:IS_EXTENDED_RELEASE_FORM_OF]->(b)
             Metformin GLUCOPHAGE XR

            MATCH (a {text: $source}), (b {text: $target})
            MERGE (a)-[r:MOLECULAR_FORMULA_IS]->(b)
             Metformin hydrochloride C4H11N5 • HCl

            MATCH (a {text: $source}), (b {text: $target})
            MERGE (a)-[r:MOLECULAR_WEIGHT_IS]->(b)
             Metformin hydrochloride 165.63

            MATCH (a {text: $source}), (b {text: $target})
            MERGE (a)-[r:FREELY_SOLUBLE_IN]->(b)
             Metformin hydrochloride water

            MATCH (a {text: $source}), (b {text: $target})
            MERGE (a)-[r:PRACTICALLY_INS

In [23]:
new_nodes = set()  # Global set to avoid duplicates


def create_nodes_and_relationships(file_path):
    global new_nodes

    with open(file_path, "r", encoding="utf-8") as f:
        relationships = json.load(f)

    with driver.session() as session:
        for rel in relationships:
            source = rel.get("source")
            relationship = sanitize_relationship(rel.get("relation", "RELATED_TO"))
            target = rel.get("target")

            if not source or not target:
                print(f"⚠️ Skipping invalid relationship: {rel}")
                continue

            # Ensure source node exists
            src_exists = session.run(
                "MATCH (n {text: $text}) RETURN count(n) AS c", text=source
            ).single()["c"]
            if src_exists == 0:
                session.run("CREATE (n {text: $text})", text=source)
                new_nodes.add(source)

            # Ensure target node exists
            tgt_exists = session.run(
                "MATCH (n {text: $text}) RETURN count(n) AS c", text=target
            ).single()["c"]
            if tgt_exists == 0:
                session.run("CREATE (n {text: $text})", text=target)
                new_nodes.add(target)

            # Create the relationship
            session.run(
                f"""
                MATCH (a {{text: $source}}), (b {{text: $target}})
                MERGE (a)-[r:{relationship}]->(b)
                """,
                source=source,
                target=target,
            )

            # Verify relationship
            rel_count = session.run(
                f"""
                MATCH (a {{text: $source}})-[r:{relationship}]->(b {{text: $target}})
                RETURN count(r) AS c
                """,
                source=source,
                target=target,
            ).single()["c"]

            if rel_count > 0:
                print(f"✅ {source} -[{relationship}]-> {target} created/exists")
            else:
                print(f"❌ {source} -[{relationship}]-> {target} NOT found")

    return list(new_nodes)

In [None]:
new_nodes = create_nodes_and_relationships("relationships.json")

✅ Metformin -[RELATED_TO]-> Chemical Composition created/exists
✅ Metformin -[IS]-> GLUCOPHAGE created/exists
✅ Metformin -[IS_EXTENDED_RELEASE_FORM_OF]-> GLUCOPHAGE XR created/exists
✅ Metformin hydrochloride -[MOLECULAR_FORMULA_IS]-> C4H11N5 • HCl created/exists
✅ Metformin hydrochloride -[MOLECULAR_WEIGHT_IS]-> 165.63 created/exists
✅ Metformin hydrochloride -[FREELY_SOLUBLE_IN]-> water created/exists
✅ Metformin hydrochloride -[PRACTICALLY_INSOLUBLE_IN]-> acetone, ether, and chloroform created/exists
✅ metformin -[INSOLUBLE]-> water created/exists
✅ metformin -[INSOLUBLE]-> acetone created/exists
✅ metformin -[INSOLUBLE]-> ether created/exists
✅ metformin -[INSOLUBLE]-> chloroform created/exists
✅ GLUCOPHAGE tablets -[CONTAINS]-> metformin hydrochloride created/exists
✅ GLUCOPHAGE tablets -[INGREDIENT]-> povidone created/exists
✅ GLUCOPHAGE tablets -[INGREDIENT]-> magnesium stearate created/exists
✅ 500 mg and 850 mg tablets -[COATING]-> hypromellose created/exists
✅ 1000 mg tablet

['plasma',
 'Renal function',
 'Phenytoin',
 'patients not responding adequately',
 'GLUCOPHAGE 850 mg',
 'plasma glucose',
 'Distributed by: Bristol-Myers Squibb Company',
 'glucose levels',
 'advanced age',
 'Regular exercise program',
 'Surgery',
 'GLUCOPHAGE XR 500 mg tablets',
 'parent',
 'administered',
 'alcohol',
 'Regimen',
 'amputations',
 'lower blood sugar',
 'Feces',
 'Plasma Proteins',
 'bioavailability',
 'low end of dosing range',
 'heart attack',
 'same active ingredient',
 'Furosemid',
 'cold hands/feet',
 'Isoniazid',
 'dosing_recommendations',
 '<0.001',
 '20°–25° C (68°–77° F)',
 'children',
 'body',
 'effects',
 'Discontinue',
 'younger patients',
 'Dietary instructions',
 'GLUCOPHAGE 1000 mg',
 'Additional Data',
 'Nicotinic Acid',
 'concomitant disease or other drug therapy',
 'Initial Dose Titration',
 'normal subjects',
 'Labeling Information',
 'Compartment of distribution',
 'deficiency',
 'Malnourished patients',
 'Carbonic Anhydrase',
 'patients',
 'Abdomi

FAISS index and label mapping saved.


In [24]:
def delete_abandoned_nodes():
    """
    Deletes all nodes in the Neo4j database that have no relationships.
    Returns a list of their `text` property values.
    """
    with driver.session() as session:
        result = session.run(
            """
            MATCH (n)
            WHERE NOT (n)--()
            WITH collect(n.text) AS removed_texts, collect(n) AS nodes
            FOREACH (node IN nodes | DELETE node)
            RETURN removed_texts
        """
        )

        removed_nodes = result.single()["removed_texts"]

    if removed_nodes:
        print(f"🗑️ Removed abandoned nodes: {removed_nodes}")
    else:
        print("✅ No abandoned nodes found.")

    return removed_nodes

In [176]:
delete_abandoned_nodes()



['6.3',
 '83',
 '±0.42',
 '0.9',
 '12-16 years of age',
 '2500/20',
 'mL. Metformin',
 'Obtain',
 '21.0',
 '64.3',
 '8.4',
 'n=26',
 '±1.10',
 '4 weeks',
 '12%',
 '15.1%',
 'approximately 20%',
 '215.0',
 '8%',
 '0.2%',
 '0.14',
 '241.5',
 'About 3',
 '5%',
 '≥1.0%',
 '412',
 '32',
 '11.5',
 '181.6',
 '654',
 '2.0',
 'Determination of fetal',
 '1.4',
 '94.64',
 '3.11',
 '0.27',
 '3',
 '24 to 48 hours',
 '18.6',
 'Twice Daily \n1000',
 'Cmax and Tmax',
 'n=24',
 '7.1',
 '19%',
 'Once Daily \nTotal Cholesterol',
 '0.04',
 '1.7%',
 '41.7',
 'more than a few weeks',
 '7.06',
 '2.64',
 '0.15',
 '16-week',
 '21',
 '08543',
 '25.5',
 '16%',
 '4.4',
 '12.4',
 '14.9%',
 '9',
 '95%',
 '2500',
 '18',
 '8.3',
 '2.71',
 '1.48',
 '23-59 years',
 'approximately 50% to 60%',
 '12',
 '179.0',
 'Table 11',
 '61-90',
 '3-year',
 '0.02',
 '7.02',
 '0.28',
 '6071',
 'half',
 '40.8',
 'Renal',
 '0.08',
 '1500/20',
 '74)d',
 '210.3',
 '126.2',
 'Pediatric',
 '1000/20',
 'Geriatrics \nLimited',
 '0.1%',
 '008

In [25]:
def verify_relationships_from_json(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        relationships = json.load(f)

    with driver.session() as session:
        for rel in relationships:
            source = rel.get("source")
            relationship = sanitize_relationship(rel.get("relation", "RELATED_TO"))
            target = rel.get("target")

            if not source or not target:
                print(f"⚠️ Skipping invalid relationship: {rel}")
                continue

            query = f"""
            MATCH (a {{text: $source}})-[r:{relationship}]->(b {{text: $target}})
            RETURN count(r) AS rel_count
            """
            result = session.run(query, source=source, target=target).single()

            if result["rel_count"] > 0:
                print(f"✅ {source} -[{relationship}]-> {target} exists")
            else:
                print(f"❌ {source} -[{relationship}]-> {target} NOT found")

In [170]:
verify_relationships_from_json("relationships.json")

❌ Metformin -[RELATED_TO]-> Chemical Composition NOT found
✅ Metformin -[IS]-> GLUCOPHAGE exists




❌ Metformin -[IS_EXTENDED_RELEASE_FORM_OF]-> GLUCOPHAGE XR NOT found
❌ Metformin hydrochloride -[MOLECULAR_FORMULA_IS]-> C4H11N5 • HCl NOT found




❌ Metformin hydrochloride -[MOLECULAR_WEIGHT_IS]-> 165.63 NOT found
❌ Metformin hydrochloride -[FREELY_SOLUBLE_IN]-> water NOT found




❌ Metformin hydrochloride -[PRACTICALLY_INSOLUBLE_IN]-> acetone, ether, and chloroform NOT found
❌ metformin -[INSOLUBLE]-> water NOT found
❌ metformin -[INSOLUBLE]-> acetone NOT found




❌ metformin -[INSOLUBLE]-> ether NOT found
❌ metformin -[INSOLUBLE]-> chloroform NOT found
❌ GLUCOPHAGE tablets -[CONTAINS]-> metformin hydrochloride NOT found




❌ GLUCOPHAGE tablets -[INGREDIENT]-> povidone NOT found
❌ GLUCOPHAGE tablets -[INGREDIENT]-> magnesium stearate NOT found




❌ 500 mg and 850 mg tablets -[COATING]-> hypromellose NOT found
❌ 1000 mg tablet -[COATING]-> polyethylene glycol NOT found
❌ GLUCOPHAGE XR -[CONTAINS]-> metformin hydrochloride NOT found




❌ GLUCOPHAGE XR 500 mg tablets -[INGREDIENT]-> sodium carboxymethyl cellulose NOT found
❌ GLUCOPHAGE XR 500 mg tablets -[INGREDIENT]-> hypromellose NOT found
❌ GLUCOPHAGE XR 500 mg tablets -[INGREDIENT]-> microcrystalline cellulose NOT found




❌ GLUCOPHAGE XR 500 mg tablets -[INGREDIENT]-> magnesium stearate NOT found
❌ GLUCOPHAGE XR -[CONTAINS]-> Metformin NOT found
❌ GLUCOPHAGE XR -[CONTAINS]-> sodium carboxymethyl cellulose NOT found
❌ GLUCOPHAGE XR -[CONTAINS]-> hypromellose NOT found
❌ GLUCOPHAGE XR -[CONTAINS]-> magnesium stearate NOT found
❌ System Components and Performance -[IS]-> dual hydrophilic polymer matrix system NOT found




❌ Metformin hydrochloride -[FORMS]-> inner phase NOT found
❌ polymer -[INCORPORATES]-> external phase NOT found
❌ fluid from the gastrointestinal tract -[ENTERS]-> tablet NOT found




❌ polymers -[CAUSE]-> hydrate and swell NOT found
❌ GI tract -[IS_EXPECTED_TO_BE_BROKEN_UP_BY]-> break up NOT found
❌ biologically inert components -[ELIMINATED_IN]-> feces NOT found
❌ CLINICAL PHARMACOLOGY -[RELATED_TO]-> GLUCOPHAGE XR NOT found
✅ Metformin -[LOWERS]-> Plasma Glucose exists
❌ Metformin -[INCREASES]-> Insulin Sensitivity NOT found
❌ Metformin -[DECREASES]-> Hepatic Glucose Production NOT found
❌ Metformin -[DECREASES]-> Intestinal Absorption of Glucose NOT found
❌ Metformin -[LOWERS]-> Basal Plasma Glucose NOT found
❌ Metformin -[LOWERS]-> Postprandial Plasma Glucose NOT found
❌ Metformin -[DECREASES]-> Fasting Insulin Levels NOT found
❌ Metformin -[DECREASES]-> Day-Long Plasma Insulin Response NOT found




❌ Metformin -[REMAINS_UNCHANGED]-> Insulin Secretion NOT found
❌ metformin -[HAS]-> bioavailability NOT found
❌ GLUCOPHAGE 500 mg tablet -[HAS]-> bioavailability NOT found
❌ metformin -[DECREASES]-> absorption NOT found
❌ food -[LOWERS]-> absorption NOT found




❌ metformin -[AFFECTS]-> Cmax NOT found
❌ metformin -[AFFECTS]-> AUC NOT found


BufferError: Existing exports of data: object cannot be re-sized

### Querying the KG

qs: ingestion of what medicine causes diarrhea


In [39]:
def query_ent_ext(query):
    embedding = model.encode([query], convert_to_numpy=True, normalize_embeddings=True)

    retrieved_ent = []
    # --- Search ---
    similarities, indices = index.search(embedding, k=5)
    results = [
        (unique_texts[idx], float(sim)) for idx, sim in zip(indices[0], similarities[0])
    ]

    # --- Sort descending (already sorted by FAISS) ---
    for text, sim in results:
        retrieved_ent.append(text)
    return retrieved_ent


answer = query_ent_ext(query="what treats type 2 diabetes")
answer

['Insulin \nTherapy',
 'Placebo/Insulin Summary',
 'Plasma Glucose',
 'Insulin Dose',
 '2.2']

In [40]:
def get_entities_with_relationships(driver, entity_texts):
    query = """
    WITH $entityList AS entityList
    MATCH (n)
    WHERE n.text IN entityList
    OPTIONAL MATCH (n)-[r]-(m)
    RETURN {
        node: {
            label: labels(n),
            text: n.text
        },
        relationship: CASE WHEN r IS NOT NULL THEN {
            type: type(r),
            properties: properties(r)
        } ELSE null END,
        connectedNode: CASE WHEN m IS NOT NULL THEN {
            label: labels(m),
            text: m.text
        } ELSE null END
    } AS result
    """

    with driver.session() as session:
        results = session.run(query, entityList=entity_texts)
        structured_output = [record["result"] for record in results]

    return structured_output


answer = get_entities_with_relationships(
    driver, query_ent_ext("what treats type 2 diabetes")
)

print(answer)

[{'relationship': {'properties': {}, 'type': 'INCREASED'}, 'node': {'text': 'Insulin Dose', 'label': ['PERSON']}, 'connectedNode': {'text': 'Placebo', 'label': ['PRODUCT']}}, {'relationship': {'properties': {}, 'type': 'REDUCED'}, 'node': {'text': 'Insulin Dose', 'label': ['PERSON']}, 'connectedNode': {'text': 'GLUCOPHAGE', 'label': ['ORG']}}, {'relationship': {'properties': {}, 'type': 'REDUCE'}, 'node': {'text': 'Insulin Dose', 'label': ['PERSON']}, 'connectedNode': {'text': 'GLUCOPHAGE', 'label': ['ORG']}}, {'relationship': {'properties': {}, 'type': 'ADMINISTERED_AS'}, 'node': {'text': 'Insulin Dose', 'label': ['PERSON']}, 'connectedNode': {'text': '600 mg/kg', 'label': ['QUANTITY']}}, {'relationship': {'properties': {}, 'type': 'AFFECTED_BY'}, 'node': {'text': 'Insulin Dose', 'label': ['PERSON']}, 'connectedNode': {'text': 'Plasma Glucose', 'label': ['PERSON']}}, {'relationship': {'properties': {}, 'type': 'MEASUREMENT'}, 'node': {'text': 'Insulin Dose', 'label': ['PERSON']}, 'con

In [28]:
def compress_relationships(data):
    """
    Compresses relationship data into minimal unique triples:
    SOURCE | RELATIONSHIP | TARGET

    Args:
        data (list): Either a list of relationship dicts
                     (with node/relationship/connectedNode)
                     or already-flat strings.

    Returns:
        str: All relationships as '\n'-separated triples, deduplicated & sorted.
    """
    triples = set()

    for entry in data:
        if isinstance(entry, dict):
            # Extract fields from nested JSON-like structure
            source = entry.get("node", {}).get("text", "").strip()
            rel = entry.get("relationship", {}).get("type", "").strip()
            target = entry.get("connectedNode", {}).get("text", "").strip()
            if source and rel and target:
                triples.add(f"{source} | {rel} | {target}")
        elif isinstance(entry, str):
            # Already flattened triple string
            triples.add(entry.strip())

    # Sort for consistent ordering (optional)
    sorted_triples = sorted(triples)

    return sorted_triples

In [None]:
question = "can you tell me if glucophage is related to estrogen?"
entity_list = query_ent_ext(question)
graph_data = get_entities_with_relationships(driver, entity_list)
compress_relationships(graph_data)

'GLU | COMPARISON | Comb\nGLU | MONOTHERAPY | GLUCOPHAGE\nGLU | RELATED | serum creatinine\nGLUCOPHAGE | ADJUSTED | dosage\nGLUCOPHAGE | ADMINISTERED | dose\nGLUCOPHAGE | AFFECTS | Serum Lipid Variables\nGLUCOPHAGE | AFFECTS | other drugs\nGLUCOPHAGE | ASSESSES | renal function\nGLUCOPHAGE | ASSOCIATED_WITH | Lactic acidosis\nGLUCOPHAGE | ASSOCIATED_WITH | liver problems\nGLUCOPHAGE | AS_ADJUNCT | diet therapy\nGLUCOPHAGE | BUILDS_UP | blood sugar\nGLUCOPHAGE | CAUSES | diarrhea\nGLUCOPHAGE | CAUSES | metallic taste\nGLUCOPHAGE | CHANGED | Body Weight\nGLUCOPHAGE | COMBINATION | Comb\nGLUCOPHAGE | COMBINATION | Insulin\nGLUCOPHAGE | COMBINED | glyburide\nGLUCOPHAGE | COMBINED_WITH | glyburide\nGLUCOPHAGE | COMPARED_TO | Placebo\nGLUCOPHAGE | COMPARED_TO | placebo\nGLUCOPHAGE | CONCOMITANT | insulin\nGLUCOPHAGE | CONCOMITANT_THERAPY | sulfonylurea\nGLUCOPHAGE | CONTAINED | combination group\nGLUCOPHAGE | CONTAINS | Metformin\nGLUCOPHAGE | CONTAINS | Tablets\nGLUCOPHAGE | CONTAINS | metf

In [30]:
import json


def answer_question_with_graph(driver, question):
    # Step 1: Fetch graph data
    entity_list = query_ent_ext(question)
    graph_data = get_entities_with_relationships(driver, entity_list)

    # Step 2: Build a prompt for the LLM
    prompt = f"""
You are a reasoning assistant working with a knowledge graph.

The graph is given below as JSON. 
It contains:
- "node" = the starting node requested
- "relationship" = the exact relationship (type + properties) between the node and another node
- "connectedNode" = the node at the other end of the relationship

STRICT INSTRUCTIONS:
1. Only use relationships that are explicitly shown in the JSON. 
2. Do NOT make any guesses, assumptions, or use outside knowledge.
3. If the answer cannot be found in the relationships, say: "No direct answer can be found from the provided graph."
4. Always show the exact chain of relationships from the graph that led to your answer.
5. If multiple paths exist, show each path separately.
6. If no relationship connects the queried entities, say so clearly.

Graph Data: (in the form Source | Relationship | Target)
{compress_relationships(graph_data)}

Question: {question}

Your response should be in the following format:

Answer: <Direct answer based only on relationships in the graph>
Relationships used:
1. <Node A> -[RELATIONSHIP_TYPE]-> <Node B>
2. ...

In the end gather whatever you founf out and give a paragraph answer.
"""

    print(compress_relationships(graph_data))

    # Step 3: Send to Ollama
    return ask_ollama(prompt)

In [43]:
# Example usage
# question = "ingestion of what medicine causes diarrhea"
question = "what treats type 2 diabetes"
response = answer_question_with_graph(driver, question)

print(response)

['Insulin Dose | ADMINISTERED_AS | 600 mg/kg', 'Insulin Dose | AFFECTED_BY | Plasma Glucose', 'Insulin Dose | INCREASED | Placebo', 'Insulin Dose | MEASURED_IN | mg/kg', 'Insulin Dose | MEASUREMENT | Daily Insulin Dose', 'Insulin Dose | REDUCE | GLUCOPHAGE', 'Insulin Dose | REDUCED | GLUCOPHAGE', 'Placebo/Insulin Summary | INCLUDED | Summary', 'Plasma Glucose | AFFECTED_BY | Insulin Dose', 'Plasma Glucose | BINDS | Metformin', 'Plasma Glucose | DECLINED | GLUCOPHAGE', 'Plasma Glucose | DOES_NOT_LOWER | Placebo', 'Plasma Glucose | INCREASED | Placebo', 'Plasma Glucose | LOWERS | GLUCOPHAGE', 'Plasma Glucose | LOWERS | Metformin', 'Plasma Glucose | REDUCED | GLUCOPHAGE', 'Plasma Glucose | RELATED | Hemoglobin A1c', 'Plasma Glucose | TESTED_AT | 182.7 mg/dL']
Answer: No direct answer can be found from the provided graph.
