In [3]:
from neo4j import GraphDatabase
from neo4j_graphrag.experimental.pipeline.kg_builder import SimpleKGPipeline
from neo4j_graphrag.llm import OpenAILLM
from neo4j_graphrag.embeddings import OpenAIEmbeddings
from neo4j_graphrag.generation.prompts import ERExtractionTemplate
from dotenv import load_dotenv
import os, time, asyncio, glob, csv

In [9]:
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USER = 'neo4j'
NEO4J_PASSWORD = os.getenv('NEO4J_AUTH')

openai_api_key = os.getenv('OPENAI_API_KEY')

In [8]:
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

In [13]:
llm = OpenAILLM(model_name='gpt-4o', api_key=openai_api_key)
dimensions = 1536
embedder = OpenAIEmbeddings(api_key=openai_api_key)

In [15]:
entities = [
 {'label': 'Executive', 'properties': [{'name': 'name',
'type': 'STRING'}]},
 {'label': 'Product', 'properties': [{'name': 'name',
'type': 'STRING'}]},
 {'label': 'FinancialMetric', 'properties': [{'name': 'name', 'type': 'STRING'}]},
 {'label': 'RiskFactor', 'properties': [{'name': 'name', 'type': 'STRING'}]},
 {'label': 'StockType', 'properties': [{'name': 'name', 'type': 'STRING'}]},
 {'label': 'Transaction', 'properties': [{'name': 'name', 'type': 'STRING'}]},
 {'label': 'TimePeriod', 'properties': [{'name': 'name', 'type': 'STRING'}]},
 {'label': 'Company', 'properties': [{'name': 'name',
'type': 'STRING'}]}
]
relations = [
 {'label': 'HAS_METRIC', 'source': 'Company', 'target': 'FinancialMetric'},
 {'label': 'FACES_RISK', 'source': 'Company', 'target': 'RiskFactor'},
 {'label': 'ISSUED_STOCK', 'source': 'Company',
'target': 'StockType'},
 {'label': 'MENTIONS', 'source': 'Company', 'target': 'Product'}
]

In [None]:
pipeline = SimpleKGPipeline(
    driver=driver,
    llm=llm,
    embedder=embedder,
    entities=entities,
    relations=relations)

In [None]:
async def run_pipeline_on_file(file_path, pipeline):
    await pipeline.run_async(pdf_path=file_path)

<neo4j_graphrag.experimental.pipeline.kg_builder.SimpleKGPipeline at 0x2c4bf7fb3e0>

In [None]:
for pdf_file in pdf_files:
    asyncio.run(run_pipeline_on_file(pdf_file, pipeline))

In [None]:
from neo4j_graphrag.indexes import create_vector_index
create_vector_index(driver, name=”chunkEmbeddings”,
label=”Chunk”,
 embedding_property=”embedding”,
dimensions=1536, similarity_fn=”cosine”)

### Use GDS in Python instead of Cypher

In [None]:
from graphdatascience import GraphDataScience

# Connect to Neo4j
gds = GraphDataScience("neo4j://localhost:7687", auth=("neo4j", "password"))

# Train a GraphSAGE model
model = gds.beta.graphSage.train(
    "myGraph",
    model_name="sageModel",
    featureProperties=["age", "income", "score"],
    embeddingDimension=128,
    epochs=10
)

# Stream embeddings into Python
embeddings = gds.beta.graphSage.stream(
    "myGraph",
    model_name="sageModel"
)

print(embeddings.head())


In [None]:
X = np.vstack(embeddings["embedding"].values)