In [None]:
import pandas as pd
import numpy as np
import re
from tqdm.notebook import tqdm
import neo4j
from neo4j import GraphDatabase
import os
from neo4j_graphrag.experimental.components.embedder import TextChunkEmbedder
from neo4j_graphrag.embeddings.openai import OpenAIEmbeddings
from neo4j_graphrag.embeddings.ollama import OllamaEmbeddings
from neo4j_graphrag.experimental.components.types import TextChunks, TextChunk


## Original code to load the graph data

In [2]:
df=pd.read_csv("Data/ADV_GRAPH_20240119.csv")
ls_text=[]
ls_index=[]
ls_graph=[]
# (start_node, edge, end_node)
ls_start_node=[]
ls_edge=[]
ls_end_node=[]
ls_len=[]
for index, row in df.iterrows():
    # Original string
    s = row["Graph"]
    if s.startswith("ERROR:"):
        continue 
    # Use regex to find all phrases in parentheses
    matches = re.findall(r'\((.*?)\)', s)
    # Format each match by removing commas and stripping whitespace
    ls_index+=[index for match in matches]
    ls_text+=[' '.join(match.split(',')) for match in matches]
    ls_graph+=matches
    ls_len+=[len(match.split(',')) for match in matches]
    for match in matches:
        temp=match.split(',')
        if len(temp)==3:
            ls_start_node.append(temp[0])
            ls_edge.append(temp[1])
            ls_end_node.append(temp[2])
        else:
            ls_start_node.append("")
            ls_edge.append("")
            ls_end_node.append("")

In [3]:
df_map=pd.DataFrame({
    "original_index":ls_index,
    "graph":ls_graph,
    "text":ls_text,
    "len":ls_len,
    "start_node":ls_start_node,
    "edge":ls_edge,
    "end_node":ls_end_node
})
df_map=df_map[df_map.len==3].drop(columns="len")

In [4]:
df_map["start_node"]=df_map["start_node"].apply(lambda x: ' '.join(str(x. lower()).split()) if isinstance(x, str) else x)
df_map["edge"]=df_map["edge"].apply(lambda x: ' '.join(str(x. lower()).split()) if isinstance(x, str) else x)
df_map["end_node"]=df_map["end_node"].apply(lambda x: ' '.join(str(x. lower()).split()) if isinstance(x, str) else x)
df_map["text"]=df_map["text"].apply(lambda x: ' '.join(str(x).split()) if isinstance(x, str) else x)


In [5]:
minorities=df_map["start_node"].unique()
adj_nodes={minority: 
            set(df_map[(df_map["start_node"]==minority) & (df_map["end_node"].isin(minorities))]["end_node"]) 
            | 
            set(df_map[(df_map["end_node"]==minority) & (df_map["start_node"].isin(minorities))]["start_node"])
            for minority in tqdm(minorities)}

  0%|          | 0/3015 [00:00<?, ?it/s]

In [6]:
df_map.head(2)

Unnamed: 0,original_index,graph,text,start_node,edge,end_node
0,0,"black folks, are, well endowed",black folks are well endowed,black folks,are,well endowed
1,1,"good blacks, belong to, black people",good blacks belong to black people,good blacks,belong to,black people


In [37]:
# find empty nodes
df_map[(df_map["start_node"]=="") | (df_map["edge"]=="") | (df_map["end_node"]=="")]

Unnamed: 0,original_index,graph,text,start_node,edge,end_node
161,101,"black people, overreact,",black people overreact,black people,overreact,
208,135,"black folks, don't fit in,",black folks don't fit in,black folks,don't fit in,
592,401,"black folks, haven't evolved fully,",black folks haven't evolved fully,black folks,haven't evolved fully,
963,651,"black folks, sleep around,",black folks sleep around,black folks,sleep around,
1003,679,"black people, shoot,",black people shoot,black people,shoot,
...,...,...,...,...,...,...
48506,24495,"Nigerian folks, don't live long,",Nigerian folks don't live long,nigerian folks,don't live long,
48856,24658,"emo kids, cry,",emo kids cry,emo kids,cry,
49588,24955,"bisexual women, don't exist,",bisexual women don't exist,bisexual women,don't exist,
49591,24955,"bisexual men, don't exist,",bisexual men don't exist,bisexual men,don't exist,


In [38]:
def get_embeddings(start_node, end_node, edge):
    ollama_embedder = OllamaEmbeddings(model="nomic-embed-text")
    
    start_embedding = ollama_embedder.embed_query(start_node)
    if end_node == "":
        end_embedding = np.zeros(len(start_embedding))
    else:
        end_embedding = ollama_embedder.embed_query(end_node)
    edge_embedding = ollama_embedder.embed_query(edge)

    return start_embedding, end_embedding, edge_embedding

In [None]:
# vector_index = Neo4jVector.from_existing_graph(
#     OpenAIEmbeddings(),
#     search_type="hybrid",
#     node_label="Document",
#     text_node_properties=["text"],
#     embedding_node_property="embedding"
# )


## Upload to Neo4j

In [13]:
# neo4j_uri = os.getenv("NEO4J_URI")
# neo4j_user = os.getenv("NEO4J_USER")
# neo4j_password = os.getenv("NEO4J_PASSWORD")
neo4j_uri = "bolt://localhost:7687"
neo4j_user= "neo4j"
neo4j_password = "biasgraph"

In [14]:
def run_query(driver, query, parameters=None):
        """
        Executes a Cypher query against the Neo4j database.

        Args:
            query (str): The Cypher query to execute.

        Returns:
            list: A list of query results, where each result is a dictionary.
        """
        with driver.session() as session:
            result = session.run(query, parameters)
            return [record.data() for record in result]

In [39]:
neo4j_driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_user, neo4j_password))

for row in df_map.iterrows():
    #print every 1000 rows
    if row[0] % 100 == 0:
        print(row[0])
    start_node=row[1]["start_node"]
    end_node=row[1]["end_node"]
    edge=row[1]["edge"]
    start_embedding, end_embedding, edge_embedding = get_embeddings(start_node, end_node, edge)
    query="""MERGE (s:StartNode {text: $start_node})
                MERGE (e:EndNode {text: $end_node}) 
                MERGE (s)-[r:RELATIONSHIP {text: $edge}]->(e)
                SET s.embedding = $start_embedding, e.embedding = $end_embedding, r.embedding = $edge_embedding
                RETURN s, e, r
        """
    parameters = {"start_node": start_node,
                "end_node": end_node,
                "edge": edge,
                "start_embedding": start_embedding,
                "end_embedding": end_embedding,
                "edge_embedding": edge_embedding}
    # Run the query
    run_query(neo4j_driver, query, parameters)

query_start_index = """
        CREATE VECTOR INDEX startIndex IF NOT EXISTS FOR (s:StartNode) ON s.embedding OPTIONS {indexConfig: {`vector.similarity_function`: 'cosine'} }
        """
query_end_index = """
        CREATE VECTOR INDEX endIndex IF NOT EXISTS FOR (e:EndNode) ON e.embedding OPTIONS {indexConfig: {`vector.similarity_function`: 'cosine'} } 
        """
query_edge_index = """
        CREATE VECTOR INDEX edgeIndex IF NOT EXISTS FOR (r:RELATIONSHIP) ON r.embedding OPTIONS {indexConfig: {`vector.similarity_function`: 'cosine'} }
        """
# Run the queries
run_query(neo4j_driver, query_start_index)
run_query(neo4j_driver, query_end_index)
run_query(neo4j_driver, query_edge_index)

neo4j_driver.close()

0
100
200
300


KeyboardInterrupt: 

In [53]:
start_node=df_map.iloc[1]["start_node"]
end_node=df_map.iloc[1]["end_node"]
edge=df_map.iloc[1]["edge"]
print(start_node, end_node, edge)

good blacks black people belong to


In [54]:
neo4j_driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_user, neo4j_password))
query = """MERGE (s:StartNode {text: $start_node})
            MERGE (e:Endnode {text: $end_node}) 
            WITH s, e
            CALL apoc.create.relationship(s, $edge, {}, e) YIELD rel
            RETURN s, e, rel"""
parameters = {"start_node": start_node,
                "end_node": end_node,
                "edge": edge}
results = run_query(neo4j_driver, query, parameters)
print(results)

[{'s': {'text': 'good blacks'}, 'e': {'text': 'black people'}, 'rel': ({'text': 'good blacks'}, 'belong to', {'text': 'black people'})}]
