## Using graph connection information to enrich LLM

In [2]:
from neo4j import GraphDatabase
from langchain.chat_models import ChatOpenAI
from langchain.chains import GraphCypherQAChain
from langchain.graphs import Neo4jGraph
import getpass
import os
import os


graph = Neo4jGraph(
    url="bolt://54.205.44.248:7687",
    username="neo4j",
    password="hat-certificates-closures"
)

name = 'OPENAI_API_KEY'
secret = getpass.getpass("Enter the OpenAI API Key")
os.environ[name] = secret
print(f"Have successfully set up {name} as an environment variable.")

# Initialize connection to Neo4j database
uri = "bolt://54.205.44.248:7687"  # Replace with your Neo4j URI
auth = ("neo4j", "hat-certificates-closures")  # Replace with your Neo4j username and password
driver = GraphDatabase.driver(uri, auth=auth)

# Setting up ChatGPT integration with Graph Database
chain = GraphCypherQAChain.from_llm(
    ChatOpenAI(temperature=0), graph=graph, verbose=True,
)

Have successfully set up OPENAI_API_KEY as an environment variable.


In [3]:
# Use ChatGPT to generate a cypher query about the context of the 

import io
import sys
import re

# Redirect stdout (standard output) to capture printed text
backup = sys.stdout
sys.stdout = io.StringIO()

# Run the function
chain.run("""
Generate a Cypher query that:
1. Matches nodes of type 'Loc' that are connected via a 'ROAD' relationship.
2. For each matched node and relationship, construct a context string that describes the connection. Use the '+' operator for string concatenation. The context string should include the names of the nodes and the cost of the ROAD.
""")


# # Capture the printed text and revert stdout
captured_output = sys.stdout.getvalue()
sys.stdout.close()
sys.stdout = backup

match = re.search(r"Full Context:(.*)", captured_output, re.DOTALL)
if match:
    content = match.group(1).strip()

# Remove ANSI escape codes
cleaned_content = re.sub(r'\x1b\[.*?m', '', content)

In [4]:
import ast

# The given input as a string
input_string = cleaned_content
# Convert the string to a list of dictionaries
data_list = ast.literal_eval(input_string.split('\n\n')[0])

# Extract the desired text
result_list = [item['context_string'] for item in data_list]
result_string = ", ".join(result_list)

In [5]:
formatted_string = result_string.replace(", ", ",\n")
print(formatted_string)

A is connected to D with a cost of 100,
A is connected to C with a cost of 50,
A is connected to B with a cost of 50,
B is connected to D with a cost of 40,
C is connected to E with a cost of 80,
C is connected to D with a cost of 40,
D is connected to D with a cost of 69,
D is connected to F with a cost of 80,
D is connected to E with a cost of 30,
E is connected to F with a cost of 40


In [6]:
system_prompt = """
You are an assistant that helps to generate text to form nice and human understandable answers based.
The latest prompt contains the information, and you need to generate a human readable response based on the given information.
Make the answer sound as a response to the question. Do not mention that you based the result on the given information.
Do not add any additional information that is not explicitly provided in the latest prompt.
I repeat, do not add any information that is not explicitly given.
"""
# context = 'A is connected to D with a cost of 100, A is connected to C with a cost of 50, A is connected to B with a cost of 50, B is connected to D with a cost of 40, C is connected to E with a cost of 80, C is connected to D with a cost of 40, D is connected to A with a cost of 69, D is connected to F with a cost of 80, D is connected to E with a cost of 30, E is connected to F with a cost of 40'
context = formatted_string

def generate_user_prompt(question, context):
   return f"""
   The question is {question}
   Answer the question by using the provided information:
   {context}
   """

In [7]:
import openai

def get_gpt3_response(system_prompt, prompt, context, api_key, model="gpt-3.5-turbo"):
    """
    Send a request to the OpenAI Chat API and get a response from the model.
    
    Args:
        prompt (str): The user's question.
        context (str): Contextual information to be passed to the model.
        api_key (str): Your OpenAI API key.
        model (str): The model version to use, default is "gpt-3.5-turbo".
    
    Returns:
        str: The model's response.
    """
    
    # Create the full prompt by combining the system prompt, context, and the user question
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": generate_user_prompt(prompt, context)}
    ]
    
    # Use the OpenAI Python client to send the request
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages
    )
    
    # Extract the response text and return
    return response.choices[0].message['content']

# You will need to install the openai Python package to use this function and also have a valid API key.


In [8]:
# Define your API key (keep it secret!)
openai.api_key = "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"
API_KEY = "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"

In [9]:
# Example usage
question = "How many nodes does the graph have"
context_info = context
response = get_gpt3_response(system_prompt, question, context_info, API_KEY, model = "gpt-3.5-turbo")
print(response)

Based on the provided information, the graph has a total of 6 nodes.


In [10]:
# Extremely volatile answer
question = "How many edges does the graph have"
context_info = context
response = get_gpt3_response(system_prompt, question, context_info, API_KEY, model = "gpt-3.5-turbo")
print(response)

Based on the provided information, the graph has a total of 9 edges.


In [11]:
# Extremely volatile
# If you refeed the model with the context, the answer may change
question = "How many nodes does the graph have? What about edges"
context_info = context
response = get_gpt3_response(system_prompt, question, context_info, API_KEY, model = "gpt-3.5-turbo")
print(response)

Based on the provided information, the graph has a total of 6 nodes: A, B, C, D, E, and F. 

There are a total of 9 edges in the graph: A-D, A-C, A-B, B-D, C-E, C-D, D-D, D-F, and D-E.


In [12]:
question = "What is the degree of node A in the graph?"
context_info = context
response = get_gpt3_response(system_prompt, question, context_info, API_KEY, model = "gpt-3.5-turbo")
print(response)

To determine the degree of node A in the graph, we need to count the number of edges that are connected to node A. Based on the provided information, we can see that A is connected to D with a cost of 100, A is connected to C with a cost of 50, and A is connected to B with a cost of 50. 

Therefore, node A has a degree of 3 in the graph.


In [13]:
question = "What is the degree of all nodes in the graph?"
context_info = context
response = get_gpt3_response(system_prompt, question, context_info, API_KEY, model = "gpt-3.5-turbo")
print(response)

The degree of a node in a graph refers to the number of edges that are connected to that node. Based on the provided information, let's determine the degree of each node in the graph:

- A is connected to D, C, and B, so the degree of A is 3.
- B is connected to A and D, so the degree of B is 2.
- C is connected to A, D, and E, so the degree of C is 3.
- D is connected to A, B, C, F, and E, and it is also connected to itself (D-D edge), so the degree of D is 6.
- E is connected to C, D, and F, so the degree of E is 3.
- F is connected to D and E, so the degree of F is 2.

Therefore, the degrees of all nodes in the graph are: A - 3, B - 2, C - 3, D - 6, E - 3, F - 2.


In [14]:
question = "Find the in degree centrality of node A."
context_info = context
response = get_gpt3_response(system_prompt, question, context_info, API_KEY, model = "gpt-3.5-turbo")
print(response)

The in-degree centrality of node A can be calculated by summing up the weights of all the incoming edges to node A. 

According to the provided information, A is connected to D with a cost of 100, A is connected to C with a cost of 50, and A is connected to B with a cost of 50. 

Therefore, the in-degree centrality of node A is 100 + 50 + 50 = 200.


In [15]:
# Incorrect
# Most likely due to context definition, since we did not consider directions
question = "Find the number of incoming links to node A."
context_info = context
response = get_gpt3_response(system_prompt, question, context_info, API_KEY, model = "gpt-3.5-turbo")
print(response)

To find the number of incoming links to node A, we need to look at the connections where A is the destination. Based on the given information, A is connected to D, C, and B. Therefore, A has three incoming links.


In [19]:
question = "Find the indegree centrality of all nodes."
context_info = context
response = get_gpt3_response(system_prompt, question, context_info, API_KEY, model = "gpt-3.5-turbo")
print(response)

The indegree centrality of a node refers to the number of incoming edges to that node. Based on the given information, we can calculate the indegree centrality of all nodes.

For node A, there are no incoming edges, so the indegree centrality is 0.

For node B, there is one incoming edge from node A, so the indegree centrality is 1.

For node C, there is one incoming edge from node A, so the indegree centrality is 1.

For node D, there are three incoming edges from nodes A, B, and C, so the indegree centrality is 3.

For node E, there are two incoming edges from nodes C and D, so the indegree centrality is 2.

For node F, there is one incoming edge from node D, so the indegree centrality is 1.

To summarize, the indegree centrality of all nodes is:
A: 0
B: 1
C: 1
D: 3
E: 2
F: 1


In [20]:
# Considers path weights
question = "Find the betweenness centrality of node A."
context_info = context
response = get_gpt3_response(system_prompt, question, context_info, API_KEY, model = "gpt-3.5-turbo")
print(response)

To find the betweenness centrality of node A, we need to determine the number of shortest paths that pass through A compared to the total number of shortest paths between all pairs of nodes.
From the given information, we know that:
- A is connected to D with a cost of 100.
- A is connected to C with a cost of 50.
- A is connected to B with a cost of 50.
To calculate the betweenness centrality, we need to consider the shortest paths between all pairs of nodes. Based on the provided information, it is not possible to directly determine all the shortest paths and their frequencies. We require the complete connectivity information among all nodes in the network to perform this calculation accurately.


In [21]:
# Fails to consider path weights unless you specify
question = "Find the betweenness centrality of all nodes. Take the cost of each path into account"
context_info = context
response = get_gpt3_response(system_prompt, question, context_info, API_KEY, model = "gpt-3.5-turbo")
print(response)

To find the betweenness centrality of all nodes, we need to calculate the number of shortest paths that pass through each node and take the cost of each path into account. Based on the provided information, here are the betweenness centrality measures for each node:

- For node A: A has connections with D, C, and B. The cost of the path from A to D (AD) is 100, from A to C (AC) is 50, and from A to B (AB) is also 50. So, the total cost of shortest paths passing through A is 100 for AD, 50 for AC, and 50 for AB. Therefore, the betweenness centrality of A is 200.

- For node B: B is connected to D with a cost of 40. So, only one shortest path passes through B, which is BD. The cost of this path is 40. Therefore, the betweenness centrality of B is 40.

- For node C: C has connections with E and D. The cost of the path from C to E (CE) is 80, and from C to D (CD) is 40. So, the total cost of shortest paths passing through C is 80 for CE and 40 for CD. Therefore, the betweenness centrality 

In [22]:
question = "Find all graphs in which each pair of nodes is connected by an edge."
context_info = context
response = get_gpt3_response(system_prompt, question, context_info, API_KEY, model = "gpt-3.5-turbo")
print(response)

Based on the given information, we can determine the following graph where each pair of nodes is connected by an edge:

A -- B
|    |
50   50
|    |
C -- D -- E
      |   /
     100 / 
      |   /
      F

In this graph, Node A is connected to Node B with a cost of 50, Node A is also connected to Node C with a cost of 50. Node A is connected to Node D with a cost of 100. Node B is connected to Node D with a cost of 40. Node C is connected to Node D with a cost of 40, and Node C is connected to Node E with a cost of 80. Node D is connected to itself with a cost of 69, Node D is connected to Node F with a cost of 80, and Node D is also connected to Node E with a cost of 30. Node E is connected to Node F with a cost of 40.

This graph satisfies the condition where each pair of nodes is connected by an edge.


In [23]:
# Every pair of verticies in the clique are adjacent
# Caveat to this: We must define the type of graph first: DAG, non directed, etc.
# Cliques have a looser definition for directed graphs
question = "Find all cliques in the graph."
context_info = context
response = get_gpt3_response(system_prompt, question, context_info, API_KEY, model = "gpt-3.5-turbo")
print(response)

Based on the provided information, there are several cliques in the given graph. One clique consists of nodes A, B, and C, as they are all connected to each other with costs of 50. Another clique consists of nodes D, E, and F, as they are all connected to each other with various costs.


In [24]:
question = "Find the largest subset of nodes that are directly and completely connected to the rest of the set."
context_info = context
response = get_gpt3_response(system_prompt, question, context_info, API_KEY, model = "gpt-3.5-turbo")
print(response)

Based on the provided information, the largest subset of nodes that are directly and completely connected to the rest of the set would include nodes A, B, C, and D. These nodes form a complete and direct connection with each other. Node A is connected to nodes B and C with a cost of 50, and it is also connected to node D with a cost of 100. Node B is connected to node D with a cost of 40. Node C is connected to nodes D and E with costs of 40 and 80 respectively. Lastly, node D is connected to nodes D, E, and F with costs of 69, 30, and 80 respectively.


In [25]:
# Incorrect. There are clearly 10 edges
question = "Calculate the density of this graph"
context_info = context
response = get_gpt3_response(system_prompt, question, context_info, API_KEY, model = "gpt-3.5-turbo")
print(response)

To calculate the density of the graph, we need to determine the total number of edges and the total number of vertices in the graph.

From the given information, we can see that the graph has 6 vertices: A, B, C, D, E, and F.

Next, let's count the number of edges:
- There is an edge between A and D, so we have one edge.
- There is an edge between A and C, so we have one more edge.
- There is an edge between A and B, adding one more edge.
- There is an edge between B and D, increasing the total number of edges to four.
- There is an edge between C and E, adding one more edge.
- There is an edge between C and D, increasing the total number of edges to six.
- There is an edge between D and D, but this is not a valid edge as a vertex cannot be connected to itself, so we still have six edges.
- There is an edge between D and F, adding one more edge.
- There is an edge between D and E, increasing the total number of edges to eight.
- There is an edge between E and F, adding one more edge.



In [27]:
# Does not calculate the coefficient
question = "What is the reciprocity of this graph."
context_info = context
response = get_gpt3_response(system_prompt, question, context_info, API_KEY, model = "gpt-3.5-turbo")
print(response)

The reciprocity of a graph refers to the equality between the cost of going from one node to another in a certain direction and the cost of going from the second node to the first node in the opposite direction. In the provided information, there are several connections between nodes and their associated costs. To determine the reciprocity, we need to check if the costs are equal in both directions for each connection.

Let's analyze the connections:

- A is connected to D with a cost of 100.
- A is connected to C with a cost of 50.
- A is connected to B with a cost of 50.
- B is connected to D with a cost of 40.
- C is connected to E with a cost of 80.
- C is connected to D with a cost of 40.
- D is connected to D with a cost of 69.
- D is connected to F with a cost of 80.
- D is connected to E with a cost of 30.
- E is connected to F with a cost of 40.

Based on this information, we can see that the connections between A and B, A and C, and C and D have equal costs in both directions

In [28]:
# Gives lots of context as the answer
question = "What is the Average Clustering Coefficient of this graph."
context_info = context
response = get_gpt3_response(system_prompt, question, context_info, API_KEY, model = "gpt-3.5-turbo")
print(response)

To calculate the average clustering coefficient of this graph, we need to determine the clustering coefficient for each individual node and then take the average.

The clustering coefficient of a node measures the extent to which its neighbors are interconnected. It is calculated by dividing the number of edges between the node's neighbors by the total possible number of edges between them. A value of 0 means there are no connections between the node's neighbors, while a value of 1 means that all of the node's neighbors are connected to each other.

Let's calculate the clustering coefficient for each node:

- For node A, it has two neighbors (B and C), and there is no connection between them. Hence, its clustering coefficient is 0.

- For node B, it has two neighbors (A and D), and there is no connection between them. Hence, its clustering coefficient is 0.

- For node C, it has three neighbors (A, D, and E), and there is one connection between D and E. Hence, its clustering coefficien

In [29]:
question = "Find the cost of the shortest path between A to C"
context_info = context
response = get_gpt3_response(system_prompt, question, context_info, API_KEY, model = "gpt-3.5-turbo")
print(response)

To find the cost of the shortest path between A to C, we need to consider the costs of all possible paths connecting A and C. 

One possible path is A -> C directly, which has a cost of 50. Another possible path is A -> B -> D -> C, which has a total cost of 50 + 40 + 40 = 130.

Comparing the two paths, we can see that the shorter path is A -> C with a cost of 50. Therefore, the cost of the shortest path between A and C is 50.
