In [2]:
import json
import networkx as nx
import openai
from groq import Groq
from neo4j import GraphDatabase
import json


In [3]:
# Neo4j connection details
uri = "bolt://localhost:7687"  
user = "neo4j"                 
password = "123456789"          

# Initialize the Neo4j driver
driver = GraphDatabase.driver(uri, auth=(user, password))

# Function to run a query and return the results
def run_query(query):
    with driver.session() as session:
        result = session.run(query)
        return [record.data() for record in result]

In [4]:
#Function to extract nodes and relationships with labels and properties
def extract_graph_data():
    nodes_query = """
    MATCH (n) 
    RETURN id(n) as id, labels(n) as labels, properties(n) as properties
    """
    relationships_query = """
    MATCH ()-[r]->() 
    RETURN id(r) as id, type(r) as type, properties(r) as properties, 
           id(startNode(r)) as start_id, id(endNode(r)) as end_id
    """

    nodes = run_query(nodes_query)
    relationships = run_query(relationships_query)
    
    return nodes, relationships

In [6]:
# Function to save data to JSON file
def save_to_json(data, file_path):
    with open(file_path, 'w', encoding='utf-8') as json_file:
        json.dump(data, json_file, ensure_ascii=False, indent=4)

# Main function to extract data and save it as JSON
def main():
    nodes, relationships = extract_graph_data()
    graph_data = {
        "nodes": nodes,
        "relationships": relationships
    }
    save_to_json(graph_data, 'entity_resolution.json')
    print("Data extraction complete. JSON file saved as graph_data.json")


In [12]:
main()

Data extraction complete. JSON file saved as graph_data.json


In [7]:
nodes, relationships = extract_graph_data()
graph_data = {
    "nodes": nodes,
    "relationships": relationships
}

### Checking data

In [8]:
for item in graph_data["relationships"]: 
    print(item)

{'id': 1146, 'type': 'WATCHED', 'properties': {'watchCount': 1}, 'start_id': 0, 'end_id': 1095}
{'id': 848, 'type': 'WATCHED', 'properties': {'watchCount': 3}, 'start_id': 0, 'end_id': 1131}
{'id': 0, 'type': 'USES', 'properties': {}, 'start_id': 0, 'end_id': 128}
{'id': 1486, 'type': 'WATCHED', 'properties': {'watchCount': 2}, 'start_id': 1, 'end_id': 910}
{'id': 1059, 'type': 'WATCHED', 'properties': {'watchCount': 1}, 'start_id': 1, 'end_id': 1127}
{'id': 1678, 'type': 'WATCHED', 'properties': {'watchCount': 3}, 'start_id': 1, 'end_id': 943}
{'id': 1, 'type': 'USES', 'properties': {}, 'start_id': 1, 'end_id': 129}
{'id': 1604, 'type': 'WATCHED', 'properties': {'watchCount': 1}, 'start_id': 2, 'end_id': 945}
{'id': 1495, 'type': 'WATCHED', 'properties': {'watchCount': 1}, 'start_id': 2, 'end_id': 1184}
{'id': 1622, 'type': 'WATCHED', 'properties': {'watchCount': 1}, 'start_id': 2, 'end_id': 987}
{'id': 1497, 'type': 'WATCHED', 'properties': {'watchCount': 1}, 'start_id': 2, 'end_id':

In [10]:
for item in graph_data["nodes"]: 
    print(item)

{'id': 0, 'labels': ['User'], 'properties': {'lastName': 'Burbidge', 'country': 'US', 'firstName': 'Dorette', 'gender': 'Male', 'phone': '834-424-8856', 'state': 'Ohio', 'userId': 1, 'email': 'dburbidge0@japanpost.jp'}}
{'id': 1, 'labels': ['User'], 'properties': {'lastName': 'Mongenot', 'country': 'US', 'firstName': 'Farley', 'gender': 'Female', 'phone': '586-213-7447', 'state': 'Virginia', 'userId': 2, 'email': 'fmulryan1@adobe.com'}}
{'id': 2, 'labels': ['User'], 'properties': {'lastName': 'Diggens', 'country': 'US', 'firstName': 'Aileen', 'gender': 'Male', 'phone': '414-618-9307', 'state': 'Ohio', 'userId': 3, 'email': 'adiggens2@chron.com'}}
{'id': 3, 'labels': ['User'], 'properties': {'lastName': 'Lyokhin', 'country': 'US', 'firstName': 'Milissent', 'gender': 'Female', 'phone': '289-923-3928', 'state': 'Virginia', 'userId': 4, 'email': 'mmacairt3@fastcompany.com'}}
{'id': 4, 'labels': ['User'], 'properties': {'country': 'US', 'lastName': 'Stanway', 'firstName': 'Kienan', 'gender'

In [11]:
# Tạo một dictionary để lưu các kết nối của mỗi node
connections = {}

# Khởi tạo các keys trong dictionary với mỗi node
for node in nodes:
    connections[node['id']] = []

# Thêm các kết nối vào dictionary
for rel in relationships:
    start = rel['start_id']
    end = rel['end_id']
    if start in connections:
        connections[start].append(end)
    else:
        connections[start] = [end]

# Tạo chuỗi mô tả đồ thị
graph_description = "G describes a graph among {}. In this graph:\n".format(
    ", ".join(str(node['id']) for node in nodes)
)

for node_id, connected_nodes in connections.items():
    if connected_nodes:
        graph_description += "Node {} is connected to nodes {}.\n".format(
            node_id, ", ".join(str(n) for n in connected_nodes)
        )
    else:
        graph_description += "Node {} is not connected to any nodes.\n".format(node_id)

print(graph_description)

G describes a graph among 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216,

In [13]:
# Function to format properties
def format_properties(properties):
    return ', '.join(f'{key}: {value}' for key, value in properties.items())

# Tạo một dictionary để lưu các kết nối của mỗi node
connections = {}

# Khởi tạo các keys trong dictionary với mỗi node
for node in nodes:
    connections[node['id']] = []

# Thêm các kết nối vào dictionary
for rel in relationships:
    start = rel['start_id']
    end = rel['end_id']
    connections[start].append((end, rel))

# Tạo chuỗi mô tả đồ thị
graph_description = "Encoded graph:\nG describes a graph among nodes: "
node_descriptions = []
for node in nodes:
    node_descriptions.append("{labels} {id} ({properties})".format(
        labels=' '.join(node['labels']),
        id=node['id'],
        properties=format_properties(node['properties'])
    ))
graph_description += "; ".join(node_descriptions) + ".\n"

graph_description += "In this graph:\n"
for node_id, connected_nodes in connections.items():
    for end_id, rel in connected_nodes:
        graph_description += "Node {labels} {start} is connected to node {end} with edge {type} ({properties}).\n".format(
            labels=' '.join([node['labels'][0] for node in nodes if node['id'] == node_id]),
            start=node_id,
            end=end_id,
            type=rel['type'],
            properties=format_properties(rel['properties'])
        )

print(graph_description)


Encoded graph:
G describes a graph among nodes: User 0 (lastName: Burbidge, country: US, firstName: Dorette, gender: Male, phone: 834-424-8856, state: Ohio, userId: 1, email: dburbidge0@japanpost.jp); User 1 (lastName: Mongenot, country: US, firstName: Farley, gender: Female, phone: 586-213-7447, state: Virginia, userId: 2, email: fmulryan1@adobe.com); User 2 (lastName: Diggens, country: US, firstName: Aileen, gender: Male, phone: 414-618-9307, state: Ohio, userId: 3, email: adiggens2@chron.com); User 3 (lastName: Lyokhin, country: US, firstName: Milissent, gender: Female, phone: 289-923-3928, state: Virginia, userId: 4, email: mmacairt3@fastcompany.com); User 4 (country: US, lastName: Stanway, firstName: Kienan, gender: Male, phone: 185-726-4318, state: Kentucky, userId: 5, email: kstanway4@lycos.com); User 5 (country: US, lastName: Schimonek, firstName: Brandie, gender: Male, phone: 574-326-7989, state: Kentucky, userId: 6, email: bscollick5@macromedia.com); User 6 (lastName: Peasema

In [14]:
len(graph_description)

238147

In [15]:
client = Groq(
    api_key="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",
)

In [29]:
def generator(encoded_graph, query, model, size):
    # Function to divide the text into smaller chunks
    def chunk_text(text, max_length):
        words = text.split()
        chunks = []
        current_chunk = []
        current_length = 0

        for word in words:
            if current_length + len(word) + 1 > max_length:
                chunks.append(' '.join(current_chunk))
                current_chunk = []
                current_length = 0
            current_chunk.append(word)
            current_length += len(word) + 1

        if current_chunk:
            chunks.append(' '.join(current_chunk))

        return chunks

    # Function to process each chunk
    def process_chunk(chunk):
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": chunk,
                }
            ],
            model= model,
        )

        return chat_completion.choices[0].message.content

    # Split encoded_graph into chunks
    max_chunk_length = size  # Adjust this length based on your needs and model limits
    graph_chunks = chunk_text(encoded_graph, max_chunk_length)

    # Process each chunk of encoded_graph
    processed_graph_parts = [process_chunk(chunk) for chunk in graph_chunks]

    # Combine processed parts back into a single string
    processed_graph = ' '.join(processed_graph_parts)

    # Append query to the processed graph and process the final combined context
    context = processed_graph + "\n" + query
    final_result = process_chunk(context)

    return final_result

In [17]:
query = "What are the possible inconsistencies (nodes and egdes) that could be detected in the relationships and interactions within this graph?"

In [21]:
result_1 = generator(graph_description[0:100000], query)

In [22]:
print(result_1)

Based on the information you've provided, I assume you have a list of user details and a list of movie details, but no explicit relationship data between users and movies. However, I can still provide possible ways to detect inconsistencies in relationships and interactions using the data you provided and making some assumptions.

1. Detecting users with multiple emails: Some users might have more than one email address in the dataset. This could be considered a potential inconsistency. You can find such users by searching for duplicate email addresses associated with different user IDs. If you find any, you should double-check the accuracy of the data.

2. Movies with duplicate titles: You can check if there are any duplicate movie titles in the dataset. Though it's not very likely, there could still be cases where the same movie has different movie IDs or release years. If you find any, look for possible errors or inconsistencies in the data.

3. Movies with no associated users: You 

In [23]:
result_2 = generator(graph_description[100000:], query)

In [24]:
print(result_2)

Based on the provided information, here are some possible inconsistencies (nodes and edges) that could be detected in the relationships and interactions within this graph:

1. Orphan nodes: Nodes that do not have any connections to other nodes. In this case, these would be user nodes or movie nodes with no outgoing or incoming edges.
2. Self-loops: Edges that connect a node to itself. Although self-loops are possible, they may not be meaningful in this context.
3. Unidirectional "USES" edges: If the "USES" relationship is intended to be bidirectional, unidirectional edges would indicate an inconsistency, as a user would be using a node but the node is not being used by the user.
4. Duplicate edges: Multiple edges with the same edge type (WATCHED or USES) between the same pair of nodes. If the "watchCount" attribute is used consistently, duplicate edges could be resolved by merging them into a single edge with the sum of the "watchCount" values.
5. Mismatched edge labels and attributes:

In [26]:
result_mixtral = result_1 + result_2

prompt_clean_mixtral = "Clean theses rules: \n" + result_mixtral

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": prompt_clean_mixtral,
        }
    ],
    model="mixtral-8x7b-32768",
)

print(chat_completion.choices[0].message.content)

Here are the cleaned and formatted rules for detecting inconsistencies in user-movie relationships:

1. Orphan nodes: Check for nodes (user or movie) without any connections. These nodes may indicate missing relationships or data inconsistencies.

2. Self-loops: Verify there are no self-loops in the graph, as they may not be meaningful in this context.

3. Unidirectional "USES" edges: If the "USES" relationship is intended to be bidirectional, ensure all edges are bidirectional. Unidirectional edges might indicate an inconsistency.

4. Duplicate edges: Identify and remove duplicate edges with the same edge type between the same pair of nodes. Merge their attributes if necessary.

5. Mismatched edge labels and attributes: Ensure consistent use of edge labels (WATCHED or USES) and the presence or absence of "watchCount" attributes.

6. Unconnected components: Check for separate groups of nodes not connected to each other. This might indicate missing relationships or data inconsistencies.

### LLAMA3

In [30]:
result_1 = generator(graph_description[:100000], query, "llama3-70b-8192", 8000)

In [31]:
print(result_1)

Given the structure of the graph, where each user is a node and the edges represent connections between users, some possible inconsistencies that could be detected in the relationships and interactions within this graph are:

**Node inconsistencies:**

1. ** Duplicate users**: Multiple nodes with the same user ID, indicating duplicate user entries.
2. **Inconsistent user information**: Discrepancies in user attributes, such as conflicting last names, first names, or email addresses for the same user ID.
3. **Invalid user data**: Nodes with missing or invalid information, like phone numbers or states that don't match the expected format.

**Edge inconsistencies:**

1. **Self-loops**: Edges that connect a user to themselves, which might indicate an error in the connection data.
2. **Duplicate edges**: Multiple edges between the same pair of users, which could suggest incorrect or redundant connection data.
3. **Unconnected nodes**: Nodes without any edges, indicating users with no connec

In [35]:
query_test = """
You are provided with a dataset representing a graph of nodes and relationships in Entity Resolution (ER) is the process 
of disambiguating data to determine if multiple digital records represent the same real-world entity such as a person, organization, place, or other type of object.

Your task is to analyze the structure and properties of the nodes in this dataset and 
identify a set of consistency rules (nodes and edges) that should be enforced to maintain data integrity and quality. 
These rules should cover aspects such as:

1. Uniqueness constraints
2. Valid property values and data types
3. Mandatory properties for specific node types
4. Allowed node labels or types
5. Any other domain-specific rules based on the data's context
6. Functional dependencies
7. Logical rules
8. Graph constraints, etc

For each consistency rule you identify, provide a clear description of the rule and 
the corresponding Cypher query that can be used to check for violations of that rule in the dataset.

Present your findings in a structured format, using markdown for headings, code blocks, and lists. 
Clearly separate and label each consistency rule and its associated Cypher query.

Additionally, provide a brief explanation of your approach to identifying these rules, 
considering the specific characteristics and requirements of the given graph data.

Your response should focus solely on generating consistency rules for the nodes and edge in the graph, find all rules. 
"""

In [38]:
result_test = generator(graph_description[:100000], query_test, "llama3-70b-8192", 8000)

In [34]:
print(result_test)

**Consistency Rules for Node Integrity**

### Uniqueness Constraints
#### 1. Unique User IDs

* Rule Description: Each user node must have a unique `userId` property.
* Cypher Query: `MATCH (u:User) RETURN COUNT(DISTINCT u.userId) AS count WHERE count < size((u))`

#### 2. Unique Email Addresses

* Rule Description: Each user node must have a unique `email` property.
* Cypher Query: `MATCH (u:User) RETURN COUNT(DISTINCT u.email) AS count WHERE count < size((u))`

### Valid Property Values and Data Types
#### 3. Gender Values

* Rule Description: The `gender` property of a user node must be either "Male" or "Female".
* Cypher Query: `MATCH (u:User) WHERE u.gender NOT IN ["Male", "Female"] RETURN u`

#### 4. Phone Number Format

* Rule Description: The `phone` property of a user node must be in the format `xxx-xxx-xxxx`.
* Cypher Query: `MATCH (u:User) WHERE NOT u.phone =~ "^\d{3}-\d{3}-\d{4}$" RETURN u`

#### 5. State Values

* Rule Description: The `state` property of a user node must 

In [39]:
print(result_test)

**Consistency Rules for Node and Edge Integrity**

### Uniqueness Constraints

#### Rule 1: Unique User IDs

Description: Each user node should have a unique `userId` property.

Cypher Query:
```cypher
MATCH (u:User)
WHERE u.userId IS NOT NULL
WITH u.userId AS userId
GROUP BY userId
HAVING COUNT(DISTINCT u) > 1
RETURN "Duplicate user IDs detected" AS message;
```

#### Rule 2: Unique Email Addresses

Description: Each user node should have a unique `email` property.

Cypher Query:
```cypher
MATCH (u:User)
WHERE u.email IS NOT NULL
WITH u.email AS email
GROUP BY email
HAVING COUNT(DISTINCT u) > 1
RETURN "Duplicate email addresses detected" AS message;
```

#### Rule 3: Unique Phone Numbers

Description: Each user node should have a unique `phone` property.

Cypher Query:
```cypher
MATCH (u:User)
WHERE u.phone IS NOT NULL
WITH u.phone AS phone
GROUP BY phone
HAVING COUNT(DISTINCT u) > 1
RETURN "Duplicate phone numbers detected" AS message;
```

### Valid Property Values and Data Types

#

In [40]:
import py2neo
from py2neo import Graph


def calculate_support(rule):
    # Execute the Cypher query to get the count of facts that satisfy the rule
    cypher_query = rule["cypher_query"]
    result = graph.run(cypher_query).data()
    return result[0]["count"]

def calculate_coverage(rule, support):
    # Extract the node label and property from the Cypher query
    node_label = None
    node_property = None
    for token in rule["cypher_query"].split():
        if token.startswith("MATCH"):
            node_label = token.split(":")[1]
        elif token.startswith("RETURN"):
            node_property = token.split(".")[1].split(" ")[0]
    
    # Calculate the coverage based on the specific rule
    if node_label and node_property:
        cypher_query = f"MATCH ({node_label}:{node_label}) RETURN COUNT(DISTINCT {node_label}.{node_property}) AS count"
    else:
        cypher_query = f"MATCH ({node_label}) RETURN COUNT(*) AS count"
    result = graph.run(cypher_query).data()
    return support / result[0]["count"]

def calculate_confidence(rule, support):
    # Extract the node label from the Cypher query
    node_label = None
    for token in rule["cypher_query"].split():
        if token.startswith("MATCH"):
            node_label = token.split(":")[1]
    
    # Calculate the confidence based on the specific rule
    if node_label:
        cypher_query = f"MATCH ({node_label}:{node_label}) RETURN COUNT(*) AS count"
    else:
        cypher_query = "MATCH (n) RETURN COUNT(*) AS count"
    result = graph.run(cypher_query).data()
    return support / result[0]["count"]

def rank_rules(rules):
    ranked_rules = []
    for rule in rules:
        support = calculate_support(rule)
        coverage = calculate_coverage(rule, support)
        confidence = calculate_confidence(rule, support)
        ranked_rules.append({
            "rule": rule["description"],
            "support": support,
            "coverage": coverage,
            "confidence": confidence
        })
    return ranked_rules

# Define the rules
rules = [
    {"description": "Unique User IDs", "cypher_query": "MATCH (u:User) RETURN COUNT(DISTINCT u.userId) AS count WHERE count < size((u))"},
    {"description": "Unique Email Addresses", "cypher_query": "MATCH (u:User) RETURN COUNT(DISTINCT u.email) AS count WHERE count < size((u))"},
    {"description": "Gender Values", "cypher_query": "MATCH (u:User) WHERE u.gender NOT IN ['Male', 'Female'] RETURN u"},
    {"description": "Phone Number Format", "cypher_query": "MATCH (u:User) WHERE NOT u.phone =~ '^\d{3}-\d{3}-\d{4}$' RETURN u"},
    {"description": "State Values", "cypher_query": "MATCH (u:User) WHERE NOT u.state IN ['AL', 'AK',..., 'WY'] RETURN u"},
    {"description": "User Node Properties", "cypher_query": "MATCH (u:User) WHERE NOT EXISTS(u.lastName) OR NOT EXISTS(u.firstName) OR NOT EXISTS(u.gender) OR NOT EXISTS(u.phone) OR NOT EXISTS(u.state) OR NOT EXISTS(u.email) RETURN u"},
    {"description": "Only User Nodes Allowed", "cypher_query": "MATCH (n) WHERE NOT n:User RETURN n"},
    {"description": "Valid Country Values", "cypher_query": "MATCH (u:User) WHERE u.country <> 'US' RETURN u"},
    {"description": "Email and Phone Uniqueness", "cypher_query": "MATCH (u1:User), (u2:User) WHERE u1.email = u2.email AND u1.phone <> u2.phone RETURN u1, u2"},
    {"description": "Consistent User Information", "cypher_query": "MATCH (u:User) WHERE NOT (u.lastName IS NOT NULL AND u.firstName IS NOT NULL AND u.gender IS NOT NULL AND u.phone IS NOT NULL AND u.state IS NOT NULL AND u.email IS NOT NULL AND u.country"}
]

# Rank the rules
ranked_rules = rank_rules(rules)

# Print the ranked rules
for rule in ranked_rules:
    print(f"Rule: {rule['rule']}, Support: {rule['support']}, Coverage: {rule['coverage']:.2f}, Confidence: {rule['confidence']:.2f}")

NameError: name 'graph' is not defined

In [47]:
import networkx as nx

# Tạo đồ thị
G = nx.DiGraph()  # Sử dụng đồ thị có hướng

# Thêm các cạnh (mối quan hệ)
G.add_edge('A', 'B')
G.add_edge('B', 'C')
G.add_edge('C', 'D')
G.add_edge('A', 'E')
G.add_edge('E', 'F')
G.add_edge('E', 'H')
G.add_edge('E', 'A')

# Hàm để tìm tất cả các kết nối từ một node sử dụng DFS
def find_all_connections(graph, start_node):
    visited = set()
    connections = []

    def dfs(node):
        for neighbor in graph.neighbors(node):
            if neighbor not in visited:
                visited.add(neighbor)
                connections.append((node, neighbor))
                dfs(neighbor)

    visited.add(start_node)
    dfs(start_node)
    return connections

# Tìm tất cả các kết nối từ node 'A'
all_connections = find_all_connections(G, 'E')

# Hiển thị các kết nối
for start, end in all_connections:
    print(f"Node {start} is connected to node {end}")


Node E is connected to node F
Node E is connected to node H
Node E is connected to node A
Node A is connected to node B
Node B is connected to node C
Node C is connected to node D


In [48]:
import networkx as nx

# Tạo đồ thị có hướng
G = nx.DiGraph()

# Thêm các nút với thuộc tính
G.add_node('A', labels=['Person'], properties={'name': 'Alice', 'age': 30})
G.add_node('B', labels=['Person'], properties={'name': 'Bob', 'age': 25})
G.add_node('C', labels=['Person'], properties={'name': 'Carol', 'age': 28})
G.add_node('D', labels=['Person'], properties={'name': 'Dave', 'age': 35})
G.add_node('E', labels=['Person'], properties={'name': 'Eve', 'age': 22})
G.add_node('F', labels=['Person'], properties={'name': 'Frank', 'age': 40})

# Thêm các cạnh với thuộc tính
G.add_edge('A', 'B', type='KNOWS', properties={'since': 2010})
G.add_edge('B', 'C', type='KNOWS', properties={'since': 2012})
G.add_edge('C', 'D', type='KNOWS', properties={'since': 2015})
G.add_edge('A', 'E', type='KNOWS', properties={'since': 2013})
G.add_edge('E', 'F', type='KNOWS', properties={'since': 2018})

# Hàm để tìm tất cả các đường đi từ một node sử dụng DFS và lưu trữ đường đi dưới dạng danh sách các câu văn
def find_all_paths(graph, start_node):
    paths = []

    def dfs(node, path):
        for neighbor in graph.neighbors(node):
            if neighbor not in path:  # Tránh chu trình
                new_path = path + [neighbor]
                paths.append(new_path)
                dfs(neighbor, new_path)

    dfs(start_node, [start_node])
    return paths

# Tìm tất cả các đường đi từ node 'A'
all_paths = find_all_paths(G, 'A')

# Chuyển đổi các đường đi thành các câu văn
sentences = []
for path in all_paths:
    for i in range(len(path) - 1):
        start = path[i]
        end = path[i + 1]
        edge_data = G.get_edge_data(start, end)
        start_data = G.nodes[start]
        end_data = G.nodes[end]
        start_info = f"Node {start} ({', '.join(f'{k}: {v}' for k, v in start_data['properties'].items())})"
        end_info = f"Node {end} ({', '.join(f'{k}: {v}' for k, v in end_data['properties'].items())})"
        edge_info = f"{edge_data['type']} ({', '.join(f'{k}: {v}' for k, v in edge_data['properties'].items())})"
        sentences.append(f"{start_info} is connected to {end_info} with edge {edge_info}")

# Hiển thị các câu văn
for sentence in sentences:
    print(sentence)


Node A (name: Alice, age: 30) is connected to Node B (name: Bob, age: 25) with edge KNOWS (since: 2010)
Node A (name: Alice, age: 30) is connected to Node B (name: Bob, age: 25) with edge KNOWS (since: 2010)
Node B (name: Bob, age: 25) is connected to Node C (name: Carol, age: 28) with edge KNOWS (since: 2012)
Node A (name: Alice, age: 30) is connected to Node B (name: Bob, age: 25) with edge KNOWS (since: 2010)
Node B (name: Bob, age: 25) is connected to Node C (name: Carol, age: 28) with edge KNOWS (since: 2012)
Node C (name: Carol, age: 28) is connected to Node D (name: Dave, age: 35) with edge KNOWS (since: 2015)
Node A (name: Alice, age: 30) is connected to Node E (name: Eve, age: 22) with edge KNOWS (since: 2013)
Node A (name: Alice, age: 30) is connected to Node E (name: Eve, age: 22) with edge KNOWS (since: 2013)
Node E (name: Eve, age: 22) is connected to Node F (name: Frank, age: 40) with edge KNOWS (since: 2018)


In [49]:
import networkx as nx

# Tạo đồ thị có hướng
G = nx.DiGraph()

# Thêm các nút với thuộc tính và nhãn
G.add_node('A', labels=['Person'], properties={'name': 'Alice', 'age': 30})
G.add_node('B', labels=['Person'], properties={'name': 'Bob', 'age': 25})
G.add_node('C', labels=['Person'], properties={'name': 'Carol', 'age': 28})
G.add_node('D', labels=['Person'], properties={'name': 'Dave', 'age': 35})
G.add_node('E', labels=['Company'], properties={'name': 'Acme Corp', 'industry': 'Tech'})
G.add_node('F', labels=['Company'], properties={'name': 'ABC Corp', 'industry': 'Media'})

# Thêm các cạnh với thuộc tính
G.add_edge('A', 'B', type='KNOWS', properties={'since': 2010})
G.add_edge('B', 'C', type='KNOWS', properties={'since': 2012})
G.add_edge('C', 'D', type='KNOWS', properties={'since': 2015})
G.add_edge('A', 'E', type='WORKS_AT', properties={'role': 'Engineer'})
G.add_edge('E', 'F', type='PARTNER', properties={'since': 2018})

# Hàm để tìm tất cả các đường đi từ một node sử dụng DFS và lọc theo nhãn
def find_all_paths_by_label(graph, start_node, target_label):
    paths = []

    def dfs(node, path):
        for neighbor in graph.neighbors(node):
            if neighbor not in path:  # Tránh chu trình
                if target_label in graph.nodes[neighbor]['labels']:
                    new_path = path + [neighbor]
                    paths.append(new_path)
                dfs(neighbor, path + [neighbor])

    dfs(start_node, [start_node])
    return paths

# Tìm tất cả các đường đi từ node 'A' đến các node có nhãn 'Person'
all_paths = find_all_paths_by_label(G, 'A', 'Person')

# Chuyển đổi các đường đi thành các câu văn
sentences = []
for path in all_paths:
    for i in range(len(path) - 1):
        start = path[i]
        end = path[i + 1]
        edge_data = G.get_edge_data(start, end)
        start_data = G.nodes[start]
        end_data = G.nodes[end]
        start_info = f"Node {start} ({', '.join(f'{k}: {v}' for k, v in start_data['properties'].items())})"
        end_info = f"Node {end} ({', '.join(f'{k}: {v}' for k, v in end_data['properties'].items())})"
        edge_info = f"{edge_data['type']} ({', '.join(f'{k}: {v}' for k, v in edge_data['properties'].items())})"
        sentences.append(f"{start_info} is connected to {end_info} with edge {edge_info}")

# Hiển thị các câu văn
for sentence in sentences:
    print(sentence)


Node A (name: Alice, age: 30) is connected to Node B (name: Bob, age: 25) with edge KNOWS (since: 2010)
Node A (name: Alice, age: 30) is connected to Node B (name: Bob, age: 25) with edge KNOWS (since: 2010)
Node B (name: Bob, age: 25) is connected to Node C (name: Carol, age: 28) with edge KNOWS (since: 2012)
Node A (name: Alice, age: 30) is connected to Node B (name: Bob, age: 25) with edge KNOWS (since: 2010)
Node B (name: Bob, age: 25) is connected to Node C (name: Carol, age: 28) with edge KNOWS (since: 2012)
Node C (name: Carol, age: 28) is connected to Node D (name: Dave, age: 35) with edge KNOWS (since: 2015)


In [55]:
from neo4j import GraphDatabase

# Hàm để kết nối đến cơ sở dữ liệu Neo4j
def connect_to_neo4j(uri, user, password):
    driver = GraphDatabase.driver(uri, auth=(user, password))
    return driver

# Hàm để truy vấn các đường đi từ một node dựa trên nhãn
def find_all_paths_by_label(driver, start_node_id, target_label):
    query = """
    MATCH p=(start)-[*]->(end)
    WHERE id(start) = $start_node_id AND $target_label IN labels(end)
    RETURN p
    """
    paths = []
    with driver.session() as session:
        result = session.run(query, start_node_id=start_node_id, target_label=target_label)
        for record in result:
            paths.append(record["p"])
    return paths

# Hàm để chuyển đổi các đường đi thành các câu văn
def format_paths(paths):
    print("Hello")
    sentences = []
    for path in paths:
        for i in range(len(path.nodes) - 1):
            start = path.nodes[i]
            end = path.nodes[i + 1]
            relationship = path.relationships[i]

            start_info = f"Node {start.id} ({', '.join(f'{k}: {v}' for k, v in start.items())})"
            end_info = f"Node {end.id} ({', '.join(f'{k}: {v}' for k, v in end.items())})"
            edge_info = f"{relationship.type} ({', '.join(f'{k}: {v}' for k, v in relationship.items())})"
            sentences.append(f"{start_info} is connected to {end_info} with edge {edge_info}")
    return sentences

# Kết nối đến Neo4j
uri = "bolt://localhost:7687"
user = "neo4j"
password = "123456789"

driver = connect_to_neo4j(uri, user, password)

# Tìm tất cả các đường đi từ node có ID 0 đến các node có nhãn 'Person'
start_node_id = 0
target_label = 'Movie'
paths = find_all_paths_by_label(driver, start_node_id, target_label)

# Định dạng các đường đi thành các câu văn
sentences = format_paths(paths)

# Hiển thị các câu văn
for sentence in sentences:
    print(sentence)

# Đóng kết nối
driver.close()


Hello
Node 0 (lastName: Burbidge, country: US, firstName: Dorette, gender: Male, phone: 834-424-8856, state: Ohio, userId: 1, email: dburbidge0@japanpost.jp) is connected to Node 1095 (year: 2021, movieId: 182, title: Gosuto masuta) with edge WATCHED (watchCount: 1)
Node 0 (lastName: Burbidge, country: US, firstName: Dorette, gender: Male, phone: 834-424-8856, state: Ohio, userId: 1, email: dburbidge0@japanpost.jp) is connected to Node 1131 (year: 2021, movieId: 218, title: The Nowhere Inn) with edge WATCHED (watchCount: 3)


  start_info = f"Node {start.id} ({', '.join(f'{k}: {v}' for k, v in start.items())})"
  end_info = f"Node {end.id} ({', '.join(f'{k}: {v}' for k, v in end.items())})"
