In [1]:
from neo4j import GraphDatabase
import json
import networkx as nx

# Neo4j connection details
uri = "bolt://localhost:7687"  
user = "neo4j"                 
password = "123456789"          

# Initialize the Neo4j driver
driver = GraphDatabase.driver(uri, auth=(user, password))

# Function to run a query and return the results
def run_query(query):
    with driver.session() as session:
        result = session.run(query)
        return [record.data() for record in result]


# Function to extract nodes and relationships with labels and properties
def extract_graph_data():
    nodes_query = """
    MATCH (n) 
    RETURN id(n) as id, labels(n) as labels, properties(n) as properties
    """
    relationships_query = """
    MATCH ()-[r]->() 
    RETURN id(r) as id, type(r) as type, properties(r) as properties, 
           id(startNode(r)) as start_id, id(endNode(r)) as end_id
    """

    nodes = run_query(nodes_query)
    relationships = run_query(relationships_query)
    
    return nodes, relationships

# Function to save data to JSON file
def save_to_json(data, file_path):
    with open(file_path, 'w', encoding='utf-8') as json_file:
        json.dump(data, json_file, ensure_ascii=False, indent=4)

# Main function to extract data and save it as JSON
def main():
    nodes, relationships = extract_graph_data()
    graph_data = {
        "nodes": nodes,
        "relationships": relationships
    }
    #save_to_json(graph_data, 'graph_data_1.json')
    print("Data extraction complete. JSON file saved as graph_data.json")

In [2]:
nodes, relationships = extract_graph_data()
graph_data = {
    "nodes": nodes,
    "relationships": relationships
}

In [3]:
for item in graph_data["relationships"]: 
    print(item)

{'id': 3339, 'type': 'GENERIC_ALL', 'properties': {'isacl': True}, 'start_id': 0, 'end_id': 211}
{'id': 3761, 'type': 'GENERIC_ALL', 'properties': {'isacl': True}, 'start_id': 0, 'end_id': 633}
{'id': 3828, 'type': 'GENERIC_ALL', 'properties': {'isacl': True}, 'start_id': 0, 'end_id': 700}
{'id': 3961, 'type': 'GENERIC_ALL', 'properties': {'isacl': True}, 'start_id': 0, 'end_id': 833}
{'id': 3842, 'type': 'GENERIC_ALL', 'properties': {'isacl': True}, 'start_id': 0, 'end_id': 714}
{'id': 3785, 'type': 'GENERIC_ALL', 'properties': {'isacl': True}, 'start_id': 0, 'end_id': 657}
{'id': 3750, 'type': 'GENERIC_ALL', 'properties': {'isacl': True}, 'start_id': 0, 'end_id': 622}
{'id': 3882, 'type': 'GENERIC_ALL', 'properties': {'isacl': True}, 'start_id': 0, 'end_id': 754}
{'id': 3754, 'type': 'GENERIC_ALL', 'properties': {'isacl': True}, 'start_id': 0, 'end_id': 626}
{'id': 3927, 'type': 'GENERIC_ALL', 'properties': {'isacl': True}, 'start_id': 0, 'end_id': 799}
{'id': 3942, 'type': 'GENERIC_

In [4]:
class Neo4jDatabase:
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self.driver.close()

    def get_node_info(self, label):
        with self.driver.session() as session:
            result = session.read_transaction(self._get_node_info, label)
            return result

    @staticmethod
    def _get_node_info(tx, label):
        query = f"""
        MATCH (d:{label})-[r]-(n)
        RETURN d, r, n
        """
        result = tx.run(query)
        nodes = []
        for record in result:
            node_info = {
                'domain_node': record['d'],
                'relationship': record['r'],
                'connected_node': record['n']
            }
            nodes.append(node_info)
        return nodes


### Get information related to nodes with a specific label to process¶

In [13]:
def get_nodes_with_label(label):
    nodes, relationships = extract_graph_data()
    result = []

    # Find all nodes with the given label
    nodes_with_label = [node for node in nodes if label in node['labels']]
    node_dict = {node['id']: node for node in nodes}  # Dictionary for fast node lookup by ID

    for node in nodes_with_label:
        node_id = node['id']
        node_label = node['labels'][0]
        node_props = node['properties']
        
        # Format node properties
        node_info = f"Node [{node_label}] {node_id}:\n"
        for prop, value in node_props.items():
            node_info += f" - {prop}: {value}\n"
        
        # Filter relationships involving this node
        related_relationships = [rel for rel in relationships if rel['start_id'] == node_id or rel['end_id'] == node_id]
        for rel in related_relationships:
            connected_node_id = rel['end_id'] if rel['start_id'] == node_id else rel['start_id']
            connected_node = node_dict.get(connected_node_id)
            if connected_node:
                connected_node_label = connected_node['labels'][0]
                roles = rel['properties']
                node_info += f"Node [{node_label}] {node_id} is connected to node  [{connected_node_label}] {connected_node_id}  with edge {rel['type']} (roles: {roles}).\n"
        
        result.append(node_info.strip())
    
    return "\n".join(result)


### Split the data into chunks with a length limit

In [15]:
# Function to split the data into chunks with a length limit
def split_data_into_chunks(details, max_length, label_node):
    nodes = details.split(f"Node [{label_node}]")
    nodes = [f"Node [{label_node}]" + node for node in nodes if node]  # Reconstruct each node detail

    chunks = []
    current_chunk = ""
    
    for node in nodes:
        if len(current_chunk) + len(node) > max_length:
            chunks.append(current_chunk.strip())
            current_chunk = ""
        current_chunk += node + "\n"
    
    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks


#### Generate rules

In [17]:
import os

from groq import Groq

client = Groq(
    api_key= "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",
)

def generator(query, context, model): 
    chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": context + "\n" + query,
        }
    ],
    model= model,
    )

    return chat_completion.choices[0].message.content

### Perform 

In [27]:
def _perform_function(max_length, label_node, model):
    contents = get_nodes_with_label(label_node)
    chunks = split_data_into_chunks(contents, max_length, label_node)
    rules = []
    for chunk in chunks: 
        prompt =  f"""From this data, learn patterns and infer consistency rules related node {label_node}. 
        The goal is to ensure that the graph data is accurate, coherent, and follows the expected schema. 
        For each consistency rule you identify, provide a clear description of the rule and the corresponding Cypher query that can be used to check for violations of that rule in the dataset """
        rule = generator(chunk, prompt, model)
        rules.append(rule)
    return rules

### Domain

In [28]:
rules_domain = _perform_function(8100, "Domain", "llama3-70b-8192")

In [29]:
for rule in rules_domain: 
    print(rule)

Based on the provided data, I identified the following patterns and consistency rules related to the `Domain` node:

**Rule 1: Domain name and domain property consistency**
The `name` property of a `Domain` node should match the `domain` property.

Cypher query to check for violations:
```
MATCH (d:Domain) WHERE d.name <> d.domain RETURN d
```
**Rule 2: High-value domains have specific relationships**
A `Domain` node with `highvalue` set to `True` should have relationships with `Group` nodes with specific edge types (e.g., `WRITE_OWNER`, `WRITE_DACL`, `DC_SYNC`, etc.).

Cypher query to check for violations:
```
MATCH (d:Domain {highvalue: true}) WHERE NOT (d)-[:WRITE_OWNER|WRITE_DACL|DC_SYNC]->(:Group) RETURN d
```
**Rule 3: Domain-Group relationships have consistent roles**
The roles associated with the relationships between a `Domain` node and a `Group` node should be consistent (e.g., `isacl` is always `True`).

Cypher query to check for violations:
```
MATCH (d:Domain)-[r]->(g:Grou

### GPO

In [31]:
rules_gpo = _perform_function(8100, "GPO", "llama3-70b-8192")

In [32]:
for rule in rules_gpo: 
    print(rule)

Based on the provided data, I've identified some patterns and consistency rules related to node GPO. Here are the rules and the corresponding Cypher queries to check for violations:

**Rule 1: Unique `objectid` property**

* Description: Each GPO node should have a unique `objectid` property.
* Cypher query: `MATCH (gpo:GPO) WITH gpo.objectid as oid MATCH (gpo2:GPO) WHERE gpo2.objectid = oid AND ID(gpo) <> ID(gpo2) RETURN COUNT(*)`

**Rule 2: Consistent `domain` property**

* Description: All GPO nodes should have the same `domain` property value, which is `TestCompany.Local` in this case.
* Cypher query: `MATCH (gpo:GPO) WHERE gpo.domain <> "TestCompany.Local" RETURN COUNT(*)`

**Rule 3: Correct `name` property format**

* Description: The `name` property of each GPO node should be in the format `GPO_<number>@TestCompany.Local`.
* Cypher query: `MATCH (gpo:GPO) WHERE NOT gpo.name =~ "GPO_[0-9]+@TestCompany.Local" RETURN COUNT(*)`

**Rule 4: GP_LINK edge consistency**

* Description: G

### OU

In [33]:
rules_ou = _perform_function(8100, "OU", "llama3-70b-8192")

In [34]:
for rule in rules_ou: 
    print(rule)

After analyzing the provided data, I've identified some patterns and inferred consistency rules related to node OU. Here are the rules I've found, along with their corresponding Cypher queries to check for violations:

**Rule 1: An OU node must have a `domain` property.**

Cypher query:
```
MATCH (ou:OU) WHERE NOT exists(ou.domain) RETURN ou
```
**Rule 2: An OU node must have a `name` property.**

Cypher query:
```
MATCH (ou:OU) WHERE NOT exists(ou.name) RETURN ou
```
**Rule 3: An OU node must have an `objectId` property.**

Cypher query:
```
MATCH (ou:OU) WHERE NOT exists(ou.objectId) RETURN ou
```
**Rule 4: An OU node must be connected to a Domain node with a CONTAINS edge.**

Cypher query:
```
MATCH (ou:OU) WHERE NOT exists(ou)-[:CONTAINS]-(d:Domain) RETURN ou
```
**Rule 5: An OU node can be connected to multiple Computer nodes with CONTAINS edges.**

This rule is already evident from the data, as multiple Computer nodes are connected to each OU node.

**Rule 6: An OU node can be co

### Group

In [35]:
rules_group = _perform_function(8100, "Group", "llama3-70b-8192")

In [36]:
for rule in rules_group: 
    print(rule)

Based on the provided data, I've identified some patterns and consistency rules related to the `Group` node:

**Rule 1: Existence of `highvalue` property**

* Description: Every `Group` node should have a `highvalue` property.
* Cypher query to check for violations:
```cypher
MATCH (g:Group) WHERE NOT EXISTS(g.highvalue) RETURN g
```

**Rule 2: Uniqueness of `neo4jImportId`**

* Description: Every `Group` node should have a unique `neo4jImportId` property.
* Cypher query to check for violations:
```cypher
MATCH (g1:Group), (g2:Group) WHERE g1.neo4jImportId = g2.neo4jImportId AND g1 <> g2 RETURN g1, g2
```

**Rule 3: Consistency of `domain` property**

* Description: Every `Group` node should have a `domain` property with a specific format (e.g., `TestCompany.Local`).
* Cypher query to check for violations:
```cypher
MATCH (g:Group) WHERE NOT g.domain =~ '.*\\..*' RETURN g
```

**Rule 4: Consistency of `name` property**

* Description: Every `Group` node should have a `name` property wi

### Computer

In [37]:
rules_computer = _perform_function(8100, "Computer", "llama3-70b-8192")

In [38]:
for rule in rules_computer: 
    print(rule)


After analyzing the provided data, I've identified some patterns and inferred consistency rules related to the node `Computer`. For each rule, I'll provide a clear description and a corresponding Cypher query to check for violations.

**Rule 1: All Computers have a `domain` property**
Description: Every `Computer` node should have a `domain` property.
Cypher query:
```
MATCH (c:Computer) WHERE NOT EXISTS(c.domain) RETURN c
```
**Rule 2: All Computers have a `operatingsystem` property**
Description: Every `Computer` node should have an `operatingsystem` property.
Cypher query:
```
MATCH (c:Computer) WHERE NOT EXISTS(c.operatingsystem) RETURN c
```
**Rule 3: All Computers have a `name` property**
Description: Every `Computer` node should have a `name` property.
Cypher query:
```
MATCH (c:Computer) WHERE NOT EXISTS(c.name) RETURN c
```
**Rule 4: All Computers have an `objectid` property**
Description: Every `Computer` node should have an `objectid` property.
Cypher query:
```
MATCH (c:Com

### User

In [39]:
rules_user = _perform_function(8100, "User", "llama3-70b-8192")

In [40]:
for rule in rules_user: 
    print(rule)

Based on the provided data, I have identified the following patterns and consistency rules related to the `User` node:

**Consistency Rule 1:** Each `User` node has a unique `neo4jImportId` property.

Cypher Query to check for violations:
```
MATCH (u:User) WHERE u.neo4jImportId IS NOT UNIQUE
RETURN COUNT(*) AS num_violations
```
**Consistency Rule 2:** Each `User` node has a `displayname` property that is not empty.

Cypher Query to check for violations:
```
MATCH (u:User) WHERE u.displayname IS NULL OR u.displayname = ''
RETURN COUNT(*) AS num_violations
```
**Consistency Rule 3:** Each `User` node has a `domain` property with a value of "TestCompany.Local".

Cypher Query to check for violations:
```
MATCH (u:User) WHERE u.domain <> 'TestCompany.Local'
RETURN COUNT(*) AS num_violations
```
**Consistency Rule 4:** Each `User` node has a `name` property that follows the format "FirstnameLastnameXX@TestCompany.Local", where "XX" is a numeric suffix.

Cypher Query to check for violations

In [52]:
rule_prompt = ""
list_rules = [rules_user, rules_computer, rules_ou, rules_group, rules_gpo, rules_domain]
for rules in list_rules: 
    for rule in rules: 
        rule_prompt += " " + rule

In [53]:
def generator_clean(encoded_graph, query, model, size):
    # Function to divide the text into smaller chunks
    def chunk_text(text, max_length):
        words = text.split()
        chunks = []
        current_chunk = []
        current_length = 0

        for word in words:
            if current_length + len(word) + 1 > max_length:
                chunks.append(' '.join(current_chunk))
                current_chunk = []
                current_length = 0
            current_chunk.append(word)
            current_length += len(word) + 1

        if current_chunk:
            chunks.append(' '.join(current_chunk))

        return chunks

    # Function to process each chunk
    def process_chunk(chunk):
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": chunk,
                }
            ],
            model= model,
        )

        return chat_completion.choices[0].message.content

    # Split encoded_graph into chunks
    max_chunk_length = size  # Adjust this length based on your needs and model limits
    graph_chunks = chunk_text(encoded_graph, max_chunk_length)

    # Process each chunk of encoded_graph
    processed_graph_parts = [process_chunk(chunk) for chunk in graph_chunks]

    # Combine processed parts back into a single string
    processed_graph = ' '.join(processed_graph_parts)

    # Append query to the processed graph and process the final combined context
    context = processed_graph + "\n" + query
    final_result = process_chunk(context)

    return final_result

In [54]:
query = "Combine and clean these rules"

In [56]:
result_1 = generator_clean(rule_prompt[:100000], query, "llama3-70b-8192", 8129)

RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01hyvd04xpe3vtmpx59tbpw3cf` on tokens per minute (TPM): Limit 6000, Used 6104, Requested 8161. Please try again in 1m22.654s. Visit https://console.groq.com/docs/rate-limits for more information.', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}

In [57]:
result_2 = generator_clean(rule_prompt[100000:200000], query, "llama3-70b-8192", 8129)

RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01hyvd04xpe3vtmpx59tbpw3cf` on tokens per minute (TPM): Limit 6000, Used 6202, Requested 6672. Please try again in 1m8.749s. Visit https://console.groq.com/docs/rate-limits for more information.', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}

In [None]:
result_3 = generator_clean(rule_prompt[200000:], query, "llama3-70b-8192", 8129)

In [None]:
rs_llama = result_1 + "\n" + result_1 + "\n" + result_1
rs_llama_cleaned = generator_clean(rs_llama, query, "llama3-70b-8192", 8129)