In [1]:
!pip install groq
!pip install faiss-cpu
!pip install gpt4all
!pip install langchain_community
!pip install langchain_groq
!pip install langchain
!pip install sentence-transformers
!pip install langchain_openai
!pip install langchain-community

Collecting groq
  Downloading groq-0.9.0-py3-none-any.whl.metadata (13 kB)
Collecting httpx<1,>=0.23.0 (from groq)
  Downloading httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->groq)
  Downloading httpcore-1.0.5-py3-none-any.whl.metadata (20 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->groq)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading groq-0.9.0-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.5/103.5 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading h11-0.14.0-py3-none-any.whl (58 kB)


In [2]:
import json
import networkx as nx
from groq import Groq
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain.embeddings import GPT4AllEmbeddings
import openai
from langchain_community.llms import CTransformers
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_groq import ChatGroq
import time
# embeddings using langchain
from langchain.embeddings import SentenceTransformerEmbeddings

## Reading graph database

In [3]:
file_path = 'cybersecurity-json-data.json'
with open(file_path, 'r') as file:
    data = [json.loads(line) for line in file]

## Encoding graph

In [4]:
# Create a graph using NetworkX
graph = nx.Graph()

# Adding nodes and edges into graph
for item in data:
    if item['type'] == 'node':
        node_id = item['id']
        node_labels = item['labels']
        node_properties = item['properties']
        graph.add_node(node_id, labels=node_labels, **node_properties)
    elif item['type'] == 'relationship':
        source = item['start']['id']
        target = item['end']['id']
        edge_properties = item.get('properties', {})
        graph.add_edge(source, target, id=item['id'], label=item['label'], **edge_properties)

In [5]:
# The function creates a string describing the node and its attributes
def create_node_string(graph):
    node_descriptions = []
    for node, props in graph.nodes(data=True):
        labels = props.pop('labels', [])
        labels_str = ', '.join(labels)
        prop_desc = ', '.join([f"{key}: {value}" for key, value in props.items()])
        node_descriptions.append(f"{node} [{labels_str}] ({prop_desc})")
    return ', '.join(node_descriptions)


In [6]:
# Encoding graph function
def encode_graph_incident(graph):
    nodes_string = create_node_string(graph)
    output = "G describes a graph among nodes: \n%s.\n" % nodes_string
    if graph.edges():
        output += "In this graph:\n"
    for source_node in graph.nodes():
        target_nodes = list(graph.neighbors(source_node))
        target_nodes_str = ""
        nedges = 0
        for target_node in target_nodes:
            edge_props = graph.get_edge_data(source_node, target_node)
            edge_props_str = ', '.join([f"{key}: {value}" for key, value in edge_props.items()])
            target_nodes_str += f"{target_node} ({edge_props_str}), "
            nedges += 1
        if nedges > 1:
            output += "Node %s is connected to nodes %s.\n" % (
                source_node,
                target_nodes_str[:-2],
            )
        elif nedges == 1:
            output += "Node %s is connected to node %s.\n" % (
                source_node,
                target_nodes_str[:-2],
            )
    return output

# Encoding graph function
encoded_graph_incident = encode_graph_incident(graph)

In [7]:
print(encoded_graph_incident)

G describes a graph among nodes: 
0 [Group, HighValue] (highvalue: True, neo4jImportId: 0, domain: TestCompany.Local, name: DOMAIN ADMINS@TestCompany.Local, objectid: S-1-5-21-883232822-274137685-4173207997-512), 1 [Group] (neo4jImportId: 1, domain: TestCompany.Local, name: DOMAIN COMPUTERS@TestCompany.Local, objectid: S-1-5-21-883232822-274137685-4173207997-515), 2 [Group] (neo4jImportId: 2, domain: TestCompany.Local, name: DOMAIN USERS@TestCompany.Local, objectid: S-1-5-21-883232822-274137685-4173207997-513), 3 [Group, HighValue] (highvalue: True, neo4jImportId: 3, domain: TestCompany.Local, name: DOMAIN CONTROLLERS@TestCompany.Local, objectid: S-1-5-21-883232822-274137685-4173207997-516), 4 [Group, HighValue] (highvalue: True, neo4jImportId: 4, domain: TestCompany.Local, name: ENTERPRISE DOMAIN CONTROLLERS@TestCompany.Local, objectid: TestCompany.Local-S-1-5-9), 5 [Group] (neo4jImportId: 5, domain: TestCompany.Local, name: ENTERPRISE READ-ONLY DOMAIN CONTROLLERS@TestCompany.Local, o

## RAG

### Embedding

In [8]:
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [9]:
embedding_model_name = "all-MiniLM-L6-v2.gguf2.f16.gguf"
gpt4all_kwargs = {'allow_download': 'True'}
embeddings = GPT4AllEmbeddings(
  model_name = embedding_model_name,
  gpt4all_kwargs = gpt4all_kwargs
)

Downloading: 100%|██████████| 45.9M/45.9M [00:00<00:00, 66.3MiB/s]
Verifying: 100%|██████████| 45.9M/45.9M [00:00<00:00, 368MiB/s]


### Vector db

In [10]:
vector_db_path = "vectorstores/db_faiss"

def create_db_from_graph(encoded_graph):

    # Split graph data into smail text
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=8192,
        chunk_overlap=100,
        length_function=len

    )

    chunks = text_splitter.split_text(encoded_graph)


    #  Faiss Vector DB
    db = FAISS.from_texts(texts=chunks, embedding=embeddings)
    db.save_local(vector_db_path)
    return db


In [11]:
db_test = create_db_from_graph(encoded_graph_incident)



### LLMs

In [12]:
vector_db_path = "vectorstores/db_faiss"

# Load LLM
def load_llm(model_file):
    llm = CTransformers(
        model=model_file,
        model_type="llama",
        max_new_tokens=8192,
        temperature=0.01
    )
    return llm



### Create prompt template

In [13]:
def creat_prompt(template):
    prompt = PromptTemplate(template = template, input_variables=["context", "question"])
    return prompt

### Create chain

In [14]:
# Create chain
def create_chain(prompt, llm, db):
    llm_chain = RetrievalQA.from_chain_type(
        llm = llm,
        chain_type= "stuff",
        retriever = db.as_retriever(search_kwargs = {"k":3}, max_tokens_limit=8192),
        return_source_documents = False,
        chain_type_kwargs= {'prompt': prompt}

    )
    return llm_chain

### Reading vector db

In [15]:
def read_vectors_db():
    # Embeding
    db = FAISS.load_local(vector_db_path,
                          embedding_model_name,
                          allow_dangerous_deserialization=True)
    return db

## Start

In [16]:
# start
db = read_vectors_db()

GROQ_API_KEY = "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"
llm_llama = ChatGroq(groq_api_key=GROQ_API_KEY, model="llama3-70b-8192")
llm_mixtral = ChatGroq(groq_api_key=GROQ_API_KEY, model="mixtral-8x7b-32768")



### Prompting template

In [17]:
template = """<|im_start|>system\nUse the following information
                          to answer the question. Please exactly
                          \n
    {context}<|im_end|>\n<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant"""
prompt = creat_prompt(template)


### LLM chain

In [18]:
llm_chain_llama  = create_chain(prompt, llm_llama, db_test)
llm_chain_mixtral = create_chain(prompt, llm_mixtral, db_test)

## Zero-shot promting

In [19]:
question_zero =  """
Based on the following graph properties, generate detailed consistency rules (graph functional dependency and graph entity dependency).
Consider the structure, node information and relationships in the graph, and provide a set of rules that can be applied to maintain consistent
and accurate data.

For each consistency rule you identify, provide a clear description of the rule and the corresponding Cypher query for checking
"""

### LLAMA 3

In [25]:
responses_llama_zero = llm_chain_llama.invoke({"query": question_zero})
for response in str(responses_llama_zero).split("\n"):
  print(response)

{'query': '\nBased on the following graph properties, generate detailed consistency rules (graph functional dependency and graph entity dependency).\nConsider the structure, node information and relationships in the graph, and provide a set of rules that can be applied to maintain consistent \nand accurate data.\n\nFor each consistency rule you identify, provide a clear description of the rule and the corresponding Cypher query for checking\n', 'result': "Based on the provided graph structure and node information, I've identified several consistency rules to maintain accurate and consistent data. Here are the rules, along with their descriptions and corresponding Cypher queries:\n\n**Rule 1: Unique GENERIC_ALL relationships for each node**\n\nDescription: Each node should have at most one GENERIC_ALL relationship with node 0.\n\nCypher Query: `MATCH (n) WHERE NOT (n)-[:GENERIC_ALL]->() OR (n)-[:GENERIC_ALL]->(:Node {id: 0}) RETURN COUNT(DISTINCT n)`\n\n**Rule 2: MEMBER_OF relationships

### Rule 1: Unique GENERIC_ALL relationships for each node
**Description: Each node should have at most one GENERIC_ALL relationship with node**

- **Support: 1**

      MATCH (n)-[:GENERIC_ALL]->(m)
      WITH n, COUNT(m) AS count
      WHERE count = 1
      RETURN COUNT(DISTINCT n) AS support

- **Coverage: 1/901**
      // Count the number of valid entities that satisfy the rule
      MATCH (n)-[:GENERIC_ALL]->(m)
      WITH n, COUNT(m) AS count
      WHERE count = 1
      WITH COUNT(DISTINCT n) AS validFacts

      // Count the number of relationship GENERIC_ALL
      MATCH ()-[:GENERIC_ALL]->()
      WITH validFacts, COUNT(*) AS totalFacts

      // Compute coverage
      RETURN validFacts, totalFacts, (validFacts * 1.0 / totalFacts) AS coverage

  - **Confidence: 1/2**
      // Count the number of valid entities that satisfy the rule
      MATCH (n)-[:GENERIC_ALL]->(m)
      WITH n, COUNT(m) AS count
      WHERE count <= 1
      WITH COUNT(DISTINCT n) AS validFacts

      // Count the number of relationship GENERIC_ALL
      MATCH (n)-[:GENERIC_ALL]->()
      WITH validFacts, COUNT(DISTINCT n) AS totalFacts

      // Compute confidence
      RETURN validFacts, totalFacts, (validFacts * 1.0 / totalFacts) AS confidence



### Rule 2: MEMBER_OF relationships are symmetric
**Description: If node A has a MEMBER_OF relationship with node B, then node B should also have a MEMBER_OF relationship with node A.**

- **Support: 2**

      MATCH (a)-[:MEMBER_OF]->(b)
      WHERE (b)-[:MEMBER_OF]->(a)
      WITH COUNT(DISTINCT a) AS support
      RETURN support



- **Coverage: 2/1361**

      // Tính tổng số mối quan hệ MEMBER_OF
      MATCH ()-[:MEMBER_OF]->()
      WITH COUNT(*) AS totalFacts

      // Tính số lượng mối quan hệ MEMBER_OF mà quy tắc đối xứng được áp dụng
      MATCH (a)-[:MEMBER_OF]->(b)
      WHERE (b)-[:MEMBER_OF]->(a)
      WITH totalFacts, COUNT(*) AS validFacts

      // Tính toán bao phủ
      RETURN toFloat(validFacts) / totalFacts AS coverage

- **Confidence: 2/1361**
      // Tính tổng số mối quan hệ MEMBER_OF
      MATCH ()-[:MEMBER_OF]->()
      WITH COUNT(*) AS totalFacts

      // Tính số lượng mối quan hệ MEMBER_OF mà quy tắc đối xứng được áp dụng
      MATCH (a)-[:MEMBER_OF]->(b)
      WHERE (b)-[:MEMBER_OF]->(a)
      WITH totalFacts, COUNT(*) AS validFacts

      // Tính toán bao phủ
      RETURN toFloat(validFacts) / totalFacts AS coverage


### Rule 3: ADMIN_TO relationships are transitive
**Description: If node A has an ADMIN_TO relationship with node B, and node B has an ADMIN_TO relationship with node C, then node A should also have an ADMIN_TO relationship with node C**

- **Support: 0**

      // Tính số lượng các nút vi phạm quy tắc chuyển giao của mối quan hệ ADMIN_TO
      MATCH (a)-[:ADMIN_TO]->(b)-[:ADMIN_TO]->(c)
      WHERE (a)-[:ADMIN_TO]->(c)
      WITH COUNT(DISTINCT a) AS support
      RETURN support

### Rule 4: HAS_SESSION relationships are unique
**Description: Each node should have at most one HAS_SESSION relationship with another node.**

- **Support: 114**

      MATCH (n)-[:HAS_SESSION]->(m)
      WITH n, COUNT(m) AS sessionCount
      WHERE sessionCount = 1
      RETURN COUNT(DISTINCT n) AS support

- **Coverage: 114/264**

      // Tính tổng số mối quan hệ HAS_SESSION
      MATCH ()-[:HAS_SESSION]->()
      WITH COUNT(*) AS totalFacts

      // Tính số lượng mối quan hệ HAS_SESSION mà quy tắc duy nhất được áp dụng
      MATCH (n)-[:HAS_SESSION]->(m)
      WITH n, COUNT(m) AS sessionCount
      WHERE sessionCount = 1
      WITH totalFacts, COUNT(*) AS validFacts

      // Tính toán bao phủ
      RETURN toFloat(validFacts) / totalFacts AS coverage

- **Confidence: 100%**

      // Tính tổng số mối quan hệ HAS_SESSION
      MATCH ()-[:HAS_SESSION]->()
      WITH COUNT(*) AS totalFacts

      // Tính số lượng các nút có nhiều hơn một mối quan hệ HAS_SESSION
      MATCH (n1)-[:HAS_SESSION]->()
      WITH COUNT(DISTINCT n1) AS support

      // Tính số lượng mối quan hệ HAS_SESSION mà quy tắc duy nhất được áp dụng
      MATCH (n)-[:HAS_SESSION]->()
      WITH n, COUNT(*) AS sessionCount
      WHERE sessionCount = 1
      WITH totalFacts, support, COUNT(*) AS validFacts

      // Tính toán độ tin cậy
      RETURN toFloat(validFacts) / totalFacts AS confidence


### Rule 5: CAN_RDP relationships are unique
**Description: Each node should have at most one CAN_RDP relationship with another node.**

- **Support: 12**

      MATCH (n)-[:CAN_RDP]->(m)
      WITH n, COUNT(m) AS rdpCount
      WHERE rdpCount <= 1
      RETURN COUNT(DISTINCT n) AS support

- **Coverage: 12/60**

      // Tính tổng số mối quan hệ CAN_RDP
      MATCH ()-[:CAN_RDP]->()
      WITH COUNT(*) AS totalFacts

      // Tính số lượng mối quan hệ CAN_RDP mà quy tắc duy nhất được áp dụng
      MATCH (n)-[:CAN_RDP]->(m)
      WITH n, COUNT(m) AS rdpCount
      WHERE rdpCount <= 1
      WITH totalFacts, COUNT(*) AS validFacts

      // Tính toán bao phủ
      RETURN toFloat(validFacts) / totalFacts AS coverage

  - **Confidence: 12/60**

      // Tính tổng số mối quan hệ CAN_RDP
      MATCH ()-[:CAN_RDP]->()
      WITH COUNT(*) AS totalFacts

      // Tính số lượng mối quan hệ CAN_RDP mà quy tắc duy nhất được áp dụng
      MATCH (n)-[:CAN_RDP]->(m)
      WITH n, COUNT(m) AS rdpCount
      WHERE rdpCount <= 1
      WITH totalFacts, COUNT(*) AS validFacts

      // Tính toán độ tin cậy
      RETURN toFloat(validFacts) / totalFacts AS confidence




### Rule 6: EXECUTE_DCOM relationships are unique
**Description: Each node should have at most one EXECUTE_DCOM relationship with another node.**

- **Support: 19**


      MATCH (n)-[:EXECUTE_DCOM]->(m)
      WITH n, COUNT(m) AS dcomCount
      WHERE dcomCount <= 1
      RETURN COUNT(DISTINCT n) AS support

- **Coverage: 19/60**

      // Tính tổng số mối quan hệ EXECUTE_DCOM
      MATCH ()-[:EXECUTE_DCOM]->()
      WITH COUNT(*) AS totalFacts

      // Tính số lượng mối quan hệ EXECUTE_DCOM mà quy tắc duy nhất được áp dụng
      MATCH (n)-[:EXECUTE_DCOM]->(m)
      WITH n, COUNT(m) AS dcomCount
      WHERE dcomCount <= 1
      WITH totalFacts, COUNT(*) AS validFacts

      // Tính toán bao phủ
      RETURN toFloat(validFacts) / totalFacts AS coverage

- **Confidence: 19/60**



### Rule 7: ALLOWED_TO_DELEGATE relationships are unique
**Description: Each node should have at most one ALLOWED_TO_DELEGATE relationship with another node.**

- **Support: 14**

      MATCH (n)-[:ALLOWED_TO_DELEGATE]->(m)
      WITH n, COUNT(m) AS delegateCount
      WHERE delegateCount <= 1
      RETURN COUNT(DISTINCT n) AS support

- **Coverage: 30**

      MATCH ()-[:ALLOWED_TO_DELEGATE]->()
      WITH COUNT(*) AS totalFacts

      MATCH (n)-[:ALLOWED_TO_DELEGATE]->(m)
      WITH n, COUNT(m) AS delegateCount
      WHERE delegateCount <= 1
      WITH totalFacts, COUNT(*) AS validFacts

      RETURN toFloat(validFacts) / totalFacts AS coverage

- **Confidence: 0.67**

      // Tính số lượng các nút mà quy tắc duy nhất được áp dụng
      MATCH (n)-[:ALLOWED_TO_DELEGATE]->(m)
      WITH n, COUNT(m) AS delegateCount
      WHERE delegateCount <= 1
      WITH COUNT(DISTINCT n) AS validFacts

      // Tính số lượng thực thể có ít nhất một mối quan hệ ALLOWED_TO_DELEGATE
      MATCH (n)-[:ALLOWED_TO_DELEGATE]->()
      WITH validFacts, COUNT(DISTINCT n) AS totalFacts

      // Tính toán độ tin cậy
      RETURN toFloat(validFacts) / totalFacts AS confidence




### Mixtral

In [21]:
responses_mixtral_zero = llm_chain_mixtral.invoke({"query": question_zero})
for response in str(responses_mixtral_zero).split("\n"):
  print(response)

{'query': '\nBased on the following graph properties, generate detailed consistency rules (graph functional dependency and graph entity dependency).\nConsider the structure, node information and relationships in the graph, and provide a set of rules that can be applied to maintain consistent \nand accurate data.\n\nFor each consistency rule you identify, provide a clear description of the rule and the corresponding Cypher query for checking\n', 'result': "Sure, based on the graph properties provided, here are some consistency rules that can be applied to maintain consistent and accurate data:\n\n1. **Rule 1: Each node can have only one GENERIC_ALL node**\n   Description: A node can be connected to the GENERIC_ALL node only once. This ensures that a node does not inherit properties from the GENERIC_ALL node multiple times.\n   Cypher Query:\n   ```\n   MATCH (n:Node)-[r]->(g:Node {label:'GENERIC_ALL'})\n   WITH n, count(r) as count\n   WHERE count > 1\n   RETURN n, count\n   ```\n\n2. *

### Rule 1: Each node can have only one GENERIC_ALL node
**Description: A node can be connected to the GENERIC_ALL node only once. This ensures that a node does not inherit properties from the GENERIC_ALL node multiple times.**

- **Support: 1**

      MATCH (n)-[:GENERIC_ALL]->(m)
      WITH n, COUNT(m) AS count
      WHERE count = 1
      RETURN COUNT(DISTINCT n) AS support

- **Coverage: 1/901**
      // Count the number of valid entities that satisfy the rule
      MATCH (n)-[:GENERIC_ALL]->(m)
      WITH n, COUNT(m) AS count
      WHERE count = 1
      WITH COUNT(DISTINCT n) AS validFacts

      // Count the number of relationship GENERIC_ALL
      MATCH ()-[:GENERIC_ALL]->()
      WITH validFacts, COUNT(*) AS totalFacts

      // Compute coverage
      RETURN validFacts, totalFacts, (validFacts * 1.0 / totalFacts) AS coverage

  - **Confidence: 1/2**
      // Count the number of valid entities that satisfy the rule
      MATCH (n)-[:GENERIC_ALL]->(m)
      WITH n, COUNT(m) AS count
      WHERE count <= 1
      WITH COUNT(DISTINCT n) AS validFacts

      // Count the number of relationship GENERIC_ALL
      MATCH (n)-[:GENERIC_ALL]->()
      WITH validFacts, COUNT(DISTINCT n) AS totalFacts

      // Compute confidence
      RETURN validFacts, totalFacts, (validFacts * 1.0 / totalFacts) AS confidence


### Rule 2: MEMBER_OF nodes should not have multiple outgoing relationships
**Description: A MEMBER_OF node should not have more than one outgoing relationship. This ensures that a node is not a member of multiple entities.**

- **Support: 340**

      // Tính số lượng các nút MEMBER_OF mà có không quá một mối quan hệ đi ra ngoài
      MATCH (n)-[:MEMBER_OF]->()
      WITH n, COUNT(*) AS outgoingCount
      WHERE outgoingCount <= 1
      RETURN COUNT(DISTINCT n) AS support


- **Coverage: 100%**

      // Tính tổng số mối quan hệ MEMBER_OF
      MATCH ()-[:MEMBER_OF]->()
      WITH COUNT(*) AS totalFacts

      // Tính số lượng mối quan hệ MEMBER_OF mà quy tắc được áp dụng
      MATCH (n)-[:MEMBER_OF]->()
      WITH COUNT(n) AS validFacts, totalFacts

      // Tính toán bao phủ
      RETURN toFloat(validFacts) / totalFacts AS coverage


- **Confidence: 0.59**

      MATCH (n)-[:MEMBER_OF]->()
      WITH n, COUNT(*) AS outgoingCount
      WHERE outgoingCount <= 1
      WITH COUNT(DISTINCT n) AS validFacts

      
      MATCH (n)-[:MEMBER_OF]->()
      WITH COUNT(DISTINCT n) AS totalFacts
   
      RETURN toFloat(validFacts) / totalFacts AS confidence


### Rule 3: Nodes connected to GENERIC_ALL should not have any other relationships
**Description: A node that is connected to the GENERIC_ALL node should not have any other relationships. This ensures that a node inherits properties from the GENERIC_ALL node only**.

- **Support: 1**

      MATCH (n)-[:GENERIC_ALL]->()
      WHERE size((n)--()) = 1
      RETURN COUNT(DISTINCT n) AS support

- **Coverage: 0.5**

      // Tính tổng số nút có mối quan hệ GENERIC_ALL
      MATCH (n)-[:GENERIC_ALL]->()
      WITH COUNT(DISTINCT n) AS totalFacts

      // Tính số lượng nút chỉ có một mối quan hệ với GENERIC_ALL và không có mối quan hệ nào khác
      MATCH (n)-[:GENERIC_ALL]->()
      WHERE size((n)--()) = 1
      WITH COUNT(DISTINCT n) AS validFacts, totalFacts

      // Tính toán bao phủ
      RETURN toFloat(validFacts) / totalFacts AS coverage

- **Confidence: 0.5**

        // Tính số lượng nút chỉ có một mối quan hệ với GENERIC_ALL và không có mối quan hệ nào khác
        MATCH (n)-[:GENERIC_ALL]->()
        WHERE size((n)--()) = 1
        WITH COUNT(DISTINCT n) AS validFacts

        // Tính tổng số nút có mối quan hệ GENERIC_ALL
        MATCH (n)-[:GENERIC_ALL]->()
        WITH COUNT(DISTINCT n) AS totalFacts, validFacts

        // Tính toán độ tin cậy
        RETURN toFloat(validFacts) / totalFacts AS confidence



### Rule 4: Nodes with ADMIN_TO, ALLOWED_TO_DELEGATE, EXECUTE_DCOM, CAN_RDP, HAS_SESSION labels should not connect to GENERIC_ALL
**Description: Nodes with specific labels should not connect to the GENERIC_ALL node. This ensures that these nodes do not inherit properties from the GENERIC_ALL node.**

- **Support: 0**

      // Tính tổng số các nút có nhãn cụ thể
      MATCH (n)
      WHERE n:ADMIN_TO OR n:ALLOWED_TO_DELEGATE OR n:EXECUTE_DCOM OR n:CAN_RDP OR n:HAS_SESSION
      WITH COUNT(DISTINCT n) AS totalFacts

      // Tính số lượng các nút có nhãn cụ thể không có kết nối với GENERIC_ALL
      MATCH (n)
      WHERE (n:ADMIN_TO OR n:ALLOWED_TO_DELEGATE OR n:EXECUTE_DCOM OR n:CAN_RDP OR n:HAS_SESSION)
      AND NOT (n)-[:GENERIC_ALL]->()
      WITH COUNT(DISTINCT n) AS validFacts, totalFacts

      // Tính toán bao phủ
      RETURN toFloat(validFacts) / totalFacts AS coverage


### Rule 5: Nodes with CONTAINS label should not have incoming relationships
**Description: A node with the CONTAINS label should not have any incoming relationships. This ensures that a node does not inherit properties from other nodes.**

- **Support: 0**

      MATCH (n:CONTAINS)
      WHERE NOT ()-[]->(n)
      RETURN COUNT(DISTINCT n) AS support

### Rule 6: Nodes with MEMBER_OF label should not have incoming relationships
**Description: A node with the MEMBER_OF label should not have any incoming relationships. This ensures that a node does not inherit properties from other nodes.**

- **Support: 0**

      MATCH (n:MEMBER_OF)
      WHERE NOT ()-[]->(n)
      RETURN COUNT(DISTINCT n) AS support


## Few-shot prompting

In [19]:
question_few_shot = """
    Examples of consistency Rules:

    1. Unique `neo4jImportId` per node: Each node should have a unique `neo4jImportId` property
    2. Unique `objectid` per node: Each node should have a unique `objectid` property.
    3. Only allowed node labels are `User`, `Group`, `Domain`, `OU`, `GPO`, and `Computer`

    Task: Generate new rules to ensure consistency and accuracy in the graph database, considering all node types and relationships.
    For each consistency rule you identify, provide a clear description of the rule and the corresponding Cypher query for checking

    Requirements:

    Ensure data consistency across all nodes and relationships.
    Avoid rules that apply to only one type of node or relationship broader context.
"""

### LLAMA

In [20]:
responses_llama_few = llm_chain_llama.invoke({"query": question_few_shot})
for response in str(responses_llama_few).split("\n"):
  print(response)

{'query': '\n    Examples of consistency Rules:\n    \n    1. Unique `neo4jImportId` per node: Each node should have a unique `neo4jImportId` property\n    2. Unique `objectid` per node: Each node should have a unique `objectid` property.\n    3. Only allowed node labels are `User`, `Group`, `Domain`, `OU`, `GPO`, and `Computer`\n\n    Task: Generate new rules to ensure consistency and accuracy in the graph database, considering all node types and relationships. \n    For each consistency rule you identify, provide a clear description of the rule and the corresponding Cypher query for checking\n    \n    Requirements:\n    \n    Ensure data consistency across all nodes and relationships.\n    Avoid rules that apply to only one type of node or relationship broader context.\n', 'result': "Based on the provided graph database, I've identified some consistency rules to ensure data consistency and accuracy. Here are the rules, along with their descriptions and corresponding Cypher queries:\

### Rule 1: Unique `id` per node
**Each node should have a unique `id` property**

- **Support: 953**

      MATCH (n)
      WITH n.neo4jImportId AS id, COUNT(n) AS countNodes
      WHERE id IS NOT NULL AND countNodes = 1
      RETURN COUNT(id) AS support

- **Coverage: 100**

      MATCH (n)
      WITH COUNT(n) AS totalNodes
      MATCH (n)
      WITH n.neo4jImportId AS id, COUNT(n) AS countNodes, totalNodes
      WHERE id IS NOT NULL AND countNodes = 1
      RETURN toFloat(COUNT(id)) / totalNodes AS coverage

- **Confidence: 100**


### Rule 2: Consistent labeling
**Each node should have a label that is one of the allowed labels: `User`, `Group`, `Domain`, `OU`, `GPO`, or `Computer`**

- **support - 953**
      MATCH (n)
      WHERE  (n:User OR n:Group OR n:Domain OR n:OU OR n:GPO OR n:Computer)
      RETURN COUNT(n) AS support

- **coverage - 100%**
      MATCH (n)
      WITH COUNT(n) AS totalNodes
      MATCH (n)
      WHERE n:User OR n:Group OR n:Domain OR n:OU OR n:GPO OR n:Computer
      RETURN toFloat(COUNT(n)) / totalNodes AS coverage


- **confidence - 100%**
      MATCH (n)
      WHERE NOT (n:User OR n:Group OR n:Domain OR n:OU OR n:GPO OR n:Computer)
      WITH COUNT(n) AS nonConformingNodes
      MATCH (n)
      WHERE n:User OR n:Group OR n:Domain OR n:OU OR n:GPO OR n:Computer
      RETURN toFloat(COUNT(n)) / (COUNT(n) + nonConformingNodes) AS confidence

### Rule 3: MEMBER_OF relationships have correct direction
**MEMBER_OF` relationships should always point from a `User` or `Group` node to a `Group` node.**

- **Support: 1043**
     
      MATCH (u:User)-[r:MEMBER_OF]->(g:Group)
      RETURN COUNT(r) AS support

- **Coverage: 77%**

      MATCH ()-[r:MEMBER_OF]->()
      WITH COUNT(r) AS totalRelationships

      MATCH (u:User)-[r:MEMBER_OF]->(g:Group)
      RETURN toFloat(COUNT(r)) / totalRelationships AS coverage

- **Confidence: 0.98**

        MATCH (u:User)-[r:MEMBER_OF]->(g:Group)
        WITH COUNT(r) AS validRelationships

        
        MATCH (n)-[r:MEMBER_OF]->()
        WHERE (n:User OR n:Group) AND (n)-[:MEMBER_OF]->()
        WITH validRelationships, COUNT(r) AS totalRelationshipsWithValidNodes

    
        RETURN toFloat(validRelationships) / totalRelationshipsWithValidNodes AS confidence



### Rule 4: ADMIN_TO relationships have correct direction
**ADMIN_TO` relationships should always point from a `User` or `Group` node to a `Computer` node.**

- **Support: 1347**

      MATCH (g:Group)-[r:ADMIN_TO]->(c:Computer)
      RETURN COUNT(r) AS support

- **Coverage: 100%**

      MATCH ()-[r:ADMIN_TO]->()
      WITH COUNT(r) AS totalRelationships

      MATCH (g:Group)-[r:ADMIN_TO]->(c:Computer)
      RETURN toFloat(COUNT(r)) / totalRelationships AS coverage


- **Confidence: 100%**

        MATCH (g:Group)-[r:ADMIN_TO]->(c:Computer)
        WITH COUNT(r) AS validRelationships

        MATCH (n)-[r:ADMIN_TO]->()
        WHERE (n:Group AND (n)-[:ADMIN_TO]->() OR n:Computer)
        WITH validRelationships, COUNT(r) AS totalRelationshipsWithValidNodes

        //  confidence
        RETURN toFloat(validRelationships) / totalRelationshipsWithValidNodes AS confidence


### Rule 5: HAS_SESSION relationships have correct direction
**HAS_SESSION` relationships should always point from a `User` node to a `Computer` node.**

- **Support: 264**

      MATCH (c:Computer)-[r:HAS_SESSION]->(u:User)
      RETURN COUNT(r) AS support


- **Coverage: 100%**

      MATCH ()-[r:HAS_SESSION]->()
      WITH COUNT(r) AS totalRelationships

      MATCH (c:Computer)-[r:HAS_SESSION]->(u:User)
      RETURN toFloat(COUNT(r)) / totalRelationships AS coverage


- **Confidence: 100%**

      MATCH (c:Computer)-[r:HAS_SESSION]->(u:User)
      WITH COUNT(r) AS validRelationships

      MATCH (n)-[r:HAS_SESSION]->()
      WHERE (n:Computer OR n:User)
      WITH validRelationships, COUNT(r) AS totalRelationshipsWithValidNodes

      // Tính confidence
      RETURN toFloat(validRelationships) / totalRelationshipsWithValidNodes AS confidence


### Rule 6: CAN_RDP relationships have correct direction
**’CAN_RDP` relationships should always point from a `User` node to a `Computer` node.**

- **Support: 30**

      MATCH (u:User)-[r:CAN_RDP]->(c:Computer)
      RETURN COUNT(r) AS support

- **Coverage: 50%**

      MATCH ()-[r:CAN_RDP]->()
      WITH COUNT(r) AS totalRelationships

      MATCH (u:User)-[r:CAN_RDP]->(c:Computer)
      RETURN toFloat(COUNT(r)) / totalRelationships AS coverage


- **Confidence: 100%**

      MATCH (u:User)-[r:CAN_RDP]->(c:Computer)
      WITH COUNT(r) AS validRelationships

      MATCH (n)-[r:CAN_RDP]->()
      WHERE (n:User OR n:Computer)
      WITH validRelationships, COUNT(r) AS totalRelationshipsWithValidNodes

      RETURN toFloat(validRelationships) / totalRelationshipsWithValidNodes AS confidence



### Rule 7: ALLOWED_TO_DELEGATE relationships have correct direction**’ALLOWED_TO_DELEGATE` relationships should always point from a `User` node to a `Computer`**

- **Support: 30**

      MATCH (u:User)-[r:ALLOWED_TO_DELEGATE]->(c:Computer)
      RETURN COUNT(r) AS support

- **Coverage: 100%**

      MATCH ()-[r:ALLOWED_TO_DELEGATE]->()
      WITH COUNT(r) AS totalRelationships

      MATCH (u:User)-[r:ALLOWED_TO_DELEGATE]->(c:Computer)
      RETURN toFloat(COUNT(r)) / totalRelationships AS coverage


- **Confidence: 100%**

      MATCH (u:User)-[r:ALLOWED_TO_DELEGATE]->(c:Computer)
      WITH COUNT(r) AS validRelationships

      MATCH (n)-[r:ALLOWED_TO_DELEGATE]->()
      WHERE (n:User OR n:Computer)
      WITH validRelationships, COUNT(r) AS totalRelationshipsWithValidNodes

      RETURN toFloat(validRelationships) / totalRelationshipsWithValidNodes AS confidence



### Mixtral

In [22]:
responses_mixtral_few = llm_chain_mixtral.invoke({"query": question_few_shot})
for response in str(responses_mixtral_few).split("\n"):
  print(response)

{'query': '\n    Examples of consistency Rules:\n    \n    1. Unique `neo4jImportId` per node: Each node should have a unique `neo4jImportId` property\n    2. Unique `objectid` per node: Each node should have a unique `objectid` property.\n    3. Only allowed node labels are `User`, `Group`, `Domain`, `OU`, `GPO`, and `Computer`\n\n    Task: Generate new rules to ensure consistency and accuracy in the graph database, considering all node types and relationships. \n    For each consistency rule you identify, provide a clear description of the rule and the corresponding Cypher query for checking\n    \n    Requirements:\n    \n    Ensure data consistency across all nodes and relationships.\n    Avoid rules that apply to only one type of node or relationship broader context.\n', 'result': "Sure, I can help you generate some consistency rules for your graph database. Here are a few suggestions:\n\n1. **Unique `relationshipType` per relationship**: Each relationship should have a unique `re

### 1. Unique `relationshipType` per relationship
**Each relationship should have a unique `relationshipType` property. This ensures that each relationship is distinct and can be easily identified.**

- **Support: 4698**

      MATCH ()-[r]->()
      WITH r, type(r) AS type, COUNT(*) AS count
      WHERE count = 1
      RETURN r

- **Coverage: 100%**

      MATCH ()-[r]->()
      WITH COUNT(r) AS totalRelationships

      MATCH ()-[r]->()
      WITH r, type(r) AS type, COUNT(*) AS count, totalRelationships
      WHERE count = 1
      RETURN toFloat(COUNT(r)) / totalRelationships AS coverage



- **Confidence: 100%**

      MATCH ()-[r]->()
      WITH r, type(r) AS type, COUNT(*) AS count
      WHERE count = 1
      WITH COUNT(count) AS uniqueTypeCount
      MATCH ()-[r]->()
      RETURN toFloat(uniqueTypeCount) / COUNT(r) AS confidence

### 2. Consistent `isacl` property for `GENERIC_ALL` nodes
**The `isacl` property for `GENERIC_ALL` nodes should be consistently set to `True`. This ensures that `GENERIC_ALL` nodes are correctly identified as Access Control List (ACL) entries.**

- **Support: 901**

      MATCH ()-[r:GENERIC_ALL]->(m)
      WHERE r.isacl = True
      RETURN COUNT(r) AS support

- **Coverage: 100%**

      MATCH ()-[r:GENERIC_ALL]->()
      WITH COUNT(r) AS totalNodes

      MATCH ()-[r:GENERIC_ALL]->(m)
      WHERE r.isacl = True
      RETURN toFloat(COUNT(r)) / totalNodes AS coverage

- **Confidence:100%**

      MATCH ()-[r:GENERIC_ALL]->(m)
      WHERE r.isacl = True
      WITH COUNT(r) AS trueIsaclCount

      // Tính tổng số nút GENERIC_ALL có thuộc tính isacl
      MATCH ()-[r:GENERIC_ALL]->()
      WHERE r.isacl IS NOT NULL
      RETURN toFloat(trueIsaclCount) / COUNT(r) AS confidence

### 3. Valid `label` property for all nodes
**The `label` property for all nodes should be one of `User`, `Group`, `Domain`, `OU`, `GPO`, `Computer`, or `GENERIC_ALL`. This ensures that all nodes are correctly identified and categorized.**

- **support - 953**
      MATCH (n)
      WHERE  (n:User OR n:Group OR n:Domain OR n:OU OR n:GPO OR n:Computer)
      RETURN COUNT(n) AS support

- **coverage - 100%**
      MATCH (n)
      WITH COUNT(n) AS totalNodes
      MATCH (n)
      WHERE n:User OR n:Group OR n:Domain OR n:OU OR n:GPO OR n:Computer
      RETURN toFloat(COUNT(n)) / totalNodes AS coverage


- **confidence - 100%**
      MATCH (n)
      WHERE NOT (n:User OR n:Group OR n:Domain OR n:OU OR n:GPO OR n:Computer)
      WITH COUNT(n) AS nonConformingNodes
      MATCH (n)
      WHERE n:User OR n:Group OR n:Domain OR n:OU OR n:GPO OR n:Computer
      RETURN toFloat(COUNT(n)) / (COUNT(n) + nonConformingNodes) AS confidence

### 4. No isolated nodes
**All nodes should be part of the graph and not isolated. This ensures that all nodes are connected and contribute to the overall graph structure.**

- **Support: 952**

      MATCH (n)
      WHERE EXISTS((n)--())  
      RETURN COUNT(n) AS support


- ***Coverage: 99.8%*

      MATCH (n)
      WITH COUNT(n) AS totalNodes

      MATCH (n)
      WHERE EXISTS((n)--())
      RETURN toFloat(COUNT(n)) / totalNodes AS coverage

- **Confidence: 100%**

      MATCH (n)
      WHERE EXISTS((n)--())
      WITH COUNT(n) AS nonIsolatedNodes

      MATCH (n)-[r]->()
      RETURN toFloat(nonIsolatedNodes) / COUNT(DISTINCT n) AS confidence

