## Import required libraries

In [58]:
import json
import networkx as nx
import openai
from groq import Groq
from neo4j import GraphDatabase
import json
import json
import networkx as nx
import time

## NEO4J Connection

In [59]:
# Neo4j connection details
uri = "bolt://localhost:7687"  
user = "neo4j"                 
password = "123456789"          

# Initialize the Neo4j driver
driver = GraphDatabase.driver(uri, auth=(user, password))

# Function to run a query and return the results
def run_query(query):
    with driver.session() as session:
        result = session.run(query)
        return [record.data() for record in result]

## Get All data from Neo4J

In [60]:
# Get all nodes
def get_all_nodes(tx):
    query = """
    CALL db.labels() YIELD label
    CALL {
      WITH label
      MATCH (n)
      WHERE label IN labels(n)
      RETURN n
      LIMIT 100
    }
    RETURN labels(n) AS labels, n
    """
    result = tx.run(query)
    nodes = []
    for record in result:
        labels = record["labels"]
        node = record["n"]
        nodes.append({
            "labels": labels,
            "properties": dict(node)
        })
    return nodes

# Perform query
with driver.session() as session:
    all_nodes = session.read_transaction(get_all_nodes)

# Close connection driver
driver.close()

# Result
for node in all_nodes[:100]:
    print(f"Labels: {node['labels']}, Properties: {node['properties']}")

  all_nodes = session.read_transaction(get_all_nodes)


Labels: ['User', 'Me'], Properties: {'followers': 34507, 'screen_name': 'neo4j', 'following': 10124, 'name': 'Neo4j', 'profile_image_url': 'http://pbs.twimg.com/profile_images/1183755273753767936/vSV_IPTs_normal.jpg', 'location': 'Graphs Are Everywhere', 'url': 'https://t.co/JSIaZFJEaE'}
Labels: ['User'], Properties: {'followers': 2617355, 'screen_name': 'NASAPersevere', 'following': 45, 'name': "NASA's Perseverance Mars Rover", 'statuses': 512, 'profile_image_url': 'http://pbs.twimg.com/profile_images/1235636280165203974/6YjG6kTC_normal.jpg', 'location': 'Jezero Crater, Mars', 'url': 'https://t.co/GpTOmL7zGl'}
Labels: ['User'], Properties: {'followers': 0, 'screen_name': 'pranitahakim1', 'following': 0, 'name': 'pranitahakim', 'location': '', 'profile_image_url': 'http://pbs.twimg.com/profile_images/1370306070845911046/VQcQPkp3_normal.jpg'}
Labels: ['User'], Properties: {'followers': 35, 'screen_name': 'galeister', 'following': 35, 'name': 'Galeister', 'profile_image_url': 'http://pbs

## Graph Encoding

In [61]:
def encode_data(data):
    encoded_str = ""
    for idx, node in enumerate(data):
        labels_str = ', '.join(node['labels'])
        properties_str = ', '.join([f"{key}: {value}" for key, value in node['properties'].items()])
        encoded_str += f"Node {idx} has labels [{labels_str}] and ({properties_str}), "
    return encoded_str[:-2]  # Loại bỏ dấu phẩy cuối cùng

# Encode data and print results
encoded_node_data = encode_data(all_nodes)
print(encoded_node_data)

Node 0 has labels [User, Me] and (followers: 34507, screen_name: neo4j, following: 10124, name: Neo4j, profile_image_url: http://pbs.twimg.com/profile_images/1183755273753767936/vSV_IPTs_normal.jpg, location: Graphs Are Everywhere, url: https://t.co/JSIaZFJEaE), Node 1 has labels [User] and (followers: 2617355, screen_name: NASAPersevere, following: 45, name: NASA's Perseverance Mars Rover, statuses: 512, profile_image_url: http://pbs.twimg.com/profile_images/1235636280165203974/6YjG6kTC_normal.jpg, location: Jezero Crater, Mars, url: https://t.co/GpTOmL7zGl), Node 2 has labels [User] and (followers: 0, screen_name: pranitahakim1, following: 0, name: pranitahakim, location: , profile_image_url: http://pbs.twimg.com/profile_images/1370306070845911046/VQcQPkp3_normal.jpg), Node 3 has labels [User] and (followers: 35, screen_name: galeister, following: 35, name: Galeister, profile_image_url: http://pbs.twimg.com/profile_images/2566078294/rkk9kwf3ubbt4bn2zcn2_normal.jpeg, location: ), Node

In [62]:
len(encoded_node_data)

67663

### Get relationship and properties

In [63]:
def get_nodes_and_relationships(tx):
    query = (
        "MATCH (a)-[r]->(b) "
        "RETURN DISTINCT labels(a) AS StartLabels, type(r) AS RelationshipType, labels(b) AS EndLabels, properties(r) AS RelationshipProperties "
        "ORDER BY StartLabels, RelationshipType, EndLabels"
    )
    result = tx.run(query)
    nodes_and_relationships = []
    for record in result:
        nodes_and_relationships.append({
            "StartLabels": record["StartLabels"],
            "RelationshipType": record["RelationshipType"],
            "EndLabels": record["EndLabels"],
            "RelationshipProperties": dict(record["RelationshipProperties"])  # Convert properties to dictionary
        })
    return nodes_and_relationships


with driver.session() as session:
    nodes_and_relationships = session.read_transaction(get_nodes_and_relationships)

# Close connection to driver
driver.close()

  with driver.session() as session:
  nodes_and_relationships = session.read_transaction(get_nodes_and_relationships)


In [68]:
nodes_and_relationships

[{'StartLabels': ['Tweet'],
  'RelationshipType': 'CONTAINS',
  'EndLabels': ['Link'],
  'RelationshipProperties': {}},
 {'StartLabels': ['Tweet'],
  'RelationshipType': 'MENTIONS',
  'EndLabels': ['User'],
  'RelationshipProperties': {}},
 {'StartLabels': ['Tweet'],
  'RelationshipType': 'MENTIONS',
  'EndLabels': ['User', 'Me'],
  'RelationshipProperties': {}},
 {'StartLabels': ['Tweet'],
  'RelationshipType': 'REPLY_TO',
  'EndLabels': ['Tweet'],
  'RelationshipProperties': {}},
 {'StartLabels': ['Tweet'],
  'RelationshipType': 'RETWEETS',
  'EndLabels': ['Tweet'],
  'RelationshipProperties': {}},
 {'StartLabels': ['Tweet'],
  'RelationshipType': 'TAGS',
  'EndLabels': ['Hashtag'],
  'RelationshipProperties': {}},
 {'StartLabels': ['Tweet'],
  'RelationshipType': 'USING',
  'EndLabels': ['Source'],
  'RelationshipProperties': {}},
 {'StartLabels': ['User'],
  'RelationshipType': 'FOLLOWS',
  'EndLabels': ['User', 'Me'],
  'RelationshipProperties': {}},
 {'StartLabels': ['User'],
  '

### Edges encoding

In [64]:
def encode_relationship(relationship):
    start_labels = ', '.join(relationship['StartLabels'])
    end_labels = ', '.join(relationship['EndLabels'])
    relationship_type = relationship['RelationshipType']
    properties = relationship['RelationshipProperties']

    properties_str = ', '.join([f"{key}: {value}" for key, value in properties.items()]) if properties else "No properties"

    return f"Node [{start_labels}] connect to node  [{end_labels}] with Relationship Type {relationship_type} and Properties {{{properties_str}}}"

encoded_relationship =  ""
# Print
for relationship in nodes_and_relationships:
    encoded_relationship += encode_relationship(relationship) + ". "

In [66]:
encoded_relationship

'Node [Tweet] connect to node  [Link] with Relationship Type CONTAINS and Properties {No properties}. Node [Tweet] connect to node  [User] with Relationship Type MENTIONS and Properties {No properties}. Node [Tweet] connect to node  [User, Me] with Relationship Type MENTIONS and Properties {No properties}. Node [Tweet] connect to node  [Tweet] with Relationship Type REPLY_TO and Properties {No properties}. Node [Tweet] connect to node  [Tweet] with Relationship Type RETWEETS and Properties {No properties}. Node [Tweet] connect to node  [Hashtag] with Relationship Type TAGS and Properties {No properties}. Node [Tweet] connect to node  [Source] with Relationship Type USING and Properties {No properties}. Node [User] connect to node  [User, Me] with Relationship Type FOLLOWS and Properties {No properties}. Node [User] connect to node  [Tweet] with Relationship Type POSTS and Properties {No properties}. Node [User, Me] connect to node  [User] with Relationship Type FOLLOWS and Properties {

### Data preparing

In [67]:
encoded_data = encoded_node_data +  "\nIn this graph: \n" + encoded_relationship 

In [68]:
encoded_data

'Node 0 has labels [User, Me] and (followers: 34507, screen_name: neo4j, following: 10124, name: Neo4j, profile_image_url: http://pbs.twimg.com/profile_images/1183755273753767936/vSV_IPTs_normal.jpg, location: Graphs Are Everywhere, url: https://t.co/JSIaZFJEaE), Node 1 has labels [User] and (followers: 2617355, screen_name: NASAPersevere, following: 45, name: NASA\'s Perseverance Mars Rover, statuses: 512, profile_image_url: http://pbs.twimg.com/profile_images/1235636280165203974/6YjG6kTC_normal.jpg, location: Jezero Crater, Mars, url: https://t.co/GpTOmL7zGl), Node 2 has labels [User] and (followers: 0, screen_name: pranitahakim1, following: 0, name: pranitahakim, location: , profile_image_url: http://pbs.twimg.com/profile_images/1370306070845911046/VQcQPkp3_normal.jpg), Node 3 has labels [User] and (followers: 35, screen_name: galeister, following: 35, name: Galeister, profile_image_url: http://pbs.twimg.com/profile_images/2566078294/rkk9kwf3ubbt4bn2zcn2_normal.jpeg, location: ), No

#### Rule Generation

In [69]:
client = Groq(
    api_key="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",
)

In [70]:
def generator(encoded_graph, query, model, size, overlap):
    # Function to divide the text into smaller chunks
    def chunk_text(text, max_length, overlap):    
        words = text.split()  # Split the text into a list of words.
        chunks = []           # Initialize a list to store the resulting chunks
        current_chunk = []    # Initialize an empty list to build the current chunk
        current_length = 0    # Initialize a variable to track the length of the current chunk.
        start_idx = 0         # Start index for the current chunk in the word list
    
        while start_idx < len(words):
            # Create a chunk starting from start_idx, ensuring it does not exceed max_length
            current_chunk = []  # Reset the current chunk for the new one
            current_length = 0  # Reset the current length of the chunk
            
            # Add words to the chunk until the total length exceeds max_length
            while start_idx < len(words) and current_length + len(words[start_idx]) + 1 <= max_length:
                current_chunk.append(words[start_idx])  # Add the current word to the chunk
                current_length += len(words[start_idx]) + 1  # Update the chunk length
                start_idx += 1  # Move to the next word.
            
            # If the current chunk has words, append it to the chunks list
            if current_chunk:
                chunks.append(' '.join(current_chunk))  # Join the words in the chunk and add to chunks list
            
            # For overlap: move back by the overlap number of words (if there are more words)
            if start_idx < len(words):
                start_idx -= overlap  # Move back by 'overlap' words to create overlap between chunks
    
        return chunks  # Return the list of chunks

    # Function to process each chunk
    def process_chunk(chunk):
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": chunk,
                }
            ],
            model= model,
        )

        return chat_completion.choices[0].message.content

    # Split encoded_graph into chunks
    max_chunk_length = size  
    overlap = overlap
    graph_chunks = chunk_text(encoded_graph, max_chunk_length, overlap)

    # Process each chunk of encoded_graph
    processed_graph_parts = [process_chunk(chunk) for chunk in graph_chunks]

    # Combine processed parts back into a single string
    processed_graph = ' '.join(processed_graph_parts)

    # Append query to the processed graph and process the final combined context
    context = processed_graph + "\n" + query
    final_result = process_chunk(context)

    return final_result

### checking window

In [71]:
def chunk_text(text, max_length, overlap):    
    words = text.split()  # Split the text into a list of words
    chunks = []           # Initialize a list to store the resulting chunks
    start_idx = 0         # Start index for the current chunk in the word list

    while start_idx < len(words):
        current_chunk = []  # Reset the current chunk for the new one
        current_length = 0  # Reset the current length of the chunk

        # Add words to the chunk until the total length exceeds max_length
        while start_idx < len(words) and current_length + len(words[start_idx]) + 1 <= max_length:
            current_chunk.append(words[start_idx])  # Add the current word to the chunk
            current_length += len(words[start_idx]) + 1  # Update the chunk length
            start_idx += 1  # Move to the next word
        
        # Append the current chunk to the chunks list
        if current_chunk:
            chunks.append(' '.join(current_chunk))
        
        # For overlap: move back by the overlap number of words (if there are more words)
        if start_idx < len(words):
            start_idx -= overlap

    return chunks  # Return the list of chunks

max_chunk_length = 8000
overlap = 0

# Split the text into overlapping chunks
graph_chunks = chunk_text(encoded_data, max_chunk_length, overlap)

# Print out the resulting chunks
print("Generated overlapping windows:")
for i, chunk in enumerate(graph_chunks):
    print(f"\nWindow {i+1}: {chunk}")


Generated overlapping windows:

Window 1: Node 0 has labels [User, Me] and (followers: 34507, screen_name: neo4j, following: 10124, name: Neo4j, profile_image_url: http://pbs.twimg.com/profile_images/1183755273753767936/vSV_IPTs_normal.jpg, location: Graphs Are Everywhere, url: https://t.co/JSIaZFJEaE), Node 1 has labels [User] and (followers: 2617355, screen_name: NASAPersevere, following: 45, name: NASA's Perseverance Mars Rover, statuses: 512, profile_image_url: http://pbs.twimg.com/profile_images/1235636280165203974/6YjG6kTC_normal.jpg, location: Jezero Crater, Mars, url: https://t.co/GpTOmL7zGl), Node 2 has labels [User] and (followers: 0, screen_name: pranitahakim1, following: 0, name: pranitahakim, location: , profile_image_url: http://pbs.twimg.com/profile_images/1370306070845911046/VQcQPkp3_normal.jpg), Node 3 has labels [User] and (followers: 35, screen_name: galeister, following: 35, name: Galeister, profile_image_url: http://pbs.twimg.com/profile_images/2566078294/rkk9kwf3u

### Zero-shot Prompt

In [22]:
zero_prompt = """
Based on the following graph properties, generate detailed consistency rules (graph functional dependency and graph entity dependency).
Consider the structure, node information and relationships in the graph, and provide a set of rules that can be applied to maintain consistent 
and accurate data.

For each consistency rule you identify, provide a clear description of the rule and generated the corresponding Cypher query to check the 
number of nodes or relationships that satisfy the rule.
Your query should return the count of entities (nodes or relationships) that match the rule described. 
Provide the query in the format of valid Cypher syntax, simple and ready for execution in a Neo4j databaseBelow is the input data:
Graph Information:

- Node: ["User", "Me"],  ["User"],  ["Tweet"],  ["Source"],  ["Hashtag"], ["Link"]. 
- Relationship: FOLLOWS, POSTS, MENTIONS,  TAGS, CONTAINS, USING,  RETWEETS,  REPLY_TO.
- Properties: ["User", "Me"].profile_image_url, ["User", "Me"].following, ["User", "Me"].url,  ["User", "Me"].location, ["User", "Me"].followers, ["User", "Me"].screen_name,  ["User", "Me"].name,  ["User"].profile_image_url, ["User"].url,  ["User"].location, ["User"].statuses, ["User"].name,  ["User"].screen_name, ["User"].followers, ["User"].following, ["Tweet"].created_at,  ["Tweet"].id, ["Tweet"].id_str, ["Tweet"].text, ["Tweet"].favorites,  ["Tweet"].import_method, ["Source"].name, ["Hashtag"].name,  ["Link"].url.
"""

### LLAMA3

In [40]:
import time
time_start = time.time()
rules_llama = generator(encoded_data, zero_prompt, "llama3-70b-8192", 8000, 300)
time_end = time.time()
execution_time = time_end-time_start
print(f"Execution time: {execution_time}")

Execution time: 525.2886788845062


In [20]:
print(rules_llama)

What a delightfully complex dataset!

After analyzing the graph structure, node properties, and relationships, I've identified several consistency rules to maintain accurate and consistent data. Here are the rules, along with their corresponding Cypher queries to check the number of nodes or relationships that satisfy each rule:

**Rule 1: Each user has a unique profile image URL**
Description: Ensure that each user node has a unique `profile_image_url` property.
Cypher Query: `MATCH (u:User) RETURN COUNT(DISTINCT u.profile_image_url)`

**Rule 2: Each tweet has a unique ID**
Description: Ensure that each tweet node has a unique `id` property.
Cypher Query: `MATCH (t:Tweet) RETURN COUNT(DISTINCT t.id)`

**Rule 3: Each hashtag has a unique name**
Description: Ensure that each hashtag node has a unique `name` property.
Cypher Query: `MATCH (h:Hashtag) RETURN COUNT(DISTINCT h.name)`

**Rule 4: Each link has a unique URL**
Description: Ensure that each link node has a unique `url` property.

**Rule 1: Unique Tweet IDs**
Description: Each tweet node should have a unique `id` property.


- Support: 2407
            MATCH (t:Tweet)
            WITH t.id AS tweetId, COUNT(t) AS tweetCount
            WHERE tweetCount = 1
            RETURN SUM(tweetCount) AS support_count

- Coverage: 100%

            MATCH (t:Tweet)
            WITH COUNT(t) AS total_tweets
            MATCH (t:Tweet)
            WITH t.id AS tweetId, COUNT(t) AS tweetCount, total_tweets
            WHERE tweetCount > 1
            RETURN COUNT(DISTINCT tweetId) * 1.0 / total_tweets AS coverage


- Confidence: 100%

**Rule 2: Valid URLs in Link nodes**
Description: All `url` properties in Link nodes should be valid URLs.

- Support: 1561

            MATCH (l:Link)
            WHERE l.url STARTS WITH 'http://' OR l.url STARTS WITH 'https://'
            RETURN COUNT(l) AS invalid_urls


- Converage: 100%

- Confidence: 100%

            MATCH (l:Link)
            WITH COUNT(l) AS total_links
            MATCH (l:Link)
            WHERE  l.url STARTS WITH 'http://' OR l.url STARTS WITH 'https://'
            RETURN COUNT(l) * 1.0 / total_links AS confidence


**Rule 3: User nodes should have a name**
Description: All User nodes should have a non-empty `name` property.

- Support: 38986

            MATCH (u:User)
            WHERE u.screen_name IS NOT NULL AND u.screen_name <> ''
            RETURN COUNT(u) AS missing_screen_names


- Coverage: 100

            MATCH (u:User)
            WITH COUNT(u) AS total_users
            MATCH (u:User)
            WHERE u.screen_name IS NOT NULL and u.screen_name <> ''
            RETURN COUNT(u) * 1.0 / total_users AS coverage

- Confidence: 100%

**Rule 4: Hashtag nodes should have a non-empty name**
Description: All Hashtag nodes should have a non-empty `name` property.

- Support: 344

            MATCH (h:Hashtag)
            WHERE h.name IS NOT NULL AND h.name <> ''
            RETURN COUNT(h) AS valid_names

- Coverage: 100%

            MATCH (u:User)
            WITH COUNT(u) AS total_users
            MATCH (u:User)
            WHERE u.screen_name IS NOT NULL AND u.screen_name <> ''
            RETURN COUNT(u) * 1.0 / total_users AS coverage

- Confidence: 100%



**Rule 5: Single Source per Tweet**
Description: Each Tweet node should have at most one Source node connected to it through the `USING` relationship.

- Support: 2146

            MATCH (t:Tweet)-[r:USING]->(s:Source)
            WITH t, COUNT(s) AS source_count
            WHERE source_count > 1
            RETURN COUNT(t) AS invalid_tweets

- Coverage: 89.15
            
            MATCH (t:Tweet)
            WITH COUNT(t) AS total_tweets
            MATCH (t:Tweet)-[r:USING]->(s:Source)
            WITH total_tweets, COUNT(DISTINCT t) AS tweets_with_sources
            RETURN tweets_with_sources * 1.0 / total_tweets AS coverage


- Confidence: 89.15


**Rule 6: Valid relationships between nodes**
Description: Ensure that the relationships between nodes are valid and consistent. For example, a Tweet node can only be connected to a User node through the `POSTS` relationship.

- Support: 2146

          MATCH (t:Tweet)<-[r:POSTS]-(u:User)
          WITH t, COUNT(u) AS userCount
          WHERE userCount = 1
          RETURN COUNT(t) AS support

- Coverage: 89%
  
          MATCH (t:Tweet)<-[r:POSTS]-(u:User)
          WITH t, COUNT(u) AS userCount
          WITH COUNT(CASE WHEN userCount = 1 THEN 1 ELSE NULL END) AS validTweets
          MATCH (t:Tweet) 
          RETURN validTweets * 1.0 / COUNT(t) AS coverage

- Confidence: 100%
  
          MATCH (t:Tweet)<-[r:POSTS]-(u:User)
          WITH t, COUNT(u) AS userCount
          RETURN COUNT(CASE WHEN userCount = 1 THEN 1 ELSE NULL END) * 1.0 / COUNT(t) AS confidence


**Rule 7: Unique User IDs**
Description: Each User node should have a unique `id` property.

- Support: 0

- Coverage: 0

- Confidence: 0

**Rule 8: Valid Followers and Following relationships**
Description: Ensure that the `FOLLOWS` relationships between User nodes are valid and consistent. A User node can only follow another User node, and the `FOLLOWS` relationship should not form cycles.

- Support: 44630
  
        MATCH (u1:User)-[f:FOLLOWS]->(u2:User)
        WHERE u1<>u2
        RETURN COUNT(f) AS support

- Covẻage: 100%

- Confidence: 100%


### Mixtral 

In [25]:
time_start = time.time()
rules_mixtral = generator(encoded_data, zero_prompt, "mixtral-8x7b-32768", 8000, 0)
time_end = time.time()
execution_time = time_end-time_start
print(f"Execution time: {execution_time}")

Execution time: 517.3231461048126


In [44]:
print(rules_mixtral)

Sure, based on the input data provided, here are some consistency rules and corresponding Cypher queries:

1. **Rule:** For each `User` node, the number of `followers` should be equal to the number of `FOLLOWS` relationships going out from the `User` node with label ["User", "Me"].

	Cypher query:
	```
	MATCH (u:User)-[:FOLLOWS]->(uMe:User [label: "Me"])
	RETURN DISTINCT u, count(*) AS num_followers, size((uMe)-[:FOLLOWS]->()) AS num_following
	```
2. **Rule:** For each `Tweet` node, the number of `favorites` should be equal to the number of `CONTAINS` relationships going out from the `Tweet` node to `Link` nodes.

	Cypher query:
	```
	MATCH (t:Tweet)-[:CONTAINS]->(l:Link)
	RETURN DISTINCT t, count(*) AS num_favorites
	```
3. **Rule:** For each `Hashtag` node, the number of `TAGS` relationships should be equal to the number of `Tweet` nodes that it is connected to.

	Cypher query:
	```
	MATCH (h:Hashtag)<-[:TAGS]-(t:Tweet)
	RETURN DISTINCT h, count(*) AS num_tweets
	```
4. **Rule:** Fo

**Consistency Rule 1**
Description: A `User` node must have a unique `screen_name` property.

- Support: 38986

        MATCH (u:User)
        WITH u.screen_name AS screen_name, COUNT(u) AS count
        WHERE count = 1
        RETURN COUNT(screen_name) AS support

- Coverage: 100%

      MATCH (u:User)
      WITH u.screen_name AS screen_name, COUNT(u) AS count, u
      RETURN SUM(CASE WHEN count = 1 THEN 1 ELSE 0 END) * 1.0 / COUNT(u) AS coverage

- Confidence: 100%

      MATCH (u:User)
      WHERE EXISTS(u.screen_name)
      WITH u.screen_name AS screen_name, COUNT(u) AS count
      RETURN SUM(CASE WHEN count = 1 THEN 1 ELSE 0 END) * 1.0 / COUNT(screen_name) AS confidence

**Consistency Rule 2:**
Description: A `Tweet` node should not have more than one `created_at` property.

- Support: 2407

- Coverage: 100%

- Confidence: 100%

**Consistency Rule 3:**
Description: A `User` node should not be connected to a `Tweet` node by both `MENTIONS` and `MENTIONS, Me` relationships.
Cypher Query:
```cypher
MATCH (u:User)-[m1:MENTIONS]->(t:Tweet)<-[:MENTIONS, Me]-(me:User, Me)
RETURN DISTINCT u.screen_name, t.id
```

- Support: 0

- Coverage: 0

- Confidence: 0


**Consistency Rule 4:**
Description: A `User` node linked with the `FOLLOWS` relationship should have a `name` property.

- Support: 34507

            MATCH (u:User)-[:FOLLOWS]->()
            RETURN COUNT(DISTINCT u) AS support

- Coverage: 100%

            MATCH (u:User)-[:FOLLOWS]->()
            WITH COUNT(DISTINCT u) AS total_follows
            MATCH (u:User)-[:FOLLOWS]->()
            WHERE u.screen_name IS NOT NULL AND u.screen_name <> ''
            RETURN COUNT(DISTINCT u) * 1.0 / total_follows AS coverage


- Confidence: 100%



**Consistency Rule 5:**
Description: A `Hashtag` node should not be connected to a `Tweet` node by both `TAGS` and `TAGS, Me` relationships.

- Support: 1439
            MATCH (t:Tweet)-[:TAGS|TAGS_ME]->(h:Hashtag)
            RETURN COUNT(h) AS support


- Coverage:100%

- Confidence: 100%


**Consistency Rule 6:**
Description: A `Tweet` node should have a `text` property with a maximum length of 280 characters.

- Support: 2146

            MATCH (t:Tweet)
            WHERE t.properties.text IS NOT NULL AND NOT t.properties.text = ""
            RETURN COUNT(t) AS support

- Coverage: 100%

- Confidence: 100%

**Consistency Rule 7:**
Description: A `Tweet` node should have a unique `id` property.


- Support: 2407
            MATCH (t:Tweet)
            WITH t.id AS tweetId, COUNT(t) AS tweetCount
            WHERE tweetCount = 1
            RETURN SUM(tweetCount) AS support_count

- Coverage: 100%

            MATCH (t:Tweet)
            WITH COUNT(t) AS total_tweets
            MATCH (t:Tweet)
            WITH t.id AS tweetId, COUNT(t) AS tweetCount, total_tweets
            WHERE tweetCount > 1
            RETURN COUNT(DISTINCT tweetId) * 1.0 / total_tweets AS coverage


- Confidence: 100%


**Consistency Rule 8:**
Description: A `Link` node should have a unique `url` property.

- Support: 1561

        MATCH (l:Link)
        WHERE l.url IS NOT NULL
        RETURN COUNT(DISTINCT l.url) AS support

- Coverage: 100%

- Confidence: 100%

**Consistency Rule 9:**
Description: A `Tweet` node should have zero or more `RETWEETS` relationships.

- Support: 286
- Coverage: 11.89
- Confidence: 11.89

**Consistency Rule 10:**
Description: A `Tweet` node should have zero or one `created_by` relationship.

- Support: 2407

            MATCH (t:Tweet)
            WHERE SIZE((t)-[:created_by]->()) <= 1
            RETURN COUNT(t) AS Support

- Coverage: 100%

- Confidence: 100%


### Few-shot prompting

In [26]:
few_shot_prompt = """
    Examples of consistency Rules:
    
    1. Each `Tweet` node should have a unique `id_str` property to ensure that no duplicate tweets exist.
    2. Nodes should only have one of the allowed labels: `User`, `Tweet`, `Hashtag`, `Link`, or `Source`. This ensures that no unexpected labels are introduced into the graph.
    3. Every node labeled `User` must have a `screen_name` property to ensure that users are correctly identified.

    Task: Generate new rules to ensure consistency and accuracy in the graph database, considering all node types and relationships. 
    For each consistency rule you identify, provide a clear description of the rule and generated the corresponding Cypher query to check the 
    number of nodes or relationships that satisfy the rule.
    Your query should return the count of entities (nodes or relationships) that match the rule described. 
    Provide the query in the format of valid Cypher syntax, simple and ready for execution in a Neo4j databaseBelow is the input data:
    Graph Information:
    
    - Node: ["User", "Me"],  ["User"],  ["Tweet"],  ["Source"],  ["Hashtag"], ["Link"]. 
    - Relationship: FOLLOWS, POSTS, MENTIONS,  TAGS, CONTAINS, USING,  RETWEETS,  REPLY_TO.
    - Properties: ["User", "Me"].profile_image_url, ["User", "Me"].following, ["User", "Me"].url,  ["User", "Me"].location, ["User", "Me"].followers, ["User", "Me"].screen_name,  ["User", "Me"].name,  ["User"].profile_image_url, ["User"].url,  ["User"].location, ["User"].statuses, ["User"].name,  ["User"].screen_name, ["User"].followers, ["User"].following, ["Tweet"].created_at,  ["Tweet"].id, ["Tweet"].id_str, ["Tweet"].text, ["Tweet"].favorites,  ["Tweet"].import_method, ["Source"].name, ["Hashtag"].name,  ["Link"].url.

    """

### LLAMA - 3

In [38]:
time_start = time.time()
rules_fs_llama = generator(encoded_data, few_shot_prompt, "llama3-70b-8192", 8000, 200)
time_end = time.time()
execution_time = time_end-time_start
print(f"Execution time: {execution_time}")

Execution time: 410.20038294792175


In [39]:
print(rules_fs_llama)

Based on the provided graph data, I've identified some consistency rules to ensure accuracy and consistency in the graph database. Here are the rules, along with their corresponding Cypher queries:

**Rule 1: Each User node should have a unique screen_name property**

Description: Ensure that no two User nodes have the same screen_name.

Cypher Query:
```
MATCH (u:User) WITH u.screen_name AS screen_name, COUNT(u) AS count
WHERE count > 1
RETURN count
```
This query returns the count of duplicate screen_name values among User nodes.

**Rule 2: Each Tweet node should have a unique id_str property**

Description: Ensure that no two Tweet nodes have the same id_str.

Cypher Query:
```
MATCH (t:Tweet) WITH t.id_str AS id_str, COUNT(t) AS count
WHERE count > 1
RETURN count
```
This query returns the count of duplicate id_str values among Tweet nodes.

**Rule 3: Each node should have only one allowed label**

Description: Ensure that nodes are labeled correctly and do not have unexpected labe

1. **Unique User Identities**: Each user node should have a unique `screen_name` property to ensure that no duplicate users exist.

- Support: 38986

        MATCH (u:User)
        WITH u.screen_name AS screen_name, COUNT(u) AS count
        WHERE count = 1
        RETURN COUNT(screen_name) AS support

- Coverage: 100%

      MATCH (u:User)
      WITH u.screen_name AS screen_name, COUNT(u) AS count, u
      RETURN SUM(CASE WHEN count = 1 THEN 1 ELSE 0 END) * 1.0 / COUNT(u) AS coverage

- Confidence: 100%

      MATCH (u:User)
      WHERE EXISTS(u.screen_name)
      WITH u.screen_name AS screen_name, COUNT(u) AS count
      RETURN SUM(CASE WHEN count = 1 THEN 1 ELSE 0 END) * 1.0 / COUNT(screen_name) AS confidence


2. **Valid Follower Counts**: The `followers` property should be a non-negative integer.

- Support: 38690
        
        MATCH (n)
        WHERE n.followers IS NOT NULL AND n.followers >= 0
        RETURN COUNT(n) AS Support

- Coverage: 100%

- Confidence: 100%


3. **Valid Following Counts**: The `following` property should be a non-negative integer.

- Support: 38690
        
        MATCH (n)
        WHERE n.following IS NOT NULL AND n.followers >= 0
        RETURN COUNT(n) AS Support

- Coverage: 100%

- Confidence: 100%

4. **Consistent Profile Image URLs**: The `profile_image_url` property should be a valid URL.

- Support: 38960

            MATCH (n)
            WHERE n.profile_image_url IS NOT NULL 
            AND n.profile_image_url =~ '^https?:\/\/[\w\-]+(\.[\w\-]+)+[/#?]?.*$'
            RETURN COUNT(n) AS Support

- Coverage: 100%

- Confidence: 100%


5. **Valid Location Formats**: The `location` property should follow a consistent format (e.g., city, state, country).

- Support: 32

        MATCH (u:User)
        WHERE u.location =~ "[A-Za-z]+, [A-Za-z]{2} ([A-Za-z]+)"
        RETURN COUNT(*) AS Support


- Coverage: 0.082
- Confidence: 0.082



6. **Valid URL Formats**: The `url` property should be a valid URL.

- Support: 16997
            MATCH (n)
            WHERE n.url IS NOT NULL 
            AND n.url =~ '^https?:\/\/[\w\-]+(\.[\w\-]+)+[/#?]?.*$'
            RETURN COUNT(n) AS Support

- Coverage: 100%

- Confidence: 100%

7. **Consistent Statuses Counts**: The `statuses` property should be a non-negative integer.

- Support: 4052

            MATCH (n)
            WHERE n.statuses IS NOT NULL 
            AND n.statuses >= 0
            RETURN COUNT(n) AS Support

- Coverage: 100%

- Confidence: 100%

#### Mixtral

In [43]:
time_start = time.time()
rules_fs_mixtral = generator(encoded_data, few_shot_prompt, "mixtral-8x7b-32768", 8000, 0)
time_end = time.time()
execution_time = time_end-time_start
print(f"Execution time: {execution_time}")

RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for model `mixtral-8x7b-32768` in organization `org_01hyvd04xpe3vtmpx59tbpw3cf` on tokens per minute (TPM): Limit 5000, Used 7763, Requested 3780. Please try again in 1m18.522s. Visit https://console.groq.com/docs/rate-limits for more information.', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}

In [48]:
print(rules_fs_mixtral)

Consistency Rules:

1. Each `Tweet` node should have a unique `id_str` property.

    Cypher Query:

    ```sql
    MATCH (t:Tweet)
    WITH t.id_str AS id_str
    RETURN COUNT(DISTINCT id_str) AS unique_tweet_count
    ```

2. Nodes should only have one of the allowed labels: `User`, `Tweet`, `Hashtag`, `Link`, or `Source`.

    Cypher Query:

    ```sql
    MATCH (n)
    WITH n, labels(n) AS labels
    WHERE SIZE(labels) > 1
    RETURN COUNT(n) AS nodes_with_multiple_labels
    ```

3. Every node labeled `User` must have a `screen_name` property.

    Cypher Query:

    ```sql
    MATCH (u:User)
    WHERE NOT EXISTS(u.screen_name)
    RETURN COUNT(u) AS users_without_screen_name
    ```

4. Check for nodes labeled `Tweet` that have a `created_at` DateTime format.

    Cypher Query:

    ```sql
    MATCH (t:Tweet)
    WHERE NOT (t.created_at IS NOT NULL AND t.created_at =~ '^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d+Z$')
    RETURN COUNT(t) AS tweets_with_incorrect_created_at_fo

**1. Unique Constraint on id\_str for Tweet nodes:**

Description:
Ensure that every tweet has a unique id\_str property.

- Support: 2146

        MATCH (t:Tweet)
        WITH t.id_str AS id_str, COUNT(t) AS count
        WHERE count = 1
        RETURN COUNT(id_str) AS support

- Coverage: 100%

      MATCH (u:Tweet)
      WITH u.id_str AS id_str, COUNT(u) AS count, u
      RETURN SUM(CASE WHEN count = 1 THEN 1 ELSE 0 END) * 1.0 / COUNT(u) AS coverage

- Confidence: 100%

      MATCH (u:Tweet)
      WHERE EXISTS(u.id_str)
      WITH u.id_str AS id_str, COUNT(u) AS count
      RETURN SUM(CASE WHEN count = 1 THEN 1 ELSE 0 END) * 1.0 / COUNT(id_str) AS confidence

**2. Unique combination of name and screen\_name for User nodes:**

Description:
Ensure that no two users have the same name and screen\_name combination.

- Support: 38986

        MATCH (u:User)
        WITH u.screen_name AS screen_name, COUNT(u) AS count
        WHERE count = 1
        RETURN COUNT(screen_name) AS support

- Coverage: 100%

      MATCH (u:User)
      WITH u.screen_name AS screen_name, COUNT(u) AS count, u
      RETURN SUM(CASE WHEN count = 1 THEN 1 ELSE 0 END) * 1.0 / COUNT(u) AS coverage

- Confidence: 100%

      MATCH (u:User)
      WHERE EXISTS(u.screen_name)
      WITH u.screen_name AS screen_name, COUNT(u) AS count
      RETURN SUM(CASE WHEN count = 1 THEN 1 ELSE 0 END) * 1.0 / COUNT(screen_name) AS confidence
**3. Hashtags should only be connected to Tweet nodes:**

Description:
Ensure that Hashtag nodes are only connected to Tweet nodes.

- Support: 1493:

        MATCH (n)-[r]->(h:Hashtag)
        WHERE (n:Tweet)
        RETURN h, r, n;

- Coverage: 100%

- COnfidence: 100%
- 
**4. Users can only follow other Users:**

Description:
Ensure that relationships going out from User nodes have the label "FOLLOWS" and connect to a User node.

- Support: 44630
  
        MATCH (u1:User)-[f:FOLLOWS]->(u2:User)
        WHERE u1<>u2
        RETURN COUNT(f) AS support

- Covẻage: 100%

- Confidence: 100%


5. Each user has at most one home location:

Description:
Ensure that a user does not have multiple nodes with the same location as their home\_location.

- Support: 7808

            MATCH (u:User)
            WITH u.location AS location, COUNT(u) AS userCount
            WHERE userCount = 1
            RETURN COUNT(location) AS Support

- Coverage: 79.33%

            MATCH (u:User)
            WHERE u.home_location IS NOT NULL
            RETURN COUNT(DISTINCT u.home_location) AS TotalUniqueLocations

- Confidence: 79.33%

6. User and Me nodes should only have Tweet and Follows relationships:

Description:
Ensure that User, Me nodes only have Tweet and Follows relationships.

- Support: 2146

        MATCH (um:User)-[r]->(n)
        WHERE NOT (type(r) = 'TWEET' OR type(r) = 'FOLLOWS')
        RETURN r, um, n;

- Coverage: 2146/46776 (4.59)

- Confidence: 4.59

**7. Each Tweet node should have at most one Label and Link relationship:**

Description:
Ensure that a tweet does not have multiple nodes with different labels and urls.

- Support: 129

            MATCH (t:Tweet)-[:CONTAINS]->(l:Link)
            WITH t, COUNT(DISTINCT l) AS link_count
            WHERE link_count > 1
            RETURN t, link_count;

- Coverage: 67.64%

        MATCH (t:Tweet)
        WITH COUNT(t) AS total_tweets
        MATCH (t:Tweet)-[:CONTAINS]->(l:Link)
        WITH total_tweets, COUNT(DISTINCT t) AS tweets_with_links
        RETURN tweets_with_links * 1.0 / total_tweets AS coverage;


- Confidence: 79.24%
  
        MATCH (t:Tweet)-[:CONTAINS]->(l:Link)
        WITH t, COUNT(DISTINCT l) AS link_count
        MATCH (t:Tweet)
        WITH COUNT(DISTINCT t) AS total_tweets, 
             COUNT(CASE WHEN link_count > 1 THEN 1 END) AS invalid_tweets
        RETURN invalid_tweets * 1.0 / total_tweets AS confidence;


