# Import required libraries

In [None]:
!pip install groq
!pip install faiss-cpu
!pip install gpt4all
!pip install langchain_community
!pip install langchain_groq
!pip install langchain
!pip install sentence-transformers
!pip install langchain_openai
!pip install langchain-community

Collecting groq
  Downloading groq-0.9.0-py3-none-any.whl.metadata (13 kB)
Collecting httpx<1,>=0.23.0 (from groq)
  Downloading httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->groq)
  Downloading httpcore-1.0.5-py3-none-any.whl.metadata (20 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->groq)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading groq-0.9.0-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.5/103.5 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading h11-0.14.0-py3-none-any.whl (58 kB)


In [None]:
import json
import networkx as nx
from groq import Groq
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain.embeddings import GPT4AllEmbeddings
import openai
from langchain_community.llms import CTransformers
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_groq import ChatGroq
import time
# embeddings using langchain
from langchain.embeddings import SentenceTransformerEmbeddings

## Reading graph database

In [None]:
file_path = 'wc_full.json'
with open(file_path, 'r',  encoding='utf-8-sig') as file:
    data = json.load(file)

In [None]:
data

[{'n': {'identity': 8,
   'labels': ['Team'],
   'properties': {'name': 'Equatorial Guinea', 'id': '1888631'},
   'elementId': '8'},
  'r': {'identity': 4161,
   'start': 8,
   'end': 2367,
   'type': 'PLAYED_IN',
   'properties': {'score': 0},
   'elementId': '4161',
   'startNodeElementId': '8',
   'endNodeElementId': '2367'},
  'm': {'identity': 2367,
   'labels': ['Match'],
   'properties': {'date': '2011-06-29',
    'stage': 'First Stage',
    'id': '300144441'},
   'elementId': '2367'}},
 {'n': {'identity': 8,
   'labels': ['Team'],
   'properties': {'name': 'Equatorial Guinea', 'id': '1888631'},
   'elementId': '8'},
  'r': {'identity': 4004,
   'start': 8,
   'end': 2210,
   'type': 'PLAYED_IN',
   'properties': {'score': 2},
   'elementId': '4004',
   'startNodeElementId': '8',
   'endNodeElementId': '2210'},
  'm': {'identity': 2210,
   'labels': ['Match'],
   'properties': {'date': '2011-07-03',
    'stage': 'First Stage',
    'id': '300144450'},
   'elementId': '2210'}},
 {

## Encoding graph using incident encoder

In [None]:
# Tạo đồ thị sử dụng NetworkX
graph = nx.Graph()

# Thêm các node và cạnh vào đồ thị
for item in data:
    # Thêm node "n"
    node_n = item['n']
    graph.add_node(node_n['identity'], labels=node_n['labels'], **node_n['properties'])

    # Thêm node "m"
    node_m = item['m']
    graph.add_node(node_m['identity'], labels=node_m['labels'], **node_m['properties'])

    # Thêm cạnh "r"
    edge_r = item['r']
    graph.add_edge(edge_r['start'], edge_r['end'], id=edge_r['identity'], label=edge_r['type'], **edge_r['properties'])

# Hàm tạo chuỗi mô tả node và các thuộc tính của nó
def create_node_string(graph):
    node_descriptions = []
    for node, props in graph.nodes(data=True):
        labels = props.pop('labels', [])
        labels_str = ', '.join(labels)
        prop_desc = ', '.join([f"{key}: {value}" for key, value in props.items()])
        node_descriptions.append(f"{node} [{labels_str}] ({prop_desc})")
    return ', '.join(node_descriptions)

# Hàm mã hóa đồ thị
def encode_graph(graph):
    nodes_string = create_node_string(graph)
    output = "G describes a graph among nodes: \n%s.\n" % nodes_string
    if graph.edges():
        output += "In this graph:\n"
    for source_node in graph.nodes():
        target_nodes = list(graph.neighbors(source_node))
        target_nodes_str = ""
        nedges = 0
        for target_node in target_nodes:
            edge_props = graph.get_edge_data(source_node, target_node)
            edge_props_str = ', '.join([f"{key}: {value}" for key, value in edge_props.items()])
            target_nodes_str += f"{target_node} ({edge_props_str}), "
            nedges += 1
        if nedges > 1:
            output += "Node %s is connected to nodes %s.\n" % (
                source_node,
                target_nodes_str[:-2],
            )
        elif nedges == 1:
            output += "Node %s is connected to node %s.\n" % (
                source_node,
                target_nodes_str[:-2],
            )
    return output

# Mã hóa đồ thị
encoded_graph = encode_graph(graph)
print(encoded_graph)


G describes a graph among nodes: 
8 [Team] (name: Equatorial Guinea, id: 1888631), 2367 [Match] (date: 2011-06-29, stage: First Stage, id: 300144441), 2210 [Match] (date: 2011-07-03, stage: First Stage, id: 300144450), 2268 [Match] (date: 2011-07-06, stage: First Stage, id: 300144454), 1074 [Squad] (id: Equatorial Guinea in 2011), 0 [Tournament] (year: 2011, name: Germany 2011, id: 251475, shortName: germany2011), 9 [Team] (name: Nigeria, id: 1882893), 7 [Tournament] (year: 2015, name: Canada 2015, id: 266030, shortName: canada2015), 2 [Tournament] (year: 1991, name: China PR 1991, id: 3373, shortName: chinapr1991), 5 [Tournament] (year: 1999, name: USA 1999, id: 4644, shortName: usa1999), 1 [Tournament] (year: 2019, name: France 2019, id: 278513, shortName: france2019), 3 [Tournament] (year: 2003, name: USA 2003, id: 6929, shortName: usa2003), 6 [Tournament] (year: 2007, name: China 2007, id: 10232, shortName: china2007), 4 [Tournament] (year: 1995, name: Sweden 1995, id: 4654, shortN

## RAG
### Embedding

In [None]:
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
embedding_model_name = "all-MiniLM-L6-v2.gguf2.f16.gguf"
gpt4all_kwargs = {'allow_download': 'True'}
embeddings = GPT4AllEmbeddings(
  model_name = embedding_model_name,
  gpt4all_kwargs = gpt4all_kwargs
)

Downloading: 100%|██████████| 45.9M/45.9M [00:00<00:00, 84.4MiB/s]
Verifying: 100%|██████████| 45.9M/45.9M [00:00<00:00, 498MiB/s]


### Vector database

In [None]:

vector_db_path = "vectorstores/db_faiss"

def create_db_from_graph(encoded_graph):

    # Split graph data into smail text
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=8192,
        chunk_overlap=100,
        length_function=len

    )

    chunks = text_splitter.split_text(encoded_graph)


    #  Faiss Vector DB
    db = FAISS.from_texts(texts=chunks, embedding=embeddings)
    db.save_local(vector_db_path)
    return db


In [None]:
db_test = create_db_from_graph(encoded_graph)



## LLMs

In [None]:
vector_db_path = "vectorstores/db_faiss"

# Load LLM
def load_llm(model_file):
    llm = CTransformers(
        model=model_file,
        model_type="llama",
        max_new_tokens=8192,
        temperature=0.01
    )
    return llm



### Create prompt template

In [None]:
def creat_prompt(template):
    prompt = PromptTemplate(template = template, input_variables=["context", "question"])
    return prompt

### Create chain

In [None]:
# Create chain
def create_chain(prompt, llm, db):
    llm_chain = RetrievalQA.from_chain_type(
        llm = llm,
        chain_type= "stuff",
        retriever = db.as_retriever(search_kwargs = {"k":3}, max_tokens_limit=8192),
        return_source_documents = False,
        chain_type_kwargs= {'prompt': prompt}

    )
    return llm_chain

### Reading vector db

In [None]:
def read_vectors_db():
    # Embeding
    db = FAISS.load_local(vector_db_path,
                          embedding_model_name,
                          allow_dangerous_deserialization=True)
    return db

## Start

In [None]:
# start
db = read_vectors_db()

GROQ_API_KEY = "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"
llm_llama = ChatGroq(groq_api_key=GROQ_API_KEY, model="llama3-70b-8192")
llm_mixtral = ChatGroq(groq_api_key=GROQ_API_KEY, model="mixtral-8x7b-32768")



### Prompt template

In [None]:
template = """<|im_start|>system\nUse the following information
                          to answer the question. Please exactly
                          \n
    {context}<|im_end|>\n<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant"""
prompt = creat_prompt(template)


### LLMs chain

In [None]:
llm_chain_llama  = create_chain(prompt, llm_llama, db_test)
llm_chain_mixtral = create_chain(prompt, llm_mixtral, db_test)

## Zero-shot prompting


In [23]:
question_zero =  """
Based on the following graph properties, generate detailed consistency rules (graph functional dependency and graph entity dependency).
Consider the structure, node information and relationships in the graph, and provide a set of rules that can be applied to maintain consistent
and accurate data.

For each consistency rule you identify, provide a clear description of the rule and the corresponding Cypher query

"""

### LLAMA3

In [25]:
responses_llama_zero = llm_chain_llama.invoke({"query": question_zero})
for response in str(responses_llama_zero).split("\n"):
  print(response)

{'query': '\nBased on the following graph properties, generate detailed consistency rules (graph functional dependency and graph entity dependency).\nConsider the structure, node information and relationships in the graph, and provide a set of rules that can be applied to maintain consistent \nand accurate data.\n\nFor each consistency rule you identify, provide a clear description of the rule and the corresponding Cypher query\n\n', 'result': "Based on the provided graph data, I've identified several consistency rules to maintain accurate and consistent data. Here are the rules, along with their descriptions and corresponding Cypher queries:\n\n**Rule 1: Unique Node Representation**\n\nEach node should have a unique representation (id) within its respective category (e.g., player, team, coach).\n\nCypher Query:\n```\nCREATE CONSTRAINT ON (n) ASSERT n.id IS UNIQUE;\n```\n**Rule 2: Consistent Node Labels**\n\nEach node should have a consistent label based on its category (e.g., player, 

### Mixtral

In [27]:
responses_mixtral_zero = llm_chain_mixtral.invoke({"query": question_zero})
for response in str(responses_mixtral_zero).split("\n"):
  print(response)

{'query': '\nBased on the following graph properties, generate detailed consistency rules (graph functional dependency and graph entity dependency).\nConsider the structure, node information and relationships in the graph, and provide a set of rules that can be applied to maintain consistent \nand accurate data.\n\nFor each consistency rule you identify, provide a clear description of the rule and the corresponding Cypher query\n\n', 'result': "Sure, I can help you generate some consistency rules based on the graph properties you've provided. Here are some potential rules:\n\n1. **Unique Player Representation**: Each player should be uniquely represented in the graph. This means that there should not be two nodes with the same `id` under the `REPRESENTS` relationship.\n\n   Cypher Query:\n   ```\n   MATCH (n:Node) WHERE n.label = 'REPRESENTS' WITH n.id AS id, COUNT(*) AS count WHERE count > 1 RETURN id\n   ```\n\n2. **Unique Squad Membership**: A player cannot be in two different squad

## Few-shot prompting

In [28]:
question_few_shot = """
    Examples of consistency Rules:

    1. Unique Person ID: Each Person node should have a unique id.
    2. Person Node Properties: Each Person node should have a name and dob.
    3. Ensure that no two matches have the same date, stage, and tournament. This helps avoid duplicate matches within the same tournament.

    Task: Generate new rules to ensure consistency and accuracy in the graph database, considering all node types and relationships.
    For each consistency rule you identify, provide a clear description of the rule and the corresponding Cypher query for checking

    Requirements:

    Ensure data consistency across all nodes and relationships.
    Avoid rules that apply to only one type of node or relationship broader context.
"""

### LLAMA 3

In [29]:
responses_llama_few = llm_chain_llama.invoke({"query": question_few_shot})
for response in str(responses_llama_few).split("\n"):
  print(response)

{'query': '\n    Examples of consistency Rules:\n\n    1. Unique Person ID: Each Person node should have a unique id.\n    2. Person Node Properties: Each Person node should have a name and dob.\n    3. Ensure that no two matches have the same date, stage, and tournament. This helps avoid duplicate matches within the same tournament.\n\n    Task: Generate new rules to ensure consistency and accuracy in the graph database, considering all node types and relationships.\n    For each consistency rule you identify, provide a clear description of the rule and the corresponding Cypher query for checking\n\n    Requirements:\n\n    Ensure data consistency across all nodes and relationships.\n    Avoid rules that apply to only one type of node or relationship broader context.\n', 'result': 'Here are some additional consistency rules to ensure accuracy and consistency in the graph database, considering all node types and relationships:\n\n**Rule 1: Unique Representation ID**\n\nEach REPRESENTS 

### Mixtral

In [32]:
responses_mixtral_few = llm_chain_mixtral.invoke({"query": question_few_shot})
for response in str(responses_mixtral_few).split("\n"):
  print(response)

{'query': '\n    Examples of consistency Rules:\n\n    1. Unique Person ID: Each Person node should have a unique id.\n    2. Person Node Properties: Each Person node should have a name and dob.\n    3. Ensure that no two matches have the same date, stage, and tournament. This helps avoid duplicate matches within the same tournament.\n\n    Task: Generate new rules to ensure consistency and accuracy in the graph database, considering all node types and relationships.\n    For each consistency rule you identify, provide a clear description of the rule and the corresponding Cypher query for checking\n\n    Requirements:\n\n    Ensure data consistency across all nodes and relationships.\n    Avoid rules that apply to only one type of node or relationship broader context.\n', 'result': "Sure, here are some consistency rules that can be applied to the given graph database:\n\n1. **Unique Relationship Endpoints**: Each relationship should have unique nodes at its ends. This means that a rela