## Import required libraries

In [None]:
!pip install groq
!pip install faiss-cpu
!pip install gpt4all
!pip install langchain_community
!pip install langchain_groq
!pip install langchain
!pip install sentence-transformers
!pip install langchain_openai
!pip install langchain-community

In [None]:
import json
import networkx as nx
from groq import Groq
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain.embeddings import GPT4AllEmbeddings
import openai
from langchain_community.llms import CTransformers
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_groq import ChatGroq
import time
# embeddings using langchain
from langchain.embeddings import SentenceTransformerEmbeddings

## Reading graph database

In [None]:
file_path = 'wc_full.json'
with open(file_path, 'r',  encoding='utf-8-sig') as file:
    data = json.load(file)

In [None]:
data

## Encoding graph using incident encoder

In [None]:
# Create a graph using NetworkX
graph = nx.Graph()

# Adding node and edge into graph
for item in data:
    # Add node "n"
    node_n = item['n']
    graph.add_node(node_n['identity'], labels=node_n['labels'], **node_n['properties'])

    # Add node "m"
    node_m = item['m']
    graph.add_node(node_m['identity'], labels=node_m['labels'], **node_m['properties'])

    # add edge r
    edge_r = item['r']
    graph.add_edge(edge_r['start'], edge_r['end'], id=edge_r['identity'], label=edge_r['type'], **edge_r['properties'])

# Function to generate a string describing the node and its properties
def create_node_string(graph):
    node_descriptions = []
    for node, props in graph.nodes(data=True):
        labels = props.pop('labels', [])
        labels_str = ', '.join(labels)
        prop_desc = ', '.join([f"{key}: {value}" for key, value in props.items()])
        node_descriptions.append(f"{node} [{labels_str}] ({prop_desc})")
    return ', '.join(node_descriptions)

# encoding function
def encode_graph(graph):
    nodes_string = create_node_string(graph)
    output = "G describes a graph among nodes: \n%s.\n" % nodes_string
    if graph.edges():
        output += "In this graph:\n"
    for source_node in graph.nodes():
        target_nodes = list(graph.neighbors(source_node))
        target_nodes_str = ""
        nedges = 0
        for target_node in target_nodes:
            edge_props = graph.get_edge_data(source_node, target_node)
            edge_props_str = ', '.join([f"{key}: {value}" for key, value in edge_props.items()])
            target_nodes_str += f"{target_node} ({edge_props_str}), "
            nedges += 1
        if nedges > 1:
            output += "Node %s is connected to nodes %s.\n" % (
                source_node,
                target_nodes_str[:-2],
            )
        elif nedges == 1:
            output += "Node %s is connected to node %s.\n" % (
                source_node,
                target_nodes_str[:-2],
            )
    return output

# graph encoding
encoded_graph = encode_graph(graph)
print(encoded_graph)


## RAG
### Embedding

In [None]:
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

In [None]:
embedding_model_name = "all-MiniLM-L6-v2.gguf2.f16.gguf"
gpt4all_kwargs = {'allow_download': 'True'}
embeddings = GPT4AllEmbeddings(
  model_name = embedding_model_name,
  gpt4all_kwargs = gpt4all_kwargs
)

In [None]:
vector_db_path = "vectorstores/db_faiss"

def create_db_from_graph(encoded_graph):

    # Split graph data into text
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=8192,
        chunk_overlap=100,
        length_function=len

    )

    chunks = text_splitter.split_text(encoded_graph)


    #  Faiss Vector DB
    db = FAISS.from_texts(texts=chunks, embedding=embeddings)
    db.save_local(vector_db_path)
    return db


In [None]:
db_test = create_db_from_graph(encoded_graph)

### LLMs

In [None]:
vector_db_path = "vectorstores/db_faiss"

# Load LLM
def load_llm(model_file):
    llm = CTransformers(
        model=model_file,
        model_type="llama",
        max_new_tokens=8192,
        temperature=0.01
    )
    return llm

#### Create prompt template

In [None]:
def creat_prompt(template):
    prompt = PromptTemplate(template = template, input_variables=["context", "question"])
    return prompt

#### create chain

In [None]:
# Create chain
def create_chain(prompt, llm, db):
    llm_chain = RetrievalQA.from_chain_type(
        llm = llm,
        chain_type= "stuff",
        retriever = db.as_retriever(search_kwargs = {"k":2}, max_tokens_limit=8192),
        return_source_documents = False,
        chain_type_kwargs= {'prompt': prompt}

    )
    return llm_chain

#### Reading vector db

In [None]:
def read_vectors_db():
    # Embeding
    db = FAISS.load_local(vector_db_path,
                          embedding_model_name,
                          allow_dangerous_deserialization=True)
    return db

#### start

In [None]:
# start
db = read_vectors_db()

GROQ_API_KEY = "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"
llm_llama = ChatGroq(groq_api_key=GROQ_API_KEY, model="llama3-70b-8192")
llm_mixtral = ChatGroq(groq_api_key=GROQ_API_KEY, model="mixtral-8x7b-32768")

#### Promp template

In [None]:
template = """<|im_start|>system\nUse the following information
                          to answer the question. Please exactly
                          \n
    {context}<|im_end|>\n<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant"""
prompt = creat_prompt(template)


#### Llms chain

In [None]:
llm_chain_llama  = create_chain(prompt, llm_llama, db_test)
llm_chain_mixtral = create_chain(prompt, llm_mixtral, db_test)

## Zero - shot prompt

In [None]:
question_zero =  """
Based on the following graph properties, generate detailed consistency rules (graph functional dependency and graph entity dependency).
Consider the structure, node information and relationships in the graph, and provide a set of rules that can be applied to maintain consistent
and accurate data.

For each consistency rule you identify, provide a clear description of the rule and generated the corresponding Cypher query to check the
number of nodes or relationships that satisfy the rule.
Your query should return the count of entities (nodes or relationships) that match the rule described.
Provide the query in the format of valid Cypher syntax, simple and ready for execution in a Neo4j databaseBelow is the input data:
Graph Information:

- Nodes: Tournament, Team, Squad, Person, Match.
- Relationships: PLAYED_IN, NAMED, PARTICIPATED_IN, FOR, REPRESENTS, IN_SQUAD, SCORED_GOAL, COACH_FOR, IN_TOURNAMENT,
- Node properties: Tournament.name, Tournament.id, Tournament.shortName, Tournament.year, Team.name, Team.id, Squad.id, Person.id, Person.name, Person.dobm Match.id, Match.stage, Match.date

"""

#### LLAMA 3

In [None]:
import time
time_start = time.time()
responses_llama_zero = llm_chain_llama.invoke({"query": question_zero})
time_end = time.time()
execution_time = time_end - time_start
print(f"execution_time: {execution_time}")
for response in str(responses_llama_zero).split("\n"):
  print(response)

#### Mixtral

In [None]:
import time
time_start_zr_mixtral = time.time()
responses_mixtral_zero = llm_chain_mixtral.invoke({"query": question_zero})
time_end_zr_mixtral = time.time()
execution_time_zr_mixtral = time_end_zr_mixtral - time_start_zr_mixtral
print(f"execution_time: {execution_time_zr_mixtral}")
for response in str(responses_mixtral_zero).split("\n"):
  print(response)

## Few - shot prompt

In [None]:
question_few_shot = """
    Examples of consistency Rules:

    1. Unique Person ID: Each Person node should have a unique id.
    2. Person Node Properties: Each Person node should have a name and dob.
    3. Ensure that no two matches have the same date, stage, and tournament. This helps avoid duplicate matches within the same tournament.

    Task: Generate new rules to ensure consistency and accuracy in the graph database, considering all node types and relationships.

    For each consistency rule you identify, provide a clear description of the rule and generated the corresponding Cypher query to check the
    number of nodes or relationships that satisfy the rule.
    Your query should return the count of entities (nodes or relationships) that match the rule described.
    Provide the query in the format of valid Cypher syntax, simple and ready for execution in a Neo4j databaseBelow is the input data:
    Graph Information:

    - Nodes: Tournament, Team, Squad, Person, Match.
    - Relationships: PLAYED_IN, NAMED, PARTICIPATED_IN, FOR, REPRESENTS, IN_SQUAD, SCORED_GOAL, COACH_FOR, IN_TOURNAMENT,
    - Node properties: Tournament.name, Tournament.id, Tournament.shortName, Tournament.year, Team.name, Team.id, Squad.id, Person.id, Person.name, Person.dobm Match.id, Match.stage, Match.date

"""

### LLAMA

In [None]:
time_start_fs_llama = time.time()
responses_llama_few = llm_chain_llama.invoke({"query": question_few_shot})
time_end_fs_llama = time.time()
execution_time_fs_llama = time_end_ls_llama - time_start_fs_llama
print(f"execution_time: {execution_time_fs_llama}")
for response in str(responses_llama_few).split("\n"):
  print(response)

### Mixtral

In [None]:
time_start_fs_mixtral = time.time()
responses_mixtral_few = llm_chain_mixtral.invoke({"query": question_few_shot})
time_end_fs_mixtral = time.time()
execution_time_fs_mixtral = time_end - time_start
print(f"execution_time: {execution_time_fs_mixtral}")
for response in str(responses_mixtral_few).split("\n"):
  print(response)