In [1]:
from langchain.output_parsers.json import SimpleJsonOutputParser
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import HumanMessage, SystemMessage 
from dotenv import load_dotenv
from datetime import datetime
from os.path import exists
import tiktoken
from firecrawl_scraping import *
from utility import *
import re
import os
import instructor
from openai import OpenAI
from pydantic import BaseModel, Field
from typing import List, Optional, Literal
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_openai import ChatOpenAI
from langchain.output_parsers.json import SimpleJsonOutputParser
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain.schema import StrOutputParser
import json
from neo4j import GraphDatabase
from langchain.chains import GraphCypherQAChain
from langchain_community.graphs import Neo4jGraph
from langchain_openai import ChatOpenAI
from neomodel import config, StructuredNode, StringProperty, FloatProperty, BooleanProperty, ArrayProperty, RelationshipTo

    
config.DATABASE_URL = f"bolt://neo4j:{os.getenv('NEO4J_PASSWORD')}@localhost:7687"


  from .autonotebook import tqdm as notebook_tqdm


## Introduction

This notebook explore a basic graph-RAG based Q&A system.

In [2]:
def cypher_generator(schema, query, model_name="gpt-4o"):
    system_message = """
    Your task is to generate Cypher statement to query a graph database (Neo4j), based on the input natural language query.
    The output response should contain only Cypher query in text format, with no additional commentary, explanations, or extraneous information.
    """

    few_shot_examples = """
    Examples of natural language to Cypher statement translation are given below:
    
    ## Examples
    
    ## Example 1
    - Question: What is the URL and description of the company called "360factors"?
    - Cypher: 
    MATCH (c:Company) 
    WHERE c.name = "360factors"
    RETURN c.url AS URL, c.description AS Description
    
    ## Example 2
    - Question: What products are served to company called "BDO"?
    - Cypher:
    MATCH (p:Product)-[:SERVES]->(c:Company)
    WHERE c.name = 'BDO'
    RETURN DISTINCT p.name AS Product
    
    ## Example 3
    - Question: Who are the clients of the company called "BDO"?
    - Cypher:
    MATCH (company:Company)-[:PROVIDES]->(p:Product)-[:SERVES]->(client:Company)
    WHERE company.name = 'BDO'
    RETURN DISTINCT client.name AS Client
    
    ## Example 4
    - Question: Who are the clients of clients of the company called "Landytech"
    - Cypher:
    MATCH (company:Company)-[:PROVIDES]->(layer_1_p:Product)-[:SERVES]->(layer_1_client:Company)-[:PROVIDES]->(layer_2_p:Product)-[:SERVES]->(layer_2_client:Company)
    WHERE company.name = 'Landytech'
    RETURN DISTINCT layer_2_client.name AS Client
    
    ## Example 5
    - Question: What are the top 5 companies with most number of service provider? Return their names, number of clients, urls and descriptions.
    - Cypher:
    MATCH (provider:Company)-[:PROVIDES]->(product:Product)-[:SERVES]->(client:Company)
    WITH client, count(DISTINCT provider) AS providerCount
    ORDER BY providerCount DESC
    LIMIT 5
    RETURN client.name AS companyName, providerCount, client.url, client.description;
    
    ## Example 6
    - Question: List the products offered by companies located in the UK.
    - Cypher: 
    MATCH (company:Company)-[:PROVIDES]->(product:Product)
    WHERE company.hq_country_territory_region = 'United Kingdom'
    RETURN company.name, product.name;
    """
    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system_message),
            ("system", """Use only the provided nodes, relationship and properties from the schema provided below:
                        {schema}
                        Do not use any other relationship types or properties that are not provided.
                        """),
            ("system", few_shot_examples),
            ("human", "Here is the natural language question that you need to translate to Cypher query: {query}"),
            ("human", """
                Here are the rules that you need to adhere:
                ## Rules:
                - Make sure to answer in the standard text format.
                - DO NOT HALLUCINATE.
             """),
        ]
    )
    
    llm = ChatOpenAI(openai_api_key=os.getenv('OPENAI_KEY'),
                    temperature=0, 
                    model_name=model_name)

    llm_chain = prompt | llm | StrOutputParser()

    response = llm_chain.invoke({'schema': schema, 'query': query})
    
    return response


In [6]:
def augmented_generation(query, database_output, model_name="gpt-4o"):
    system_message = """
    Your task is to generate natural language response based on user's query, backed by database query result.
    Given input user query, you need to interpret the query and generate suitable response based on the database output.
    Use solely the information from the database with no additional commentary or extraneous information.
    """

    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system_message),
            ("human", "Here is the natural language question that you need to translate to Cypher query: {query}"),
            ("human", """Here is the database query output: {database_output}. 
                         Now you need to organise the information obtained from the database and generate response that answer the user's query."""),
            ("human", """
                Here are the rules that you need to adhere:
                ## Rules:
                - Make sure to answer in the standard text format.
                - If the database output is empty or NaN, just reply the information is not found from the database.
                - DO NOT HALLUCINATE.
             """),
        ]
    )
    
    llm = ChatOpenAI(openai_api_key=os.getenv('OPENAI_KEY'),
                    temperature=0, 
                    model_name=model_name)

    llm_chain = prompt | llm | StrOutputParser()

    response = llm_chain.invoke({'database_output': database_output, 'query': query})
    
    return response


In [3]:
from neomodel import db

def query_databse(query):
    try:
        # Execute the node query
        result, _ = db.cypher_query(query)
        return result
    except Exception as e:
        print(f"Failed to query the data: {e}")
        return None

In [4]:
# Get the schema of the graph
graph = Neo4jGraph(url="bolt://localhost:7687", username="neo4j", password=os.getenv('NEO4J_PASSWORD'))
print(graph.schema)

Node properties:
Company {communityId: INTEGER, processed_name: STRING, name: STRING, url: STRING, cluster_name_detail: STRING, hq_location: STRING, hq_city: STRING, last_known_valuation_date: STRING, verticals: STRING, valuation: FLOAT, primary_industry_sector: STRING, primary_industry_group: STRING, last_known_valuation_deal_type: STRING, description: STRING, year_founded: STRING, total_raised: FLOAT, hq_country_territory_region: STRING}
Product {name: STRING, communityId: INTEGER, summary_product: BOOLEAN, name_embedding: LIST, description_embedding: LIST, description: STRING, company_url: STRING, product_key: STRING}
Relationship properties:

The relationships:
(:Company)-[:PROVIDES]->(:Product)
(:Product)-[:SERVES]->(:Company)


In [7]:
user_question = "Who provide services to company Grant Thornton and what services do they provide to Grant Thornton."
query = cypher_generator(graph.schema, user_question)
print("Step 0: User's query\n---------------------------------------")
print(user_question)
print(' ')
print('Step 1: Generate Cypher query\n---------------------------------------')
print(query)
print(' ')
print('Step 2: Query database\n---------------------------------------')
database_result = query_databse(query)
print(database_result)

print(' ')
print('Step 3: Generate response\n---------------------------------------')
response = augmented_generation(query, str(database_result))
print(response)


Step 0: User's query
---------------------------------------
Who provide services to company Grant Thornton and what services do they provide to Grant Thornton.
 
Step 1: Generate Cypher query
---------------------------------------
MATCH (provider:Company)-[:PROVIDES]->(p:Product)-[:SERVES]->(client:Company)
WHERE client.name = 'Grant Thornton'
RETURN DISTINCT provider.name AS Provider, p.name AS Service
 
Step 2: Query database
---------------------------------------
[['Thomson Reuters', 'Professional Solutions'], ['RoyaltyRange', 'Professional databases and analytical tools'], ['Anybill', 'Tax Payment Platform']]
 
Step 3: Generate response
---------------------------------------
The companies providing services to Grant Thornton are:

1. Thomson Reuters - Professional Solutions
2. RoyaltyRange - Professional databases and analytical tools
3. Anybill - Tax Payment Platform
