In [8]:
from typing import List
from tqdm import tqdm
import os
import wikipediaapi
import re

In [9]:
from neo4j import GraphDatabase
import google.generativeai as genai

In [117]:
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores.neo4j_vector import Neo4jVector, remove_lucene_chars
from langchain_google_genai import (GoogleGenerativeAI, GoogleGenerativeAIEmbeddings,
                                    ChatGoogleGenerativeAI)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_groq import ChatGroq
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_core.documents import Document
from langchain_core.runnables import (RunnableParallel, RunnablePassthrough,
                                      RunnableLambda)
from langchain_core.output_parsers.string import StrOutputParser
from langchain.prompts import PromptTemplate, ChatPromptTemplate
from langchain.chains import (GraphCypherQAChain,
                              RetrievalQAWithSourcesChain)
from pydantic import BaseModel, Field, validator

In [4]:
from google.colab import userdata
from dotenv import load_dotenv

# Setup

In [11]:
DOC_DIR = "./documents"

In [12]:
# I run both commands as sometimes colab doesn't behave as expectedly
os.environ["GOOGLE_API_KEY"] = userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=userdata.get('GOOGLE_API_KEY'))

In [13]:
os.environ["GROQ_API_KEY"] = userdata.get('GROQ_KEY')

In [14]:
# I need to create embeddings in multiple parts
# for homogenity of the model, declaration is up
EMBEDDING_MODEL = "models/text-embedding-004"
EMBEDDINGS = GoogleGenerativeAIEmbeddings(model=EMBEDDING_MODEL)

# Documents loading

In [15]:
if not os.path.exists(DOC_DIR):
  os.mkdir(DOC_DIR)

**Please, either add your documents (.txt format) to the created folder, or run the scraping for my examples.**

If adding own documents, the current default graph building might not work so well, you might need to customize "Nodes" and "Relationships"

In [16]:
def get_articles(keywords: list):
  """
  Fetches and stores articles from wikipedia
  Using keywords as title
  """
  wiki_wiki = wikipediaapi.Wikipedia('GraphRAG', 'en')

  for keyword in keywords:
    try:
      page_py = wiki_wiki.page(keyword)
    except:
      print(f"Page with title {keyword} not found")
      continue
    with open(os.path.join(DOC_DIR, f"{keyword}.txt"), "w") as f:
      f.write(page_py.text)

In [17]:
keywords = ["Retrieval-augmented_generation", "Large_language_model", "Natural_language_processing",
            "LaMDA", "Groq", "Hugging_Face",
            "ChatGPT"]

In [18]:
get_articles(keywords)

# Data preprocessing

Optional function used in earlier development

In [19]:
def get_paragraphs(dir_path: str, par_idx:int=0):
  """
  Returns a dictionary of par_idx'th paragraph (value)
                          for each filename (key) in dir_path
  Assumes files are in .txt format
  Assumes parragraphs are separated by a newline
  """

  assert os.path.exists(dir_path), "Directory does not exist"
  assert par_idx >= 0, "par_idx must be non-negative"

  paragraphs = {}

  for filename in os.listdir(dir_path):
    if not filename.endswith(".txt"):
      print(f"{filename} is not a .txt, skippnig")
      continue
    with open(os.path.join(dir_path, filename), "r") as f:
      title = filename.split('.')[0]
      text = f.read()

      paragraphs_arr = text.split("\n")
      if len(paragraphs_arr) <= par_idx:
        print(f"{title} document does not have {par_idx}th pargraph, skipping")
        continue

      paragraphs[title] = paragraphs_arr[par_idx].strip()

  return paragraphs

In [45]:
def clean_text(text: str):
  res = re.sub(r'\s+', ' ', text)
  res = re.sub('\n\n', '\n', res)
  return res


def get_documents_with_metadata(dir_path: str):
  """
  Returns an array of dictionries containing
  text - document content
  source - document title
  id - document title + index
  embedding - document embedding

  Uses genai embedding model to create embeddings.
  """
  assert os.path.exists(dir_path), "Directory does not exist"

  documents_with_metadata = []
  vectors = {}

  for i,filename in enumerate(os.listdir(dir_path)):
    if not filename.endswith(".txt"):
      print(f"{filename} is not a .txt, skipping")
      continue
    with open(os.path.join(dir_path, filename), "r") as f:
      text = f.read().strip()

      # text = clean_text(text)

      first_paragraph = text.split("\n")[0].strip()
      title = filename.split('.')[0].strip()

      documents_with_metadata.append({
        'text': text,
        'first_paragraph': first_paragraph,
        'source': title,
        'document_id': f'{title}-{i}',
      })

  return documents_with_metadata

In [21]:
first_paragraphs = get_paragraphs(DOC_DIR)

In [22]:
for key, value in first_paragraphs.items():
  print(f"{key} first paragraph has {len(value)} characters")

Natural_language_processing first paragraph has 512 characters
ChatGPT first paragraph has 702 characters
Groq first paragraph has 267 characters
LaMDA first paragraph has 314 characters
Retrieval-augmented_generation first paragraph has 446 characters
Hugging_Face first paragraph has 411 characters
Large_language_model first paragraph has 304 characters


All of the first paragraphs are small, I will use them without chunking for entity and relationship generation. Full text I will chunk up and store to index over.

In [46]:
documents_with_metadata = get_documents_with_metadata(DOC_DIR)

# NEO4J

NEO4J is a widely used graph database management system. It provides multiple functinalities for graph management, querying and analysis. LangChain provides integraions with the syste.

I used a free inctance of Aura DB for this exercise.

In [24]:
# Neo4j provides a free instance on Aura which is more than enough for this HA
# I don't see issue providing my credientials for this instance
NEO4J_URL = "neo4j+s://dd48e548.databases.neo4j.io"
NEO4J_USERNAME = "neo4j"
NEO4J_PASSWORD = userdata.get('NEO_PASSWORD')

In [25]:
graph = Neo4jGraph(
    url=NEO4J_URL, username=NEO4J_USERNAME, password=NEO4J_PASSWORD
)

In [26]:
node_count = graph.query("""
         MATCH (n)
         RETURN count(n) as nodeCount
         """)
assert node_count==0, "Graph is not empty, chances are it's already correctly\
                         populated. You can skip the following parts."

AssertionError: Graph is not empty, chances are it's already correctly                         populated. You can skip the following parts.

If you want to proceed with the full pipeline, but the graph is already populated, run the following two cells to reset it to blank.

In [27]:
# to remove all the nodes and their relationships
graph.query("""
            MATCH (n)
            DETACH DELETE n
            """)

[]

In [29]:
# to check that there are no entried in the graph anymore
graph.query("""
            CALL apoc.schema.assert({},{},true) YIELD label, key RETURN *
            """)

[]

In [30]:
# refreshing schema to view what's currently in the graph
graph.refresh_schema()
print(graph.schema)

Node properties:

Relationship properties:

The relationships:



## Creating graph nodes using the first paragraphs

In [31]:
def add_paragraphs_to_graph(documents_with_metadata: list):
  """
  Adds pargraphs to the graph one by one
  Return number of the processed documents (nodes created)
  """

  # the query for creating a node
  merge_paragraph_node_query = """
            MERGE(mergedParagraph:Paragraph {paragraphId: $paragraphParam.document_id})
                ON CREATE SET
                    mergedParagraph.source = $paragraphParam.source
            RETURN mergedParagraph
            """

  # ensure paragraph is in unique
  graph.query("""
              CREATE CONSTRAINT unique_paragraph IF NOT EXISTS
                  FOR (p:Paragraph) REQUIRE p.id IS UNIQUE
              """)

  # adding nodes one by one
  node_count = 0
  for paragraph in documents_with_metadata:
    graph.query(merge_paragraph_node_query,
            params={
                'paragraphParam': paragraph
            })
    node_count += 1

  print(f"Created {node_count} nodes")

  return node_count

In [58]:
created_nodes = add_paragraphs_to_graph(documents_with_metadata)

Created 7 nodes


In [59]:
node_count = graph.query("""
         MATCH (n)
         RETURN count(n) as nodeCount
         """)[0]['nodeCount']
assert node_count==created_nodes, "Number of created nodes does not equal\
                                  number of nodes in the graph"

In [60]:
# refresh schema to refer to the most recent one
graph.refresh_schema()
print(graph.schema)

Node properties:
Paragraph {paragraphId: STRING, source: STRING}
Relationship properties:

The relationships:



## LLM for entity extraction, adding those entities to the graph.

Note, that I limit the nodes and relationships using domain observations. I do that after graph inspection in Neo4j Aura.
For other documents, this requires either customization or
entity dismiguation in a more dedicated solution.

In [35]:
def to_documents(documents_with_metadata: list):
  """
  Converts documents_with_metadata to langchain documents
  """
  return [Document(page_content=paragraph['first_paragraph'], metadata={'source': paragraph['source']}) for paragraph in documents_with_metadata]

In [36]:
def transform_to_graph_documents(documents: list, allowed_nodes:list=[],
                                allowed_relationships:list=[]):
  """
  Transforms documents to graph documents using gemini-pro
  """

  # I use Google models as they are free
  # as opposed to the widely used OpenAI
  llm = GoogleGenerativeAI(model="gemini-pro")

  llm_transformer = LLMGraphTransformer(
    llm=llm,
    allowed_nodes=allowed_nodes,
    allowed_relationships=allowed_relationships,
  )
  graph_documents = llm_transformer.convert_to_graph_documents(documents)

  # externally adding the source property for easier reference
  # to the full document
  for graph_document in graph_documents:
    source = graph_document.source.metadata['source']
    for node in graph_document.nodes:
      node.properties = {"source": source}

  return graph_documents

In [47]:
documents = to_documents(documents_with_metadata)

In [38]:
allowed_nodes = ["Technology", "Process", "Concept", "Field"]
allowed_relationships = ["USED_FOR", "NOTABLE_FOR", "CAN"]

In [49]:
graph_documents = transform_to_graph_documents(documents, allowed_nodes,
                                               allowed_relationships)
print(f"Nodes:{graph_documents[0].nodes}")
print(f"Relationships:{graph_documents[0].relationships}")

Nodes:[Node(id='Natural language processing', type='Technology', properties={'source': 'Natural_language_processing'}), Node(id='text corpora', type='Concept', properties={'source': 'Natural_language_processing'}), Node(id='information retrieval', type='Process', properties={'source': 'Natural_language_processing'}), Node(id='computational linguistics', type='Process', properties={'source': 'Natural_language_processing'}), Node(id='process data encoded in natural language', type='Process', properties={'source': 'Natural_language_processing'}), Node(id='artificial intelligence', type='Field', properties={'source': 'Natural_language_processing'}), Node(id='machine learning', type='Process', properties={'source': 'Natural_language_processing'}), Node(id='knowledge representation', type='Process', properties={'source': 'Natural_language_processing'}), Node(id='computer science', type='Field', properties={'source': 'Natural_language_processing'}), Node(id='data', type='Concept', properties=

In [61]:
graph.add_graph_documents(graph_documents,
                          include_source=True,
                          baseEntityLabel=True)

In [62]:
graph.refresh_schema()
print(graph.schema)

Node properties:
Paragraph {paragraphId: STRING, source: STRING}
Document {id: STRING, source: STRING, text: STRING}
Concept {id: STRING, source: STRING}
Process {id: STRING, source: STRING}
Technology {id: STRING, source: STRING}
Field {id: STRING, source: STRING}
Relationship properties:

The relationships:
(:Document)-[:MENTIONS]->(:Technology)
(:Document)-[:MENTIONS]->(:Concept)
(:Document)-[:MENTIONS]->(:Process)
(:Document)-[:MENTIONS]->(:Field)
(:Concept)-[:USED_FOR]->(:Concept)
(:Concept)-[:USED_FOR]->(:Process)
(:Process)-[:NOTABLE_FOR]->(:Field)
(:Technology)-[:USED_FOR]->(:Process)
(:Technology)-[:USED_FOR]->(:Field)
(:Technology)-[:USED_FOR]->(:Technology)
(:Technology)-[:USED_FOR]->(:Concept)
(:Technology)-[:CAN]->(:Process)
(:Technology)-[:CAN]->(:Concept)
(:Technology)-[:NOTABLE_FOR]->(:Technology)
(:Technology)-[:NOTABLE_FOR]->(:Process)


## Creating embeddings for the entities

In [63]:
def update_with_embeddings(graph_documents: list):
  """
  Updates the graph with embeddings for entities
  using genai text embedding model
  """

  graph.refresh_schema()

  for graph_document in tqdm(graph_documents):
    source = graph_document.source.metadata['source']
    vectors = {}

    for node in graph_document.nodes:
      vectors[node.id] = genai.embed_content(
            model=EMBEDDING_MODEL,
            content=node.id,
            task_type="retrieval_document")['embedding']

    graph.query("""
      MATCH (e:__Entity__) WHERE e.embedding IS NULL AND e.source = $source
      SET e.embedding = $vectors[e.id]
      """,
          params={"vectors": vectors, "source": source})

    graph.refresh_schema()
  print(graph.schema)

In [64]:
update_with_embeddings(graph_documents)

100%|██████████| 7/7 [00:16<00:00,  2.30s/it]

Node properties:
Paragraph {paragraphId: STRING, source: STRING}
Document {id: STRING, source: STRING, text: STRING}
Concept {id: STRING, source: STRING, embedding: LIST}
Process {id: STRING, source: STRING, embedding: LIST}
Technology {id: STRING, source: STRING, embedding: LIST}
Field {id: STRING, source: STRING, embedding: LIST}
Relationship properties:

The relationships:
(:Document)-[:MENTIONS]->(:Technology)
(:Document)-[:MENTIONS]->(:Concept)
(:Document)-[:MENTIONS]->(:Process)
(:Document)-[:MENTIONS]->(:Field)
(:Concept)-[:USED_FOR]->(:Concept)
(:Concept)-[:USED_FOR]->(:Process)
(:Process)-[:NOTABLE_FOR]->(:Field)
(:Technology)-[:USED_FOR]->(:Process)
(:Technology)-[:USED_FOR]->(:Field)
(:Technology)-[:USED_FOR]->(:Technology)
(:Technology)-[:USED_FOR]->(:Concept)
(:Technology)-[:CAN]->(:Process)
(:Technology)-[:CAN]->(:Concept)
(:Technology)-[:NOTABLE_FOR]->(:Technology)
(:Technology)-[:NOTABLE_FOR]->(:Process)





## Creating explicit relationships between entities and source paragraphs.

For some reason 1 document was dropped by add_graph_documents.

This is a workaround of sorts to disambiguate nodes with no connections. Such is the case with Groq exracted entities, for example.

In [65]:
def add_pargraph_enetity_relationship():
  """
  Adds explicit relationships between entities and source paragraphs
  """
  graph.refresh_schema()

  rel_query = """
              MATCH (e:__Entity__), (p:Paragraph) WHERE e.source = p.source
              MERGE (e)<-[:HAS_ENTITY]-(p)
              """
  graph.query(rel_query)

  graph.refresh_schema()

In [66]:
def add_pagraph_document_relationship():
  """
  Adds explicit relationships between paragraphs and documents
  """
  graph.refresh_schema()

  rel_query = """
            MATCH (p:Paragraph), (d:Document) WHERE p.source = d.source
            MERGE (p)<-[:FIRST_CHUNK]-(d)
            """
  graph.query(rel_query)

  graph.refresh_schema()

In [67]:
add_pargraph_enetity_relationship()

In [68]:
add_pagraph_document_relationship()
print(graph.schema)

Node properties:
Paragraph {paragraphId: STRING, source: STRING}
Document {id: STRING, source: STRING, text: STRING}
Concept {id: STRING, source: STRING, embedding: LIST}
Process {id: STRING, source: STRING, embedding: LIST}
Technology {id: STRING, source: STRING, embedding: LIST}
Field {id: STRING, source: STRING, embedding: LIST}
Relationship properties:

The relationships:
(:Paragraph)-[:HAS_ENTITY]->(:Technology)
(:Paragraph)-[:HAS_ENTITY]->(:Concept)
(:Paragraph)-[:HAS_ENTITY]->(:Process)
(:Paragraph)-[:HAS_ENTITY]->(:Field)
(:Document)-[:MENTIONS]->(:Technology)
(:Document)-[:MENTIONS]->(:Concept)
(:Document)-[:MENTIONS]->(:Process)
(:Document)-[:MENTIONS]->(:Field)
(:Document)-[:FIRST_CHUNK]->(:Paragraph)
(:Concept)-[:USED_FOR]->(:Concept)
(:Concept)-[:USED_FOR]->(:Process)
(:Process)-[:NOTABLE_FOR]->(:Field)
(:Technology)-[:USED_FOR]->(:Process)
(:Technology)-[:USED_FOR]->(:Field)
(:Technology)-[:USED_FOR]->(:Technology)
(:Technology)-[:USED_FOR]->(:Concept)
(:Technology)-[:CAN

At this point I believe I have a working graph comparable with what is expected.

# Storing the full documents for later use

I will store them in Neo4j too, but mostly for convenience. They are not part of the graph.

In [74]:
def get_chunk_with_metadata(documents_with_metadata: List, chunk_size:int=512, chunk_overlap:int=24):
  """
  Chunks up the full text and retains the metadata
  """

  text_splitter = RecursiveCharacterTextSplitter(
      chunk_size = chunk_size,
      chunk_overlap  = chunk_overlap,
      length_function = len,
      is_separator_regex = False,
  )

  chunks_with_metadata = []

  for document_with_metadata in documents_with_metadata:
    chunks = text_splitter.split_text(document_with_metadata['text'])
    source = document_with_metadata['source']
    for i,chunk in enumerate(chunks):
      chunks_with_metadata.append({
                                  'content': chunk,
                                  'source': source,
                                  'chunk_id': f'{source}-chunk-{i}'
                                  })

  return chunks_with_metadata

In [75]:
def add_chunks_to_neo(chunks: List):
  """
  Stores chunks in Neo4j
  """

  graph.refresh_schema()

  # the query for creating a node
  merge_chunk_node_query = """
            MERGE(mergedChunk:Chunk {chunkId: $chunkParam.chunk_id})
                ON CREATE SET
                    mergedChunk.source = $chunkParam.source,
                    mergedChunk.content = $chunkParam.content
            RETURN mergedChunk
            """

  # ensure paragraph is in unique
  graph.query("""
              CREATE CONSTRAINT unique_chunk IF NOT EXISTS
                  FOR (c:Chunk) REQUIRE c.chunkId IS UNIQUE
              """)

  # adding nodes one by one
  node_count = 0
  for chunk in tqdm(chunks_with_metadata):
    graph.query(merge_chunk_node_query,
            params={
                'chunkParam': chunk
            })
    node_count += 1

  print(f"Created {node_count} nodes")

  return node_count

In [76]:
chunks_with_metadata = get_chunk_with_metadata(documents_with_metadata, 1000, 100)

In [77]:
add_chunks_to_neo(chunks_with_metadata)

100%|██████████| 239/239 [00:41<00:00,  5.74it/s]

Created 239 nodes





239

In [78]:
graph.refresh_schema()
print(graph.schema)

Node properties:
Paragraph {paragraphId: STRING, source: STRING}
Document {id: STRING, source: STRING, text: STRING}
Concept {id: STRING, source: STRING, embedding: LIST}
Process {id: STRING, source: STRING, embedding: LIST}
Technology {id: STRING, source: STRING, embedding: LIST}
Field {id: STRING, source: STRING, embedding: LIST}
Chunk {source: STRING, chunkId: STRING, content: STRING}
Relationship properties:

The relationships:
(:Paragraph)-[:HAS_ENTITY]->(:Technology)
(:Paragraph)-[:HAS_ENTITY]->(:Concept)
(:Paragraph)-[:HAS_ENTITY]->(:Process)
(:Paragraph)-[:HAS_ENTITY]->(:Field)
(:Document)-[:MENTIONS]->(:Technology)
(:Document)-[:MENTIONS]->(:Concept)
(:Document)-[:MENTIONS]->(:Process)
(:Document)-[:MENTIONS]->(:Field)
(:Document)-[:FIRST_CHUNK]->(:Paragraph)
(:Concept)-[:USED_FOR]->(:Concept)
(:Concept)-[:USED_FOR]->(:Process)
(:Process)-[:NOTABLE_FOR]->(:Field)
(:Technology)-[:USED_FOR]->(:Process)
(:Technology)-[:USED_FOR]->(:Field)
(:Technology)-[:USED_FOR]->(:Technology)


# Initial search

Plan:

1. Use the discovered entities and relationships to determine which documents are relevant (based on the 'source' linking). The details on user query processing for this are below
2. Collect the relevant chunks from the full documents into context
3. Pass the model both (1) and (2) for it to have a view of existing concepts and relationships as well as a full context

## Creating retrievers for structured (nodes, entities and their relationships).

Langchain internal capabilties, while very helpful, failed to produce good Cypher queries for the chain.

The selected texts are unstructured data consisting mainly of varied concepts (while on the same topic). For some data queries are predictable, so the prompt engineering is feasible while still intensive. For example, one might expect question like the following in a Hospital review dataset.
  
  Question: "How many patients attended St. Mungus in September?"

  Query:
  
  MACTCH (p:Patient) WHERE p.hospital=St.Mungus AND p.month=Septermber
  RETURN COUNT(p)

Such query is successfully produced by llm for GraphCypherQAChain.

However, I found that asking questions on concepts and ideas does not produce similar result. It seems that the chain as is insufficient for the chosen data.

There was little to no leeway for the question spelling that would determine the success of the query. As such, questions like "What are some nlp tasks?" failed due to the fact that there wasn't necessarily an enitity labeled 'nlp', rather 'Natural Language Processing', while the llm failed to create a meaningful query than matching a label 1-1. After some attempts in prompting, I decided to employ a different strategy including embeddings.

1. The model extracts appropriate entities from the user's query
2. The graph is indexed over entities' short descriptions' embeddings
3. The user's extracted entities are queried one by one over this index.
4. For each node (within a limit), I output ANY relationship between the node and its neighbor
5. The nodes' are used to filter chunks from which to query the full context, based on the shared source (original document).
6. At the same time, the extracted relationships are passed as 'Structured data' to the chain for it to use the information and decide which type of relationships are most relevant.



The following code is produced with the help of this blog:

https://medium.com/neo4j/enhancing-the-accuracy-of-rag-applications-with-knowledge-graphs-ad5e2ffab663

In [114]:
graph.query(
    """CREATE FULLTEXT INDEX entity IF NOT EXISTS
    FOR (e:__Entity__) ON EACH [e.id]""")

graph.query(
    """CREATE VECTOR INDEX `entity_vector` IF NOT EXISTS
    FOR (e:__Entity__) ON (e.embedding)
    OPTIONS { indexConfig: {
            `vector.dimensions`: 768,
            `vector.similarity_function`: 'cosine'
         }
         }
    """
)

[]

In [80]:
class Entities(BaseModel):
    """
    Identifying information about entities.
    """

    names: List[str] = Field(
        ...,
        description="All the concept, field, process and technology entities that "
        "appear in the text",
    )

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are extracting concept, field, process and technology entities from the text.",
        ),
        (
            "human",
            "Use the given format to extract information from the following "
            "input: {question}",
        ),
    ]
)

llm = ChatGroq(model="mixtral-8x7b-32768", temperature=0,)
entity_chain = prompt | llm.with_structured_output(Entities)

In [121]:
def generate_full_text_query(input: str) -> str:
    """
    Generate a full-text search query for a given input string.

    This function constructs a query string suitable for a full-text search.
    It processes the input string by splitting it into words and appending a
    similarity threshold (~2 changed characters) to each word, then combines
    them using the AND operator. Useful for mapping entities from user questions
    to database values, and allows for some misspelings.
    """
    full_text_query = ""
    words = [el for el in remove_lucene_chars(input).split() if el]
    for word in words[:-1]:
        full_text_query += f" {word}~2 AND"
    full_text_query += f" {words[-1]}~2"
    return full_text_query.strip()

def generate_embedded_query(input: str) -> list:
    """
    Generate an embedding for input
    """
    return genai.embed_content(
            model=EMBEDDING_MODEL,
            content=input,
            task_type="retrieval_query")['embedding']

In [205]:
def structured_retriever(question: str) -> str:
    """
    Collects the neighborhood of entities mentioned
    in the question
    """
    result = ""
    sources = []
    entities = entity_chain.invoke({"question": question})
    for entity in entities.names:
        response = graph.query(
            """CALL db.index.vector.queryNodes('entity_vector', 10, $query)
              YIELD node, score
              WITH node, score
              CALL (){
                MATCH (node)-[r:!MENTIONS]->(neighbor)
                RETURN node.id + ' - ' + type(r) + ' -> ' + neighbor.id AS output
                UNION
                MATCH (node)<-[r:!MENTIONS]-(neighbor)
                RETURN neighbor.id + ' - ' + type(r) + ' -> ' +  node.id AS output
              }
              RETURN node.source AS source, output
            """,
            {"query": generate_embedded_query(entity)},
        )
        result += "\n".join([el['output'] for el in response[:30] if el['output']])
        sources.extend([el['source'] for el in response if el['source']])
    return result, list(set(sources))

 ## Creating retrievers for unstructured (index over text embeddings) data

In [83]:
chunk_db = Neo4jVector.from_existing_graph(
    EMBEDDINGS,
    url=NEO4J_URL,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    index_name="document_index",
    node_label="Chunk",
    text_node_properties=["content"],
    embedding_node_property="embedding"
)

In [207]:
def chunk_retriever(question: str, sources: List, k:int=3):
  """
  Retrieves k most similar chunks filtered on the found relevant sources
  """
  # Note that structured retriever can return an empty set
  if len(sources) == 0:
    documents = chunk_db.similarity_search(
    question,
    k=k
  )
  else:
    documents = chunk_db.similarity_search(
    question,
    filter={"source": {"$in": sources}},
    k=k
  )
  return documents

In [208]:
def retriever(question: str):
    # print(f"Search query: {question}")
    structured_data, sources = structured_retriever(question)
    unstructured_data = [el.page_content if isinstance(el.page_content, (str, bytes)) else str(el.page_content)
        for el in chunk_retriever(question, sources, 10)]
    # unstructured_data = [el if isinstance(el, (str, bytes)) else str(el)
    #     for el in neo4j_vector_search(question, 'vector')]
    final_data = f"""Structured data:
                    {structured_data}
                    Unstructured data:
                    {"#Document ". join(unstructured_data)}"""
    # final_data = "#Document ". join(unstructured_data)

    return final_data

In [209]:
# template makes sure we answer only based on the provided files
# and not hallucinate from the general llm capabilities
template = """Answer the question based only on the provided context.
              Provide details.
              If you are unsure, "answer is not available in the context",
              don't provide the wrong answer.

              Note, that context includes Structured and Unstrctured data:
              Structured: contains relevant entities and their relationships
              Unstructured: contains chunks of relevant text

              Context: {context}

              Question: {question}
              """

prompt = ChatPromptTemplate.from_template(template)
# llm = ChatGroq(model="mixtral-8x7b-32768", temperature=0.3,)
llm = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.3)
chain = (
    RunnableParallel(
        {
            "context": RunnableLambda(lambda x : x["question"])|retriever,
            "question": RunnablePassthrough(),
        }
    )
    | prompt
    | llm
    | StrOutputParser()
)

In [210]:
chain.invoke({"question": "So what is RAG?"})

'Retrieval augmented generation (RAG) is a type of generative artificial intelligence that has information retrieval capabilities. It modifies interactions with a large language model (LLM) so that the model responds to user queries with reference to a specified set of documents, using this information in preference to information drawn from its own vast, static training data.'

# Pipeline evaluation

I want to create a system that receives the same question as my GraphRAG, and the answer it gave, and evalutes wether the answer seems good.

Addiitonally to that, I create a model that creates question.

I then collect the scores for a simple glance of performance.

In [211]:
class Evaluation(BaseModel):
    """
    Evaluating the quality of answer
    """

    score: str = Field(
        ...,
        description="Whether the answer was consiered good (ok or not ok)"
    )
    answer: str = Field(
        ...,
        description="The generated answer without changes"
    )

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are evaluating the quality of answer (ok or not ok) from the given answer to a question.",
        ),
        (
            "human",
            "Use the given format to evalate the following answer to the question "
            "question: {question}, answer: {answer}",
        ),
    ]
)

llm = ChatGroq(model="mixtral-8x7b-32768", temperature=0.3,)
eval_chain = (
    RunnableParallel(
        {
            "answer": RunnableLambda(lambda x : x) | chain,
            "question": RunnablePassthrough(),
        }
    )
    | prompt
    | llm.with_structured_output(Evaluation)
)

In [212]:
class Questions(BaseModel):
    """
    Evaluating the quality of answer
    """

    questions: List[str] = Field(
        ...,
        description="A list of all the questions in the text"
    )

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "Generate {number} relatively simple questions (one sentence, no commas)"
            "on the following topics: {topics}",
        ),
    ]
)
q_model = ChatGroq(model="mixtral-8x7b-32768", temperature=0.3,)
q_chain = prompt | q_model.with_structured_output(Questions)

In [214]:
questions = q_chain.invoke({'number': "30", "topics": "nlp, llm, chatgpt, groq, hugging face, lamda and rag"}).questions

This is a lengthy process, keep in mind.

In [215]:
positive_answers = 0
total_answers = 0
for question in tqdm(questions):
  score = eval_chain.invoke({"question": question}).score
  positive_answers += score=='ok'
  total_answers += 1

100%|██████████| 28/28 [14:16<00:00, 30.60s/it]


In [216]:
print(positive_answers/total_answers)

0.4642857142857143


Good and bad example of the produced result.

In [217]:
questions

['What is NLP?',
 'What does LLM stand for?',
 'What is ChatGPT?',
 'What is Groq?',
 'What is Hugging Face?',
 'What is Lambda?',
 'What is RAG?',
 'How does NLP work?',
 'What are the applications of LLM?',
 'How is ChatGPT used?',
 'What are the features of Groq?',
 'What are the benefits of Hugging Face?',
 'What are the use cases of Lambda?',
 'What are the components of RAG?',
 'What are the limitations of NLP?',
 'What are the challenges of LLM?',
 'What are the ethical considerations of ChatGPT?',
 'How does Groq improve performance?',
 'What are the unique features of Hugging Face?',
 'What are the drawbacks of Lambda?',
 'How does RAG enhance language understanding?',
 'What are the future directions of NLP?',
 'What are the research topics in LLM?',
 'How is ChatGPT being developed?',
 'What are the trends in Groq?',
 'What are the applications of Hugging Face?',
 'What are the innovations in Lambda?',
 'What are the improvements in RAG?']

In [218]:
eval_chain.invoke({"question": 'What are the components of RAG?'})

Evaluation(score='ok', answer='The RAG process is made up of four key stages. First, all the data must be prepared and indexed for use by the LLM. Thereafter, each query consists of a retrieval, augmentation and a generation phase.')

In [219]:
eval_chain.invoke({"question": 'What are the innovations in Lambda?'})

Evaluation(score='not ok', answer='Answer is not available in the context')

The ratio could be better.

There may be several factors at play:

Data:
1. The documents I have selected may be not sharing the same topic from the perspective of llms and graph creation as much as it seemed to me.

Design:
1. Chunk size for the full text has not been experimented with extensively
2. Number of k best chunks - similarly
3. When searching for entities, there are 2 parameters that can be examined:  
  * k - number of most similar nodes
  * LIMIT <> - by how many connections to limit
4. There are many missing connections in the graph between entities. I did not create "SIMILAR" connections nor did I create community reports like they do in the original GraphRAG. Providing such reports could possible improve the context

# Unused code

In [None]:
# Index the processed documents with FAISS for efficient retrieval
db = FAISS.from_texts(
    chunks,
    EMBEDDINGS
)

In [None]:
def get_conversational_chain():
    # Define a prompt template for asking questions based on a given context
    prompt_template = """
    Answer the question as detailed as possible from the provided context, make sure to provide all the details,
    if the answer is not in the provided context just say, "answer is not available in the context", don't provide the wrong answer\n\n
    Context:\n {context}?\n
    Question: \n{question}\n

    Answer:
    """

    # Initialize a ChatGoogleGenerativeAI model for conversational AI
    model = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.3)

    # Create a prompt template with input variables "context" and "question"
    prompt = PromptTemplate(
        template=prompt_template, input_variables=["context", "question"]
    )

    # Load a question-answering chain with the specified model and prompt
    chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)

    return chain

In [None]:
question = "Describe llm?"

In [None]:
docs = db.similarity_search(question)

# Obtain a conversational question-answering chain
chain = get_conversational_chain()

# Use the conversational chain to get a response based on the user question and retrieved documents
response = chain(
    {"input_documents": docs, "question": question}, return_only_outputs=True
)

# Print the response to the console
print(response)

{'output_text': 'A large language model (LLM) is a computational model capable of language generation or other natural language processing tasks. As language models, LLMs acquire these abilities by learning statistical relationships from vast amounts of text during a self-supervised and semi-supervised training process.'}


In [None]:
def generate_full_text_query(input: str) -> str:
    """
    Generate a full-text search query for a given input string.

    This function constructs a query string suitable for a full-text search.
    It processes the input string by splitting it into words and appending a
    similarity threshold (~2 changed characters) to each word, then combines
    them using the AND operator. Useful for mapping entities from user questions
    to database values, and allows for some misspelings.
    """
    full_text_query = ""
    words = [el for el in remove_lucene_chars(input).split() if el]
    for word in words[:-1]:
        full_text_query += f" {word}~2 AND"
    full_text_query += f" {words[-1]}~2"
    return full_text_query.strip()