In [50]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join('..')))

from src.pipelines.data_ingestion_pipeline import DocumentLoader

In [51]:
#load env variables
from dotenv import load_dotenv
load_dotenv()

from config.config import GPT4O_MINI_CONFIG, NEO4J_CONFIG

OPENAI_API_KEY_4o_MINI = GPT4O_MINI_CONFIG['OPENAI_API_KEY_4o_MINI']
NEO4J_USERNAME = NEO4J_CONFIG['NEO4J_USERNAME']
NEO4J_PASSWORD = NEO4J_CONFIG['NEO4J_PASSWORD']

In [52]:
from langchain_core.runnables import  RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field
from langchain_core.output_parsers import StrOutputParser
from langchain_community.graphs import Neo4jGraph
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_experimental.graph_transformers import LLMGraphTransformer
from neo4j import GraphDatabase
from langchain_community.vectorstores import Neo4jVector
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars
import os
from neo4j import  Driver
from langchain_core.documents import Document
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

In [53]:
embedding_model = OpenAIEmbeddings(openai_api_key=GPT4O_MINI_CONFIG['OPENAI_API_KEY_4o_MINI'], model= "text-embedding-3-small")

In [54]:
try:
    graph = Neo4jGraph(
        url='neo4j+s://2358b74c.databases.neo4j.io', 
        username=NEO4J_USERNAME, 
        password=NEO4J_PASSWORD
    )
    print("Connection to Neo4j was successful!")
except Exception as e:
    print(f"Failed to connect to Neo4j: {e}")

Connection to Neo4j was successful!


In [55]:
file = '/workspaces/LLM_Rag_chatbot/data/Databricks Big Book Of GenAI FINAL.pdf'

loader = DocumentLoader(file)

file_read = loader.get_loader()

documents = file_read.load()
print(documents)

Loading PDF document loader for /workspaces/LLM_Rag_chatbot/data/Databricks Big Book Of GenAI FINAL.pdf
[Document(metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 19.3 (Windows)', 'creationdate': '2024-04-15T20:37:40+01:00', 'source': '/workspaces/LLM_Rag_chatbot/data/Databricks Big Book Of GenAI FINAL.pdf', 'file_path': '/workspaces/LLM_Rag_chatbot/data/Databricks Big Book Of GenAI FINAL.pdf', 'total_pages': 118, 'format': 'PDF 1.4', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2024-04-15T20:37:52+01:00', 'trapped': '', 'modDate': "D:20240415203752+01'00'", 'creationDate': "D:20240415203740+01'00'", 'page': 0}, page_content=''), Document(metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 19.3 (Windows)', 'creationdate': '2024-04-15T20:37:40+01:00', 'source': '/workspaces/LLM_Rag_chatbot/data/Databricks Big Book Of GenAI FINAL.pdf', 'file_path': '/workspaces/LLM_Rag_chatbot/data/Databricks Big Book Of GenAI FINA

In [56]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=40)
docs = text_splitter.split_documents(documents)
print(docs) 

[Document(metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 19.3 (Windows)', 'creationdate': '2024-04-15T20:37:40+01:00', 'source': '/workspaces/LLM_Rag_chatbot/data/Databricks Big Book Of GenAI FINAL.pdf', 'file_path': '/workspaces/LLM_Rag_chatbot/data/Databricks Big Book Of GenAI FINAL.pdf', 'total_pages': 118, 'format': 'PDF 1.4', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2024-04-15T20:37:52+01:00', 'trapped': '', 'modDate': "D:20240415203752+01'00'", 'creationDate': "D:20240415203740+01'00'", 'page': 1}, page_content='THE BIG BOOK OF GENERATIVE AI\nCONTENTS\nIntroduction .............................................................................................................................................................................................................3\nThe Path to Deploying Production-Quality GenAI Applications..............................................................................................5\nS

In [57]:
for i, chunk in enumerate(docs[:5]):  # Print first 5 chunks
    print(f"Chunk {i+1}:\n{chunk.page_content}\n{'='*50}")

Chunk 1:
THE BIG BOOK OF GENERATIVE AI
CONTENTS
Introduction .............................................................................................................................................................................................................3
The Path to Deploying Production-Quality GenAI Applications..............................................................................................5
Stage 0: Foundation Models.................................................................................................................................................................................................................................................................5
Chunk 2:
Use Case: Introducing DBRX: A New State-of-the-Art Open LLM.......................................................................................................................................................................5
Stage 1: Prompt Engineering............................

In [58]:
 # Clear the graph database
cypher = """
        MATCH (n)
        DETACH DELETE n;
        """
graph.query(cypher)

[]

In [59]:
import json
from langchain.chat_models import ChatOpenAI
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.schema import HumanMessage
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyMuPDFLoader
from config.config import GPT4O_MINI_CONFIG

# Initialize LLM
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, api_key=GPT4O_MINI_CONFIG['OPENAI_API_KEY_4o_MINI'])

# Define response format
response_schemas = [
    ResponseSchema(name="nodes", description="A list of entity nodes."),
    ResponseSchema(
        name="relationships",
        description="A list of relationships, each containing 'source', 'relation', and 'target' keys."
    ),
]

# Create output parser
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
format_instructions = output_parser.get_format_instructions()

# Define structured prompt
prompt_template = """
You are an AI that extracts **knowledge graph entities and relationships** from text.

**Input Text:**
{text}

**Instructions:**
- Extract the key **concepts (nodes)**.
- Identify relationships between the nodes in a structured format.
- **Relationships must follow this JSON format:**  
  ```json
  {{"source": "Entity 1", "relation": "Relation Type", "target": "Entity 2"}}" 
    ```
- **Format Instructions:**" 
{format_instructions}
"""
# Initialize storage for extracted nodes and relationships
allowed_nodes = set()
allowed_relationships = set()

# Process each chunk using LLM
for chunk in docs[:40]:  # Process first 5 chunks for testing
    text = chunk.page_content  # Extract chunk text
    prompt = prompt_template.format(text=text, format_instructions=format_instructions)

    try:
        # Invoke LLM
        response = llm.invoke([HumanMessage(content=prompt)]).content

        if not response.strip():
            print("Warning: LLM returned an empty response.")
            continue

        # Parse response using LangChain's output parser
        extracted_data = output_parser.parse(response)

        nodes = extracted_data.get("nodes", [])
        relationships = extracted_data.get("relationships", [])

        allowed_nodes.update(nodes)
        allowed_relationships.update((rel["source"], rel["relation"], rel["target"]) for rel in relationships)

    except Exception as e:
        print(f"Error parsing LLM response: {e}")
        print(f"LLM Raw Response: {response}")  # Debugging output

# Convert sets to lists for final storage
allowed_nodes = list(allowed_nodes)
allowed_relationships = list(allowed_relationships)

# Output results
print("\nExtracted Nodes:", allowed_nodes)
print("\nExtracted Relationships:", allowed_relationships)


Extracted Nodes: ['enterprise', 'HellaSwag 10-shot', 'AI Technologies', 'open models', 'Investment in AI', 'MPT-7B', 'GPT-3.5 Turbo', 'Big Book of Generative AI', 'Chat', 'MosaicML', 'scaling experiments', 'Large Language Models', 'GenAI-Powered Applications', 'DBRX Instruct', 'MISTRAL MEDIUM', 'world knowledge', 'Wikipedia corpus', '12T tokens', 'DBRX MoE-A', 'LLM pretraining pipeline', 'Instruction Following', 'Grok-1', 'Unity Catalog', '132B non-MoE model', 'Data Preparation', 'proprietary models', 'language understanding', 'LLAMA2-70 B CHAT', 'language understanding (MMLU)', 'GPT-48', 'Data Infrastructure', 'github', 'KV-Pairs Benchmark', 'MIXTRAL BASE', 'Programming (HumanEval)', 'Cost-effective Training', 'Gemini 1.0 Pro', 'LLaMA2-70B', 'Retrieval Augmented Generation (RAG)', 'Custom LLM', '7.7B total parameters', 'GenAI Models', '2.2B active parameters', 'TruthfulQA', 'HellaSwag', 'inference efficiency', 'Businesses', 'HumanEval', 'CodeLLaMA blog', 'footnotes', 'Chatbot', 'FLOP

In [63]:
# Ensure relationships are in list format
allowed_relationships = [
    (source, relation, target)
    for source, relation, target in allowed_relationships
    if source in allowed_nodes and target in allowed_nodes  # 
]

# Now pass it into LLMGraphTransformer
transformer = LLMGraphTransformer(
    llm=llm,
    allowed_nodes=list(allowed_nodes),  
    allowed_relationships=allowed_relationships,  
    node_properties=False,
    relationship_properties=False
)

In [65]:
graph_documents = transformer.convert_to_graph_documents(docs)

In [66]:
graph.add_graph_documents(graph_documents, include_source=True)

In [67]:
# Use the stored connection parameters
index = Neo4jVector.from_existing_graph(
    embedding=embedding_model,
    url='neo4j+s://2358b74c.databases.neo4j.io',
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    database="neo4j",
    node_label="Document",  # Adjust node_label as needed
    text_node_properties=["text"], 
    embedding_node_property="embedding",
    search_type="hybrid" 
)

In [68]:
vector_retriever = index.as_retriever()

In [83]:
from langchain.graphs import Neo4jGraph

# Establish direct connection to Neo4j database (separate from vector index)
graph_db = Neo4jGraph(
    url="neo4j+s://2358b74c.databases.neo4j.io",
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    database="neo4j"
)

# Retrieve the graph schema correctly
schema = graph_db.get_schema  # ✅ This works
print("Graph Schema:\n", schema)

Graph Schema:
 Node properties:
Document {id: STRING, text: STRING, file_path: STRING, creator: STRING, creationdate: STRING, modDate: STRING, keywords: STRING, trapped: STRING, author: STRING, subject: STRING, format: STRING, source: STRING, total_pages: INTEGER, title: STRING, creationDate: STRING, moddate: STRING, producer: STRING, page: INTEGER, embedding: LIST}
Big Book of Generative AI {id: STRING}
Use Case {id: STRING}
DBRX Instruct {id: STRING}
open models {id: STRING}
Large Language Models {id: STRING}
GenAI Applications {id: STRING}
Stage 4: Pretraining {id: STRING}
enterprise {id: STRING}
Evaluation {id: STRING}
Summary {id: STRING}
Custom Large Language Models (LLMs) {id: STRING}
Investment in AI {id: STRING}
GenAI-Powered Applications {id: STRING}
Generative AI {id: STRING}
Businesses {id: STRING}
CIOs {id: STRING}
Data Infrastructure {id: STRING}
MIT Tech Review {id: STRING}
AI Technologies {id: STRING}
Data Preparation {id: STRING}
Tools {id: STRING}
Retrieval Models {id

In [78]:
retrieved_data = vector_retriever.get_relevant_documents("What is Generative AI?")
print(f"Retrieved Documents: {retrieved_data}")


  retrieved_data = vector_retriever.get_relevant_documents("What is Generative AI?")


Retrieved Documents: [Document(metadata={'moddate': '2024-04-15T20:37:52+01:00', 'keywords': '', 'subject': '', 'file_path': '/workspaces/LLM_Rag_chatbot/data/Databricks Big Book Of GenAI FINAL.pdf', 'format': 'PDF 1.4', 'creationdate': '2024-04-15T20:37:40+01:00', 'creator': 'Adobe InDesign 19.3 (Windows)', 'creationDate': "D:20240415203740+01'00'", 'author': '', 'title': '', 'trapped': '', 'source': '/workspaces/LLM_Rag_chatbot/data/Databricks Big Book Of GenAI FINAL.pdf', 'modDate': "D:20240415203752+01'00'", 'page': 116, 'producer': 'Adobe PDF Library 17.0', 'total_pages': 118}, page_content='\ntext: THE BIG BOOK OF GENERATIVE AI\nSummary\nWhether you’re looking to disrupt traditional industries, enhance creative endeavors or solve complex problems \nin novel ways, the potential applications of generative AI are limited only by your imagination and willingness to \nexperiment. Remember, every significant advancement in this field began with a simple idea and the courage to \nexplor

In [89]:
from langchain.prompts import PromptTemplate
from langchain.chains.graph_qa.cypher import GraphCypherQAChain

# Define the prompt for generating Cypher queries
cypher_prompt_template = """
Task: Generate a Cypher statement to query the graph database.
Instructions:
- Use only relationship types and properties provided in schema.
- Do not use other relationship types or properties that are not provided.
- Enclose all node labels and relationship types in backticks (`) to prevent syntax errors.

Schema:
{schema}

Note: 
- Do not include explanations or apologies in your answers.
- Do not answer questions that ask anything other than creating Cypher statements.
- Do not include any text other than the generated Cypher statement.

Question: {question}""" 

cypher_prompt = PromptTemplate(
    template=cypher_prompt_template,
    input_variables=["schema", "question"]
)

In [91]:
qa_chain = GraphCypherQAChain.from_llm(
    llm=llm,  
    graph=graph_db,  
    cypher_prompt=cypher_prompt,
    verbose=True,
    return_intermediate_steps=True,
    allow_dangerous_requests=True  # ✅ Enables execution of generated Cypher queries
)

In [97]:
question = "What is RAG?"
response = qa_chain.invoke({"query": question})  

# Print Cypher query and response
print("Generated Cypher Query:\n", response["intermediate_steps"])
print("Answer:\n", response["result"])



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (d:`Document`)-[:MENTIONS]->(r:`Retrieval Augmented Generation (RAG)`)
RETURN d, r
[0m
Full Context:
[32;1m[1;3m[{'d': {'file_path': '/workspaces/LLM_Rag_chatbot/data/Databricks Big Book Of GenAI FINAL.pdf', 'creator': 'Adobe InDesign 19.3 (Windows)', 'creationdate': '2024-04-15T20:37:40+01:00', 'modDate': "D:20240415203752+01'00'", 'keywords': '', 'trapped': '', 'author': '', 'subject': '', 'format': 'PDF 1.4', 'source': '/workspaces/LLM_Rag_chatbot/data/Databricks Big Book Of GenAI FINAL.pdf', 'total_pages': 118, 'title': '', 'creationDate': "D:20240415203740+01'00'", 'moddate': '2024-04-15T20:37:52+01:00', 'producer': 'Adobe PDF Library 17.0', 'id': '4466744a280c76a9bfd9c891e845510a', 'text': 'THE BIG BOOK OF GENERATIVE AI\nIn this eBook, you’ll learn: \n\t\n■How to plan a path from basic to advanced GenAI applications, leveraging your organization’s data\n\t\n■How to use retrieval au

In [95]:
question = "What are the key topics related to generative AI?"
response = qa_chain.invoke({"query": question})  

# Print Cypher query and response
print("Generated Cypher Query:\n", response["intermediate_steps"])
print("Answer:\n", response["result"])



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3m
MATCH (d:`Document`)-[:MENTIONS]->(g:`Generative AI`)
RETURN g
[0m
Full Context:
[32;1m[1;3m[{'g': {'id': 'Generative AI'}}, {'g': {'id': 'Generative AI'}}, {'g': {'id': 'generative AI'}}, {'g': {'id': 'Our diffusion model'}}, {'g': {'id': 'Stable Diffusion'}}, {'g': {'id': 'Stable Diffusion'}}, {'g': {'id': 'Stable Diffusion'}}, {'g': {'id': 'Stable Diffusion'}}, {'g': {'id': 'U-Net diffusion model'}}, {'g': {'id': 'Stable Diffusion 2'}}][0m

[1m> Finished chain.[0m
Generated Cypher Query:
 [{'query': '\nMATCH (d:`Document`)-[:MENTIONS]->(g:`Generative AI`)\nRETURN g\n'}, {'context': [{'g': {'id': 'Generative AI'}}, {'g': {'id': 'Generative AI'}}, {'g': {'id': 'generative AI'}}, {'g': {'id': 'Our diffusion model'}}, {'g': {'id': 'Stable Diffusion'}}, {'g': {'id': 'Stable Diffusion'}}, {'g': {'id': 'Stable Diffusion'}}, {'g': {'id': 'Stable Diffusion'}}, {'g': {'id': 'U-Net diffusion model'}}, {'

In [102]:
question = "What is GenAI and how is it useful in industry?"
response = qa_chain.invoke({"query": question})  

# Print Cypher query and response
print("Generated Cypher Query:\n", response["intermediate_steps"])
print("Answer:\n", response["result"])



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (d:`Document`)-[:MENTIONS]->(g:`GenAI`)
RETURN d
[0m
Full Context:
[32;1m[1;3m[{'d': {'file_path': '/workspaces/LLM_Rag_chatbot/data/Databricks Big Book Of GenAI FINAL.pdf', 'creator': 'Adobe InDesign 19.3 (Windows)', 'creationdate': '2024-04-15T20:37:40+01:00', 'modDate': "D:20240415203752+01'00'", 'keywords': '', 'trapped': '', 'author': '', 'subject': '', 'format': 'PDF 1.4', 'source': '/workspaces/LLM_Rag_chatbot/data/Databricks Big Book Of GenAI FINAL.pdf', 'total_pages': 118, 'title': '', 'creationDate': "D:20240415203740+01'00'", 'moddate': '2024-04-15T20:37:52+01:00', 'producer': 'Adobe PDF Library 17.0', 'id': 'b747531f3a04d67f62a96e29446a1b89', 'text': 'the Quality of Your Data Matters\nBusinesses need to achieve production quality with their GenAI applications. Developers need rich tools for \nunderstanding the quality of their data and model outputs, along with an underlying 