In [None]:
import os
import warnings
from typing import TypedDict, Annotated

warnings.filterwarnings("ignore")

from IPython.display import Markdown

from langchain_core.messages import (
    BaseMessage,
    HumanMessage,
    AIMessage,
    SystemMessage
)
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnableConfig

from langchain_openai import ChatOpenAI,OpenAIEmbeddings
from langchain_google_genai import GoogleGenerativeAIEmbeddings

from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter


from langchain_community.vectorstores.pgvector import PGVector

# from langchain_postgres import PGVector


from tqdm import tqdm


from typing_extensions import TypedDict
from langgraph.graph import StateGraph,add_messages,START,END
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv
from langchain.messages import RemoveMessage # to delete something from state permenantly
from langchain.messages import HumanMessage

from typing import TypedDict, Annotated

from langgraph.graph.message import add_messages


from langchain_core.documents import Document
from langchain_community.document_loaders import DirectoryLoader

from dotenv import load_dotenv
load_dotenv(dotenv_path=r"C:\Users\hasee\Desktop\Legal Chatbot\.env")

In [None]:
CONNECTION_STRING = os.environ.get("CONNECTION_STRING","")
SUPERBASE_SERVICE_ROLE_KEY = os.environ.get("SUPABASE_SERVICE_ROLE_KEY","")
SUPABASE_URL = os.environ.get("SUPABASE_URL","")


In [None]:
from supabase import create_client

supabase_client = create_client(SUPABASE_URL,SUPERBASE_SERVICE_ROLE_KEY)
print("Succefully coonectd to Supabase client")

In [None]:
# chatting llm
llm = ChatOpenAI(model="gpt-4o-mini",temperature=0,streaming=True)
#embedding llm
EMBEDDING = OpenAIEmbeddings(model="text-embedding-3-small")

In [None]:
class AgentState(TypedDict):
    documents_path:str
    documents:list[Document]
    chunks:list[Document] 
    collection_name:str
    retrieved_docs:list[Document]
    context: str 
    answer:str
    messages: Annotated[list[BaseMessage], add_messages]
    doc_id:str
    summary:str
    vectorstore_uploaded:bool
    rewritten_query:str




PROMPT_TEMPLATE = """
        You are an expert Legal AI Assistant for Pakistan. Your task is to answer legal questions based strictly on the provided context.

        Instructions:
        1. Source-Based Answering: Answer the question using ONLY the information provided in the Context below. Do not use outside knowledge.
        2. Specific Legal Citations: When making a statement, you must cite the specific legal authority found in the text (e.g., "Article 6 of the Constitution", "Section 302 of PPC", "Clause 3"). 
        3. Citation Format: Format citations as: [Legal Reference]** (Found in: Chunk ID/Source).
            Example: "Every citizen has the right to a fair trial as per Article 10-A (Source: Chunk 2, constitution.pdf).."
        4. No Hallucinations:** If the provided context does not contain the answer, state: "The provided context does not contain sufficient information to answer this question."

        Context:
        {context}

        Question:
        {question}

        Answer:
        """


# Document ID generator

In [None]:
import hashlib
def get_file_hash(file_path: str) -> str:
    """
    It reads a file (like a PDF) and generates a SHA-256 hash,
      which is a fixed-length unique string representing the file’s content.

    If:
    The file content is exactly the same → hash is the same
    Even 1 byte changes → hash is completely different
    """
    hasher = hashlib.sha256()
    with open(file_path, "rb") as f:
        hasher.update(f.read())
    return hasher.hexdigest()

# get_file_hash() reads the entire file content and produces a SHA-256 hash.
# SHA-256 guarantees:
# Same content → same hash → same doc_id
# Even 1 byte difference → completely different hash
# So if a user uploads the same PDF file again, the hash will be identical → same doc_id.


# Even one single byte change in the file will produce a completely different hash.
# That means if you change one word in the PDF, the doc_id will be different, because the file content is no longer exactly the same.

In [None]:
        
def set_doc_id(state:AgentState):
    path = os.path.abspath(state["documents_path"])
    
    if not os.path.isfile(path):
        raise ValueError("Directoy uploaded not supported with hashing yet")
    state["doc_id"] = get_file_hash(path)
    return state


In [None]:
def check_pdf_already_uploaded(state:AgentState):
    """Checkif PDF already exist in SUpbase"""
    # first check if vectostore already exist
    if state.get("vectorstore_uploaded"):
        return state
    # it aslo check if vectorstore exist in database
    response = (self.supabase_client.table("documents").select("doc_id").eq("doc_id",state["doc_id"]).limit(1).execute())
    if response.data:
        print("Pdf already exist in supbase skipping documnet ingesion...")
        state["vectorstore_uploaded"] = True
    else:
        state["vectorstore_uploaded"] = False
    return state  

In [None]:
def document_ingestion(state: AgentState):
    if state.get("vectorstore_uploaded"):
        print("Skipping vectoingestion - PDF already exist")
        state["vectorstore_uploaded"] = True
        return state
    
    path = os.path.abspath(state["documents_path"])  # ensure absolute

    if not os.path.isfile(path):
        raise ValueError(f"Invalid documents_path: {path}")
    

    loader = PyPDFLoader(path)
    documents = loader.load()

    splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)
    chunks = splitter.split_documents(documents)

    # langchain chunk metadata is first updated
    # langchain chunk metadata (each chunk of document will have this metadata (it will not have page content - only metadata))
    for i,chunk in enumerate(chunks):
        source_path = chunk.metadata.get("source","")
        file_name = os.path.basename(source_path) if source_path else "unknow.pdf"

        metadata = {
            "doc_id":state["doc_id"],
            "chunk_index":i,
            "file_name":file_name,
            "page":chunk.metadata.get("page")  
        }
        # update langchian chunk metadata usd by pgvector
        chunk.metadata.update(metadata)

    vectorstore = PGVector(
        connection=CONNECTION_STRING,
        collection_name=state["collection_name"],
        embeddings=self.embedding_model,
        use_jsonb=True,
        engine_args={"poolclass": NullPool}  # disable pooling
    )
    batch_size=50
    # upload embedding 
    for i in tqdm(range(0, len(chunks), batch_size), desc="Uploading chunks"):
        batch = chunks[i:i + batch_size]
        vectorstore.add_documents(batch)
    

    # Insert metadata to supbase table
    rows = [{
            "doc_id": state["doc_id"],
            "chunk_index": i,
            "file_name": chunk.metadata["file_name"],
            "page": chunk.metadata.get("page"),
            "content": chunk.page_content,
    } for i,chunk in enumerate(chunks)
    ]
    if rows:
        self.supabase_client.table("documents").insert(rows).execute()

    print(f"Uploaded {len(chunks)} chunks")

    state["vectorstore_uploaded"] = True
    return state


In [None]:
def query_rewriter(state: AgentState):
    """Rewrite follow-up questions to be standalone using conversation context"""
    
    human_messages = [m for m in state.get("messages", []) if isinstance(m, HumanMessage)]
    current_query = human_messages[-1].content
    
    # If there's conversation history, rewrite the query
    if len(state.get("messages", [])) > 1:
        
        # Build conversation context
        memory_text = state.get("summary") or ""
        if not memory_text:
            conversation_history = []
            for m in state.get("messages", [])[:-1]:  # Exclude current question
                role = "User" if isinstance(m, HumanMessage) else "Assistant"
                conversation_history.append(f"{role}: {m.content}")
            memory_text = "\n".join(conversation_history)
        
        # Rewrite query to be standalone
        rewrite_prompt = f"""Given this conversation history:
            {memory_text}

            Rewrite the following question to be standalone (include necessary context from history):
            Question: {current_query}

            Standalone question:"""
                    
        response = self.llm.invoke([HumanMessage(content=rewrite_prompt)])
        rewritten_query = response.content.strip()
        
        print(f"Original: {current_query}")
        print(f"Rewritten: {rewritten_query}")
        
        # Store rewritten query for retrieval
        state["rewritten_query"] = rewritten_query
    else:
        state["rewritten_query"] = current_query
    
    return state







In [None]:
# We can add Metadata filtering Here
# We can add Metadata filtering Here
def retriever(self,state: AgentState):     
    vectorstore = PGVector(
        connection=CONNECTION_STRING,
        collection_name=state["collection_name"],
        embeddings=self.embedding_model,
        use_jsonb=True,
        engine_args={"poolclass": NullPool}  # disable pooling
    )
    # Metadata filter for this specific PDF
    retriever = vectorstore.as_retriever(
        search_type="similarity",
        search_kwargs={
            "k": 5,
            "filter": {"doc_id": state["doc_id"]} # only search this pdf 
        }
    )
    
    # Use rewritten query instead of original
    query = state.get("rewritten_query", state["messages"][-1].content)
    
    retrieved_docs = retriever.invoke(query)

    state["retrieved_docs"] = retrieved_docs
    return state

In [None]:
# file_name is added during text splitting
# page exists → PyPDFLoader adds this automatically
def context_builder(state:AgentState):
        retrieved_docs = state.get("retrieved_docs",[])
        if not state["retrieved_docs"]:
            state["context"] = ""
            state["answer"] = ("I could not find relevant information in the provided document.")
        else:

            context = "\n\n".join(
                f"[Source: {doc.metadata.get('file_name', 'Unknown')} "
                f"- Page {doc.metadata.get('page', 'N/A')}]\n"    # page no
                f"{doc.page_content}"
                for doc in retrieved_docs
            )
            state["context"] = context
        return state


In [None]:
def summary_creation(state:AgentState):
    existing_summary = state["summary"] # we first load existing summary

    # We have two scenrio:
    # 1. We might already have summary
    # 2. or We are Genrating summary fir the first time
    if existing_summary:
        prompt = (
            f"Existing summary:\n{existing_summary}\n\n"
            "Extend the summary using new conversation above"
        )
    else:
        prompt = "summarize the conversation above"

    message_for_summary = state["messages"] + [HumanMessage(content=prompt)]

    print("Callin summary LLM") # debugging
    # generate summary
    response = self.llm.invoke(message_for_summary)

    # now delete the orignal messages that have been summarized
    message_to_delete = state["messages"][:-2] if len(state["messages"]) > 2 else []

    return {
        "summary":response.content,
        "messages":[RemoveMessage(id=m.id) for m in message_to_delete]
    }

In [None]:
def should_summzarizer(state:AgentState):
    return len(state["messages"]) > 6

In [None]:
# cat node with memory
def agent_response(state: AgentState):
    """
    Generates the LLM response for the current query, injecting memory (summary or previous messages)
    and RAG context into the prompt.
    """
    context = state.get("context", "")

    # Get all human messages
    human_messages = [m for m in state.get("messages", []) if isinstance(m, HumanMessage)]
    if not human_messages:
        raise ValueError("No human message found in state for retrieval")

    query = human_messages[-1].content

    prompt_messages = []

    # Memory injection 
    # Use summary if it exists, otherwise include all previous messages
    memory_text = state.get("summary", "")
    if not memory_text:
        conversation_history = []
        for m in state.get("messages", []):
            role = "User" if isinstance(m, HumanMessage) else "Assistant"
            conversation_history.append(f"{role}: {m.content}")
        memory_text = "\n".join(conversation_history) if conversation_history else "No previous conversation."

    # Inject memory as system message
    prompt_messages.append(SystemMessage(content=f"Conversation Memory:\n{memory_text}"))

    # RAG context + current query 
    formatted_prompt = PROMPT_TEMPLATE.format(
        context=context,
        question=query
    )
    prompt_messages.append(HumanMessage(content=formatted_prompt))

    print("Calling Agent Response LLM")  # debugging
    response = self.llm.invoke(prompt_messages)

    # Save AI response in state
    state["messages"].append(AIMessage(content=response.content))
    state["answer"] = response.content

    return state

In [None]:
def conditional(state: AgentState):
    if state.get("vectorstore_uploaded", False):
        return "query_rewriter"   # already exists → query
    else:
        return "document_ingestion"  # new → ingest

In [None]:
# from langgraph.checkpoint.postgres import PostgresSaver
# import psycopg

# conn = psycopg.connect(
#     CONNECTION_STRING,
#     autocommit=True
# )
# # PostgresSaver --> LangGraph will automatically create its own tables in your Supabase Postgres database the first time it runs.
# checkpointer = PostgresSaver(conn) 
# # It creates the internal tables in Supabase:
# # checkpoints
# # checkpoint_writes
# # These tables store:
# # AgentState snapshots
# # Messages
# # Node execution progress
# # Thread state
# checkpointer.setup()


In [None]:
workflow = StateGraph(AgentState)

    # nodes
workflow.add_node("document_ingestion",document_ingestion)
workflow.add_node("query_rewriter", query_rewriter)
workflow.add_node("retriever", retriever)
workflow.add_node("context_builder", context_builder)
workflow.add_node("agent_response", agent_response)
workflow.add_node("summarize",summary_creation)
workflow.add_node("check_pdf",check_pdf_already_uploaded)
workflow.add_node("set_doc_id",set_doc_id)

# edges
workflow.add_edge(START, "set_doc_id")
workflow.add_edge("set_doc_id", "check_pdf")
workflow.add_conditional_edges(
    "check_pdf",
    nodes.conditional,
    {
        "document_ingestion": "document_ingestion",
        "query_rewriter": "query_rewriter"
    }
)

# if new vector store path
workflow.add_edge("document_ingestion","query_rewriter")

workflow.add_edge("query_rewriter", "retriever")
workflow.add_edge("retriever", "context_builder")
workflow.add_edge("context_builder", "agent_response")

workflow.add_conditional_edges(
    "agent_response",
    nodes.should_summzarizer,
    {
        True: "summarize",
        False: END
    }
)
workflow.add_edge("summarize", END)

app = workflow.compile(checkpointer=self.checkpointer)
app

In [None]:
# Project root
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

file_path = os.path.join(PROJECT_ROOT,"data","Constitution and law","PAKISTAN PENAL CODE.pdf")
if not os.path.exists(file_path):
    raise FileNotFoundError("File is not found")

In [None]:
# Generate unique doc_id for PDF
doc_id = get_file_hash(file_path)

# Create a collection name based on file name
collection_name = (os.path.splitext(os.path.basename(file_path))[0].lower().replace(" ", "_"))

# Thread ID for chat persistence
thread_id = "user-123"

In [None]:
# Connect to Supabase Postgres for checkpointing
with PostgresSaver.from_conn_string(CONNECTION_STRING) as checkpointer:
    checkpointer.setup()  # run once

    # Build the workflow graph
    graph = GraphBuilder(checkpointer=checkpointer)()  # use as function

    config = {"configurable": {"thread_id": thread_id}}

    # ===== FIRST MESSAGE =====
    result = graph.invoke(
        {
            "documents_path": file_path,
            "doc_id": doc_id,
            "collection_name": collection_name,
            "messages": [HumanMessage(content="What is punishment for making false claim in court?")],
            "summary": ""},
        config=config
    ) 
    print("Answer 1:", result["answer"])


    # ===== SECOND MESSAGE =====

    initial_state = {
    "doc_id": doc_id,
    "collection_name": collection_name,  # Use existing vectorstore
    "messages": [HumanMessage(content="What is the penalty for that?")],  # Follow-up question
}
    result = graph.invoke(initial_state,config=config)
    print("Answer 2:", result["answer"])
