In [48]:
from langchain_core.prompts import PromptTemplate
from langchain_groq import ChatGroq
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from dotenv import load_dotenv
from langchain_community.document_loaders import DirectoryLoader,PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
load_dotenv()

True

## Document Loading

In [49]:
def doc_loader(path):
    loader = DirectoryLoader(path=path, glob="*.pdf",loader_cls=PyMuPDFLoader)
    load = loader.load()
    return load

path_to_cases = "pdf_files/cases"
path_to_legal = "pdf_files/Constitution and law"
path_to_islamic = "pdf_files/Islamic law"

cases_document = doc_loader(path_to_cases)
legal_document = doc_loader(path_to_legal)
islamic_law_document = doc_loader(path_to_islamic)

print("len of islamic document is :",len(islamic_law_document))
print("len of cases document is :",len(cases_document))
print("len of legal document is :",len(legal_document))

len of islamic document is : 571
len of cases document is : 379
len of legal document is : 822


## Text Chunking

In [50]:
# Step 2: Text splitting
def text_splitter(data):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=250
    )
    return text_splitter.split_documents(data)

legal_chunks = text_splitter(legal_document)
islmaic_chunks = text_splitter(islamic_law_document)
cases_chunks = text_splitter(cases_document)

print("Total Chunks legal:",len(legal_chunks))
print("Total Chunks cases:",len(cases_chunks))
print("Total Chunks islamic:",len(islmaic_chunks))

Total Chunks legal: 2369
Total Chunks cases: 1038
Total Chunks islamic: 3408


## Create Vector Store

In [53]:
import os
from tqdm import tqdm

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

batch_size = 50

def create_vector_store(chunks,vector_store_save_path,batch_size):
    vector_store = None
    for i in tqdm(range(0,len(chunks),batch_size)):
        batch = chunks[i:i+batch_size]
        if vector_store is None:
            vector_store = FAISS.from_documents(documents=batch,embedding=embedding_model)
        else:
            new_store = FAISS.from_documents(documents=batch,embedding=embedding_model)
            vector_store.merge_from(new_store)

    vector_store.save_local(vector_store_save_path)
    return vector_store

os.makedirs("vector_store/Islamic",exist_ok=True)
os.makedirs("vector_store/Cases",exist_ok=True)
os.makedirs("vector_store/Legal",exist_ok=True)

path_to_islamic_db = "vector_store/Islamic/"
path_to_cases_db = "vector_store/Cases/"
path_to_legal_db = "vector_store/Legal/"


islamic_vector_store = create_vector_store(islmaic_chunks,path_to_islamic_db,batch_size)
legal_vector_store = create_vector_store(legal_chunks,path_to_legal_db,batch_size)
cases_vector_store = create_vector_store(cases_chunks,path_to_cases_db,batch_size)

print("Vector stroe created successfully")

100%|██████████| 69/69 [04:34<00:00,  3.98s/it]
100%|██████████| 48/48 [03:16<00:00,  4.10s/it]
100%|██████████| 21/21 [01:06<00:00,  3.15s/it]

Vector stroe created successfully





## Load Vector Store

In [54]:
def load_vector_store(path,embedding_model):
    return FAISS.load_local(folder_path=path,embeddings=embedding_model,allow_dangerous_deserialization=True)

cases_vector_store= load_vector_store(path_to_cases_db,embedding_model)
legal_vector_store= load_vector_store(path_to_legal_db,embedding_model)
islamic_vector_store= load_vector_store(path_to_islamic_db,embedding_model)

## Vector Store As Retriever

In [None]:
cases_retriever = cases_vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 3})
legal_retriever = legal_vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 3})
islamic_retriever = islamic_vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [56]:
legal_retriever.invoke("Constitution of pakistan")

[Document(id='b6045c4b-63ab-4b95-82f0-240b355772d3', metadata={'producer': 'Microsoft® Word 2010', 'creator': 'Microsoft® Word 2010', 'creationdate': '2012-04-04T12:15:58+05:00', 'source': 'pdf_files\\Constitution and law\\constitution.pdf', 'file_path': 'pdf_files\\Constitution and law\\constitution.pdf', 'total_pages': 222, 'format': 'PDF 1.7', 'title': '', 'author': 'Naveed Anjum', 'subject': '', 'keywords': '', 'moddate': '2012-04-04T12:19:29+05:00', 'trapped': '', 'encryption': 'Standard V4 R4 128-bit AES', 'modDate': "D:20120404121929+05'00'", 'creationDate': "D:20120404121558+05'00'", 'page': 0}, page_content='THE \nCONSTITUTION \nOF THE \nISLAMIC REPUBLIC \nOF \nPAKISTAN \n \n \n \n \n \n \n[As modified upto the 28th February, 2012] \n \n \n \n \n \n \nNATIONAL ASSEMBLY OF PAKISTAN'),
 Document(id='fb7cfb73-088d-4318-989b-4125ee45eee9', metadata={'producer': 'Microsoft® Word 2010', 'creator': 'Microsoft® Word 2010', 'creationdate': '2012-04-04T12:15:58+05:00', 'source': 'pdf_fi

## HuggingFace Reranker 

In [18]:
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import DocumentCompressorPipeline
from langchain_community.cross_encoders import HuggingFaceCrossEncoder


hugging_face_reranker = HuggingFaceCrossEncoder(model_name = "cross-encoder/ms-marco-MiniLM-L6-v2")
reranker = CrossEncoderReranker(model=hugging_face_reranker)
pipeline = DocumentCompressorPipeline(transformers=[reranker])

#pip install hf_xet

### Group Reranker + Retriever

In [39]:
contextual_compression_retriever = ContextualCompressionRetriever(
    base_compressor=pipeline,
    base_retriever=legal_retriever
)

## Prompt + Parser

In [45]:
from langchain_core.output_parsers import StrOutputParser

template = PromptTemplate(
    template=""" 
        You are a Legal Assistant specialized in Pakistani law. 
        Your task is to provide precise and legally accurate answers based on the provided context.
        Instructions:
        - Refer ONLY to the provided context to answer the user's legal question.
        - If the context lacks information to answer, respond with:
        "I don't know based on the given context."
        - Maintain a formal and professional tone at all times.
        - Provide relevant citations from the documents, using brackets like [1], [2], etc.
        
        CONTEXT:
        {context}
        QUESTION:
        {question}
        ANSWER:
        """,
    input_variables=["context", "question"],
    validate_template=True
)
parser = StrOutputParser()

## Model

In [46]:
llm = ChatGroq(model="llama3-70b-8192",max_tokens=512)

## Clean The retriever Output before passing to the model

In [None]:
def cleaner(docs):
    return "\n\n".join(doc.page_content for doc in docs)

## Final Chain 

In [43]:
from langchain_core.runnables import RunnableParallel,RunnablePassthrough,RunnableLambda

parallel_chain = RunnableParallel({
    "context": contextual_compression_retriever | RunnableLambda(cleaner),
    "question": RunnablePassthrough()
})

final_chain = parallel_chain | template | llm | parser

result = final_chain.invoke("Constitution of pakistan")

In [47]:
from IPython.display import Markdown,display
display(Markdown(result))

The Constitution of Pakistan is the supreme law of the land. It was adopted on March 23, 1956, and it has undergone several amendments since then. The Constitution sets out the framework of the government, the relationship between the federal government and the provinces, and the fundamental rights of the people. 

The Constitution is divided into twelve parts, with 280 articles. It also includes several schedules. Article 1 of the Constitution declares that Pakistan shall be a Federal Republic, comprising four provinces: Balochistan, Khyber Pakhtunkhwa, Punjab, and Sindh [1]. 

The Constitution also establishes the supremacy of the Constitution, and declares that any law inconsistent with the Constitution shall be void [2]. 

[1] Article 1 of the Constitution of Pakistan 
[2] Article 8 of the Constitution of Pakistan