In [1]:
%pip install langchain_community tiktoken langchainhub langchain_huggingface chromadb langchain pypdf langchain-groq sentence-transformers;

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Environment Setup
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

True

In [3]:
# RAG Quickstart
import numpy as np
import tiktoken
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_groq import ChatGroq
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

In [5]:
# Initialize the embedding model
embedding = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={
        "device": "cpu",
    },
)

In [6]:
# Load the document
loader = PyPDFLoader("Documents/Porsche Case File.pdf")
pages = loader.load()


In [7]:
print(pages[0])

page_content='Pooja Gagan Jain vs State Of Maharashtra on 25 June, 2024
Author: Bharati Dangre
Bench: Bharati Dangre
 2024:BHC-AS:24726-DB
                                                                                       WP-2372-2024.doc
           Rajshree
                                        IN THE HIGH COURT OF JUDICATURE AT BOMBAY
                                                 CRIMINAL APPELLATE JURISDICTION
                                            CRIMINAL WRIT PETITION NO.2372 OF 2024
                          Pooja Gagan Jain                   ]    ..   Petitioner
                                           vs.
                          State of Maharashtra               ]    ..   Respondent
                          Mr. Aabad Ponda, Senior Advocate a/w Prashant Patil, Swapnil
                          Ambure, Pranav Patil, Avantika Sharma, Nida Khan, Swati
                          Pandey, Vinayak Patil, Anant Charkhe, Vishal Nevshe and
                          R.B

In [8]:
# Split the document into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
)
splits = text_splitter.split_documents(pages)

In [9]:
print(splits[0])

page_content='Pooja Gagan Jain vs State Of Maharashtra on 25 June, 2024
Author: Bharati Dangre
Bench: Bharati Dangre
 2024:BHC-AS:24726-DB
                                                                                       WP-2372-2024.doc
           Rajshree
                                        IN THE HIGH COURT OF JUDICATURE AT BOMBAY
                                                 CRIMINAL APPELLATE JURISDICTION
                                            CRIMINAL WRIT PETITION NO.2372 OF 2024
                          Pooja Gagan Jain                   ]    ..   Petitioner
                                           vs.
                          State of Maharashtra               ]    ..   Respondent
                          Mr. Aabad Ponda, Senior Advocate a/w Prashant Patil, Swapnil
                          Ambure, Pranav Patil, Avantika Sharma, Nida Khan, Swati
                          Pandey, Vinayak Patil, Anant Charkhe, Vishal Nevshe and' metadata={'source': 'Documen

In [10]:
# Embedding the chunks
vectorstore = Chroma.from_documents(
    documents=splits, 
    embedding=embedding,
    persist_directory="./chroma_db"
)

In [11]:
retriever = vectorstore.as_retriever()
print(retriever)


tags=['Chroma', 'HuggingFaceEmbeddings'] vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x0000018D5E36C280> search_kwargs={}


In [12]:
# Prompt
prompt = hub.pull("rlm/rag-prompt")
print(prompt)


input_variables=['context', 'question'] input_types={} partial_variables={} metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'} messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"), additional_kwargs={})]


In [28]:
# LLM
llm = ChatGroq(
    api_key=os.getenv("GROQ_API_KEY"),
    model="llama-3.3-70b-versatile",
    temperature=0.7,
    max_tokens= 12000,
)

In [29]:
# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


In [30]:
# Chain
chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [31]:
# Run the chain
chain.invoke("Give me a summary of the case")


'The case involves a child (CCL) who was involved in an incident after attending a late-night party at a pub with friends, where his parents allowed him to consume liquor and use a Porsche car. The CCL became a victim of mob lynching and there are concerns about his safety if released on bail. The board is considering transferring his custody to an Observation Home due to neglected parenting and potential dangers to his life.'

In [32]:
question = "What are the key legal issues in the case?"


In [33]:
# Run the chain
answer = chain.invoke(question)
print(answer)

The key legal issue in the case appears to be the application of the Juvenile Justice Act to the accused, who is under 18 years old. The court must consider the provisions of the law and the doctrine of equality, which requires equal protection of the law for all individuals. The court must balance the public outcry for justice with the requirement to apply the law equally, regardless of the accused's age or the severity of the crime.


In [35]:
question = "What are the steps that can be taken to resolve the case?"
answer = chain.invoke(question)
print(answer)


To resolve the case, an inquiry is directed to be launched for the appointment of a fit person or facility for the child. The child should be restored to the rehabilitation stage at the Observation Home Pune. An extension of the period of custody may be considered to facilitate the progress of investigation and to ensure the child's care, given that their parents and grandparents are in custody.


In [34]:
# Token Count
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

num_tokens_from_string(question, "cl100k_base")

NameError: name 'tiktoken' is not defined

In [28]:
# Text Embedding
question_result = embedding.embed_query(question)  # Use existing 'embedding' instance
print(question_result)
answer_result = embedding.embed_query(answer)
print(answer_result)


[-0.012645701877772808, 0.12642905116081238, -0.0020536242518574, -0.07646709680557251, -0.030324073508381844, 0.036024756729602814, -0.03804846107959747, 0.003914241679012775, -0.07140165567398071, 0.026665525510907173, 0.1237354502081871, 0.08873916417360306, -0.010214148089289665, -0.01460725162178278, 0.0297860037535429, 0.008222957141697407, 0.047784123569726944, -0.039985112845897675, -0.06159700080752373, 0.05284808203577995, 0.0032699282746762037, -0.04844236746430397, -0.07155179232358932, 0.02414419874548912, -0.012172140181064606, -0.006971366237848997, 0.0423533134162426, 0.0013341255253180861, 0.01751040108501911, -0.04286939278244972, 0.04814557358622551, -0.004849589895457029, 0.017639348283410072, -0.018458697944879532, -0.01192481815814972, -0.017791040241718292, 0.013409692794084549, -0.048316825181245804, -0.029044577851891518, -0.015002185478806496, 0.057077087461948395, -0.06384072452783585, 0.027742523699998856, -0.012836615554988384, -0.0061742221005260944, -0.02

In [29]:
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

similarity = cosine_similarity(question_result, answer_result)
print("Cosine Similarity:", similarity)

Cosine Similarity: 0.5714159648160456
