In [1]:
# Need some pre-requisites

!pip install langchain
!pip install huggingface_hub
!pip install sentence_transformers
!pip install unstructured[local-inference] -q
!pip install -q faiss-cpu



In [29]:
# FAISS needed to store the vector representations of PDF text data from UnstructuredPDFLoader
# QA will provide me with answer based data from PDF provided, to the prompt given

from langchain.document_loaders import UnstructuredPDFLoader
from langchain.vectorstores import FAISS
from langchain.vectorstores import Chroma
from langchain.chains.question_answering import load_qa_chain


In [30]:
# PLEASE PROVIDE HUGGINGFACE TOKEN, SO THAT WE ARE ABLE TO USE HUGGINGFACE MODELS

import os
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

In [31]:
# Load the document of where the data is.. In my case., Clothes Catalogue

loader = UnstructuredPDFLoader("caseDoc.pdf")
document = loader.load()

In [32]:
# Take a document and split it into chunks of a specified size while considering overlap and using defined separators.
# Useful for processing large text documents in smaller, more manageable pieces.

from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap=0, separators=[" ", ",", "\n"])
docs = text_splitter.split_documents(document)

In [33]:
# Numerical representations of words, sentences, or documents that capture semantic information are done with HuggingFaceEmbeddings
# Creating a FAISS index from the embedded documents, which allows for efficient similarity search based on the learned embeddings.

from langchain.embeddings import HuggingFaceEmbeddings

embedding = HuggingFaceEmbeddings()
db = FAISS.from_documents(docs, embedding)

In [93]:
# Previously, By fetching Hugging Face Token, We use a HuggingFace LLM powered to fetch response to the qa_chain

from langchain import HuggingFaceHub

llm=HuggingFaceHub(repo_id="HuggingFaceH4/zephyr-7b-beta", model_kwargs={"temperature":0.2, "max_length":256})
chain = load_qa_chain(llm, chain_type="stuff")


In [98]:
# I provide a query, and based on the query and PDF's vector values, similarities are matched to fetch a response
query = "What are the sections the accused were charged under?"
docs = db.similarity_search(query)

ans = chain.run(input_documents=docs, question=query)

In [99]:
full_text = ans

# Find the index of "Helpful Answer:"
start_index = full_text.find("Helpful Answer:")

# Add the length of "Helpful Answer:" to get the text immediately after it
cropped_text = full_text[start_index + len("Helpful Answer:"):]

print(cropped_text)


 The accused were charged under sections 302 and 34 of the Indian Penal Code (IPC). Section 302 relates to murder, while section 34 refers to common intention in committing an offense. In this case, the police had submitted a charge sheet against the accused under both sections, but the trial proceeded only against the first two accused as the third accused was shown as an absconder.
