In [None]:
# Install libraries
!pip install llama-index==0.9.34
!pip install pinecone-client>=3.0.0
!pip install arxiv==2.1.0
!pip install setuptools==-69.0.3  # (Optional)

In [2]:
# Set environment variables for API keys
import os
os.environ[
    "PINECONE_API_KEY"
] = "65077d46-c5f7-4dc1-9c4a-c9bfb9b17d3a"

api_key = os.environ["PINECONE_API_KEY"]

In [3]:
from pinecone import Pinecone

pc = Pinecone(api_key=api_key)

In [None]:
!pip install pypdf
!pip install -q transformers einops accelerate langchain bitsandbytes
!pip install sentence_transformers

In [5]:
from llama_index import SimpleDirectoryReader

documents = SimpleDirectoryReader(
    input_files=["./penal_code.pdf"]
).load_data()

In [6]:
len(documents)

112

## Removing extra space and new lines

In [7]:
documents[0]

Document(id_='472f3605-c098-47c1-8855-683822b0b47d', embedding=None, metadata={'page_label': '1', 'file_name': 'penal_code.pdf', 'file_path': 'penal_code.pdf', 'file_type': 'application/pdf', 'file_size': 1529218, 'creation_date': '2024-03-12', 'last_modified_date': '2024-03-12', 'last_accessed_date': '2024-03-12'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, hash='aed2cc43163e7af30d3edb2e67fe7302813d3ada08cbbed5eaec5663f6d6584a', text='1 \n THE INDIAN PENAL CODE  \n___________  \nARRANGEMENT OF SECTIONS  \n__________  \nCHAPTER I  \nINTRODUCTION  \nPREAMBLE  \nSECTIONS  \n1. Title and extent of operation of the Code.  \n2. Punishment of offences committed within India.  \n3. Punishment of offences committed beyond, but which by law may be tried within, In

In [8]:
# Clean up our Documents' content
import re

def clean_up_text(content: str) -> str:
    """
    Remove unwanted characters and patterns in text input.

    :param content: Text input.

    :return: Cleaned version of original text input.
    """

    # Fix hyphenated words broken by newline
    content = re.sub(r'(\w+)-\n(\w+)', r'\1\2', content)

    # Remove specific unwanted patterns and characters
    unwanted_patterns = [
        "\\n", "  —", "——————————", "—————————", "—————",
        r'\\u[\dA-Fa-f]{4}', r'\uf075', r'\uf0b7'
    ]
    for pattern in unwanted_patterns:
        content = re.sub(pattern, "", content)

    # Fix improperly spaced hyphenated words and normalize whitespace
    content = re.sub(r'(\w)\s*-\s*(\w)', r'\1-\2', content)
    content = re.sub(r'\s+', ' ', content)

    return content

# Call function
cleaned_docs = []
for d in documents:
    cleaned_text = clean_up_text(d.text)
    d.text = cleaned_text
    cleaned_docs.append(d)

In [9]:
# Inspect output
cleaned_docs[0].get_content()

'1 THE INDIAN PENAL CODE ___________ ARRANGEMENT OF SECTIONS __________ CHAPTER I INTRODUCTION PREAMBLE SECTIONS 1. Title and extent of operation of the Code. 2. Punishment of offences committed within India. 3. Punishment of offences committed beyond, but which by law may be tried within, India. 4. Extension of Code to extra-territorial offences. 5. Certain laws not to be affected by this Act. CHAPTER II GENERAL EXPLANATIONS 6. Definitions in the Code to be understood subject to exceptions. 7. Sense of expression once explained. 8. Gender. 9. Number. 10. “Man”. “Woman”. 11. “Person”. 12. “Public”. 13. [Omitted .]. 14. “Servant of Government”. 15. [Repealed. ]. 16. [Repealed .]. 17. “Government”. 18. “India”. 19. “Judge”. 20. “Court of Justice”. 21. “Public servant”. 22. “Moveable property”. 23. “Wrongful gain”. “Wrongful loss”. Gainin g wrongfully/ Losing wrongfully. 24. “Dishonestly”. 25. “Fraudulently”. 26. “Reason to believe”. 27. Property in posse ssion of wife, clerk or servant .

## Embedding Model

In [None]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index import ServiceContext
from llama_index.embeddings import LangchainEmbedding

# embedding: 1) BAAI/bge-small-en-v1.5 (vectorsize: 384) 2) sentence-transformers/multi-qa-MiniLM-L6-cos-v1 3) sentence-transformers/all-mpnet-base-v2 (vectorsize: 768)
embed_model=LangchainEmbedding(
    HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5"))


In [11]:
from pinecone import ServerlessSpec
from llama_index.vector_stores import PineconeVectorStore

pinecone_index = pc.Index("penal-code-embeddings")

# Initialize VectorStore
vector_store = PineconeVectorStore(pinecone_index=pinecone_index)

In [12]:
from llama_index.storage.storage_context import StorageContext
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [13]:
from llama_index.prompts import Prompt
from llama_index.prompts.prompt_type import PromptType

from llama_index import PromptTemplate

# NOTE: we add an extra tone_name variable here
qa_prompt_tmpl = (
    "Context information is below.\n"
    "---------------------\n"
    "<context>\n"
    "{context_str}\n"
    "</context>\n"
    "---------------------\n"
    "Instructions are given below.\n"
    "answer the query based on given Context. \n"
    "If you do not find the answer from Context then write response like below. \n"
    "Do not able to answer this question.\n"
    "Query is given below.\n"
    "<query>\n"
    "{query_str}\n"
    "</query>\n"
    "Take a deep breath and provide the answer from the context and not from the prior knowledge.\n"
)
qa_prompt = PromptTemplate(qa_prompt_tmpl)

# initialize response synthesizer



In [14]:
system_prompt="""
You are a Q&A assistant. Your goal is to answer questions as
accurately as possible based on the instructions and context provided.
"""
## Default format supportable by LLama2
# query_wrapper_prompt=SimpleInputPrompt("<|USER|>{query_str}<|ASSISTANT|>")



In [15]:
from contextvars import Token
import torch
from llama_index.llms import HuggingFaceLLM

llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.0, "do_sample": False},
    system_prompt=system_prompt,
    # query_wrapper_prompt=qa_prompt,
    # tokenizer_name="TheBloke/Mistral-7B-Instruct-v0.1-GGUF",
    # model_name="TheBloke/Mistral-7B-Instruct-v0.1-GGUF",
     tokenizer_name="mistralai/Mistral-7B-Instruct-v0.2",
    model_name="mistralai/Mistral-7B-Instruct-v0.2",
    device_map="auto",
    # uncomment this if using CUDA to reduce memory usage
    model_kwargs={"torch_dtype": torch.float16 , "load_in_8bit":True},
)

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

In [21]:
# function
import os
from llama_index import VectorStoreIndex
from llama_index.node_parser import SentenceWindowNodeParser
from llama_index.indices.postprocessor import MetadataReplacementPostProcessor
from llama_index.indices.postprocessor import SentenceTransformerRerank
from llama_index import load_index_from_storage
from llama_index.response_synthesizers import get_response_synthesizer
from llama_index.response_synthesizers import ResponseMode
from llama_index.prompts.default_prompts import DEFAULT_REFINE_PROMPT_TMPL

def build_sentence_window_index(
    documents, llm, storage_context, embed_model="local:BAAI/bge-small-en-v1.5"
):
    # create the sentence window node parser w/ default settings
    node_parser = SentenceWindowNodeParser.from_defaults(
        window_size=3,
        window_metadata_key="window",
        original_text_metadata_key="original_text",
    )
    sentence_context = ServiceContext.from_defaults(
        llm=llm,
        embed_model=embed_model,
        node_parser=node_parser,
    )

    sentence_index = VectorStoreIndex.from_documents(
        documents, service_context=sentence_context, storage_context=storage_context
    )

    return sentence_context, sentence_index


def get_sentence_window_query_engine(
    qa_prompt_tmpl,
    sentence_context,
    sentence_index,
    similarity_top_k=6,
    rerank_top_n=3,
):
    # define postprocessors
    postproc = MetadataReplacementPostProcessor(target_metadata_key="window")
    rerank = SentenceTransformerRerank(
        top_n=rerank_top_n, model="BAAI/bge-reranker-base"
    )
    response_synthesizer_sentence = get_response_synthesizer(
      service_context=sentence_context,
      response_mode=ResponseMode.COMPACT,
      text_qa_template=Prompt(qa_prompt_tmpl, prompt_type=PromptType.QUESTION_ANSWER),
      refine_template=Prompt(DEFAULT_REFINE_PROMPT_TMPL, prompt_type=PromptType.REFINE),
    )
    sentence_window_engine = sentence_index.as_query_engine(
        similarity_top_k=similarity_top_k, node_postprocessors=[postproc, rerank], response_synthesizer=response_synthesizer_sentence
    )
    return sentence_window_engine

In [23]:
sentence_context, sentence_index = build_sentence_window_index(
    cleaned_docs,
    llm=llm,
    storage_context=storage_context,
    embed_model="local:BAAI/bge-small-en-v1.5"
)

Upserted vectors:   0%|          | 0/2048 [00:00<?, ?it/s]

Upserted vectors:   0%|          | 0/2048 [00:00<?, ?it/s]

Upserted vectors:   0%|          | 0/642 [00:00<?, ?it/s]

In [24]:
sentence_window_engine = get_sentence_window_query_engine(qa_prompt_tmpl, sentence_context, sentence_index)

In [25]:
window_response = sentence_window_engine.query(
    "what is the punishment for making false claim in court?"
)
print(str(window_response))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



The person who fraudulently or dishonestly makes a false claim in a Court of Justice shall be punished with imprisonment of either description for a term which may extend to two years, and shall also be liable to fine. (Section 209, Penal Code)
