In [5]:
pip install langchain openai sentence-transformers pymupdf faiss-gpu dill

Collecting langchain
  Using cached langchain-0.1.17-py3-none-any.whl (867 kB)
Collecting openai
  Using cached openai-1.25.1-py3-none-any.whl (312 kB)
Collecting sentence-transformers
  Using cached sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
Collecting pymupdf
  Using cached PyMuPDF-1.24.2-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
Collecting dill
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)
  Using cached dataclasses_json-0.6.5-py3-none-any.whl (28 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain)
  Using cached jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting langchain-community<0.1,>=0.0.36 (from langchain)
  Using cached langchain_community-0.0.36-py3-none-any.whl (2.0 MB)
Collecting langchain-core<0.2.0,>=0.1.48 (from langchain)
  Using cached langchain_core-0.1.50-py3-none-an

In [6]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
import textwrap
import torch

In [None]:
torch.device('cpu')

In [7]:
class Runnable:
    def run(self, *args, **kwargs):
        raise NotImplementedError("Subclass must implement abstract method")

In [8]:

def load_pdf_data(file_path):
    loader = PyMuPDFLoader(file_path=file_path)
    docs = loader.load()
    return docs

In [9]:
# Responsible for splitting the documents into several chunks

def split_docs(documents, chunk_size=1000, chunk_overlap=20):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunks = text_splitter.split_documents(documents=documents)
    return chunks

In [10]:
def load_embedding_model(model_path, normalize_embedding=True):
    return HuggingFaceEmbeddings(
        model_name=model_path,
        model_kwargs={'device':'cpu'},
        encode_kwargs = {
            'normalize_embeddings': normalize_embedding
        }
    )

def create_embeddings(chunks, embedding_model, storing_path = "C:\\Users\\nehan\\vectorstore"):


    vectorstore = FAISS.from_documents(chunks, embedding_model)
    vectorstore.save_local(storing_path)
    return vectorstore

In [11]:
template = """
### System:
You have to answer the user's \
questions using only the context provided to you. If you don't know the answer, \
just say you don't know. Don't try to make up an answer.

### Context:
{context}

### User:
{question}

### Response:
"""

In [12]:
def load_qa_chain(retriever, llm, prompt):
    return RetrievalQA.from_chain_type(
        llm=llm,
        retriever=retriever,
        chain_type="stuff",
        return_source_documents=True,
        chain_type_kwargs={'prompt': prompt}
    )

In [13]:
def get_response(query, chain):
    response = chain({'query': query})

    wrapped_text = textwrap.fill(response['result'], width=100)
    print(wrapped_text)

In [14]:
from langchain.llms import OpenAI
from langchain import PromptTemplate

In [15]:
class OpenAIWrapper(Runnable):
    def __init__(self, openai_instance):
        self.openai_instance = openai_instance

    def run(self, *args, **kwargs):
        return self.openai_instance.run(*args, **kwargs)

In [16]:
import os
from langchain.llms import OpenAI

os.environ["OPENAI_API_KEY"] = "Paste Your API Key Here"


In [17]:
# Loading openai
llm = OpenAI()

# Loading the Embedding Model
embed = load_embedding_model(model_path = "all-MiniLM-L6-v2")

  warn_deprecated(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [20]:
# loading and splitting the documents
docs = load_pdf_data(file_path=r"/content/COI.pdf")
documents = split_docs(documents=docs)

# creating vectorstore
vectorstore = create_embeddings(documents, embed)

# converting vectorstore to a retriever
retriever = vectorstore.as_retriever()

In [21]:
# Creating the prompt from the template which we created before
prompt = PromptTemplate.from_template(template)

# Creating the chain
chain = load_qa_chain(retriever, llm, prompt)

In [24]:
get_response("tell me about right of self-defence", chain)

  warn_deprecated(


The Constitution of India protects the rights of individuals to safeguard public property and to
abjure violence. This includes the right of self-defence, where a person has the right to protect
themselves and their property from harm or danger. However, this right is limited to the extent of
the force necessary to protect oneself and cannot be used as an excuse for committing violence.
