In [1]:
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

In [2]:
from pinecone import Pinecone, ServerlessSpec

from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers

  from tqdm.autonotebook import tqdm


In [7]:
def load_data(data):
    loader = DirectoryLoader(data, glob='*.pdf', loader_cls=PyPDFLoader)

    documents = loader.load()
    return documents

In [8]:
extracted_data = load_data("data/")

Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 560 0 (offset 0)


In [9]:
# create text chucks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap=40)
    text_chunks = text_splitter.split_documents(extracted_data)

    return text_chunks

In [10]:
text_chunks = text_split(extracted_data)
print("length of text chunks: ", len(text_chunks))

length of text chunks:  2793


In [11]:
def download_huggingface_embedding():
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings             

In [12]:
embeddings = download_huggingface_embedding()

In [13]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [14]:
query_result = embeddings.embed_query("hi")
print('Length', len(query_result))

Length 384


In [15]:
persist_directory = 'db'

In [16]:
vectordb = Chroma.from_documents(documents=text_chunks, embedding=embeddings, persist_directory=persist_directory)

In [17]:
vectordb.persist()

  warn_deprecated(


In [18]:
vectordb = None

In [19]:
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings)

In [20]:
vectordb

<langchain_community.vectorstores.chroma.Chroma at 0x2282bd4f910>

In [21]:
retriever = vectordb.as_retriever(search_kwargs={"k": 2})

retriever

VectorStoreRetriever(tags=['Chroma', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x000002282BD4F910>, search_kwargs={'k': 2})

In [22]:
docs = retriever.invoke("What are pasta ingredients?")

In [23]:
docs

[Document(page_content='protein  35 g\ncarbohydrates  47 g\npotassium  734 mgmain dishes         pastas', metadata={'page': 88, 'source': 'data\\dinners_cookbook_508-compliant.pdf'}),
 Document(page_content='protein  35 g\ncarbohydrates  47 g\npotassium  734 mgmain dishes         pastas', metadata={'page': 88, 'source': 'data\\dinners_cookbook_508-compliant.pdf'})]

In [24]:
prompt_template='''
Use the following pieces of information to answer the user's question.
If you don't know the answer,just say that you don't know,don't try to makeup the answer.
Context: {context}
Question: {question}

Only return the helpful aswer below and nothing else.
Helpful Answer:
'''

In [25]:
PROMPT = PromptTemplate(template=prompt_template, input_variables=['context','question'])
chain_type_kwargs = {'prompt': PROMPT}

In [26]:
llm=CTransformers(
    model="TheBloke/Llama-2-7B-Chat-GGML",
    model_type="llama",
    config={'max_new_tokens':2048,'context_length' : 1024,'temperature':0.8}
)

Fetching 1 files: 100%|██████████| 1/1 [00:00<?, ?it/s]
Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 1000.79it/s]


In [27]:
qa=RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectordb.as_retriever(search_kwargs={"k":2}),
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs
)

In [28]:
import warnings 
warnings.filterwarnings('ignore') 

In [29]:
while True:
    user_input=input(f"Input Prompt:")
    result=qa({"query": user_input})
    print("Response : ", result["result"])

Response :  To cook pasta, follow these steps:
1. Place spaghetti into boiling water and cook as per packet instructions.
Response :  Do not open the lid during the first 10 minutes of cooking the rice, as this will cause the rice to cook unevenly. After 15 minutes, open the lid and again mix only the top of the rice, close the lid and then cook for another 10 minutes. When the rice is cooked, remove from the heat and gently mix all the ingredients together. Serve on a large flat plate, along with a fresh salad.
Response :  The ingredients needed for making bean burgers are tinned kidney beans (£0.30), tinned mushrooms (£0.39), eggs (£0.69), bread (£0.36), lettuce (optional, £0.69), salt and pepper (optional).
Please note that the prices are based on Lidl Supermarket's prices.


KeyboardInterrupt: 