In [18]:

from langchain.prompts import PromptTemplate
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_pinecone import PineconeVectorStore
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.llms import CTransformers
from dotenv import load_dotenv

import os

load_dotenv()

True

In [2]:
def load_pdf(data_dir):
    """Load PDFs from a directory

    Args:
        data_dir (str): path to the directory where the pdf data is stored

    Returns:
        _type_: _description_
    """
    loader = DirectoryLoader(data_dir,
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)
    documents = loader.load()

    return documents

In [3]:
extracted_data = load_pdf("../data/")

In [4]:
def text_split(extracted_data):

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    documents = text_splitter.split_documents(extracted_data)

    return documents

In [5]:
text_chunks = text_split(extracted_data)
print("Length of text chunks: ", len(text_chunks))

Length of text chunks:  8595


In [6]:
def download_hugging_face_embeddings():
    """Download the Hugging Face embeddings

    Returns:
        _type_: _description_
    """
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [7]:
embeddings = download_hugging_face_embeddings()

In [8]:
query_result = embeddings.embed_query("Hello wolrd")

print(query_result)

[-0.08045594394207001, 0.03474036976695061, 0.04197043552994728, 0.03294938802719116, -0.010872787795960903, -0.12379560619592667, 0.07415361702442169, -0.004162982106208801, 0.02425968088209629, -0.03343380615115166, 0.023308511823415756, 0.0097760409116745, -0.025727173313498497, -0.03764554113149643, 0.021908342838287354, -0.02695627138018608, -0.001193429110571742, 0.03142089024186134, -0.12250585108995438, 0.009405793622136116, -0.025801697745919228, 0.07865899801254272, 0.024827634915709496, 0.020943669602274895, -0.04023496434092522, -0.04566634073853493, 0.05064922198653221, 0.06266313791275024, -0.020599951967597008, 0.018608979880809784, 0.10433362424373627, -0.021139707416296005, 0.05518507584929466, -0.003499421291053295, -0.011035345494747162, 0.06912390142679214, 0.02409300021827221, -0.023133961483836174, 0.010791191831231117, 0.027800604701042175, -0.024782443419098854, -0.011795461177825928, -0.017575206235051155, 0.031164443120360374, 0.020911864936351776, 0.034649036

In [9]:
index_name = "medical-chatbot"

docsearch = PineconeVectorStore.from_documents(text_chunks, embeddings, index_name=index_name)

In [11]:
query = "What are allergies?"
docs = docsearch.similarity_search(query, k=3)
print(docs[0].page_content)

8th ed. St. Louis: Mosby, 1996.
ORGANIZATIONS
American Academy of Ophthalmology. 655 Beach Street, PO
Box 7424, San Francisco, CA 94120-7424. <http://www.eyenet.org>.KEY TERMS
Allergen —A substance capable of inducing an
allergic response.
Allergic reaction —An immune system reaction to
a substance in the environment; symptomsinclude rash, inflammation, sneezing, itchy wateryeyes, and runny nose.
Conjunctiva —The mucous membrane that covers
the white part of the eyes and lines the eyelids.


In [12]:
prompt_template = """
Use the following information to answer the user's question.
If you don't know the answer, you can ask for more information.

Context: {context}
Question: {question}

Only respond with the information that is relevant to the user's question.
Answer:
"""

In [19]:
prompt=PromptTemplate(template=prompt_template, input_variables=["context", "question"])

In [20]:
llm=CTransformers(model="../model/llama-2-7b-chat.ggmlv3.q4_0.bin", 
                  model_type="llama", 
                  config={'max_new_tokens':512,
                          'temperature':0.8
                        })

In [23]:
rag_chain = (
    {"context": docsearch.as_retriever(search_kwargs={'k':2}), "question": RunnablePassthrough()}
    | prompt
    | llm 
    | StrOutputParser()
)

In [24]:
rag_chain.invoke("Tell me about allergies")

Number of tokens (513) exceeded maximum context length (512).
Number of tokens (514) exceeded maximum context length (512).
Number of tokens (515) exceeded maximum context length (512).
Number of tokens (516) exceeded maximum context length (512).
Number of tokens (517) exceeded maximum context length (512).
Number of tokens (518) exceeded maximum context length (512).
Number of tokens (519) exceeded maximum context length (512).
Number of tokens (520) exceeded maximum context length (512).
Number of tokens (521) exceeded maximum context length (512).
Number of tokens (522) exceeded maximum context length (512).
Number of tokens (523) exceeded maximum context length (512).
Number of tokens (524) exceeded maximum context length (512).
Number of tokens (525) exceeded maximum context length (512).
Number of tokens (526) exceeded maximum context length (512).
Number of tokens (527) exceeded maximum context length (512).
Number of tokens (528) exceeded maximum context length (512).
Number o

'Allergy is an immune system reaction to a substance in the environment. Symptoms include rash, inflammation, sneezing, itchy watery eyes, and runny nose. Allergens are foreign substances that can induce an allergic response. Examples of this includes mites, such as they can include mites, such as they can be ith. Common allergies, such as well known as they include dust or immune. Examples include dust or cause the body. Examples of any time. Examples of some common ones when in some examples include pollen and are class=" Aller, such as they include mites. Common allergensuring to environmental medicine is something in a substances. Conj and may include mite, such as well-a '