In [1]:
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
#from langchain.vectorstores import Pinecoe
from pinecone import Pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers

  from tqdm.autonotebook import tqdm


In [2]:
PINECONE_API_KEY = "4e3fee5e-29a5-46f2-96d5-f3b68a5e0a25"
PINECONE_API_ENV = "gcp-starter"
INDEX_NAME = "medicbot"

In [3]:
import os
os.environ['PINECONE_API_KEY'] = PINECONE_API_KEY

In [4]:
### Initializins second way
pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index("medicbot")

In [5]:
#Extract data from the PDF
def load_pdf(data):
    loader = DirectoryLoader(data,
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)
    
    documents = loader.load()

    return documents

In [6]:
extracted_data = load_pdf("D:\DSProjects\MedicBot\data")

In [7]:
# extracted_data

In [8]:
#Create text chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 300, chunk_overlap = 25)
    text_chunks = text_splitter.split_documents(extracted_data)

    return text_chunks

In [9]:
text_chunks = text_split(extracted_data)
print("length of my chunk:", len(text_chunks))

length of my chunk: 12333


In [10]:
#text_chunks

In [11]:
#download embedding model
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [12]:
embeddings = download_hugging_face_embeddings()



In [13]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [14]:
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [15]:
vec = embeddings.aembed_query('key terms of Acetaminophe')

In [16]:
# query_result

In [17]:
from langchain_pinecone import PineconeVectorStore

In [18]:
#for storing vectors
docsearch = PineconeVectorStore.from_documents(text_chunks, embeddings, index_name=INDEX_NAME)

In [19]:
#for stored vectors rerieve
docsearch = PineconeVectorStore.from_existing_index(embedding=embeddings, index_name=INDEX_NAME)

In [20]:
query = "What are Allergies"

docs=docsearch.similarity_search(query, k=3)

print("Result", docs)

Result [Document(page_content='Description\nAllergies are among the most common of medical', metadata={'page': 128.0, 'source': 'D:\\DSProjects\\MedicBot\\data\\Medical_book.pdf'}), Document(page_content='Allergies\nDefinition\nAllergies are abnormal reactions of the immune sys-\ntem that occur in response to otherwise harmless sub-stances.\nGALE ENCYCLOPEDIA OF MEDICINE 2 114AllergiesGEM - 0001 to 0432 - A  10/22/03 1:42 PM  Page 114', metadata={'page': 127.0, 'source': 'D:\\DSProjects\\MedicBot\\data\\Medical_book.pdf'}), Document(page_content='An allergy is a type of immune reaction. Normally,', metadata={'page': 128.0, 'source': 'D:\\DSProjects\\MedicBot\\data\\Medical_book.pdf'})]


In [21]:
print(docs[0].page_content)

Description
Allergies are among the most common of medical


In [62]:
prompt_template="""
you are brilliant doctor. 
Use the following pieces of information to answer the user's question. 

Context: {context}
Question: {question}


Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [63]:
PROMPT=PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain_type_kwargs={"prompt": PROMPT}

In [56]:
path = "D:\DSProjects\MedicBot\model\llama-2-7b-chat.ggmlv3.q2_K.bin"
llm=CTransformers(model=path,
                  model_type="llama",
                  config={'max_new_tokens':1000,
                          'temperature':0.2,
                          'repetition_penalty': 1.4})

In [25]:
ret =docsearch.as_retriever(search_kwargs={'k': 5})

In [64]:
qa=RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=docsearch.as_retriever(search_kwargs={'k': 5}),
    return_source_documents=True,
    chain_type_kwargs= chain_type_kwargs
    )

In [None]:
r = qa.invoke('what are types of Allergies?')

In [82]:
print(r['result'])

Types of allergies can be classified into several categories based on their causes or symptoms as follows; ☮️ Food allergy - an immune reaction to a specific food that is not tolerated by your body, resulting in adverse reactions such as hives. Common culprits include peanuts and tree nuts like almonds walnuts etc., fish shellfish crustaceans (shrimp lobster crab), milk eggs soy wheat corn rice oats barley pollen mold dust mites animal dander insect stings bee venom. 
☮️ Seasonal Allergies - an immune reaction to a specific season or time of year, such as spring allergens like pollution and tree pollination in the air during this period cause adverse reactions including sneezing itchy eyes runny nose congestion cough wheeze. Common culprits include birch trees oak elm linden poplar cottonwood silver maple willow etc., grasses like timothy Kentucky bluegrass Bermuda and zoysia, weeds such as ragweed pigweed plantain sagebrush Russian thistles.
☮️ Insect Allergies - an immune reaction to

In [89]:
while True:
    user_input = input("Input Prompt:")  
    if user_input.lower() == 'exit':    
        break                           
    result = qa.invoke({"query": user_input})  
    print("Response:", result["result"])  


Response: Coronary angiography (also called an arterial angiosome) is a diagnostic test used to evaluate blood flow through coronary artery, which supply oxygen-rich blood to heart muscle during exercise or stress. It involves injection of contrast dye into the catheter inserted in leg and chest area under fluoroscopy guidance followed by X-ray imaging for visualization of vessels inside body
Coronary disease occurs when there is a blockage (atherosclerosis) that reduces blood flow to heart muscle, leading symptoms such as fatigue shortness breath or pain during exertion.
