In [31]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pinecone import Pinecone
from langchain.prompts import PromptTemplate
from langchain.document_loaders import PyPDFLoader , DirectoryLoader
from langchain.llms import CTransformers
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore


In [2]:

import os
os.environ['PINECONE_API_KEY'] = "1840affb-9ae0-426c-920e-df290252ede2"
index_name = "medical-chatbot"

In [3]:
#Extract data from pdf:
def load_pdf(data):
  loader = DirectoryLoader(data,glob="*.pdf",loader_cls=PyPDFLoader)
  documents = loader.load()
  return documents 

In [4]:
#data extraction
extracted_data = load_pdf("data/")

In [5]:
# data split and chunk creation
def text_splitter(extracted_data):
  text_split=RecursiveCharacterTextSplitter(chunk_size=500 , chunk_overlap=20)
  text_chunks = text_split.split_documents(extracted_data)
  return text_chunks

In [6]:
# storing text chunks
text_chunks = text_splitter(extracted_data)

In [7]:
# function to download embedding model
def download_embedding_model():
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  return embeddings

In [8]:
# embedding model object
embeddings = download_embedding_model()

In [9]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [10]:
#testing embedding model
query_result = embeddings.embed_query("hello world")
query_result

[-0.03447728976607323,
 0.03102322667837143,
 0.006735010538250208,
 0.026108987629413605,
 -0.03936200588941574,
 -0.16030244529247284,
 0.06692402064800262,
 -0.0064414627850055695,
 -0.047450508922338486,
 0.01475893147289753,
 0.07087531685829163,
 0.05552755668759346,
 0.01919335313141346,
 -0.026251355186104774,
 -0.010109529830515385,
 -0.026940520852804184,
 0.02230735309422016,
 -0.0222266037017107,
 -0.1496925950050354,
 -0.01749310828745365,
 0.007676230277866125,
 0.05435226112604141,
 0.0032544718123972416,
 0.031725943088531494,
 -0.08462142199277878,
 -0.02940600924193859,
 0.05159568786621094,
 0.04812406748533249,
 -0.00331477215513587,
 -0.05827920883893967,
 0.041969284415245056,
 0.02221072092652321,
 0.128188818693161,
 -0.022338923066854477,
 -0.011656236834824085,
 0.06292837113142014,
 -0.03287626430392265,
 -0.09122605621814728,
 -0.03117534890770912,
 0.05269956588745117,
 0.04703482612967491,
 -0.0842030718922615,
 -0.030056139454245567,
 -0.02074483036994934

In [32]:
# intializing the pinecone
pc = Pinecone(api_key="1840affb-9ae0-426c-920e-df290252ede2")
index = pc.Index("medical-chatbot")
#creating embeddings for each of the text_chunks and storing in pinecorn_db
docsearch = PineconeVectorStore.from_texts([t.page_content for t in text_chunks], embeddings , index_name=index_name)

In [33]:
# testing with query
docsearch = PineconeVectorStore.from_existing_index(index_name,embeddings)
query = "what are allergies"
doc = docsearch.similarity_search(query,k=3)
print(doc)

[Document(page_content='ORGANIZATIONS\nAmerican Academy of Ophthalmology. 655 Beach Street, PO\nBox 7424, San Francisco, CA 94120-7424. <http://www.\neyenet.org>.KEY TERMS\nAllergen —A substance capable of inducing an\nallergic response.\nAllergic reaction —An immune system reaction to\na substance in the environment; symptoms\ninclude rash, inflammation, sneezing, itchy watery\neyes, and runny nose.\nConjunctiva —The mucous membrane that covers\nthe white part of the eyes and lines the eyelids.'), Document(page_content='Although environmental medicine is gaining more\nrespect within conventional medicine, detoxificationKEY TERMS\nAllergen —A foreign substance, such as mites in\nhouse dust or animal dander, that when\ninhaled,causes the airways to narrow and pro-\nduces symptoms of asthma.\nAntibody —A protein, also called immunoglobu-\nlin, produced by immune system cells to remove\nantigens (the foreign substances that trigger the\nimmune response).\nFibromyalgia —A condition of debi

In [34]:
prompt_template="""
Use the following piece of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up the answer.
Context: {context}
Question: {question}

Only return the helpful answer below nothing else.
Helpful answer:
"""

In [36]:
PROMPT = PromptTemplate(template=prompt_template, input_variables=["context","question"])
chain_type_kwargs = {"prompt":PROMPT}

In [42]:
# loading our model:
llm = CTransformers(model="model/llama-2-7b-chat.ggmlv3.q4_0.bin",
                    model_type = "llama",
                    config = {'max_new_tokens':512,
                              'temperature':0.8})

In [46]:
qa = RetrievalQA.from_chain_type(
  llm = llm,
  chain_type = "stuff",
  retriever = docsearch.as_retriever(search_kwargs = {'k' : 2}),
  return_source_documents = True,
  chain_type_kwargs = chain_type_kwargs
)

In [None]:
while True:
  user_input = input(f"Input Prompt:")
  result = qa.invoke({"query" : user_input})
  print("response: ",result["result"])