In [2]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers

  from tqdm.autonotebook import tqdm


In [3]:
PINECONE_API_KEY= "key"
PINECONE_API_ENV= "key"
OPENAI_API_KEY="key"

In [4]:
from langchain.document_loaders import DirectoryLoader, PyPDFLoader

In [5]:
def load_pdf(data):
    loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
    documents= loader.load()
    return documents


In [6]:
extracted_files= load_pdf("Data/")

In [7]:
#Now we need to create text chunks
def text_chunk_create(extracted_files):
     text_chunker= RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
     text_chunk= text_chunker.split_documents(extracted_files)
     return text_chunk



In [8]:
text_chunks= text_chunk_create(extracted_files)
print(f"Length of chunk is: {len(text_chunks)}")
text_chunks

Length of chunk is: 6460


[Document(page_content='OXFORD HANDBOOK OF  \nCLINICAL\nMEDICINE\nTENTH EDITION\nIan B. Wilkinson\nTim Raine\nKate Wiles\nAnna Goodhart\nCatriona Hall \nHarriet O’Neill\n3\n_OHCM_10e.indb   i_OHCM_10e.indb   i 02/05/2017   19:0602/05/2017   19:06', metadata={'source': 'Data\\8205Oxford Handbook of Clinical Medicine 10th 2017.pdf', 'page': 1}),
 Document(page_content='Reading tests  Hold this chart (well-illuminated) 30cm away, and record the smallest \ntype read (eg N12 left eye, N6 right eye, spectacles worn) or object named accurately.all the brightest gems N. 24He movedN. 48\nfaster and faster towards the N. 18\never-growing bucket of lost hopes;\nhad there been just one more yearN. 14\nof peace the battalion would have made\na floating system of perpetual drainage.N. 12\nA silent fall of immense snow came near oily', metadata={'source': 'Data\\8205Oxford Handbook of Clinical Medicine 10th 2017.pdf', 'page': 2}),
 Document(page_content='remains of the recently eaten supper on the ta

In [9]:
#Embedding Step:
def getting_embedding_model():
    embeddings= HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [10]:
embeddings= getting_embedding_model()

In [11]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [12]:
from langchain_openai import OpenAIEmbeddings
import os
from langchain_pinecone import PineconeVectorStore

In [13]:
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
os.environ['PINECONE_API_KEY'] = PINECONE_API_KEY

index_name = "medical-chatbot"


In [None]:

docsearch = PineconeVectorStore.from_documents(text_chunks, embedding=embeddings, index_name=index_name)


In [14]:
docsearch = PineconeVectorStore.from_existing_index(index_name=index_name, embedding=embeddings)

In [15]:
query="What is bacterial infection?"
similar_sear= docsearch.similarity_search(query)

In [16]:
similar_sear[0] #unreadable right now

Document(page_content='mophilus inﬂ  uenzae, Moraxella catarrhalis. Atypicals: Mycoplasma pneumoniae, \nStaphylococcus aureus , Legionella species, and Chlamydia. Gram-negative bacilli, \nCoxiella burnetii and anaerobes are rarer (?aspiration). Viruses account for up to \n15%. Flu may be complicated by community-acquired MRSA  pneumonia.\nHospital-acquired: Deﬁ ned as >48h after hospital admission. Most commonly \nGram-negative enterobacteria or Staph. aureus . Also Pseudomonas, Klebsiella, \nBacteroides , and Clostridia.', metadata={'page': 179.0, 'source': 'Data\\8205Oxford Handbook of Clinical Medicine 10th 2017.pdf'})

In [17]:
prompt_template="""
Use the following information to answer the users question
If you don't know the answer, just say that you are a text model that hasn't been trained for that data, don't try to make up an answer.
Context:{context}
Question:{question}
only return a useful answer and nothing else.
Useful answer:

"""

In [18]:
PROMPT= PromptTemplate(template=prompt_template, input_variables=["context","quetsion"])
chain_t_kwargs={"prompt":PROMPT}


In [19]:
llm= CTransformers(model="model\llama-2-7b-chat.ggmlv3.q4_0.bin", model_type="llama", config={'max_new_tokens':512, 'temperature':0.8})

In [21]:
questionanswer= RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever(search_kwargs={'k':2}), return_source_documents=True, chain_type_kwargs=chain_t_kwargs)


In [22]:
quest="what is bacterial infection?"
questionanswer.invoke(quest)

{'query': 'what is bacterial infection?',
 'result': "Bacterial infections are illnesses caused by microorganisms called bacteria. These microorganisms can infect various parts of the body, including the skin, respiratory system, bloodstream, and other organs. Bacterial infections can cause a wide range of symptoms, such as fever, cough, chills, and fatigue. They can also lead to more severe complications, such as pneumonia, meningitis, and sepsis, which can be especially if left unresponsible.\nwhich can occur when left unresponsible to name a life- a life- which can cause organ failure of which can resultin some of which can beweenic which can be it is a potentially resulting in severe inflammunity that can also known as well as well as well as well as well as well as well as well as well as well as well as well as well as well as well as well as well as well as well as well if left untreat times in some of which can be life- which can occur when left un\nwhich can cause organ failur