In [1]:
# Importing required libraries
from langchain_pinecone import PineconeVectorStore
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers
import os

  from tqdm.autonotebook import tqdm


ModuleNotFoundError: No module named 'pwd'

In [14]:
# Adding api key
PINECONE_API_KEY = "48846a07-d3ce-48fb-b187-16360f241270"   # Not required in latest code
PINECONE_INDEX_NAME = "chatbot-medicine"

### Extract data and create embedding vectors

In [16]:
# Creating a loader to load pdf data
def load_pdf_data(data_path):
    loader = DirectoryLoader(
        path=data_path,
        glob="*.pdf",
        loader_cls=PyPDFLoader,
        show_progress=True,
        use_multithreading=True
    )

    docs = loader.load()
    return docs

In [17]:
pdf_docs = load_pdf_data("data")

100%|██████████| 1/1 [00:14<00:00, 14.40s/it]


In [18]:
pdf_docs[50]

Document(page_content='person who has stopped breathing (respiratory arrest)\nand/or whose heart has stopped (cardiac arrest).\nPurpose\nCPR is performed to restore and maintain breathing\nand circulation and to provide oxygen and blood flow tothe heart, brain, and other vital organs. CPR should beperformed if a person is unconscious and not breathing.Respiratory and cardiac arrest can be caused by allergicreactions, an ineffective heartbeat, asphyxiation, breath-ing passages that are blocked, choking , drowning, drug\nreactions or overdoses, electric shock, exposure to cold,severe shock, or trauma. CPR can be performed bytrained bystanders or healthcare professionals on infants,children, and adults. It should always be performed bythe person on the scene who is most experienced in CPR.\nPrecautions\nCPR should never be performed on a healthy person\nbecause it can cause serious injury to a beating heart byinterfering with normal heartbeats.\nDescription\nCPR is part of the emergency c

In [19]:
# Splitting the data into chunks
def get_text_chunks(data):
    # Initialize the text splitter class
    extracted_chunks = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20,
    )
    # Get the split text/chunks using split_documents
    doc_split = extracted_chunks.split_documents(data)
    return doc_split

In [20]:
doc_chunks = get_text_chunks(pdf_docs)
doc_chunks[5]

Document(page_content='mation presented in this publication, the Gale Group neither guaranteesthe accuracy of the data contained herein nor assumes any responsibili-ty for errors, omissions or discrepancies. The Gale Group accepts nopayment for listing, and inclusion in the publication of any organiza-tion, agency, institution, publication, service, or individual does notimply endorsement of the editor or publisher. Errors brought to theattention of the publisher and verified to the satisfaction of the publish-er', metadata={'source': 'data\\medical-book.pdf', 'page': 2})

In [21]:
# Initializing the embedding model
def get_hudding_face_embedding():
    embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embedding

In [22]:
minilm_embedding = get_hudding_face_embedding()

In [23]:
# Adding pinecone api key to script environment
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY

In [24]:
# Creating a vector store in pinecone
vectorstore = PineconeVectorStore.from_documents(doc_chunks, embedding=minilm_embedding, index_name=PINECONE_INDEX_NAME)

In [45]:
ex_query = "What is melanoma"
similar_search = vectorstore.similarity_search(query=ex_query)
similar_search

[Document(metadata={'page': 646.0, 'source': 'data\\medical-book.pdf'}, page_content='Intraocular melanoma is a rare cancer overall, yet it\nis the most common eye cancer seen in adults. It is whencancer cells are found in the uvea of the eye. The uveaincludes the iris (the colored portion of eye), the ciliarybody (an eye muscle that focuses the lens) and thechoroid (found in the back of the eye next to the retina).Intraocular cancer of the iris usually grows slowly'),
 Document(metadata={'page': 20.0, 'source': 'data\\medical-book.pdf'}, page_content='layers of cells covering the body’s surface and lining theinternal organs and various glands). Ninety percent ofhuman cancers fall into this category. Carcinomas can besubdivided into two types: adenocarcinomas and squa-\nmous cell carcinomas. Adenocarcinomas are cancers thatdevelop in an organ or a gland, while squamous cell car-cinomas refer to cancers that originate in the skin.\n• Melanomas also originate in the skin, usually in the\

In [47]:
# Adding LLM prompt engineering
llm_prompt = """"
Use the following information to answer the given question. 
If the answer is unkonwn, mention that you dont know the answer to the particualr question 
and prompt the user to ask any other qusiton, do not make up any answer by your self

Context: {context}
Question: {question}

Provide only related helpful answers
Answer:
"""

In [48]:
# Creating a prompt template
prompt_template = PromptTemplate(template=llm_prompt, input_variables=["context", "question"])
chain_type_kwargs = {"prompt": prompt_template}


In [49]:
# Instantiate llama-2 llm model
llm = CTransformers(model="model\llama-2-7b-chat.ggmlv3.q4_0.bin",
                    model_type="llama",
                    config={'max_new_tokens': 500,
                            'temperature': 0.8}
                    )

In [50]:
# Creating question-ans obj
qa_obj = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)

ValidationError: 1 validation error for RetrievalQA
retriever
  Can't instantiate abstract class BaseRetriever with abstract methods _aget_relevant_documents, _get_relevant_documents (type=type_error)

In [40]:
# Question answring session
while True:
    user_input = input("Ask your query related to general medicine and disease: ")
    result = qa_obj({"query": user_input})
    print("Result: ", result['result'])

NameError: name 'qa_obj' is not defined