# Install Required Packages


In [3]:
!pip install ctransformers
!pip install sentence-transformers
!pip install pinecone-client
!pip install langchain
!pip install pypdf
!pip install tiktoken
!pip install pinecone[grpc]
!pip install langchain-pinecone
!pip install langchain-community


Collecting langchain-community
  Downloading langchain_community-0.2.17-py3-none-any.whl.metadata (2.7 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.22.0-py3-none-any.whl.metadata (7.2 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading mypy_extensions-1.0.0-py3-none-any.whl.metadata (1.1 kB)
Downloading langchain_community-0.2.17-py3-none-any.whl (2.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m40.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dataclasses_json-0.6.7-py3-none-any.whl (

# Import necessary libraries
- We import modules for document handling, embeddings, vector search, and language models.


In [4]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import CTransformers


# Function to load PDFs from a directory
- This function loads all PDF documents from the specified directory.


In [5]:
def load_pdf(data):
    loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

# Load PDF data
extract_data = load_pdf("/content/")
extract_data


[Document(metadata={'source': '/content/Medical_Records.pdf', 'page': 0}, page_content='429 © The Author(s) 2022\nS. Nundy et al., How to Practice Academic Medicine and Publish from \nDeveloping Countries?, https://doi.org/10.1007/978-981-16-5248-6_45\n45Medical Records\n‘By computerising health records, we can avoid dangerous medical mistakes, reduce costs \nand improve care’—George W.\xa0Bush, former US President (1946–)\n45.1  What Are Medical Records?\nThe term of Medical Records is vast and vaguely defined. It encompasses, but is not \nnecessarily limited to, the admission sheet, history sheet, progress of a patient as \nwell as the charts of his or her vital parameters, intake–output data, medications \ngiven, referrals, and discharge summary. Medical certificates, birth certificates, \nMedico-legal case sheets also fall into this purview.\nThere is a lack of uniformity in the dimensions defined and encompassed by \nmedical records in the literature.\n45.2  Is There a\xa0Legal Ba

# Function to split the extracted data into chunks
- Splits large documents into smaller text chunks using RecursiveCharacterTextSplitter for easier processing.


In [6]:
def text_split(extract_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extract_data)
    return text_chunks

# Split the loaded PDF data
text_chunks = text_split(extract_data)
print('Text Chunks:', text_chunks)
print('Number of Chunks:', len(text_chunks))


Number of Chunks: 937


# Function to download embeddings from Hugging Face
- Retrieves a pre-trained embedding model for text queries.


In [7]:
def download_hugging_face_embeddings():
    embedding = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embedding

# Download embeddings
embedding = download_hugging_face_embeddings()
embedding

# Test the embedding with a sample query
query_result = embedding.embed_query("Generative AI")
print("Query Embedding:", query_result)
print("Embedding Length:", len(query_result))


  embedding = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Query Embedding: [-0.07440435141324997, -0.0594085194170475, 0.057583436369895935, 0.0006480728043243289, -0.02570318430662155, 0.022137867286801338, 0.0926656424999237, -0.01514868438243866, -0.005658809561282396, -0.041373610496520996, -0.034168630838394165, -0.0984947681427002, 0.05384870991110802, -0.028577709570527077, -0.03763485327363014, 0.04893640801310539, 0.045807134360075, 0.08285655081272125, -0.06154760718345642, -0.0737510472536087, -0.0027254601009190083, 0.06451704353094101, -0.017008081078529358, -0.03744758293032646, 0.046621304005384445, 0.09511931985616684, 0.02878565713763237, -0.028456008061766624, 0.08930286020040512, -0.09803933650255203, 0.04297880455851555, 0.06631941348314285, -0.018005480989813805, 0.02426755614578724, -0.07755503803491592, 0.07708828151226044, -0.11995290964841843, 0.05890905484557152, 0.059042736887931824, 0.019745366647839546, -0.012411638163030148, -0.016594918444752693, 0.02816745638847351, -0.062177807092666626, 0.06355020403862, 0.01

# Set up Pinecone vector store
- Configures the Pinecone vector store using the Pinecone API and embeds text chunks.


In [8]:
import os
from langchain.vectorstores import Pinecone
from dotenv import load_dotenv
load_dotenv()

# Set up environment variables
os.environ['PINECONE_API_KEY'] = os.getenv('PINECONE_API_KEY')
index_name = 'medical-chatbot'

# Convert text chunks to list of page contents
texts = [t.page_content for t in text_chunks]

# Set up Pinecone vector store
vectorstore_from_texts = Pinecone.from_texts(
    texts,
    index_name=index_name,
    embedding=embedding
)


# Perform a similarity search
- Queries the vector store for similar documents using a search term.


In [9]:
# Initialize document search using Pinecone
docsearch = Pinecone.from_existing_index(index_name, embedding)

# Perform a similarity search
query = 'what is DNA'
docs = docsearch.similarity_search(query, k=3)
print("Search Results:", docs)


Search Results: [Document(page_content='organisms. Most DNA is located in the cell nucleus (where it is called nuclear DNA), but a small\namount of DNA can also be found in the mitochondria (where it is called mitochondrial DNA).\nDNA contains the code for building and maintaining an organism. The code is spelled out in\nthe order, or sequence, of four chemical bases—adenine (A), cytosine (C), guanine (G), and\nthymine (T)—in the same way that letters of the alphabet come together to form words,'), Document(page_content='organisms. Most DNA is located in the cell nucleus (where it is called nuclear DNA), but a small\namount of DNA can also be found in the mitochondria (where it is called mitochondrial DNA).\nDNA contains the code for building and maintaining an organism. The code is spelled out in\nthe order, or sequence, of four chemical bases—adenine (A), cytosine (C), guanine (G), and\nthymine (T)—in the same way that letters of the alphabet come together to form words,'), Document(

# Create a Prompt Template
- Sets up a prompt template for retrieving information based on context and user questions.


In [10]:
prompt_template = """
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Context: {context}
Question: {question}
Only return the helpful answer below and nothing else.
Helpful answer:
"""

PROMPT = PromptTemplate(template=prompt_template, input_variables=['context', 'question'])
chain_type_kwargs = {'prompt': PROMPT}


# Set up the Language Model (Llama 2)
- Configures the CTransformers library to load the Llama-2-7B model for text generation.


In [11]:
llm = CTransformers(model='TheBloke/Llama-2-7B-Chat-GGML',
                    model_type='llama',
                    config={'max_new_tokens': 512, 'temperature': 0.8})


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

llama-2-7b-chat.ggmlv3.q2_K.bin:   0%|          | 0.00/2.87G [00:00<?, ?B/s]

# Build the QA Retrieval Chain
- Constructs a retrieval-based question-answering system using the document retriever and language model.


In [12]:
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='stuff',
    retriever=docsearch.as_retriever(search_kwargs={'k': 2}),
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs,
)


# Interactive User Input for QA System
- Accepts user input in a loop and generates responses using the QA system.


In [None]:
while True:
    user_input = input("Input prompt: ")
    result = qa({'query': user_input})
    print("Response:", result['result'])


Input prompt: what is mitochondria


  result = qa({'query': user_input})


Response: Mitochondria are organelles found inside cells that play a crucial role in energy production. They generate energy by breaking down nutrients, such as proteins, fats, and carbohydrates, into ATP (adenosine triphosphate), the primary source of energy for cells.

Please note: The mitochondria is a type of organelle found inside cells that play an important role in generating energy from nutrients. It has its own DNA, known as mtDNA, and these mutations can have consequences on our health.
Input prompt: which is called the power of house of the cell
Response: The power of the cell is the ability to divide, grow, and differentiate into specialized cells.
Input prompt: which part of the human cell is known as power house of the cell ?
Response: The "powerhouse" of the cell is called the mitochondria.
