In [3]:
%pwd


'/home/bit/Medical-assistance-chatbot/research'

In [4]:
import os
os.chdir('../')

In [5]:
%pwd

'/home/bit/Medical-assistance-chatbot'

In [6]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [7]:
def load_pdf_file(data):
    loader = DirectoryLoader(data,
                             glob = "*pdf",
                             loader_cls=PyPDFLoader)
    documents = loader.load()
    
    return documents

In [41]:
# extracted_data = load_pdf_file(data = 'Data/')
#rerun the above line to load the pdf files again if cache is cleared

In [9]:
#extracted_data

In [42]:
# import joblib
# joblib.dump(extracted_data, 'cached_data.joblib')
#for caching the data

['cached_data.joblib']

In [43]:
import joblib
# Load the cached data
extracted_data = joblib.load('cached_data.joblib')

In [72]:
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        separators=["\n\n", "\n", ".", " ", ""]
    )
    text_chunks = text_splitter.split_documents(extracted_data)
    
    return text_chunks

In [73]:
text_chunks = text_split(extracted_data)
print("Length of text chunks: ", len(text_chunks))

Length of text chunks:  3424


In [75]:
from langchain.embeddings import HuggingFaceEmbeddings

In [76]:
def download_embedding_model():
    embeddings = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-MiniLM-L6-v2')
    return embeddings
#returns the embedding model which is used to convert the text chunks into vectors
#it gives 384 dimensional vectors

In [77]:
import huggingface_hub
print(huggingface_hub.__version__)


0.30.1


In [78]:
embeddings = download_embedding_model()

In [79]:
query_results = embeddings.embed_query("Hello world")
print("Length of query results: ", len(query_results))

Length of query results:  384


In [80]:
query_results[:5]

[-0.03447728976607323,
 0.03102320060133934,
 0.006734994240105152,
 0.026108931750059128,
 -0.039361972361803055]

In [81]:
from dotenv import load_dotenv
load_dotenv()

True

In [82]:
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')


In [83]:
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY # for authentication
# for authentication

In [84]:
index_name = 'medicalbot'

In [85]:
# from pinecone import Pinecone, ServerlessSpec
# pc = Pinecone(api_key=PINECONE_API_KEY)

In [86]:


# pc.create_index(
#     name='medicalbot',
#     dimension=384,  # dimension of your embeddings
#     metric='cosine',
#     spec=ServerlessSpec(

#         cloud="aws",

#         region="us-east-1"

#     ) 
# )

In [87]:
#Enbed each text chunk and upsert the embeddings into the Pinecone index
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents = text_chunks,
    index_name = index_name,
    embedding = embeddings
)

In [89]:
#Load existing index
from langchain_pinecone import PineconeVectorStore
docsearch = PineconeVectorStore.from_existing_index(
    index_name = index_name,
    embedding = embeddings
)

In [90]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x743b23f9b5b0>

In [137]:
retriever = docsearch.as_retriever(search_type = "similarity", search_kwargs = {"k": 3})

In [121]:
retrieved_docs = retriever.invoke("What is acne?")

In [122]:
retrieved_docs

[Document(id='66e3d95e-780a-4d52-9ce6-7e543e181a1c', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 47.0, 'page_label': '48', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'Data/Medical_book.pdf', 'total_pages': 637.0}, page_content='KEY TERMS\nAdenoma —A type of noncancerous (benign)\ntumor that often involves the overgrowth of certain\ncells found in glands.\nGland—A collection of cells that releases certain\nchemicals, or hormones, that are important to the\nfunctioning of other organs or body systems.\nHormone —A chemical produced in one part of\nthe body that travels to another part of the body in\norder to exert an effect.\nHypothalamus —A structure within the brain\nresponsible for a large number of normal functions\nthroughout the body, including regulating sleep,\ntemperature, eating, and sexual development. The\nhypothalamus also regulates the functions of the\npituitary gland by directing the pit

In [94]:
from dotenv import load_dotenv
load_dotenv()

True

In [95]:
GEMINI_API_KEY = os.getenv('GEMINI_API_KEY')
os.environ["GEMINI_API_KEY"] = GEMINI_API_KEY 

In [129]:
import google.generativeai as genai

# Configure the API
genai.configure(api_key=GEMINI_API_KEY)

# List available models
print("Available Models:")
for model in genai.list_models():
    print(f"Name: {model.name}")
    print(f"Display Name: {model.display_name}")
    print(f"Supported Generation Methods: {model.supported_generation_methods}")
    print("---")

Available Models:
Name: models/chat-bison-001
Display Name: PaLM 2 Chat (Legacy)
Supported Generation Methods: ['generateMessage', 'countMessageTokens']
---
Name: models/text-bison-001
Display Name: PaLM 2 (Legacy)
Supported Generation Methods: ['generateText', 'countTextTokens', 'createTunedTextModel']
---
Name: models/embedding-gecko-001
Display Name: Embedding Gecko
Supported Generation Methods: ['embedText', 'countTextTokens']
---
Name: models/gemini-1.0-pro-vision-latest
Display Name: Gemini 1.0 Pro Vision
Supported Generation Methods: ['generateContent', 'countTokens']
---
Name: models/gemini-pro-vision
Display Name: Gemini 1.0 Pro Vision
Supported Generation Methods: ['generateContent', 'countTokens']
---
Name: models/gemini-1.5-pro-latest
Display Name: Gemini 1.5 Pro Latest
Supported Generation Methods: ['generateContent', 'countTokens']
---
Name: models/gemini-1.5-pro-001
Display Name: Gemini 1.5 Pro 001
Supported Generation Methods: ['generateContent', 'countTokens', 'createC

In [130]:
from langchain_google_genai import ChatGoogleGenerativeAI

# Initialize the LLM
llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-pro-latest",
    temperature=0.4,
    max_output_tokens=500,
    google_api_key=GEMINI_API_KEY # Add this line
)

In [124]:
retriever.invoke("What is acne?")

[Document(id='3d8a7ad8-9ba7-4ad8-bc36-8b995b2f3c70', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 38.0, 'page_label': '39', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'Data/Medical_book.pdf', 'total_pages': 637.0}, page_content='The goal of treating moderate acne is to decrease\ninflammation and prevent new comedone formation. One\neffective treatment is topical tretinoin along with a topical\nGALE ENCYCLOPEDIA OF MEDICINE 2 25\nAcne\nAcne vulgaris affecting a woman’s face. Acne is the general\nname given to a skin disorder in which the sebaceous\nglands become inflamed.(Photograph by Biophoto Associ-\nates, Photo Researchers, Inc. Reproduced by permission.)\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 25'),
 Document(id='b5652f43-b3d6-4445-afaf-939cf6096fd2', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 37.0, 'page_label'

In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.prompts import ChatPromptTemplate
from langchain_pinecone import Pinecone
from pinecone import Pinecone as PineconeClient



def check_relevance(docs, query):
    # If no documents or empty documents, return False
    if not docs or not any(doc.page_content.strip() for doc in docs):
        return False
    
    # Check if any document contains the exact query term
    query_terms = query.lower().split()
    for doc in docs:
        content = doc.page_content.lower()
        # Check if the exact query term is in the content
        if any(term in content for term in query_terms):
            return True
    return False

# Create a stricter prompt
system_prompt = (
    "You are a medical assistant for question-answering tasks. "
    "STRICTLY follow these rules:\n"
    "1. Use ONLY the provided medical context to answer\n"
    "2. If the context doesn't contain the answer, say: 'I don't have enough medical information to answer that.'\n"
    "3. Never make up, infer, or add information not explicitly stated\n"
    "4. If partial information exists, share ONLY what's explicitly mentioned\n"
    "5. Never reference similar or related topics if not directly asked\n"
    "6. Keep answers concise (2-3 sentences maximum)\n"
    "\n"
    "Context: {context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

# Create the document chain


question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)



# Example usage


In [150]:


response = rag_chain.invoke({"input": "what is Acromegaly and gigantism?detailed explanation"})
print(response["answer"])



Acromegaly is a disorder marked by increased bone and soft tissue growth, along with other bodily disturbances, caused by the abnormal release of growth hormone (GH) from the pituitary gland.  In children whose growth plates haven't closed, this leads to exceptional long bone growth, a variant called gigantism, causing unusual height. When the abnormality occurs after bone growth stops, it's called acromegaly. It affects roughly 50 out of every 1 million people, impacting both men and women.  Symptoms develop gradually, often delaying diagnosis until middle age.  The body's processing and use of nutrients like fats and sugars is also altered. Without treatment, it can lead to premature death due to effects on the heart, lungs, brain, or colon cancer. Treatment can enable a normal lifespan.


In [151]:


response = rag_chain.invoke({"input": "What is statistics?"})
print(response["answer"])



I don't have any relevant information about that topic in my medical database.
