In [3]:
# cell 1
%pwd

'c:\\Users\\Tolamo\\Chatbot-Medical-Diagnosis\\research'

In [5]:
# cell 2
import os
os.chdir("../")

In [6]:
# cell 3
%pwd

'c:\\Users\\Tolamo\\Chatbot-Medical-Diagnosis'

In [7]:
# cell 4
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [8]:
#cell 5
#Extract Data from PDF files
def load_pdf_file(data_path):
    """Load medical PDFs with enhanced error recovery"""
    try:
        loader = DirectoryLoader(
            path=data_path,
            glob="*.pdf",
            loader_cls=PyPDFLoader,
            show_progress=True    # Visual loading indicator
        )
        documents = loader.load()
        logger.info(f"Successfully loaded {len(documents)} medical documents")
        return documents
    except Exception as e:
        logger.error(f"Failed to load PDFs: {str(e)}")
        return []

In [12]:
# cell 6
# cell 6
extracted_data = load_pdf_file(data_path="Data/")

100%|██████████| 1/1 [00:16<00:00, 16.59s/it]
INFO:__main__:Successfully loaded 409 medical documents


In [13]:
# cell 7
#Split the data into smaller chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)

    return text_chunks

In [14]:
#cell 8
text_chunks = text_split(extracted_data)
print(f"Total number of chunks: {len(text_chunks)}")

Total number of chunks: 1733


In [15]:
# cell 9
from langchain.embeddings import HuggingFaceEmbeddings

def download_huggingface_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings


In [16]:
# cell 10

embeddings = download_huggingface_embeddings()

  embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2


In [17]:
#cell 11
from dotenv import load_dotenv
load_dotenv()

PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
HUGGINGFACE_API_TOKEN = os.environ.get("HUGGINGFACE_API_TOKEN")

In [18]:
# cell 12

from pinecone import Pinecone, ServerlessSpec


pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "medicalbot"
dimension = 384  


if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=dimension,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )
    print(f"Created new index: {index_name}")
else:
    print(f"Index {index_name} already exists")

Index medicalbot already exists


In [19]:
# cell 13
import os

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["HUGGINGFACE_API_TOKEN"] = HUGGINGFACE_API_TOKEN

In [20]:
# cell 14 - Optimized Document Embedding
from langchain_pinecone import PineconeVectorStore
from tqdm.auto import tqdm
import time

try:
    
    docsearch = PineconeVectorStore.from_documents(
        documents=text_chunks,
        embedding=embeddings,
        index_name=index_name,
        batch_size=100,
        namespace="medical_knowledge",
    )
    print("Successfully embedded all chunks!")
    
except Exception as e:
    print(f" Embedding failed: {str(e)}")
    
    from langchain_community.vectorstores import FAISS
    faiss_store = FAISS.from_documents(text_chunks, embeddings)
    faiss_store.save_local("medical_chatbot_fallback")
    print("Saved embeddings locally as fallback")

Successfully embedded all chunks!


In [21]:
# cell 15 - Enhanced Pinecone Index Loading and Upsert
from langchain_pinecone import PineconeVectorStore
from tqdm.auto import tqdm  
import time


try:
    docsearch = PineconeVectorStore.from_existing_index(
        index_name=index_name,
        embedding=embeddings
    )
    print(f" Successfully connected to Pinecone index '{index_name}'")
    
    
    index_stats = docsearch._index.describe_index_stats()
    print(f"Index contains {index_stats['total_vector_count']} vectors")
    
    
    batch_size = 100  
    failed_chunks = []
    
    for i in tqdm(range(0, len(text_chunks), batch_size), 
                desc="Embedding medical chunks"):
        batch = text_chunks[i:i + batch_size]
        try:
            docsearch.add_documents(batch)
        except Exception as e:
            print(f" Failed on batch {i//batch_size}: {str(e)}")
            failed_chunks.extend(batch)
    
    if failed_chunks:
        print(f" Failed to embed {len(failed_chunks)} chunks")
        
        from langchain_community.vectorstores import FAISS
        faiss_store = FAISS.from_documents(failed_chunks, embeddings)
        faiss_store.save_local("failed_medical_embeddings")
        print(" Saved failed chunks to local FAISS storage")
    
except Exception as e:
    print(f" Critical error: {str(e)}")
    from langchain_community.vectorstores import FAISS
    faiss_store = FAISS.from_documents(text_chunks, embeddings)
    faiss_store.save_local("full_medical_knowbase")
    print(" All chunks saved to local FAISS storage")

 Successfully connected to Pinecone index 'medicalbot'
Index contains 8665 vectors


Embedding medical chunks:   0%|          | 0/18 [00:00<?, ?it/s]

In [22]:
# cell 16
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [23]:
# cell 17
retrieved_docs = retriever.invoke("What is the treatment for diabetes?")

In [48]:
# cell 18
retrieved_docs

[Document(id='07b4ab43-f8c3-4c99-b756-d821da1f6304', metadata={'creationdate': '2025-01-30T11:41:24+00:00', 'creator': 'Chromium', 'moddate': '2025-01-30T11:41:24+00:00', 'page': 32.0, 'page_label': '33', 'producer': 'Skia/PDF m91', 'source': 'Data\\medbook.pdf', 'total_pages': 409.0}, page_content='dose aspirin, tramadol), or anticipate hypoglycaemia (e.g.\xa0administer quinine IV in a glucose infusion).\nIn diabetic patients:\nAvoid missing meals, increase intake of carbohydrates if necessary.\nAdjust dosage of insulin according to blood glucose levels and physical activity.\nAdjust dosage of oral antidiabetics, taking into account possible drug interactions.\nFootnotes'),
 Document(id='92c63da1-9647-47de-a3df-f4ce9569f26a', metadata={'creationdate': '2025-01-30T11:41:24+00:00', 'creator': 'Chromium', 'moddate': '2025-01-30T11:41:24+00:00', 'page': 32.0, 'page_label': '33', 'producer': 'Skia/PDF m91', 'source': 'Data\\medbook.pdf', 'total_pages': 409.0}, page_content='dose aspirin, t

In [41]:
# cell 19
from langchain_community.llms import HuggingFaceHub

llm = HuggingFaceHub(
    repo_id="meta-llama/Llama-3.1-8B-Instruct",
    huggingfacehub_api_token=HUGGINGFACE_API_TOKEN,
    model_kwargs={"temperature": 0.4, "max_tokens": 500}
)

In [57]:
# cell 20
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from  langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are EDI and You are a a compassionate medical assistant. "
    "You will be provided with relevant medical documents to answer the user's question. "
    "Start with brief empathy (e.g. 'I understand this is concerning...')"
    "If you don't know the answer, just say that you don't know."
    "End with clear next steps (e.g. 'You should consult a doctor if...'')"
    "Use three sentences maximum to answer the question and keep the answer concise."
    "Guidelines:"
        "Use simple language (8th grade level)"
        "Put URGENT symptoms in ALL CAPS"
        "Never diagnose, only suggest possibilities"
        "If unsure: 'This requires professional evaluation'"
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [58]:
# cell 21 
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)