### Import Libraries

In [38]:
import os
import warnings
warnings.filterwarnings("ignore")
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
import sentence_transformers
from dotenv import load_dotenv
load_dotenv(override=True)


True

In [2]:
os.chdir("..")
%pwd

'c:\\Users\\mhmdh\\AI-Medical-ChatBot'

In [3]:
# Extract data from pdf files
def load_pdf_file(data):
     loader = DirectoryLoader(path=data,
                              glob="*.pdf",
                              loader_cls=PyPDFLoader)
     
     document = loader.load()
     return document


extracted_data = load_pdf_file("Data/")

In [4]:
# extracted_data

In [5]:
# Split the data into chunks
def text_split(extracted_data):
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
     text_chunks = text_splitter.split_documents(extracted_data)
     return text_chunks

text_chunks = text_split(extracted_data)
print(f"Length of the text chunks: {len(text_chunks)}")

Length of the text chunks: 39994


In [6]:
# Download the embedding from HuggingFace
def download_hugginface_embedding():
     embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
     return embeddings

embeddings = download_hugginface_embedding()

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [7]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [8]:
# Test for embedding model
emb_qu = embeddings.embed_query("Hello World")
print(emb_qu)
print(f"Length {len(emb_qu)}")

[-0.03447727486491203, 0.03102317824959755, 0.006734970025718212, 0.026108985766768456, -0.03936202451586723, -0.16030244529247284, 0.06692401319742203, -0.006441489793360233, -0.0474504791200161, 0.014758856035768986, 0.07087527960538864, 0.05552763119339943, 0.019193334504961967, -0.026251312345266342, -0.01010954286903143, -0.02694045566022396, 0.022307461127638817, -0.022226648405194283, -0.14969263970851898, -0.017493007704615593, 0.00767625542357564, 0.05435224249958992, 0.0032543970737606287, 0.031725890934467316, -0.0846213847398758, -0.02940601296722889, 0.05159561336040497, 0.04812406003475189, -0.0033148222137242556, -0.058279167860746384, 0.04196927323937416, 0.022210685536265373, 0.1281888335943222, -0.022338971495628357, -0.011656315997242928, 0.06292839348316193, -0.032876335084438324, -0.09122604131698608, -0.031175347045063972, 0.0526994913816452, 0.04703482985496521, -0.08420311659574509, -0.030056199058890343, -0.02074483036994934, 0.009517835453152657, -0.0037217906

In [23]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

In [10]:
# Initialize Pinecone and create the index if not exists
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
# from pinecone.grpc import PineconeGRPC as Pinecone
# from pinecone import ServerlessSpec
from pinecone import Pinecone, ServerlessSpec


pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = "medicalbot"
pc.create_index(name=index_name,
                dimension=384,
                metric="cosine",
                spec=ServerlessSpec(cloud="aws",region="us-east-1"))

{
    "name": "medicalbot",
    "metric": "cosine",
    "host": "medicalbot-u6z9n1i.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [11]:
index_name

'medicalbot'

In [66]:

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

In [67]:
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY

os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
os.environ["GEMINI_API_KEY"] = GEMINI_API_KEY

In [None]:
from langchain_pinecone import PineconeVectorStore
from tqdm import tqdm
import time

def initialize_pinecone_vector_store(documents, index_name, embedding, retries=3, delay=5):
    for attempt in range(retries):
        try:
            docsearch = PineconeVectorStore.from_documents([], index_name=index_name, embedding=embedding)
            print("✅ Pinecone Vector Store created.")

            for i in tqdm(range(0, len(documents)), desc="Uploading documents", unit="doc"):
                docsearch.add_documents([documents[i]])

            print("✅ All documents uploaded successfully.")
            return docsearch

        except Exception as e:
            print(f"\n❌ Attempt {attempt + 1} failed: {e}")
            if attempt < retries - 1:
                time.sleep(delay)
            else:
                raise e


docsearch = initialize_pinecone_vector_store(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings
)


✅ Pinecone Vector Store created.


Uploading documents: 100%|██████████| 39994/39994 [5:37:26<00:00,  1.98doc/s]   

✅ All documents uploaded successfully.





In [None]:
# from langchain_pinecone import PineconeVectorStore
# from tqdm import tqdm
# import time

# def initialize_pinecone_vector_store_with_progress(documents, index_name, embedding, batch_size=32, retries=3, delay=5):
#     for attempt in range(retries):
#         try:
#             print("Starting to upload documents to Pinecone with progress bar...")
            
#             for i in tqdm(range(0, len(documents), batch_size), desc="Uploading batches"):
#                 batch = documents[i:i+batch_size]
#                 PineconeVectorStore.from_documents(
#                     documents=batch,
#                     index_name=index_name,
#                     embedding=embedding
#                 )
            
#             print("✅ Pinecone Vector Store initialized successfully.")
#             return True
        
#         except Exception as e:
#             print(f"❌ Attempt {attempt + 1} failed: {e}")
#             if attempt < retries - 1:
#                 print(f"⏳ Retrying after {delay} seconds...")
#                 time.sleep(delay)
#             else:
#                 raise e


# success = initialize_pinecone_vector_store_with_progress(
#     documents=text_chunks, 
#     index_name=index_name,
#     embedding=embeddings,
#     batch_size=32 
# )


In [56]:
# Load the existing index
docsearch = PineconeVectorStore.from_existing_index(index_name=index_name,
                                                    embedding=embeddings)

docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x1b4917c8250>

In [57]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [60]:
retrieved_docs = retriever.invoke("What is Acute kidney failure?")
retrieved_docs

[Document(id='5f64bdeb-de16-44ae-b338-f807d3f705d3', metadata={'creationdate': '2006-10-16T20:19:33+02:00', 'creator': 'Adobe Acrobat 6.0', 'moddate': '2006-10-16T22:03:45+02:00', 'page': 75.0, 'page_label': '46', 'producer': 'PDFlib+PDI 6.0.3 (SunOS)', 'source': 'Data\\The-Gale-Encyclopedia-of-Medicine-3rd-Edition-staibabussalamsula.ac_.id_.pdf', 'total_pages': 4505.0}, page_content='Acute kidney failure'),
 Document(id='2863a229-019e-4b38-b165-2f393e0c37c8', metadata={'creationdate': '2006-10-16T20:19:33+02:00', 'creator': 'Adobe Acrobat 6.0', 'moddate': '2006-10-16T22:03:45+02:00', 'page': 75.0, 'page_label': '46', 'producer': 'PDFlib+PDI 6.0.3 (SunOS)', 'source': 'Data\\The-Gale-Encyclopedia-of-Medicine-3rd-Edition-staibabussalamsula.ac_.id_.pdf', 'total_pages': 4505.0}, page_content='Acute kidney failure\nDefinition\nAcute kidney failure occurs when illness, infec-\ntion, or injury damages the kidneys. Temporarily, the\nkidneys cannot adequately remove fluids and wastes\nfrom the 

In [65]:
import google.generativeai as genai
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=GEMINI_API_KEY)
system_prompt = (
     "you are an assistant for question-answering task. "
     "Use the following pieces of retrieved context to answer"
     "the question. if you don't know the answer, say that you don't know."
     "use four sentences maximum and keep that"
     "answer concise ."
     "\n\n"
     "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [("system", system_prompt), ("human", "{input}")]
)

question_answer_chain = create_stuff_documents_chain(llm=llm, prompt=prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

response = rag_chain.invoke({"input": "how to solve the Acute lymphangitis"})
print(response["answer"])

Acute lymphangitis is treated with large doses of antibiotics, usually penicillin, administered intravenously. A broad-spectrum antibiotic may also be used. Early treatment with antibiotics usually leads to complete recovery. If left untreated, it can cause tissue damage.
