In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
%pwd

'c:\\Downloads\\LLM Projects\\MediGuide-AI\\research'

In [3]:
import os
os.chdir("../")

In [4]:
%pwd

'c:\\Downloads\\LLM Projects\\MediGuide-AI'

In [5]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [9]:
#Extract Data From the PDF File
def load_pdf_file(data):
    loader= DirectoryLoader(data,
                            glob="*.pdf",
                            loader_cls=PyPDFLoader)

    documents=loader.load()

    return documents


In [10]:
extracted_data=load_pdf_file(data='Data/')

In [11]:
# extracted_data

In [12]:
#Split the Data into Text Chunks
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [13]:
text_chunks=text_split(extracted_data)
print("Length of Text Chunks", len(text_chunks))

Length of Text Chunks 39994


In [14]:
# text_chunks

In [15]:
from langchain.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer

In [16]:
#Download the Embeddings from Hugging Face
def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings


In [17]:
embeddings = download_hugging_face_embeddings()

  embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')


In [18]:
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [19]:
# query_result

In [49]:
PINECONE_API_KEY=os.environ.get('PINECONE_API_KEY')
OPENAI_API_KEY=os.environ.get('OPENAI_API_KEY')

In [8]:
from pinecone import Pinecone, ServerlessSpec
import os

# Initialize Pinecone with API key
pc = Pinecone(api_key=PINECONE_API_KEY)

# Define index name
index_name = "mediguide"

# Create the index
pc.create_index(
    name=index_name,
    dimension=384,  # Replace with your model dimensions
    metric="cosine",  # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)

{
    "name": "mediguide",
    "metric": "cosine",
    "host": "mediguide-ukt67zh.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [38]:
from dotenv import load_dotenv
load_dotenv()

True

In [50]:
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [51]:

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY


In [52]:
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [22]:
# Embed each chunk and upsert the embeddings into your Pinecone index.
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings, 
)

In [23]:
# Load Existing index 

from langchain_pinecone import PineconeVectorStore
# Embed each chunk and upsert the embeddings into your Pinecone index.
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [24]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x1e8c18ecc10>

In [25]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [66]:
retrieved_docs = retriever.invoke("What is ADHD?")

In [67]:
retrieved_docs

[Document(id='5fe2ccf1-3f63-4907-808e-9b7c294a0c56', metadata={'creationdate': '2006-10-16T20:19:33+02:00', 'creator': 'Adobe Acrobat 6.0', 'moddate': '2006-10-16T22:03:45+02:00', 'page': 485.0, 'page_label': '456', 'producer': 'PDFlib+PDI 6.0.3 (SunOS)', 'source': 'Data\\The-Gale-Encyclopedia-of-Medicine-3rd-Edition-staibabussalamsula.ac_.id_.pdf', 'total_pages': 4505.0}, page_content='Attention-deficit/\nHyperactivity disorder\n(ADHD)\nDefinition\nAttention-deficit/hyperactivity disorder (ADHD)\nis a developmental disorder characterized by distract-\nibility, hyperactivity, impulsive behaviors, and the\ninability to remain focused on tasks or activities.\nDescription\nADHD, also known as hyperkinetic disorder\n(HKD) outside of the United States, is estimated to\naffect 3-9% of children, and afflicts boys more often\nthan girls. Although difficult to assess in infancy and'),
 Document(id='d0e6267f-f4c2-4853-81e4-f540325821d1', metadata={'creationdate': '2006-10-16T20:19:33+02:00', 'cr

In [53]:
from langchain_openai import OpenAI
llm = OpenAI(temperature=0.4, max_tokens=500)

In [54]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "are not aware of this topic. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [55]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [69]:
response = rag_chain.invoke({"input": "I notice some blood cloted inside my nose every morning for last 6 months. What might be the problem?"})
print(response["answer"])



It is possible that you are experiencing symptoms of a disorder called Wegener's granulomatosis (WG). This disorder causes inflammation of the blood vessels, which can result in chronic runny nose, nosebleeds, and other symptoms such as fatigue and weakness. Other possible causes of nosebleeds include infections, drying of the nasal membranes, medications, trauma, and underlying disorders such as leukemia. It is important to consult a doctor for proper diagnosis and treatment.


In [57]:
response = rag_chain.invoke({"input": "what is Multiple Sclerosis?"})
print(response["answer"])



Multiple sclerosis is a chronic autoimmune disorder that affects movement, sensation, and bodily functions. It is caused by the destruction of the myelin insulation covering nerve fibers in the central nervous system. This disorder is more common in the United States, Canada, and Northern Europe and is very rare among Asians, North and South American Indians, and Eskimos. The cause of MS is believed to be an attack by the body's own immune system on the myelin sheath that insulates neurons in the brain and spinal cord.


In [68]:
response = rag_chain.invoke({"input": "what is Fintech?"})
print(response["answer"])



I am not aware of this topic.
