# Data Loading

In [2]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [3]:
def load_pdf_file(data):
    loader = DirectoryLoader(data, glob="**/*.pdf",loader_cls=PyPDFLoader)
    documents = loader.load()

    return documents

In [7]:
extracted_data = load_pdf_file("Data/")

# Chunking

In [8]:
def text_splitter(documents):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    split_documents = text_splitter.split_documents(documents)

    return split_documents

In [9]:
text_chunks = text_splitter(extracted_data)
print(f"Total number of chunks: {len(text_chunks)}")

Total number of chunks: 6973


# Embedding

In [10]:
from langchain.embeddings import HuggingFaceEmbeddings

In [11]:
def download_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [14]:
embeddings = download_embeddings()

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


# Pinecone

In [17]:
import os
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv

load_dotenv()

# Instantiate the Pinecone client
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

# Connect to the index
index = pc.Index("medibot")

In [18]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}

# Vectorstore

In [19]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name="medibot",
    embedding=embeddings
)

# Retriever

In [20]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [21]:
retrieved_docs = retriever.invoke("What is the treatment for diabetes?")
retrieved_docs

[Document(id='50cd1367-0ed7-484f-8db1-d65192e614fa', metadata={'author': '', 'creationdate': '2017-05-01T10:37:35-07:00', 'creator': '', 'keywords': '', 'moddate': '2017-05-01T10:37:35-07:00', 'page': 434.0, 'page_label': '435', 'producer': 'GPL Ghostscript 9.10', 'source': 'Data\\The_GALE_ENCYCLOPEDIA_of_MEDICINE_SECOND.pdf', 'subject': '', 'title': '', 'total_pages': 759.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 1047\nDiabetes mellitus'),
 Document(id='882a0899-0ea7-4f60-9afa-f6b9f9e796b4', metadata={'author': '', 'creationdate': '2017-05-01T10:37:35-07:00', 'creator': '', 'keywords': '', 'moddate': '2017-05-01T10:37:35-07:00', 'page': 438.0, 'page_label': '439', 'producer': 'GPL Ghostscript 9.10', 'source': 'Data\\The_GALE_ENCYCLOPEDIA_of_MEDICINE_SECOND.pdf', 'subject': '', 'title': '', 'total_pages': 759.0}, page_content='available to treat diabetes include metformin, acarbose,\nand troglitizone. The choice of the right medication\ndepends in part on the individual patient

# LLM

In [23]:
from langchain_openai import ChatOpenAI
load_dotenv()
llm = ChatOpenAI(
             model = "openai/gpt-4o-mini",
             api_key=os.getenv("OPENAI_API_KEY"),
             base_url="https://models.github.ai/inference",
             temperature=0.4,
             max_completion_tokens=500
)

In [32]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an AI assistant specialized in medical knowledge. "
    "Use the following pieces of context to answer the question. "
    "If you don't know the answer, just say that you don't know. "
    "Use three sentences maxim and keep the answer concise. "
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [33]:
question_answer_chain = create_stuff_documents_chain(
    llm=llm,
    prompt=prompt,)

rag_chain = create_retrieval_chain(
    retriever,
    question_answer_chain)

In [34]:
response = rag_chain.invoke({"input": "What is the treatment for diabetes?"})
print(response['answer'])

Treatment for diabetes typically includes medications such as metformin, acarbose, and troglitizone, chosen based on the individual patient's profile. Additionally, some herbs like fenugreek and bilberry may help manage blood sugar levels and other symptoms. Lifestyle changes, including diet and exercise, are also crucial in managing diabetes effectively.
