In [2]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [3]:
def load_pdf_files(data):
    loader = DirectoryLoader(data,glob='*.pdf',loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

In [4]:
%pwd

'f:\\dev\\AI\\Medical-Chatbot-GenAI\\research'

In [5]:
extracted_data = load_pdf_files(data='../data/')

In [6]:
extracted_data

[Document(metadata={'producer': 'Qt 4.8.6', 'creator': 'wkhtmltopdf 0.12.2.1', 'creationdate': '2016-04-16T17:29:14+07:00', 'moddate': '2016-04-16T21:10:33+07:00', 'title': '', 'source': '..\\data\\washington_medical_book.pdf', 'total_pages': 1009, 'page': 0, 'page_label': '1'}, page_content=''),
 Document(metadata={'producer': 'Qt 4.8.6', 'creator': 'wkhtmltopdf 0.12.2.1', 'creationdate': '2016-04-16T17:29:14+07:00', 'moddate': '2016-04-16T21:10:33+07:00', 'title': '', 'source': '..\\data\\washington_medical_book.pdf', 'total_pages': 1009, 'page': 1, 'page_label': '2'}, page_content='Dedication\nWe dedicate this manual to the outstanding medicine house staff at Washington University and\nBarnes-Jewish Hospital—their wisdom, dedication, and compassion continue to inspire us each and\nevery day.'),
 Document(metadata={'producer': 'Qt 4.8.6', 'creator': 'wkhtmltopdf 0.12.2.1', 'creationdate': '2016-04-16T17:29:14+07:00', 'moddate': '2016-04-16T21:10:33+07:00', 'title': '', 'source': '..\

In [7]:
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [8]:
text_chunks = text_split(extracted_data)
print("Length of Text Chunks: ", len(text_chunks))

Length of Text Chunks:  6040


In [9]:
from langchain.embeddings import HuggingFaceEmbeddings

In [10]:
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [11]:
import sentence_transformers

In [12]:
embeddings = download_hugging_face_embeddings()

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [13]:
query_result = embeddings.embed_query('Hello world')
# query_result

In [33]:
from dotenv import load_dotenv
import os

load_dotenv()

PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [26]:
from pinecone import Pinecone, ServerlessSpec, CloudProvider, AwsRegion

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "medical-genai-bot"

existing_indexes = [index_info.name for index_info in pc.list_indexes()]
if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(
            cloud=CloudProvider.AWS,
            region=AwsRegion.US_EAST_1
        )
    )

In [27]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(documents=text_chunks,index_name=index_name,embedding=embeddings)

In [28]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [31]:
retrieved_docs = retriever.invoke('What is diabeties?')
retrieved_docs

[Document(id='42fa0afa-f4c5-4138-8faf-00644de0d79c', metadata={'creationdate': '2016-04-16T17:29:14+07:00', 'creator': 'wkhtmltopdf 0.12.2.1', 'moddate': '2016-04-16T21:10:33+07:00', 'page': 789.0, 'page_label': '790', 'producer': 'Qt 4.8.6', 'source': '..\\data\\washington_medical_book.pdf', 'title': '', 'total_pages': 1009.0}, page_content='P.731\n23\nDiabetes Mellitus and Related Disorders\nCynthia J. Herrick\nJanet B. McGill\nDiabetes Mellitus\nGENERAL PRINCIPLES\nDiabetes mellitus (DM)\n is a group of metabolic diseases characterized by hyperglycemia resulting from defects in\ninsulin secretion, insulin action, or both. In 2012, DM was present in 12.3% of persons over the age of 20 years in the\nUnited States and 25.9% of those over the age of 65 years. A substantial percentage of affected persons are not'),
 Document(id='be25e7b3-22f4-443b-a808-8e639a9eba20', metadata={'creationdate': '2016-04-16T17:29:14+07:00', 'creator': 'wkhtmltopdf 0.12.2.1', 'moddate': '2016-04-16T21:10:33+

In [34]:
from langchain_openai import OpenAI
llm = OpenAI(temperature=0.4, max_tokens=500)

In [35]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks."
    "Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you"
    "don't know. Use three sentences maximim and keep the answer concise"
    "\n \n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human","{input}"),
    ]
)

In [36]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [39]:
response = rag_chain.invoke({"input": "What is diabeties?"})
print(response["answer"])


Diabetes is a group of metabolic diseases that cause high levels of sugar in the blood due to problems with insulin secretion or action. It affects a large percentage of people in the United States, especially those over the age of 65. Tight control of blood sugar levels can help prevent complications such as diabetic retinopathy.


In [40]:
response = rag_chain.invoke({"input": "What are the Common Symptoms in diabeties?"})
print(response["answer"])



The common symptoms of diabetes include painless jaundice, anorexia, weight loss, and abdominal pain. Additionally, sudden onset of diabetes in patients over 50 years old may also be indicative of pancreatic cancer. Other ocular abnormalities associated with diabetes include cataract formation, dyskinetic pupils, glaucoma, optic neuropathy, extraocular muscle paresis, floaters, and fluctuating visual acuity.
