# Making sure its in the Source Directory

In [1]:
%pwd

'/Users/georgekuncheria/Desktop/GenAI/End-To-End_Projects/MedicalChatBot/research'

In [2]:
import os

os.chdir("../")

In [3]:
%pwd

'/Users/georgekuncheria/Desktop/GenAI/End-To-End_Projects/MedicalChatBot'

# Import Important Libraries

In [4]:
from langchain.document_loaders import PyPDFLoader , DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Loading PDF Documents

In [5]:
def load_document(data):
    loader= DirectoryLoader(
        data,
        glob="**/*.pdf",
        loader_cls=PyPDFLoader
    )
    documents = loader.load()
    return documents



In [6]:
extracted_data= load_document("Data/")

In [7]:
len(extracted_data)


637

# Creating Text Chunks

In [8]:
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20
    )
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks


In [24]:
text_chunks = text_split(extracted_data)
print("Length of text chunks:", len(text_chunks))

Length of text chunks: 5859


In [28]:
text_chunks[56].page_content

'Dept. of Pediatrics, Division of\nMedical Genetics\nEmory University School of\nMedicine\nAtlanta, GA\nBethany Thivierge\nBiotechnical Writer/Editor\nTechnicality Resources\nRockland, ME\nMai Tran, Pharm.D.\nMedical Writer\nTroy, MI\nCarol Turkington\nMedical Writer\nLancaster, PA\nJudith Turner, B.S.\nMedical Writer\nSandy, UT\nAmy B. Tuteur, M.D.\nMedical Advisor\nSharon, MA\nSamuel Uretsky, Pharm.D.\nMedical Writer\nWantagh, NY\nGALE ENCYCLOPEDIA OF MEDICINE 2 XVII\nContributors'

# Creating HuggingFace Embeddings

In [10]:
from langchain_community.embeddings import HuggingFaceEmbeddings


In [11]:
def download_huggingface_embeddings():
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [12]:
embeddings = download_huggingface_embeddings()

  embeddings = HuggingFaceEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm


## Check for dimensions (Must be input to pinecone dimension)

In [13]:
query_result = embeddings.embed_query("What is the treatment for diabetes?")
print("length of query result:", len(query_result))

length of query result: 384


# Initiating & Upserting to Pinecone Database

In [14]:
from pinecone import Pinecone, ServerlessSpec
import os
from dotenv import load_dotenv


In [15]:
print(os.getcwd())  # Where Python is running from


/Users/georgekuncheria/Desktop/GenAI/End-To-End_Projects/MedicalChatBot


In [18]:
load_dotenv()

True

In [40]:
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")

In [22]:
index_name = "medical-chatbot"

pc = Pinecone(api_key=PINECONE_API_KEY)

pc.create_index(
    name=index_name,
    dimension=384,
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)



{
    "name": "medical-chatbot",
    "metric": "cosine",
    "host": "medical-chatbot-si6xagf.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [23]:
# Describe index
description = pc.describe_index(index_name)
print(description)

# Check if the status is ready
if description.status['ready']:
    print(f"Index '{index_name}' is ready and connected!")
else:
    print(f"Index '{index_name}' is not ready yet.")

{'deletion_protection': 'disabled',
 'dimension': 384,
 'host': 'medical-chatbot-si6xagf.svc.aped-4627-b74a.pinecone.io',
 'metric': 'cosine',
 'name': 'medical-chatbot',
 'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
 'status': {'ready': True, 'state': 'Ready'},
 'tags': None,
 'vector_type': 'dense'}
Index 'medical-chatbot' is ready and connected!


## Upserting To Pinecone Database

In [29]:
from langchain_pinecone import PineconeVectorStore

vector_store = PineconeVectorStore.from_documents(
    index_name=index_name,
    embedding=embeddings,
    documents=text_chunks
)


# Load existing index and Create a Retriever

In [30]:
from langchain_pinecone import PineconeVectorStore

In [31]:
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [32]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x119c5a350>

In [33]:
retriever = docsearch.as_retriever(search_kwargs={"k": 3},search_type="similarity")

In [34]:
retrieved_docs = retriever.invoke("What is the treatment for diabetes?")

In [35]:
retrieved_docs

[Document(id='7ecc25e3-6302-44a3-b479-f9d95f978a68', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 274.0, 'page_label': '275', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'Data/Medical_book.pdf', 'total_pages': 637.0}, page_content='with a physician or pharmacist before combining tri-\ncyclic antidepressants with any other prescription or non-\nprescription (over-the-counter) medicine.\nNancy Ross-Flanigan\nAntidiabetic drugs\nDefinition\nAntidiabetic drugs are medicines that help control\nblood sugar levels in people with diabetes mellitus\n(sugar diabetes).\nPurpose\nDiabetes may be divided into type I and type II, for-\nmerly termed juvenile onset or insulin-dependent, and\nGALE ENCYCLOPEDIA OF MEDICINE 2 261\nAntidiabetic drugs'),
 Document(id='f2ae4f25-1b6f-4b56-bcb8-840a908cb399', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'pag

# Using OpenAI LLM

In [36]:
from langchain_openai import OpenAI

In [39]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")


In [61]:
llm= OpenAI(
    temperature=0.5,
    max_tokens=500
)



# Creating Chain & Prompt

In [62]:
from langchain.chains import create_retrieval_chain
from langchain.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain

In [63]:
system_prompt =(
    "You are an assistant for question answering task."
    "Use the following pieces of retrieved context to answer the question."
    "If you don't know the answer, just say that you don't know."
    "Use 3 sentence maximum to answer the question and keep it concise."
    "\n\n"
    "{context}"
)

In [64]:
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human","{input}")
    ]
)

In [65]:
question_answer_chain = create_stuff_documents_chain(llm,prompt)
rag_chain = create_retrieval_chain(retriever,question_answer_chain)

In [73]:
response = rag_chain.invoke({"input":"Who makes playstations?"})
print(response['answer'])



I don't know.
