In [2]:
import warnings
warnings.filterwarnings("ignore")

In [1]:
%pwd

'c:\\Users\\NoorNahiyanB\\Downloads\\Books\\MedPrepAI-RAG\\research'

In [3]:
import os
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\NoorNahiyanB\\Downloads\\Books\\MedPrepAI-RAG'

In [5]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [6]:
#Extract Data From the PDF File
def load_pdf_file(data):
    loader= DirectoryLoader(data,
                            glob="*.pdf",
                            loader_cls=PyPDFLoader)

    documents=loader.load()

    return documents


In [7]:
extracted_data=load_pdf_file(data='Data/')

In [9]:
#extracted_data

In [10]:
#Split the Data into Text Chunks
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [11]:
text_chunks=text_split(extracted_data)
print("Length of Text Chunks", len(text_chunks))

Length of Text Chunks 5227


In [12]:
# text_chunks

In [13]:
from langchain.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer

In [14]:
#Download the Embeddings from Hugging Face
def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings


In [15]:
embeddings = download_hugging_face_embeddings()

  embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')


In [16]:
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [17]:
# query_result

In [27]:
PINECONE_API_KEY=os.environ.get('PINECONE_API_KEY')
OPENAI_API_KEY=os.environ.get('OPENAI_API_KEY')

In [29]:
from pinecone import Pinecone, ServerlessSpec
import os

# Initialize Pinecone with API key
pc = Pinecone(api_key=PINECONE_API_KEY)

# Define index name
index_name = "medprep"

# Create the index
pc.create_index(
    name=index_name,
    dimension=384,  # Replace with your model dimensions
    metric="cosine",  # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)

{
    "name": "medprep",
    "metric": "cosine",
    "host": "medprep-ukt67zh.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [22]:
from dotenv import load_dotenv
load_dotenv()

True

In [30]:
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")


In [31]:
os.environ["PINECONE_API_KEY"] = os.getenv("PINECONE_API_KEY")


In [32]:
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [33]:
# Embed each chunk and upsert the embeddings into your Pinecone index.
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings, 
)

In [34]:
# Load Existing index 

from langchain_pinecone import PineconeVectorStore
# Embed each chunk and upsert the embeddings into your Pinecone index.
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [35]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x25f2299f340>

In [36]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [38]:
retrieved_docs = retriever.invoke("Explain Estrogen Pathway?")

In [39]:
retrieved_docs

[Document(id='5b4bf0ba-13ec-42fb-a79d-cb404db3bc7d', metadata={'author': 'Tao Le, Connie Qiu, Panagiotis Kaparaliotis, Kimberly Kallianos, Vikas Bhushan, Anup Chalise, Caroline Coleman and Sean Evans', 'creationdate': '2024-02-27T11:11:22+05:30', 'creator': 'Adobe InDesign 18.5 (Macintosh)', 'moddate': '2024-03-04T15:58:07+05:30', 'page': 668.0, 'page_label': '648', 'producer': 'Adobe PDF Library 17.0; modified using iText® 5.5.6 ©2000-2015 iText Group NV (AGPL-version)', 'source': 'Data\\First Aid for the USMLE Step 1 2024, 34th Edition.pdf', 'title': 'First Aid for the® USMLE Step 1 2024: A Student-to-Student Guide', 'total_pages': 868.0, 'trapped': '/False'}, page_content='RepRoductive  `\u2009REPRODUCTIVE—PHy SIOl OgyRepRoductive  `\u2009REPRODUCTIVE—PHy SIOl OgySectioN iii648\nEstrogen\nSOURCE Ovary (estradiol), placenta (estriol), adipose \ntissue (estrone via aromatization).\nPotency: estradiol > estrone > estriol.\nEstradiol is produced from 2 ovaries.\nFUNCTION Development of 

In [40]:
from langchain_openai import OpenAI
llm = OpenAI(temperature=0.4, max_tokens=500)

In [41]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "are not aware of this topic. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [42]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [43]:
response = rag_chain.invoke({"input": "What is the difference between rheumatoid arthritis and ostheoarthritis?"})
print(response["answer"])



Rheumatoid arthritis is an autoimmune disorder that causes inflammation and formation of pannus, while osteoarthritis is a mechanical disorder caused by wear and tear on articular cartilage. Rheumatoid arthritis also has a genetic component and is more common in females, while osteoarthritis is more common in older individuals and is not linked to a specific gene. Both conditions can lead to joint pain and damage, but they have different underlying causes and treatments.


In [45]:
response = rag_chain.invoke({"input": "what is the Relation Between Cerebral Perfusion pressure and partial pressure of carbon di oxide?"})
print(response["answer"])



The cerebral perfusion pressure (CPP) is directly proportional to the partial pressure of carbon dioxide (Pco2) until Pco2 reaches a level above 90 mm Hg. This means that as Pco2 increases, CPP also increases, until a certain threshold is reached. However, severe hypoxia can also modulate cerebral perfusion, and in this case, the relationship between CPP and Pco2 may not hold true. Additionally, CPP relies on a pressure gradient between mean arterial pressure (MAP) and intracranial pressure (ICP), and if CPP reaches 0, there is no cerebral perfusion and brain death may occur.


In [47]:
response = rag_chain.invoke({"input": "Where is Bangladesh?"})
print(response["answer"])



I am not aware of this topic.
