In [1]:
print("ok")

ok


In [2]:
import os
os.chdir("../")

In [3]:
from langchain.document_loaders import PyPDFLoader,DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [4]:
def load_pdf(path):
    loader = DirectoryLoader(path, glob="**/*.pdf",loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

In [5]:
extracted_data = load_pdf("Data/")

In [6]:
def split_documents(extracted_data):
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    split_data = splitter.split_documents(extracted_data)
    return split_data

In [7]:
text_chunks = split_documents(extracted_data)
print(f"Number of text chunks: {len(text_chunks)}")

Number of text chunks: 5859


In [8]:
from langchain.embeddings import HuggingFaceEmbeddings
def download_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [9]:
embeddings = download_embeddings()

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


In [10]:
query_result = embeddings.embed_query("What is the purpose of this trial?")
print(f"Query result: {len(query_result)}")  # Print first 5

Query result: 384


In [25]:
from dotenv import load_dotenv
import os
load_dotenv()

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
GROK_API_KEY = os.getenv("GROK_API_KEY")




In [13]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)

In [15]:
index_name = "medical-bot"

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        metric="cosine",  # Metric for similarity search
        dimension=384,  # Dimension of the embeddings
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1",
        )
        
    )

In [16]:
from langchain_pinecone import PineconeVectorStore

vector_store = PineconeVectorStore.from_documents(
    documents = text_chunks,
    index_name=index_name,
    embedding=embeddings,
)

In [17]:
from langchain_pinecone import PineconeVectorStore

doc_search = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings,
)

In [18]:
doc_search

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x2561e9986d0>

In [19]:
retriver = doc_search.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [21]:
retriver.invoke("What is acne?")

[Document(id='dacd4341-76aa-4019-aa87-2433620153cf', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 39.0, 'page_label': '40', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'Data\\Medical_book.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='eb3f5600-f3b8-4200-b23c-067bc1cf3d32', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 38.0, 'page_label': '39', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'Data\\Medical_book.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 25\nAcne\nAcne vulgaris affecting a woman’s face. Acne is the general\nname given to a skin disorder in which the sebaceous\nglands become inflamed. (Photograph by Biophoto Associ-\nates, Photo Researchers, Inc. Reproduced by permission.)\nGEM 

In [27]:
from langchain_groq import ChatGroq

llm = ChatGroq(
    groq_api_key=GROK_API_KEY,
    model="llama-3.3-70b-versatile",
    temperature=0.6,
)

In [28]:
llm.invoke("What is acne?")

AIMessage(content="Acne is a common skin condition that occurs when the pores on the skin become clogged with dead skin cells, oil, and bacteria. It can cause a range of symptoms, including:\n\n1. **Pimples**: Small, red, and inflamed bumps on the skin, often filled with pus.\n2. **Blackheads**: Small, dark spots on the skin, caused by clogged pores.\n3. **Whiteheads**: Small, white bumps on the skin, caused by clogged pores.\n4. **Cysts**: Large, painful bumps under the skin, filled with pus.\n5. **Nodules**: Large, painful bumps under the skin, often inflamed.\n\nAcne can occur anywhere on the body, but it's most common on the:\n\n1. Face (especially the forehead, nose, and chin)\n2. Chest\n3. Back\n4. Shoulders\n\nAcne is caused by a combination of factors, including:\n\n1. **Overproduction of sebum**: The skin's oil glands produce too much oil, which can clog pores.\n2. **Dead skin cells**: Dead skin cells can accumulate and clog pores.\n3. **Bacteria**: A type of bacteria called P

In [36]:
from langchain.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

system_prompt = (
'''
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer
the question. If you don't know the answer, say that you
don't know. Use three sentences maximum and keep the "
answer concise. 
{context}

'''
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [37]:
question_answer_chain = create_stuff_documents_chain(
    llm=llm,prompt=prompt)

rag_chain = create_retrieval_chain(
    retriever=retriver,
    combine_docs_chain=question_answer_chain,
)

In [38]:
response = rag_chain.invoke(
    {"input": "What is acne?"}
)

print(response['answer'])

Acne is a common skin disease characterized by pimples on the face, chest, and back. It occurs when the pores of the skin become clogged with oil, dead skin cells, and bacteria. Acne is also known as acne vulgaris, and it is the most common skin disease, affecting nearly 17 million people in the United States.
