In [None]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
def load_pdf_file(data):
    loader= DirectoryLoader(data,
                            glob="*.pdf",
                            loader_cls=PyPDFLoader)

    documents=loader.load()

    return documents
extracted_data=load_pdf_file(data='../data')    

In [None]:
extracted_data[:5]
len(extracted_data)

In [None]:
from typing import List
from langchain.schema import Document
def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    """
    Given a list of Document objects, return a new list of Document objects
    containing only 'source' in metadata and the original page_content.
    """
    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source": src}
            )
        )
    return minimal_docs

minimal_docs = filter_to_minimal_docs(extracted_data) 

In [None]:
minimal_docs[:6]

In [None]:
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks


In [None]:
text_chunks = text_split(minimal_docs)
print(f"Number of chunks: {len(text_chunks)}")

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

In [None]:
def download_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')  #this model return 384 dimensions
    return embeddings

In [None]:
embeddings = download_embeddings()

In [None]:
embeddings

In [None]:
from dotenv import load_dotenv
import os
load_dotenv() 
from pinecone import Pinecone
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
pinecone_api_key = PINECONE_API_KEY
pc = Pinecone(api_key=pinecone_api_key) # to authenticate with Pinecone account

In [None]:
pc

In [None]:
from pinecone import ServerlessSpec
index_name = "medical-chatbot"
if not pc.has_index(index_name):
    pc.create_index(name=index_name,
                   dimension=384,
                   metric="cosine",
                   spec=ServerlessSpec(cloud="aws", region="us-east-1"))

index = pc.Index(index_name)
    

In [None]:
from langchain_pinecone import PineconeVectorStore

# ----------------------------------------------------------------------
# --- INITIAL DATA INGESTION: ONLY execute this block the FIRST time ---
# This step creates the vector embeddings from your documents (text_chunks)
# and uploads them to the Pinecone index for persistent storage/ Pinecone vector db.
# Once the index is created, keep this block commented out.
# ----------------------------------------------------------------------

# docsearch = PineconeVectorStore.from_documents(
#     documents=text_chunks,
#     embedding=embeddings,
#     index_name=index_name,
# )

In [None]:
#Load existing index 
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [None]:
dummy_docs = Document(
        page_content="The Phase 3 trial of 'Medication X' demonstrated an 85% efficacy rate, leading to its new classification as a first-line treatment for Condition Z.",
        metadata={"source": "Youtube"}
    )

In [None]:
docsearch.add_documents(
    documents=[dummy_docs])

In [None]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [None]:
retrieved_docs = retriever.invoke("What is Acne")

In [None]:
retrieved_docs # 3 responses as k = 3. In  next step refine this responses using LLM

In [None]:
from langchain_openai import ChatOpenAI
OPENAI_API_KEY=os.environ.get('OPENAI_API_KEY')
os.environ["OPEN_API_KEY"] = OPENAI_API_KEY

# OPENAI_KEY_VALUE = os.environ.get('OPEN_API_KEY') # Retrieve your custom variable

In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
system_prompt = (
    "You are an Medical assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

chatModel = ChatOpenAI(model="gpt-4o")
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(chatModel, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)


In [None]:
response = rag_chain.invoke({"input": "What is Acromegaly and gigantism"})
print("Response : ", response["answer"])