In [None]:
print(".")

pdfload -> textsplitter -> embeddings -> vectordatabase

In [None]:
%pwd

In [None]:
import os
os.chdir("medical-chatbot")

In [None]:
%pwd

In [None]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader 
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
def load_documents(data):
    loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

In [None]:
extracted_text = load_documents(data = "data/")

In [None]:
extracted_text

In [None]:
extracted_text[1].page_content.replace("\n"," ")

In [None]:
def textsplitter(text):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size = 1000,
        chunk_overlap = 100
    )
    chunks = splitter.split_documents(text)
    return chunks

In [None]:
chunks = textsplitter(extracted_text)

In [None]:
chunks

In [None]:
len(chunks)

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

def download_HF_Embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [None]:
embeddings = download_HF_Embeddings()

In [None]:
query = embeddings.embed_query("My name is Muhammad Hashir!")
print(query)
print(len(query))

In [None]:
from dotenv import load_dotenv
load_dotenv()

In [None]:
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")


In [None]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)

In [None]:
index_name="medicalbot"
pc.create_index(
    name=index_name,
    dimension=384, 
    metric="cosine", 
    spec=ServerlessSpec(
        cloud="aws", 
        region="us-east-1"
    ) 
) 

In [None]:
os.environ['PINECONE_API_KEY'] = PINECONE_API_KEY
os.environ['GEMINI_API_KEY'] = GEMINI_API_KEY

In [None]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents = chunks,
    index_name = index_name,
    embedding = embeddings,
    
)

In [None]:
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [None]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs = {"k":3})

In [None]:
retrieve_docs = retriever.invoke("what is skin")

In [None]:
retrieve_docs

In [None]:
import getpass
import os

if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = GEMINI_API_KEY

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0.4,
    max_tokens=1000,
    timeout=None,
    max_retries=2,
    # other params...
)

In [None]:
llm.invoke("What is acne")

In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

In [None]:
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)
prompt = ChatPromptTemplate(
    [
        ("system",system_prompt),
        ("human","{input}")
    ]
)

In [None]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
drag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [None]:
response = rag_chain.invoke({"input":"What is Acne"})
response["answer"]


In [None]:
response = rag_chain.invoke({"input":"What is data science"})
response["answer"]
