In [None]:
print("ok")

In [None]:
import os

In [None]:
%pwd

In [None]:
os.chdir("../")

In [None]:
%pwd

In [None]:
from dotenv import load_dotenv

In [None]:
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')

In [None]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
def load_pdf_file(data):
    loader = DirectoryLoader(
        data, glob= "*pdf",
        loader_cls=PyPDFLoader)
    
    documents = loader.load()
    
    return documents

In [None]:
extracted_text = load_pdf_file(data = 'Data/')

In [None]:
extracted_text

In [None]:
def text_split(extracted_text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(extracted_text)
    
    return text_chunks

In [None]:
chunked_text = text_split(extracted_text)

len(chunked_text)

In [None]:
chunked_text

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

In [None]:
def download_hugging_face_embidding():
    embeddings = HuggingFaceEmbeddings(model_name = "sentence-transformers/all-MiniLM-l6-v2")
    return embeddings

In [None]:
embedding = download_hugging_face_embidding()

In [None]:
from pinecone import Pinecone,ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)
pc.delete_index("medibot")

In [None]:
index_name = "medibot"

pc.create_index(
    name=index_name,
    dimension=384, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

In [None]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents = chunked_text,
    index_name = index_name,
    embedding= embedding,
)

In [None]:
dbsearch = PineconeVectorStore.from_existing_index(
    index_name = index_name,
    embedding = embedding
)

In [None]:
retriever = dbsearch.as_retriever(search_type = "similarity", search_kwargs = {"k" : 3})

In [None]:
retrieved_docs = retriever.invoke("What is Acne?")

In [None]:
retrieved_docs

In [None]:
from langchain_ollama import OllamaLLM
llm = OllamaLLM(model = "deepseek-r1:1.5b",
                stop = ["<|eot_id|>"],
                configurable = {"temperature": 0.6, "num_predicts": 500})

In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


system_prompt = (
    """You are a highly knowledgeable and professional medical assistant. Your role is to provide accurate and reliable medical information based on the content of the provided medical PDF. You should analyze the user’s query and generate a response only if the relevant information exists in the PDF.

Guidelines:
1. **Medical Questions Only:** Answer only if the query is related to medical topics such as diseases, symptoms, treatments, medications, diagnostics, anatomy, physiology, or any other medical subject found in the PDF.
2. **Reference-Based Answers:** Ensure all responses are based on the content of the provided PDF. If the information is not found in the document, respond with:  
   *"I'm sorry, but I couldn't find relevant information in the provided medical document."*
3. **No Speculative or Personal Advice:** Do not provide personal medical advice, diagnoses, or suggest treatments. Always recommend consulting a healthcare professional.
4. **Reject Non-Medical Queries:** If the user asks about unrelated topics (e.g., politics, sports, technology), respond with:  
   *"I'm designed to provide only medical information. Please ask a medical-related question."*
5. **Concise and Professional Tone:** Use clear, professional, and easy-to-understand medical language while avoiding unnecessary complexity.

If the PDF contains relevant medical images, tables, or figures, describe them clearly in your response."""
"{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}")
    ]
)

In [None]:
qna_chain = create_stuff_documents_chain(llm,prompt)

rag_chain = create_retrieval_chain(retriever,qna_chain)

In [None]:
import re

In [None]:
response = rag_chain.invoke({"input": "What is Acne? "})
raw_answer = response.get("answer", "")

# Use regex to remove the <think> block
clean_answer = re.sub(r"<think>.*?</think>", "", raw_answer, flags=re.DOTALL).strip()

print(clean_answer)