In [35]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [36]:
def load_pdf():
    loader = DirectoryLoader(
        '../data',
        glob='*.pdf',
        loader_cls=PyPDFLoader
    )   
    documents = loader.load()
    return documents

In [37]:
book = load_pdf()


In [38]:

from typing import List
from langchain_core.documents import Document

def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(Document(page_content=doc.page_content, metadata={"source": src}))
    return minimal_docs


In [39]:
doc = filter_to_minimal_docs(book)

In [40]:
def text_split(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20,

    )
    text = text_splitter.split_documents(documents)
    return text

In [41]:
text_chunks = text_split(doc)
print(f"Total number of text chunks: {len(text_chunks)}")

Total number of text chunks: 5859


In [42]:
from langchain_huggingface import HuggingFaceEmbeddings

def download_embeddings_model():
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings_model= HuggingFaceEmbeddings(
    model_name=model_name
    )
    return embeddings_model
    

In [43]:
embeddings_model = download_embeddings_model()

In [44]:
q1 = embeddings_model.embed_query("Hello")

In [45]:
from dotenv import load_dotenv
import os
load_dotenv()


True

In [46]:
pinecone = os.getenv('PINECONE_API_KEY')
os.environ["PINECONE_API_KEY"] = pinecone

In [47]:
from pinecone import Pinecone
pinecone_api_key = pinecone
pc = Pinecone(api_key=pinecone_api_key)

In [48]:
pc

<pinecone.pinecone.Pinecone at 0x2120a49ce30>

In [49]:
from pinecone import ServerlessSpec
index_name = "medical-chatbot"
if not pc.has_index(index_name):
    pc.create_index(
        name = index_name,
        dimension=384,
        metric="cosine",
        spec = ServerlessSpec(cloud = "aws", region = "us-east-1")
        )
index = pc.Index(index_name)


In [None]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
                documents= text_chunks,
                embedding=  embeddings_model,
                index_name=index_name
            )

## If an index is laread available then you can load it in the following way

In [51]:
docsearch = PineconeVectorStore.from_existing_index(
    index_name= index_name,
    embedding=embeddings_model
)

## Add more data to the index Use this:
    doc = Document(
        page_content = "Something",
        metadata = {"source": "Something"}
    )
    dosearch.add_documents(documets = [doc])

In [84]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k": 5})

In [88]:
ret_docs = retriever.invoke("What is Anthrax?")

In [89]:
ret_docs

[Document(id='ee75da72-2b5f-450a-b7ab-8ce20c85b74b', metadata={'source': '..\\data\\Medical_book.pdf'}, page_content='Anthrax\nDefinition\nAnthrax is a bacterial infection caused by Bacillus\nanthracis that primarily affects livestock but that can\noccasionally spread to humans, affecting either the skin,\nintestines, or lungs. In humans, the infection can often be\ntreated, but it is almost always fatal in animals.\nDescription\nAnthrax is most often found in the agricultural\nareas of South and Central America, southern and east-\nern Europe, Asia, Africa, the Caribbean, and the Middle'),
 Document(id='3b6f15d0-26a8-4351-91dc-4a178b0dc8c4', metadata={'source': '..\\data\\Medical_book.pdf'}, page_content='Avenue Appia 20, 1211 Geneva 27, Switzerland. (+00 41\n22) 791 21 11. <http://www.who.int>.\nGALE ENCYCLOPEDIA OF MEDICINE 2 225\nAnthrax\nGEM - 0001 to 0432 - A  10/22/03 1:42 PM  Page 225'),
 Document(id='0ac46b11-dc2a-48f0-b345-925bd57a056e', metadata={'source': '..\\data\\Medical

In [97]:
import os
from langchain_groq import ChatGroq
from langchain_core.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
# Set your key
groq_api_key = os.environ.get("GROQ_API_KEY")
os.environ['GROQ_API_KEY'] = groq_api_key


def generation(query, retriever):
    llm = ChatGroq(
            model_name="llama-3.3-70b-versatile",
            temperature=0.5
        )

    docs = retriever.invoke(query)
    context = "\n\n".join(d.page_content for d in docs)

    system_prompt = SystemMessagePromptTemplate.from_template(
    "You are a helpful medical assistant." 
    "Based only on the context below, answer the question concisely and clearly."
    "If you do not have enough information, say so. Use three sentence maximum"
    "and keep the answer consice."
    "\n\n"
    "{context}"
    )

    user_prompt = HumanMessagePromptTemplate.from_template(
    "Question: {query}")

    prompt = ChatPromptTemplate.from_messages([system_prompt , user_prompt])


    response = llm.invoke(prompt)


    return response.content
response = generation("What is anthrax?", retriever)

ValueError: Invalid input type <class 'langchain_core.prompts.chat.ChatPromptTemplate'>. Must be a PromptValue, str, or list of BaseMessages.

In [95]:
print(response)

Anthrax is a bacterial infection caused by Bacillus anthracis that primarily affects livestock but can occasionally spread to humans. It can affect the skin, intestines, or lungs in humans, and while it can often be treated, it is almost always fatal in animals. The infection is characterized by a coal-black sore and can be diagnosed through blood, skin, or respiratory secretion samples.
