In [62]:

from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [63]:
#Extract Data From the PDF File
def load_pdf_file(data):
    loader= DirectoryLoader(data,
                            glob="*.pdf",
                            loader_cls=PyPDFLoader)

    documents=loader.load()

    return documents

In [65]:
extracted_data=load_pdf_file(data='D:\Projects\GenAi_Medical_Chatbot\Data')

In [66]:
#Split the Data into Text Chunks
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [67]:
text_chunks=text_split(extracted_data)
print("Length of Text Chunks", len(text_chunks))


Length of Text Chunks 5860


In [68]:
from langchain.embeddings import HuggingFaceEmbeddings
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [69]:
embeddings = download_hugging_face_embeddings()

In [70]:
query_result = model.encode("Hello world")
print("Length", len(query_result))

Length 384


In [108]:
import os
from dotenv import load_dotenv
load_dotenv()

PINECONE_API_KEY=os.environ.get('PINECONE_API_KEY')
OPENAI_API_KEY=os.environ.get('OPENAI_API_KEY')
HUGGINGFACEHUB_API_TOKEN=os.environ.get('HUGGINGFACEHUB_API_TOKEN')
#print(HUGGINGFACEHUB_API_TOKEN)

In [73]:
import os
from pinecone import Pinecone  # Updated import for Pinecone 6.0.2
from tqdm.auto import tqdm

# Initialize Pinecone (different in 6.0.2)
pc = Pinecone(api_key=PINECONE_API_KEY)

# Connect to your existing index
index_name = "medibot"
index = pc.Index(index_name)

In [74]:
print(len(text_chunks))

5860


In [None]:
'''# Embed each chunk and upsert the embeddings into your Pinecone index.

def embed_and_upsert(text_chunks, embeddings_model):
    batch_size = 100  # Adjust as needed
    
    for i in tqdm(range(0, len(text_chunks), batch_size)):
        # Get the batch of documents
        batch = text_chunks[i:i+batch_size]
        
        # Get texts to embed
        texts = [doc.page_content for doc in batch]
        
        # Generate embeddings
        embeds = embeddings_model.embed_documents(texts)
        
        # Get metadata 
        metadatas = [{"text": doc.page_content, 
                     "source": doc.metadata.get("source", ""),
                     "page": doc.metadata.get("page", 0)} for doc in batch]
        
        # Create IDs
        ids = [f"doc_{i+j}" for j in range(len(batch))]
        
        # Create records in the format expected by Pinecone 6.0.2
        records = [
            {"id": ids[j], 
             "values": embeds[j], 
             "metadata": metadatas[j]
            } for j in range(len(batch))
        ]
        
        # Upsert to Pinecone
        index.upsert(vectors=records)
        
    return f"Successfully embedded and upserted {len(text_chunks)} document chunks to Pinecone"
result = embed_and_upsert(text_chunks, embeddings)
print(result)'
'''

100%|██████████| 59/59 [05:46<00:00,  5.88s/it]

Successfully embedded and upserted 5860 document chunks to Pinecone





In [75]:
# Load Existing index 

from langchain_pinecone import PineconeVectorStore
# Embed each chunk and upsert the embeddings into your Pinecone index.
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)


In [76]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [77]:
retrieved_docs = retriever.invoke("What is Dandruff?")

In [78]:
retrieved_docs

[Document(id='doc_1223', metadata={'page': 139.0, 'source': 'D:\\Projects\\GenAi_Medical_Chatbot\\Data\\Medical_book.pdf'}, page_content='its own tissues.\nChemotherapy—The treatment of diseases, usual-\nly cancer, with drugs (chemicals).\nHair follicles—Tiny organs in the skin, each one of\nwhich grows a single hair.\nLupus erythematosus —An autoimmune disease\nthat can damage skin, joints, kidneys, and other\norgans.\nRingworm—A fungal infection of the skin, usually\nknown as tinea corporis.\nSystemic—Affecting all or most parts of the body.\ntime, minoxidil produces satisfactory results in about one'),
 Document(id='doc_1219', metadata={'page': 138.0, 'source': 'D:\\Projects\\GenAi_Medical_Chatbot\\Data\\Medical_book.pdf'}, page_content='Alopecia\nTop of balding male’s head.(Photograph by Kelly A. Quin.\nReproduced by permission.)\nGEM - 0001 to 0432 - A  10/22/03 1:42 PM  Page 125'),
 Document(id='doc_1215', metadata={'page': 138.0, 'source': 'D:\\Projects\\GenAi_Medical_Chatbot\\D

In [115]:
#from langchain_openai import OpenAI

#llm = OpenAI(model="gpt-3.5-turbo", temperature=0.4, max_tokens=500)

from langchain_community.llms import HuggingFaceHub

llm = HuggingFaceHub(
    repo_id="google/flan-t5-base",
    model_kwargs={"temperature": 0.4, "max_length": 1024},
    huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN
)


In [116]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that"
    "Sorry, That information is not availabe in the materila provided,Try with different keywords." 
    "Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [117]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [122]:
response = rag_chain.invoke({"input": "what is diziness?"})
print(response["answer"])



hyperactivity
