In [1]:
from langchain.document_loaders import PyPDFLoader,DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore  
from langchain.prompts import PromptTemplate
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

  from .autonotebook import tqdm as notebook_tqdm

For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from langchain_pinecone.vectorstores import Pinecone, PineconeVectorStore


In [56]:
pip install langchain-openai




In [8]:
# Extract text from PDF files
def load_pdf_files(data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )
    
    documents = loader.load()
    return documents

In [9]:
extracted_data=load_pdf_files("../data")

In [10]:
extracted_data[10]

Document(metadata={'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'creator': 'PyPDF', 'creationdate': '2004-12-18T17:00:02-05:00', 'moddate': '2004-12-18T16:15:31-06:00', 'source': '..\\data\\Medical_book.pdf', 'total_pages': 637, 'page': 10, 'page_label': '11'}, page_content='Rhonda Cloos, R.N.\nMedical Writer\nAustin, TX\nGloria Cooksey, C.N.E\nMedical Writer\nSacramento, CA\nAmy Cooper, M.A., M.S.I.\nMedical Writer\nVermillion, SD\nDavid A. Cramer, M.D.\nMedical Writer\nChicago, IL\nEsther Csapo Rastega, R.N., B.S.N.\nMedical Writer\nHolbrook, MA\nArnold Cua, M.D.\nPhysician\nBrooklyn, NY\nTish Davidson, A.M.\nMedical Writer\nFremont, California\nDominic De Bellis, Ph.D.\nMedical Writer/Editor\nMahopac, NY\nLori De Milto\nMedical Writer\nSicklerville, NJ\nRobert S. Dinsmoor\nMedical Writer\nSouth Hamilton, MA\nStephanie Dionne, B.S.\nMedical Writer\nAnn Arbor, MI\nMartin W. Dodge, Ph.D.\nTechnical Writer/Editor\nCentinela Hospital and Medical\nCenter\nInglewood, CA\nDavid Doermann\nMedical

In [11]:
len(extracted_data)

637

In [12]:
from typing import List
from langchain.schema import Document
def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    """
    Given a list of Document objects, return a new list of Document objects
    containing only 'source' in metadata and the original page_content.
    """
    minimal_docs: list[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source": src},
            )
        )
    
    return minimal_docs

In [13]:
minimal_docs=filter_to_minimal_docs(extracted_data)

In [17]:

# Split the documents into smaller chunks
def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20
    )
    texts_chunks= text_splitter.split_documents(minimal_docs)
    return texts_chunks

In [65]:
texts_chunks = text_split(minimal_docs)
print(f"Number of chunks:{len(texts_chunks)}")

Number of chunks:5859


In [32]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [67]:
pip install accelerate

Note: you may need to restart the kernel to use updated packages.


In [20]:
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings


In [21]:
embeddings = download_hugging_face_embeddings()

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [70]:
query_text = "What is a common allergy?"
embedding_vector = embeddings.embed_query(query_text)

In [71]:
print("Sample of the embedding vector (first 20 numbers):")
print(embedding_vector[:20])

Sample of the embedding vector (first 20 numbers):
[0.04702260345220566, -0.006634933408349752, -0.01139666885137558, -0.031360842287540436, 0.033007990568876266, 0.03826579824090004, 0.07354530692100525, 0.08192341774702072, -0.08451011031866074, 0.027469314634799957, 0.0592004731297493, -0.06414079666137695, -0.026036029681563377, 0.05833462253212929, -0.04474414139986038, 0.06623373180627823, -0.011161021888256073, -0.023217888548970222, 0.014017648063600063, -0.05721267685294151]


In [22]:
import os
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

In [23]:
index_name = "medical-chatbot"

In [74]:
from langchain_pinecone import PineconeVectorStore

In [None]:
vector_store = PineconeVectorStore.from_documents(
    texts_chunks,
    embeddings,
    index_name=index_name
)

TypeError: VectorStore.from_documents() missing 1 required positional argument: 'embedding'

In [25]:
# --- Use this code EVERY time you restart your script or notebook ---

print("Connecting to existing index in Pinecone...")
vector_store = PineconeVectorStore.from_existing_index(
    index_name,
    embeddings
)
print("Connection successful!")

Connecting to existing index in Pinecone...
Connection successful!


In [27]:
pip install pinecone

Note: you may need to restart the kernel to use updated packages.


In [26]:
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [27]:
retrieved_docs = retriever.invoke("What is Acne?")
retrieved_docs

[Document(id='1811b710-fbe5-43b2-8394-d83e0536b4c5', metadata={'source': '..\\data\\Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='d4a1434f-3129-432c-a390-1e9117f54591', metadata={'source': '..\\data\\Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 25\nAcne\nAcne vulgaris affecting a woman’s face. Acne is the general\nname given to a skin disorder in which the sebaceous\nglands become inflamed. (Photograph by Biophoto Associ-\nates, Photo Researchers, Inc. Reproduced by permission.)\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 25'),
 Document(id='ea02179f-578f-49a5-b56d-a4bcaa38bdc4', metadata={'source': '..\\data\\Medical_book.pdf'}, page_content='Acidosis see Respiratory acidosis; Renal\ntubular acidosis; Metabolic acidosis\nAcne\nDefinition\nAcne is a common skin disease characterized by\npimples on the face, chest, and back. It occurs when the\npores of the skin becom

In [28]:
prompt_template = """
You are a helpful and knowledgeable medical assistant. 
Use the retrieved context below to answer the user’s medical question. 
If the answer is not in the context, say "I don’t know." 
Keep your response clear, factual, and under three sentences. 
Do not provide personal medical advice, diagnoses, or treatment recommendations. 

Context: {context}
Question: {question}

Helpful answer:
"""

PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain_type_kwargs = {"prompt": PROMPT}

In [6]:
# +++ NEW CODE TO ADD +++
from langchain_huggingface import HuggingFaceEndpoint

print("Connecting to Hugging Face Inference API...")

# Define the model you want to use from the Hub
repo_id = "microsoft/phi-3-mini-4k-instruct"

# Create the llm object that connects to the Inference API
llm = HuggingFaceEndpoint(
    repo_id=repo_id,
    max_new_tokens=512,
    temperature=0.8,
    huggingfacehub_api_token=os.getenv("HUGGINGFACEHUB_API_TOKEN")
)

Connecting to Hugging Face Inference API...


In [29]:
rag_chain= RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vector_store.as_retriever(search_kwargs={'k': 3}),
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs
)

In [None]:
import os
os.environ["LANGCHAIN_TRACING_V2"] = "false"

In [35]:
# ---- Run a query ----
query = "What are Allergies"
result =rag_chain({"query": query})
print("Final Answer:", result["result"])
print("\nSource Documents:", result["source_documents"])

StopIteration: 

In [None]:
response = rag_chain.invoke({"input": "what is Acromegaly and gigantism?"})
print(response["answer"])

In [None]:
response = rag_chain.invoke({"input": "what is Acne?"})
print(response["answer"])

In [None]:
response = rag_chain.invoke({"input": "what is the Treatment of Acne?"})
print(response["answer"])