In [0]:
# 1. Uninstall all conflicting versions
%pip uninstall -y langchain langchain-community databricks-langchain databricks-sql-connector sentence-transformers faiss-cpu huggingface_hub pypdf unstructured transformers

# 2. Install only the required versions in a single cell
%pip install langchain==0.1.16 langchain-community==0.0.33 sentence-transformers faiss-cpu "huggingface_hub>=0.21.0" "transformers>=4.38.2,<4.58.0" pypdf unstructured
%pip install pymupdf

In [0]:
dbutils.library.restartPython()

In [0]:

##### BeautifulSoup Webscraper used to read html response ######
import requests
from bs4 import BeautifulSoup

url = "https://www.who.int/news-room/fact-sheets/detail/headache-disorders"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
text = soup.get_text()

In [0]:
###### Create document object with retrieved text and metadata addition ####
from langchain_core.documents import Document

documents1 = [Document(page_content=text, metadata={"source": url})]
print(documents1)

In [0]:
%skip
####### FOR STRUCTURED PDF   #########

from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

loader = PyPDFLoader("/Volumes/workspace/llm/medical_faq_bot/Migraine Headache - ed-migraine_headache.pdf")
mig_documents = loader.load()
loader = PyPDFLoader("/Volumes/workspace/llm/medical_faq_bot/dengue_06092018.pdf")
dengue_documents = loader.load()


# Add topic metadata to each document
for doc in mig_documents:
    doc.metadata["topic"] = "migraine"
for doc in dengue_documents:
    doc.metadata["topic"] = "dengue"    

documents = mig_documents + dengue_documents


In [0]:
##### FOR UNSTRUCTURED PDFs #####
import re
from langchain.schema import Document
import fitz  # PyMuPDF

def clean_text(text):
    # Normalize bullet points
    text = re.sub(r'â€¢', '-', text)
    # Remove extra spaces
    text = re.sub(r'\n\s*\n', '\n\n', text)
    return text



def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    # Extract text with formatting
    for page_num in range(len(doc)):
         page = doc.load_page(page_num)
         text = page.get_text("text")  # Use "text" for layout-aware extraction
         print(f"--- Page {page_num + 1} ---\n{text}\n")
    formatted_texts = [clean_text(doc.load_page(i).get_text("text")) for i in range(len(doc))]
    documents = [Document(page_content=txt, metadata={"page": i + 1}) for i, txt in enumerate(formatted_texts)]   
    return documents      

In [0]:
### Took sample migraine and dengue PDF documents to provide as inputs to the model (RAG)
pdf_path_migraine = "/Volumes/workspace/llm/medical_faq_bot/Migraine Headache - ed-migraine_headache.pdf"
migraine_docs = extract_text_from_pdf(pdf_path_migraine) + documents1
pdf_path_dengue = "/Volumes/workspace/llm/medical_faq_bot/dengue_06092018.pdf"
dengue_docs = extract_text_from_pdf(pdf_path_dengue)
documents = migraine_docs + dengue_docs

In [0]:
### break large documents into smaller, overlapping chunks, - Breaks text into chunks of 500 characters, Adds 50 characters of overlap between chunks to preserve context ###


from langchain.text_splitter import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50
)
docs = splitter.split_documents(documents)

In [0]:
### Embeddings are created using HuggingFaceEmbeddings LANGCHAIN vector store is used to store the embeddings ###

from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

embedding = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

vector_store = FAISS.from_documents(
    docs,
    embedding
)

In [0]:
%skip
ALTER TABLE workspace.llm.medical_docs
SET TBLPROPERTIES (delta.enableChangeDataFeed = true)

In [0]:
%skip
from databricks_langchain import DatabricksEmbeddings, DatabricksVectorSearch
from databricks.vector_search.client import VectorSearchClient

embedding = DatabricksEmbeddings(
    endpoint="bge-large-en-endpoint"
)

vsc = VectorSearchClient()
index_name = "workspace.llm.medical_docs"

vsc.create_delta_sync_index(
    source_table_name="workspace.llm.medical_docs",
    index_name="workspace.llm.medical_docs_v2",
    primary_key="id",
    pipeline_type="TRIGGERED",  # Use CONTINUOUS or TRIGGERED
    endpoint_name="bge-large-en-endpoint",
    embedding_source_column="content",
    embedding_model_endpoint_name="bge-large-en-endpoint"
)

vector_store = DatabricksVectorSearch(
    index_name=index_name,
    databricks_vector_search_client=vsc,
    embedding=embedding
)

vector_store.add_documents(docs)

In [0]:
%skip
import requests
import json

# Replace 
workspace_url = ""
token = ""

headers = {
    "Authorization": f"Bearer {token}",
    "Content-Type": "application/json"
}

payload = {
    "name": "workspace.llm.medical_docs_index",
    "source_table": "workspace.llm.medical_docs",
    "primary_key": "id",
    "embedding_column": "embedding_column_name"
}

response = requests.post(
    f"{workspace_url}/api/vector-search/indexes",
    headers=headers,
    data=json.dumps(payload)
)

print(response.json())


%sql
SELECT * FROM vector_search(
  index => 'workspace.llm.medical_docs',
  query_text => 'dengue',
  num_results => 20
)


In [0]:
%skip
# Step 1: Deploy the embedding model as a serving endpoint in Databricks Model Serving UI or via API.
# Make sure the endpoint name matches what you use in your code, e.g., "bge-large-en-endpoint".

# Step 2: After deployment, verify the endpoint exists.
# You can list endpoints using the MLflow Deployments SDK:
from mlflow.deployments import get_deploy_client
client = get_deploy_client("databricks")
print(client.list_endpoints())

# Step 3: Use the correct endpoint name in your code.
embedding = DatabricksEmbeddings(
    endpoint="bge-large-en-endpoint"  # Ensure this matches the deployed endpoint
)

In [0]:
%skip
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFaceHub

llm = HuggingFaceHub(
    repo_id="google/flan-t5-base",
    task="text2text-generation",
    huggingfacehub_api_token=""
)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vector_store.as_retriever(),
    return_source_documents=True
)

response = qa_chain.invoke("What are the symptoms of migraine?")
display(response)

This script performs Retrieval-Augmented Generation (RAG) for medical queries. It:
- Extracts disease-related keywords from a user query
- Matches them against a Delta table of diseases
- Uses a language model to answer the query if a match is found in the diseases list and then in the uploaded documents (migraine and dengue pdf files and headaches related API response)
- Displays the answer only if the source documents contain the matched disease


In [0]:
import re
from transformers import pipeline
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA


# Step 1: Define stopwords (you can expand this list)
stopwords = {
    "what", "are", "is", "the", "of", "in", "on", "and", "or", "to", "a", "an",
    "please", "explain", "each", "briefly", "common", "five", "symptoms"
}

# Step 2: Tokenize query and remove stopwords
query_text = dbutils.widgets.get("query")
query_tokens = set(re.findall(r'\b\w+\b', query_text.lower())) - stopwords

# Step 3: Load disease names from Delta table
df_diseases = spark.table("workspace.llm.diseases")
disease_list = [row["diseases"] for row in df_diseases.select("diseases").collect()]

# Step 4: Match if any query token appears in disease name
matched_diseases = []
for disease in disease_list:
    disease_tokens = set(re.findall(r'\b\w+\b', disease.lower()))
    if query_tokens & disease_tokens:
        matched_diseases.append(list(query_tokens & disease_tokens))

matched_diseases = [item for sublist in matched_diseases for item in sublist]
print(matched_diseases)


if matched_diseases:
    # Create a local pipeline with increased max_new_tokens
    hf_pipeline = pipeline(
        "text2text-generation",
        model="google/flan-t5-base",
        max_new_tokens=500  # Increase this value for longer outputs
    )
    
    llm = HuggingFacePipeline(pipeline=hf_pipeline)
    
    retriever = vector_store.as_retriever(search_kwargs={"k": 3})
    
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=retriever,
        return_source_documents=True,
        chain_type="stuff"
    )
    
    result = qa_chain.invoke(query_text)
    source_docs = result.get("source_documents", [])
    
    # Check if any source document contains 'malaria' (case-insensitive)
    if any(matched_diseases[0] in doc.page_content.lower() for doc in source_docs):
        #display(result["result"])
        displayHTML(f"<b>Answer:</b> {result["result"]}")
    else:
        displayHTML(f"<b>Answer:</b> NO RESULT FOUND")