In [80]:
#load the dataset

import requests
import pandas as pd

# Define the URL and parameters
url = "https://datasets-server.huggingface.co/rows"
params = {
    "dataset": "Nicolybgs/healthcare_data",
    "config": "default",
    "split": "train",
    "offset": 0,
    "length": 100
}

# Make the GET request
response = requests.get(url, params=params)

# Check if the request was successful
if response.status_code == 200:
    # Parse the JSON response
    data = response.json()

    # Convert the JSON data to a Pandas DataFrame
    rows = data.get('rows', [])
    df = pd.DataFrame([row['row'] for row in rows])


In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 15 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Available Extra Rooms in Hospital  100 non-null    int64  
 1   Department                         100 non-null    object 
 2   Ward_Facility_Code                 100 non-null    object 
 3   doctor_name                        100 non-null    object 
 4   staff_available                    100 non-null    int64  
 5   patientid                          100 non-null    int64  
 6   Age                                100 non-null    object 
 7   gender                             100 non-null    object 
 8   Type of Admission                  100 non-null    object 
 9   Severity of Illness                100 non-null    object 
 10  health_conditions                  74 non-null     object 
 11  Visitors with Patient              100 non-null    int64  


In [69]:
import pandas as pd
from langchain.docstore.document import Document


# Define the function to format each row
def format_row(row):
    return (
        f"Available Extra Rooms in Hospital: {row['Available Extra Rooms in Hospital']}, "
        f"Department: {row['Department']}, Ward_Facility_Code: {row['Ward_Facility_Code']}, "
        f"Doctor Name: {row['doctor_name']}, Staff Available: {row['staff_available']}, "
        f"Patient ID: {row['patientid']}, Age: {row['Age']}, Gender: {row['gender']}, "
        f"Type of Admission: {row['Type of Admission']}, Severity of Illness: {row['Severity of Illness']}, "
        f"Health Conditions: {row['health_conditions']}, Visitors with Patient: {row['Visitors with Patient']}, "
        f"Insurance: {row['Insurance']}, Admission Deposit: {row['Admission_Deposit']}, "
        f"Stay (in days): {row['Stay (in days)']}\n\n"
    ).lower()

# Apply the function to each row and create a new column with the formatted text
df['formatted_text'] = df.apply(format_row, axis=1)

# Convert the formatted text into a list of Document objects
documents = []
for text in df['formatted_text']:
    document = Document(page_content=text)
    documents.append(document)



In [70]:
#load the embedding model
from langchain_community.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name  = "BAAI/bge-base-en-v1.5")

In [71]:
# index all the documents into the vector database Qdrant.
from langchain_community.vectorstores import Qdrant
qdrant = Qdrant.from_documents(
    documents,
    embeddings,
    location=":memory:",  
    collection_name="reranker",
)

In [72]:
#loading the retriever
retriever = qdrant.as_retriever(search_kwargs = {'k':70})

In [73]:
#Implementing Re-Ranking and Compression Retriever
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain_community.cross_encoders import HuggingFaceCrossEncoder

model = HuggingFaceCrossEncoder(model_name="BAAI/bge-reranker-base")
compressor = CrossEncoderReranker(model=model, top_n=10)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
)

In [79]:
#We now initialize our LLM

from langchain_community.llms import Ollama

llm = Ollama(model="Qwen2-7B")

In [76]:
# Step 6: Define the Function to Retrieve and Respond
def retrieve_and_respond(question):
    # Retrieve relevant documents using compression_retriever
    compression_documents = compression_retriever.invoke(question)
    
    # Combine the most relevant documents
    if compression_documents:
        combined_relevant_info = " ".join([doc.page_content for doc in compression_documents])
        prompt = f"Question: {question}\n\nRelevant Information: {combined_relevant_info}"
        response = llm.invoke(prompt)
        return response.content  # Access the content attribute of the AIMessage object
    else:
        return "No relevant information found."


In [77]:
# Step 7: Create Gradio Chatbot Interface
import gradio as gr

with gr.Blocks() as demo:
    chatbot = gr.Chatbot()
    msg = gr.Textbox(placeholder="Ask any healthcare-related questions...")
    clear = gr.Button("Clear Chat")

    def respond(message, chat_history):
        response = retrieve_and_respond(message)
        chat_history.append((message, response))
        return "", chat_history

    msg.submit(respond, [msg, chatbot], [msg, chatbot])
    clear.click(lambda: None, None, chatbot)

# Launch the Gradio interface in the Jupyter Notebook
demo.launch()


Running on local URL:  http://127.0.0.1:7877

To create a public link, set `share=True` in `launch()`.


