In [None]:
pip install -U transformers
from transformers import pipeline

In [None]:
pip install faiss-cpu

In [3]:
# !pip install -U sentence-transformers

In [4]:
import pandas as pd
import re
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss

In [5]:
# importing dataset from google drive
file_path = "/content/drive/MyDrive/ai-medical-chatbot.csv"

In [6]:
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,Description,Patient,Doctor
0,Q. What does abutment of the nerve root mean?,"Hi doctor,I am just wondering what is abutting...",Hi. I have gone through your query with dilige...
1,Q. What should I do to reduce my weight gained...,"Hi doctor, I am a 22-year-old female who was d...",Hi. You have really done well with the hypothy...
2,Q. I have started to get lots of acne on my fa...,Hi doctor! I used to have clear skin but since...,Hi there Acne has multifactorial etiology. Onl...
3,Q. Why do I have uncomfortable feeling between...,"Hello doctor,I am having an uncomfortable feel...",Hello. The popping and discomfort what you fel...
4,Q. My symptoms after intercourse threatns me e...,"Hello doctor,Before two years had sex with a c...",Hello. The HIV test uses a finger prick blood ...


In [7]:
df.describe()

Unnamed: 0,Description,Patient,Doctor
count,256916,256916,256916
unique,228722,246006,242150
top,Q. Why do periods get delayed after first time...,"Hello doctor, My fiancee and I had unprotected...",Hi. For further doubts consult a sexologist on...
freq,1137,1137,1519


In [8]:
df.shape

(256916, 3)

# Data Preprocessing

In [9]:
# Combining Description and patient column fot better context understanding
df['query_text'] = df['Description'].astype(str).str.strip() + " - " + df['Patient'].astype(str).str.strip()

In [10]:
# checking for duplicates
df.duplicated().sum()

np.int64(10378)

In [11]:
# checking first duplicate line
df[df.duplicated()].head(3)

Unnamed: 0,Description,Patient,Doctor,query_text
8,Q. What does abutment of the nerve root mean?,"Hi doctor,I am just wondering what is abutting...",Hi. I have gone through your query with dilige...,Q. What does abutment of the nerve root mean? ...
25,Q. What does abutment of the nerve root mean?,"Hi doctor,I am just wondering what is abutting...",Hi. I have gone through your query with dilige...,Q. What does abutment of the nerve root mean? ...
26,Q. Will Nano-Leo give permanent solution for e...,"Hello doctor, I am 48 years old. I am experien...",Hi. For further doubts consult a sexologist on...,Q. Will Nano-Leo give permanent solution for e...


In [12]:
# Remove duplicates
df.drop_duplicates(subset='query_text', inplace=True)
df.shape

(246510, 4)

In [13]:
# Cleaning the text
def clean_text(text):
    text = str(text)
    text = re.sub(r'Q[.]?', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\bHi doctor\b[:,]? ?', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

In [14]:
# Apply cleaning to both fields
df['query_text'] = df['query_text'].apply(clean_text)
df['Doctor'] = df['Doctor'].apply(clean_text)

In [15]:
# # Optional: Truncate long entries to avoid model token limits
# df['query_text'] = df['query_text'].apply(lambda x: ' '.join(x.split()[:150]))
# df['Doctor'] = df['Doctor'].apply(lambda x: ' '.join(x.split()[:250]))

In [16]:
# Reset index
df.reset_index(drop=True, inplace=True)

# Preview the cleaned data
df[['query_text', 'Doctor']].head()

Unnamed: 0,query_text,Doctor
0,What does abutment of the nerve root mean? - I...,Hi. I have gone through your uery with diligen...
1,What should I do to reduce my weight gained du...,Hi. You have really done well with the hypothy...
2,"I have started to get lots of acne on my face,...",Hi there Acne has multifactorial etiology. Onl...
3,Why do I have uncomfortable feeling between th...,Hello. The popping and discomfort what you fel...
4,My symptoms after intercourse threatns me even...,Hello. The HIV test uses a finger prick blood ...


In [17]:
# Check GPU usage
print("Available FAISS GPUs:", faiss.get_num_gpus())

Available FAISS GPUs: 0


In [None]:
# Load model and embed text
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(df['query_text'].tolist(), show_progress_bar=True, convert_to_numpy=True)

In [19]:
# Build index
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)  # Use Flat index (GPU-friendly fallback)
index.add(embeddings)

In [20]:
np.save("embeddings.npy", embeddings)
df.to_csv("processed.csv", index=False)


In [21]:
# function for getting patients query
def retrieve_top_k_answers(user_query, k=3):
    # Embed the user query
    query_embedding = model.encode([user_query], convert_to_numpy=True)

    # Search the FAISS index
    distances, indices = index.search(query_embedding, k)

    # Fetch top-k results from the dataframe
    results = []
    for i in indices[0]:
        results.append(df.iloc[i]['Doctor'])

    return results


# This gives you the retrieved context, i.e., candidate answers to feed into the generator.

In [22]:
# Example question
user_input = "What should I do to lose weight caused by hypothyroidism?"

# Retrieve top 3 similar answers
top_answers = retrieve_top_k_answers(user_input, k=3)

# Print results
for i, ans in enumerate(top_answers, 1):
    print(f"\n Answer {i}:\n{ans}")



 Answer 1:
Hi. You have really done well with the hypothyroidism problem. Your levels are normal with less medications which are very good. As it is genetically induced, it is very difficult to lose weight. My advice to you is, you should focus on maintaining normal levels of TSH (thyroid-stimulating hormone) and try to remain active, having a positive outlook in life. Or else, it will become very difficult to balance your life with the symptoms of hypothyroidism. Even though your weight has not reduced, be very careful in not putting on weight here afterward. Everyday brisk walking for 1 hour. If you have body pain, alternate with exercises and walking. Avoid all kinds of junk foods, processed, bakery products, rich sweets, fatty foods, sodas, alcohol, and smoking. Avoid partying and binge eating. Follow the food timings properly. Have small freuent meals. In between snacks should be strictly fruits or any kind of low-calorie foods. Have unsalted nuts around five daily. It can give a

# Now summarize that response

In [None]:
# Load summarizer
summarizer = pipeline("summarization", model="google/flan-t5-small", tokenizer="google/flan-t5-small")


In [46]:
def summarize_context_answers(context_list, max_tokens=256):
    # Join all top-k answers into one chunk
    joined_context = "\n".join(context_list)

    # HuggingFace's summarization pipeline expects text < 1024 tokens
    if len(joined_context.split()) > 1000: # Increased limit for truncation
        joined_context = " ".join(joined_context.split()[:1000])  # truncate if needed

    summary = summarizer(joined_context, max_length=400, min_length=60, do_sample=False)[0]['summary_text'] # Increased max_length for summarization
    return summary

# Now Adding the Generator

Install and Import Cohere SDK

In [None]:
!pip install cohere

In [None]:
import cohere
# MY API KEY (use your own)
co = cohere.Client("YOUR_COHERE_API_KEY")

In [27]:
import csv
from datetime import datetime

log_file = "/content/rag_chatbot_logs.csv"

# Create log file if it doesn't exist
with open(log_file, mode='a', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Timestamp", "User_Query", "Top_K_Context", "Summary", "Final_Answer"])


In [28]:
# This will store Q&A pairs for the current user session
chat_history = []

In [45]:
def generate_answer_rag(user_query, k=3):
    # Step 1: Retrieve top-k context
    context_answers = retrieve_top_k_answers(user_query, k)

    # Step 2: Summarize top-k retrieved answers
    context_block = summarize_context_answers(context_answers)

    # Step 3: Build chat memory block (up to last 3 turns for context)
    memory_turns = chat_history[-3:]  # Only keep last 3 Q&A to avoid long prompts
    memory_block = ""
    for turn in memory_turns:
        memory_block += f"\nUser: {turn['user']}\nBot: {turn['bot']}\n"

    # Step 4: Construct the full prompt
    prompt = f"""You are a highly experienced and empathetic medical assistant with deep knowledge in general medicine, diagnostics, and patient care.

I need clear, accurate, and easy-to-understand information about the following medical question. Please use the provided context to guide your answer.

- Explain the possible causes.
- Recommend appropriate actions or steps I can take.
- Clearly state when I should consider seeing a doctor.
- Avoid medical jargon unless necessary. If you use any, explain it in simple terms.
- If there are any warning signs or urgent symptoms related to the issue, highlight them clearly.

Respond with care and empathy and structure your answer using the following sections:
 Possible Causes:
 Recommended Actions:
 When to See a Doctor:
 YOUR ANSWER SHOULD BE PRECISE AS OF 200 WORDS
Chat History:
{memory_block}

Summarized Context (from medical data):
{context_block}

Current Question:
{user_query}

Answer:"""

    # Step 5: Generate using Cohere
    response = co.generate(
        model='command-r-plus',
        prompt=prompt,
        max_tokens=500, # Increased max_tokens for longer response
        temperature=0.4,
        stop_sequences=["--"]
    )

    final_answer = response.generations[0].text.strip()

    # Step 6: Update chat history
    chat_history.append({
        "user": user_query,
        "bot": final_answer
    })

    # Step 7: Log to CSV (optional)
    with open(log_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow([
            datetime.now().isoformat(),
            user_query,
            " || ".join(context_answers),
            context_block,
            final_answer
        ])

    return final_answer

In [30]:
print("\n THANK YOU FOR REACHING US :")
q1 = generate_answer_rag("I feel tired all the time.")
print("Answer 1:", q1)

q2 = generate_answer_rag("I also feel dizzy sometimes.")
print("Answer 2:", q2)

q3 = generate_answer_rag("Could this be related to my thyroid?")
print("Answer 3:", q3)


Your max_length is set to 200, but your input_length is only 174. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=87)



 THANK YOU FOR REACHING US :


Both `max_new_tokens` (=256) and `max_length`(=200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Answer 1: Possible Causes: 

There are several potential reasons why you may be feeling tired all the time. Some common causes include:

- Thyroid issues: Your thyroid gland produces hormones that regulate energy use and control how your body uses energy. Problems with your thyroid, such as an overactive or underactive thyroid, can lead to fatigue.

- Stress: Mental and emotional stress can take a toll on your energy levels. If you're constantly feeling stressed, it can interfere with the quality of your sleep and overall well-being.

- Sleep problems: Issues with sleep can leave you feeling tired during the day. This includes sleep apnea, a condition that causes disrupted breathing during sleep, resulting in frequent awakenings and non-restorative sleep.

- Other illnesses: Fatigue can also be a symptom of various illnesses, including anemia, heart disease, depression, and chronic fatigue syndrome, among others.

Recommended Actions: 

- Consult a general physician: The first step is 

Both `max_new_tokens` (=256) and `max_length`(=200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Answer 2: Possible Causes:

Dizziness and constant fatigue can have several potential causes, and it's important to consider your overall health and any other symptoms you may be experiencing. Some common causes include:

- Inner ear problems: Dizziness could be related to an issue with your inner ear, which is responsible for balance and spatial orientation. This may include conditions like benign paroxysmal positional vertigo (BPPV), labyrinthitis, or Meniere's disease.

- Vestibular system disorders: The vestibular system, which includes the inner ear and associated brain regions, plays a crucial role in balance and spatial orientation. Disorders in this system can lead to dizziness and balance issues.

- Anxiety and stress: Mental health conditions, such as anxiety and prolonged stress, can sometimes manifest as physical symptoms, including dizziness and fatigue.

- Cardiovascular issues: In some cases, dizziness can be related to problems with your heart or blood circulation. This

In [31]:
pd.read_csv(log_file).tail(5)

Unnamed: 0,Timestamp,User_Query,Top_K_Context,Summary,Final_Answer
0,2025-07-10T12:23:59.223021,I feel tired all the time.,"Dear Nisar, Tiredness and un-refreshing mornin...","Dear Nisar, Tiredness and un-refreshing mornin...",Possible Causes: \n\nThere are several potenti...
1,2025-07-10T12:24:14.110171,I also feel dizzy sometimes.,Hi well come to HCM. You got to give detail de...,"Hi, Your symptoms are consistent with what cou...",Possible Causes:\n\nDizziness and constant fat...
2,2025-07-10T12:24:28.947170,Could this be related to my thyroid?,"Dear patient, Welcome to healthcareMagic forum...",Your results show that you have hyperthyroidis...,Possible Causes: \nBased on your provided cont...


In [32]:
pd.DataFrame(chat_history).to_csv("/content/chat_history.csv", index=False)

In [33]:
q1 = generate_answer_rag("my body temprature remains high for about two days.")
print (q1)

Both `max_new_tokens` (=256) and `max_length`(=200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Possible Causes:

There are a number of reasons why you might be experiencing a high body temperature, including:

- Infection: An increase in body temperature, often referred to as a fever, is a common sign that your body is fighting off an infection. This could be due to a variety of factors, including bacteria, viruses, or parasites.

- Inflammatory response: Sometimes, the body's immune response to an injury, condition, or illness can result in an increased body temperature. This is your body's natural reaction to help combat and resolve the issue.

- Environment: Being in a hot or humid environment can cause your body temperature to rise temporarily. This is often accompanied by sweating as your body attempts to cool down.

- Clothing and bedding: Wearing too much clothing or bedding, especially during sleep, can lead to an increase in body temperature. This is more common in infants but can occur in adults as well.

- Medication: Certain medications can cause an increase in body 

In [35]:
# Define the destination path in your Google Drive
drive_path = "/content/drive/MyDrive/"

# Save the files to Google Drive
np.save(drive_path + "embeddings.npy", embeddings)
df.to_csv(drive_path + "processed.csv", index=False)

print(f"Saved embeddings.npy to {drive_path}embeddings.npy")
print(f"Saved processed.csv to {drive_path}processed.csv")

Saved embeddings.npy to /content/drive/MyDrive/embeddings.npy
Saved processed.csv to /content/drive/MyDrive/processed.csv


# creating gui

In [36]:
!pip install gradio



# IF YOU WANT TO INTERACT WITH GUI THROUGH COLAB

In [None]:
import gradio as gr

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("## 🩺 Ask the AI Medical Assistant")

    chatbot = gr.Chatbot(label="AI Doctor", type="messages")
    query = gr.Textbox(placeholder="Ask your health-related question...", label="Your Question")
    send = gr.Button("Get Answer")

    def respond(message, history):
        answer = generate_answer_rag(message)
        history.append({"role": "user", "content": message})
        history.append({"role": "assistant", "content": answer})
        return history

    send.click(respond, [query, chatbot], [chatbot])
    query.submit(respond, [query, chatbot], [chatbot])

demo.launch(share=True, debug=True)


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://d0edcaecd086f0a5ab.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
