<a href="https://www.kaggle.com/code/mohamedmaboshady/q-a-chatbot-gemini-fallback?scriptVersionId=250831649" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:

!pip install google-generativeai streamlit pandas

import shutil
import os
import pandas as pd
# Define source and destination paths
source_csv_path = '/kaggle/input/q-a-fallback/medquad.csv'
destination_csv_path = 'medquad.csv' # This is the working directory

# Check if the file already exists in the working directory to avoid unnecessary copying
if not os.path.exists(destination_csv_path):
    try:
        shutil.copy(source_csv_path, destination_csv_path)
        print(f"Copied {source_csv_path} to {destination_csv_path}")
    except FileNotFoundError:
        print(f"Error: Source CSV not found at {source_csv_path}. Please ensure the dataset is added to your notebook.")
    except Exception as e:
        print(f"Error copying CSV: {e}")
else:
    print(f"{destination_csv_path} already exists in working directory. Skipping copy.")

print("Environment setup complete.")




Collecting streamlit
  Downloading streamlit-1.46.1-py3-none-any.whl.metadata (9.0 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.46.1-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m66.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m81.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydeck, streamlit
Successfully installed pydeck-0.9.1 streamlit-1.46.1
Copied /kaggle/input/q-a-fallback/medquad.csv to medquad.csv
Environment setup complete.


# app.py (This code would be in a separate .py file for Streamlit deployment
import streamlit as st
import os
import pandas as pd
import google.generativeai as genai
import re # Import regex for better keyword extraction

# --- Configuration & Data Loading ---
try:
    # Attempt to get API key from environment variable (for Streamlit Cloud/local deployment)
    gemini_api_key = os.environ.get('GEMINI_API_KEY')
    if not gemini_api_key:
        # If running in Kaggle Notebook, try to get from Kaggle Secrets client
        # This part is specific to Kaggle Notebook environment
        try:
            from kaggle_secrets import UserSecretsClient
            user_secrets = UserSecretsClient()
            gemini_api_key = user_secrets.get_secret("GEMINI_API_KEY")
        except ImportError:
            # Not in Kaggle environment, and env var not set
            st.error("GEMINI_API_KEY environment variable not found. Please set it.")
            st.stop() # Stop the app if API key is missing

    if not gemini_api_key:
        st.error("GEMINI_API_KEY is empty. Please ensure it's correctly set in Kaggle Secrets or environment variables.")
        st.stop()

    genai.configure(api_key=gemini_api_key)
    print("Gemini API configured successfully.")
except Exception as e:
    st.error(f"Error configuring Gemini API: {e}")
    st.stop()

# Load the Q&A dataset
@st.cache_data # Cache the dataframe loading to avoid re-reading on every rerun
def load_qa_data():
    try:
        data_path = 'medquad.csv' # Assuming it's in the same directory as app.py for Streamlit Cloud
        
        qa_df = pd.read_csv(data_path)
        
        
        expected_cols_mapping = {
            'question': 'Question', 
            'answer': 'Answer', 
            'focus_area': 'Focus Area'
        }
        
        for old_col, new_col in expected_cols_mapping.items():
            if old_col in qa_df.columns:
                qa_df.rename(columns={old_col: new_col}, inplace=True)
            elif new_col not in qa_df.columns: # Check if the new name exists directly
                st.error(f"CSV must contain '{old_col}' or '{new_col}' column. Please check your file header.")
                st.stop()
            
        print(f"Dataset loaded successfully from {data_path}. Shape: {qa_df.shape}")
        print("First 5 rows of your Q&A data (after potential renaming):")
        print(qa_df.head())
        return qa_df
    except FileNotFoundError:
        st.error(f"Dataset not found at {data_path}. Please ensure it's in the correct location.")
        return pd.DataFrame(columns=['Question', 'Answer', 'Focus Area']) # Include Focus Area in empty df
    except Exception as e:
        st.error(f"Error loading Q&A dataset: {e}")
        return pd.DataFrame(columns=['Question', 'Answer', 'Focus Area'])

qa_df = load_qa_data()

# Initialize the Gemini 2.5 Flash model
@st.cache_resource # Cache the model to avoid re-initializing on every rerun
def load_gemini_model():
    return genai.GenerativeModel('gemini-2.5-flash')

model = load_gemini_model()

# --- Helper function to identify focus area ---
def identify_focus_area_from_question(user_question, qa_dataframe):
    """
    Identifies the most probable focus area from the user's question based on keyword matching.
    Prioritizes direct focus area name matches within the user's question.
    Returns the identified focus area string or None.
    """
    user_question_lower = user_question.lower()
    user_keywords = set(re.findall(r'\b\w+\b', user_question_lower))
    
    identified_focus_area = None
    max_focus_score = 0 
    
    unique_focus_areas = qa_dataframe['Focus Area'].dropna().unique() # Get unique non-null focus areas

    for focus_area_name in unique_focus_areas:
        focus_area_lower = str(focus_area_name).lower()
        
        current_score = 0
        
        # Strongest signal: if the user's question directly contains the focus area name
        if focus_area_lower in user_question_lower:
            current_score += 1000 # A very high bonus for direct containment
            current_score += len(focus_area_lower.split()) # Bonus for longer exact matches

        # Fallback/additional signal: count keyword overlap
        current_score += sum(1 for keyword in user_keywords if keyword in focus_area_lower)
        
        if current_score > max_focus_score:
            max_focus_score = current_score
            identified_focus_area = focus_area_name
            
    return identified_focus_area # Returns None if no strong match


# --- Primary Retrieval function from dataset ---
def retrieve_context_from_dataset(user_question, qa_dataframe, top_n=3):
    """
    Retrieves relevant Q&A pairs from the dataframe by first identifying a focus area,
    then performing keyword matching within that area.
    Returns a string of context, which might be empty if no relevant data is found.
    """
    user_question_lower = user_question.lower()
    user_keywords = set(re.findall(r'\b\w+\b', user_question_lower))

    if qa_dataframe.empty or 'Question' not in qa_dataframe.columns or 'Answer' not in qa_dataframe.columns or 'Focus Area' not in qa_dataframe.columns:
        return "Knowledge base is empty or required columns are missing. Cannot retrieve context."

    # Identify the most probable focus area using the helper function
    identified_focus_area = identify_focus_area_from_question(user_question, qa_dataframe)

    # Filter the DataFrame based on the identified focus area
    filtered_df = qa_dataframe
    if identified_focus_area: # Only filter if a relevant focus area was identified
        print(f"Attempting to filter by identified focus area: '{identified_focus_area}'.")
        # Case-insensitive filtering for the focus area
        temp_filtered_df = qa_dataframe[qa_dataframe['Focus Area'].str.lower() == identified_focus_area.lower()]
        
        if not temp_filtered_df.empty:
            filtered_df = temp_filtered_df
        else:
            print(f"Filtering by '{identified_focus_area}' resulted in an empty DataFrame. Proceeding with full search.")


    # Find relevant Q&A pairs within the (potentially filtered) DataFrame
    found_qa_pairs = []
    
    # Prioritize exact question match first from the filtered DataFrame
    exact_match_row = filtered_df[filtered_df['Question'].str.lower() == user_question_lower]
    if not exact_match_row.empty:
        found_qa_pairs.append(f"Q: {exact_match_row.iloc[0]['Question']}\nA: {exact_match_row.iloc[0]['Answer']}")
        # If an exact match is found and we only need one, return it immediately
        if top_n == 1:
            return "\n\n".join(found_qa_pairs)

    # Then, look for keyword matches in questions and answers within the filtered_df
    potential_matches = []
    for index, row in filtered_df.iterrows():
        qa_pair_string = f"Q: {row['Question']}\nA: {row['Answer']}"
        # Skip if this is the exact match we already added
        if qa_pair_string in found_qa_pairs:
            continue
            
        question_text_lower = str(row['Question']).lower()
        answer_text_lower = str(row['Answer']).lower()

        # Check for keyword overlap in question or answer
        keyword_overlap_count = sum(1 for keyword in user_keywords if keyword in question_text_lower or keyword in answer_text_lower)
        
        if keyword_overlap_count > 0: # Only add if there's at least one keyword overlap
            potential_matches.append((keyword_overlap_count, qa_pair_string))

    # Sort potential matches by keyword overlap count (descending)
    potential_matches.sort(key=lambda x: x[0], reverse=True)
    
    # Add the top potential matches to found_qa_pairs until top_n is reached
    for count, qa_pair_string in potential_matches:
        if len(found_qa_pairs) < top_n:
            found_qa_pairs.append(qa_pair_string)
        else:
            break

    # Return the context string. It will be empty if no relevant pairs were found.
    return "\n\n".join(found_qa_pairs)


# --- Chatbot response generation function with smart Gemini fallback ---
def generate_response_with_context(user_prompt, qa_dataframe):
    if model is None:
        return "Chatbot not initialized. Please check API configuration."
    if qa_dataframe.empty:
        return "Knowledge base is empty. Please load your Q&A dataset."

    # 1. Attempt to retrieve context from the primary dataset
    context_from_dataset = retrieve_context_from_dataset(user_prompt, qa_dataframe)
    
    # Identify focus area for potential use in fallback or general guidance
    identified_focus_area = identify_focus_area_from_question(user_prompt, qa_dataframe)
    
    # Construct the base prompt for Gemini
    # This prompt is designed to allow Gemini to use its general knowledge if dataset context is weak.
    
    # If context from dataset is found, prioritize it.
    if context_from_dataset:
        print("Dataset context found. Using RAG prompt.")
        full_prompt = f"""
        You are a helpful question-answering assistant specializing in medical topics.
        Use the following provided information to answer the user's question.
        If the provided information is incomplete or does not fully answer the question,
        you may supplement it with your general knowledge about the topic,
        especially if a focus area like '{identified_focus_area}' (if identified) is relevant.
        Do not make up answers if you truly don't know, even with general knowledge.

        --- Provided Information ---
        {context_from_dataset}
        ---

        User's Question: {user_prompt}
        """
    else:
        # If no context from dataset, explicitly tell Gemini to use general knowledge
        # and guide it with the identified focus area if possible.
        print("No dataset context found. Falling back to Gemini's general knowledge.")
        if identified_focus_area:
            full_prompt = f"""
            You are a helpful question-answering assistant specializing in medical topics.
            I could not find specific information in my knowledge base.
            Please answer the following question based on your general knowledge about '{identified_focus_area}'.
            If you truly don't know, state that you cannot answer.

            User's Question: {user_prompt}
            """
        else:
            # If no focus area identified either, just a general medical question
            full_prompt = f"""
            You are a helpful question-answering assistant specializing in medical topics.
            I could not find specific information in my knowledge base.
            Please answer the following question based on your general knowledge.
            If you truly don't know, state that you cannot answer.

            User's Question: {user_prompt}
            """

    try:
        response = model.generate_content(full_prompt)
        final_response = response.text
    except Exception as e:
        print(f"Error generating content from Gemini: {e}")
        if hasattr(e, 'response') and e.response.prompt_feedback.block_reason:
            final_response = "I'm sorry, I cannot answer that question due to content safety policies."
        else:
            final_response = "I'm having trouble understanding or responding right now. Please try again."
    
    return final_response

    # --- Streamlit UI ---
st.set_page_config(page_title="Custom Q&A Chatbot with Gemini 2.5 Flash", layout="centered")

st.title("📚 Custom Q&A Chatbot (Powered by Gemini 2.5 Flash)")
st.markdown("Ask questions based on the medical knowledge in my dataset! If I don't have specific data, I'll try to use my general knowledge for the topic.")

# Initialize chat history in session state
if "messages" not in st.session_state:
    st.session_state.messages = []

# Display chat messages from history on app rerun
for message in st.session_state.messages:
    with st.chat_message(message["role"]):
        st.markdown(message["content"])

# Accept user input
if prompt := st.chat_input("Ask your question here..."):
    st.session_state.messages.append({"role": "user", "content": prompt})
    with st.chat_message("user"):
        st.markdown(prompt)

    with st.chat_message("assistant"):
        with st.spinner("Thinking..."):
            # Pass the loaded dataframe to the response generation function
            full_response = generate_response_with_context(prompt, qa_df)
        st.markdown(full_response)
    st.session_state.messages.append({"role": "assistant", "content": full_response})



print("--- Chatbot Testing with CSV Context ---")


# Example 1: Question directly from your CSV
question5 = "What are the symptoms of sinusitis?"
response5 = generate_response_with_context(question5, qa_df)
print(f"question: {question5}")
print(f"Response: {response5}\n")


# Test the chatbot with various questions
print("--- Chatbot Testing with CSV Context ---")

# Assuming your qa_df has questions like "What are the side effects of Drug X?"
# and "How do I schedule an appointment?"

# Example 1: Question directly from your CSV
question1 = "What are the symptoms of high blood pressure?"
response1 = generate_response_with_context(question1, qa_df)
print(f"question: {question1}")
print(f"Response: {response1}\n")


# Example 2: A slightly rephrased question (testing retrieval)
question2 = "what is the treament plan of Diabetes type 2."
response2 = generate_response_with_context(question2, qa_df)
print(f"question: {question2}")
print(f"Response: {response2}\n")

# Example 3: Question not directly in your CSV (testing fallback/general knowledge)
question3 = "What is the capital of Japan?"
response3 = generate_response_with_context(question3, qa_df)
print(f"Question: {question3}")
print(f"Response: {response3}\n")

# Example 4: Question that might retrieve general context if no direct match
question4 = "What is the purpose of HIPAA?"
response4 = generate_response_with_context(question4, qa_df)
print(f"Question: {question4}")
print(f"Response: {response4}\n")
# Expected Response: If "HIPAA" is in your CSV, it should use that. Otherwise, it will rely on the fallback context.


# Example 2: A slightly rephrased question (testing retrieval)
question8 = "what is the medicines of treament of Diabetes type 1 ,plz give me list of medicines and brands also."
response8 = generate_response_with_context(question8, qa_df)
print(f"question: {question8}")
print(f"Response: {response8}\n")

# Example 2: A slightly rephrased question (testing retrieval)
question7 = "what is the medicines of treament of Diabetes type 2."
response7 = generate_response_with_context(question7, qa_df)
print(f"question: {question7}")
print(f"Response: {response7}\n")