In [1]:
import gradio as gr
import google.generativeai as genai
import os
import io
from PIL import Image
from gtts import gTTS
import tempfile

In [5]:
if os.getenv("GOOGLE_API_KEY") is None:
    raise ValueError("Please set the GOOGLE_API_KEY environment variable.")
else:
    genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

In [8]:
model = genai.GenerativeModel('gemini-2.5-flash')
model.generate_content("Hello").text

'Hello! How can I help you today?'

In [None]:
def handle_multimodal_query(text_input, image_input, audio_input=None):
    """
    Handles farming-related queries by processing a combination of text,
    image, and audio input using a Generative AI model.

    Args:
        text_input (str): The text description of the problem.
        image_input (PIL.Image.Image): A PIL Image object of the crop or pest.
        audio_input (str): The file path to a recorded audio file.

    Returns:
        tuple: A tuple containing the AI's text response and the path to
               the generated audio file.
    """
    try:
        # --- Handle Audio Input (Placeholder for Transcription) ---
        # NOTE: A real-world application would use a speech-to-text model here
        # to transcribe the audio into text. For this demo, we'll simply
        # acknowledge the audio file and process the other inputs.
        if audio_input:
            print(f"Audio file received at: {audio_input}")
            print("Placeholder: A speech-to-text model would transcribe this audio.")
            # For a real implementation, you would do something like:
            # transcribed_text = transcribe_audio(audio_input)
            # text_input = f"{text_input}\nTranscribed audio: {transcribed_text}"

        # --- Construct the Multimodal Prompt ---
        prompt_parts = [
            "You are an AI assistant specialized in providing advice to farmers. "
            "Analyze the following information to answer the farmer's question. "
            "Be practical and concise. If you cannot provide a specific answer, "
            "provide general helpful advice.",
            f"\n\nFarmer's Question: {text_input}"
        ]

        if image_input:
            # If an image is provided, add it to the prompt parts for the model
            # to analyze alongside the text.
            prompt_parts.append(image_input)

        # --- Call the Generative AI Model ---
        print("Sending request to Gemini model...")
        response = model.generate_content(prompt_parts)
        ai_text_response = response.text

        # --- Step 3: Generate Audio Output from the AI's Response ---
        print("Generating audio response...")
        tts = gTTS(text=ai_text_response, lang='en')
        
        # Use a temporary file to store the audio, which Gradio can then use.
        with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as fp:
            tts.save(fp.name)
            audio_output_path = fp.name

        print("Audio file generated successfully.")
        return ai_text_response, audio_output_path

    except Exception as e:
        error_message = f"An error occurred: {e}"
        print(error_message)
        # Clean up the temporary file on error.
        if 'audio_output_path' in locals() and os.path.exists(audio_output_path):
            os.remove(audio_output_path)
        return error_message, None

In [10]:
# --- Step 4: Build the Gradio Interface ---
# Define the input components for the UI.
text_box = gr.Textbox(
    label="Describe your farming issue:", 
    placeholder="e.g., 'My tomato plant leaves are turning yellow.'", 
    interactive=True
)
image_box = gr.Image(
    type="pil", 
    label="Upload an image of the plant (optional):", 
    interactive=True
)
audio_box = gr.Audio(
    label="Record your question (optional):", 
    sources=["microphone"], 
    interactive=True
)

# Define the output components for the UI.
text_output = gr.Textbox(
    label="AI Response (Text):", 
    placeholder="The AI's response will appear here...", 
    interactive=False
)
audio_output = gr.Audio(
    label="AI Response (Audio):", 
    interactive=False
)

# Create the Gradio interface.
gr.Interface(
    fn=handle_multimodal_query,
    inputs=[text_box, image_box, audio_box],
    outputs=[text_output, audio_output],
    title="Farm AI Assistant Demo",
    description="Ask a farming question and get a text and audio response. "
                "You can use text, an image, or both.",
    allow_flagging="auto",
).launch()




* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.




Sending request to Gemini model...
Generating audio response...
Audio file generated successfully.
Created dataset file at: .gradio/flagged/dataset1.csv
