In [18]:
import gradio as gr
import google.generativeai as genai
import os
import io
from PIL import Image
from gtts import gTTS
import tempfile
import torch
from transformers import pipeline
from groq import Groq
import elevenlabs
from elevenlabs.client import ElevenLabs
import platform
import subprocess
from uuid import uuid4
from dotenv import load_dotenv

In [7]:
load_dotenv()  # Load environment variables from a .env file if present

if os.getenv("GROQ_API_KEY") is None:
    raise ValueError("Please set the GROQ_API_KEY environment variable.")
if os.getenv("GOOGLE_API_KEY") is None:
    raise ValueError("Please set the GOOGLE_API_KEY environment variable.")
if os.getenv("ELEVENLABS_API_KEY") is None:
    raise ValueError("Please set the ELEVENLABS_API_KEY environment variable.")

model = genai.GenerativeModel('gemini-2.5-flash')
model.generate_content("Hello").text

'Hello! How can I help you today?'

In [19]:
def handle_multimodal_query(text_input, image_input, audio_input=None):
    """
    Handles farming-related queries by processing a combination of text,
    image, and audio input using a Generative AI model.

    Args:
        text_input (str): The text description of the problem.
        image_input (PIL.Image.Image): A PIL Image object of the crop or pest.
        audio_input (str): The file path to a recorded audio file.

    Returns:
        tuple: A tuple containing the AI's text response and the path to
               the generated audio file.
    """
    print("Received inputs:")
    print(f"Text input: {text_input}")
    if image_input:
        print(f"Image input: {image_input}")
    if audio_input:
        print(f"Audio input: {audio_input}")
    question_id = str(uuid4())
    try:
        # --- Handle Audio Input (Placeholder for Transcription) ---
        # NOTE: A real-world application would use a speech-to-text model here
        # to transcribe the audio into text. For this demo, we'll simply
        # acknowledge the audio file and process the other inputs.
        if audio_input:
            print(f"Audio file received at: {audio_input}")
            # For a real implementation, you would do something like:
            stt_model="whisper-large-v3"
            transcribed_text = transcribe_with_groq(stt_model, audio_input, os.getenv("GROQ_API_KEY"))
            text_input = f"{text_input}\nTranscribed audio: {transcribed_text}"

        # --- Construct the Multimodal Prompt ---
        prompt_parts = [
            "You are an AI assistant specialized in providing advice to farmers. "
            "Analyze the following information to answer the farmer's question. "
            "Be practical and concise. If you cannot provide a specific answer, "
            "provide general helpful advice. Keep your answer concise (max 2 sentences). No preamble, start your answer right away please. No astricks please.",
            f"\n\nFarmer's Question: {text_input}"
        ]

        if image_input:
            # If an image is provided, add it to the prompt parts for the model
            # to analyze alongside the text.
            prompt_parts.append(image_input)

        # --- Call the Generative AI Model ---
        print("Sending request to Gemini model...")
        response = model.generate_content(prompt_parts)
        ai_text_response = response.text
        print(f"AI Response generated: {ai_text_response}")

        # --- Convert AI Text Response to Speech ---
        print("Converting AI response to speech...")
        output_filepath = f"generated_speech_responses/speech_response_{question_id}.mp3"
        text_to_speech_with_elevenlabs(input_text=ai_text_response, output_filepath=output_filepath)
        print(f"Generated speech saved to: {output_filepath}")
        return ai_text_response, output_filepath

    except Exception as e:
        error_message = f"An error occurred: {e}"
        print(error_message)
        return error_message, None
    
def transcribe_with_groq(stt_model, audio_filepath, GROQ_API_KEY):
    client=Groq(api_key=GROQ_API_KEY)
    
    audio_file=open(audio_filepath, "rb")
    transcription=client.audio.transcriptions.create(
        model=stt_model,
        file=audio_file,
        language="en"
    )
    print(transcription)
    return transcription.text

def text_to_speech_with_elevenlabs(input_text, output_filepath):
    client=ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))
    audio=client.text_to_speech.convert(
        text= input_text,
        voice_id="aGb0TwKthRLQTPThYRqI",
        output_format="mp3_44100_128",
        model_id="eleven_turbo_v2"
    )
    elevenlabs.save(audio, output_filepath)
    os_name = platform.system()
    try:
        if os_name == "Darwin":  # macOS
            subprocess.run(['afplay', output_filepath])
        elif os_name == "Windows":  # Windows
            subprocess.run(['powershell', '-c', f'(New-Object Media.SoundPlayer "{output_filepath}").PlaySync();'])
        elif os_name == "Linux":  # Linux
            subprocess.run(['aplay', output_filepath])  # Alternative: use 'mpg123' or 'ffplay'
        else:
            raise OSError("Unsupported operating system")
    except Exception as e:
        print(f"An error occurred while trying to play the audio: {e}")


In [29]:
# --- Step 4: Build the Gradio Interface ---
# Define the input components for the UI.
text_box = gr.Textbox(
    label="Describe your farming issue:", 
    placeholder="e.g., 'My tomato plant leaves are turning yellow.'", 
    interactive=True
)
image_box = gr.Image(
    type="pil", 
    label="Upload an image of the plant (optional):", 
    interactive=True
)
audio_box = gr.Audio(
    label="Record your question (optional):", 
    sources=["microphone"], 
    interactive=True,
    type="filepath"
)

# Define the output components for the UI.
text_output = gr.Textbox(
    label="AI Response (Text):", 
    placeholder="The AI's response will appear here...", 
    interactive=False
)
audio_output = gr.Audio(
    label="AI Response (Audio):", 
    interactive=False,
    type="filepath"   # Explicitly state you'll return a filepath
)

# Create the Gradio interface.
gr.Interface(
    fn=handle_multimodal_query,
    inputs=[text_box, image_box, audio_box],
    outputs=[text_output, audio_output],
    title="Farm AI Assistant Demo",
    description="Ask a farming question and get a text and audio response. "
                "You can use text, an image, or both.",
).launch()


* Running on local URL:  http://127.0.0.1:7872
* To create a public link, set `share=True` in `launch()`.


