In [None]:
import os
import gradio as gr
import requests
import io
from PIL import Image
from groq import Groq
from huggingface_hub import login
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

# Load API keys from environment variables
groq_api_key = os.getenv("groq_api_key")
hf_token = os.getenv("token")
image_api_key = os.getenv("imagetoken")

if not groq_api_key:
    raise ValueError("Groq API key not found. Please ensure 'groq_api_key' is set as an environment variable.")
if not hf_token:
    raise ValueError("Hugging Face API token not found. Please ensure 'token' is set as an environment variable.")
if not image_api_key:
    raise ValueError("Image generation API token not found. Please ensure 'imagetoken' is set as an environment variable.")

# Initialize Groq client
client = Groq(api_key=groq_api_key)

# Authenticate Hugging Face and load MBart model and tokenizer
login(token=hf_token)
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

# Stable Diffusion API details
API_URL = "https://api-inference.huggingface.co/models/CompVis/stable-diffusion-v1-4"
headers = {"Authorization": f"Bearer {image_api_key}"}

# Function to transcribe audio using Groq
def transcribe_audio(audio_path):
    if audio_path is None:
        return "Please upload an audio file."
    try:
        print("Transcribing audio...")
        with open(audio_path, "rb") as file:
            transcription = client.audio.transcriptions.create(
                file=(os.path.basename(audio_path), file.read()),
                model="whisper-large-v3",
                response_format="verbose_json",
                language="ta"
            )
        print(f"Transcription Response: {transcription}")  # Debugging output
        if hasattr(transcription, 'text'):
            return transcription.text
        else:
            return "Transcription response does not contain expected text."
    except Exception as e:
        print(f"Transcription error: {str(e)}")
        return f"An error occurred during transcription: {str(e)}"

# Function to translate Tamil to English using MBart
def translate_tamil_to_english(tamil_text):
    if tamil_text is None or not tamil_text.strip():
        return "No text to translate"
    print("Translating text...")
    tokenizer.src_lang = "ta_IN"
    encoded_input = tokenizer(tamil_text, return_tensors="pt")
    generated_tokens = model.generate(**encoded_input, forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"])
    translated_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
    return translated_text

# Function to generate image using Hugging Face API
def generate_image(prompt):
    response = requests.post(API_URL, headers=headers, json={"inputs": prompt})
    if response.status_code == 200:
        return response.content
    else:
        return f"Error generating image: {response.status_code}, {response.text}"

#combined function
def process_audio(audio_file, tamil_text_input):
    transcription = None
    translation = None
    image = None

    # Step 1: Process Audio Transcription if Audio is Provided
    if audio_file:
        print(f"Received audio file: {audio_file}")  # Debugging output for audio file path
        try:
            transcription = transcribe_audio(audio_file)
            print(f"Transcription: {transcription}")  # Debugging output
            if "error" in transcription.lower():
                transcription = None  # Reset transcription on error
        except Exception as e:
            print(f"Error accessing audio file: {e}")
            transcription = None  # Ensure transcription remains None on error

    # Step 2: Use Tamil Text Input Directly if No Valid Audio Transcription
    if not transcription and tamil_text_input:
        transcription = tamil_text_input  # Use text input if audio transcription failed or wasn't provided
        print(f"Using Tamil Text Input: {tamil_text_input}")  # Debugging output

    # Step 3: Translate Transcription if Available
    if transcription:
        translation = translate_tamil_to_english(transcription)
        print(f"Translation: {translation}")  # Debugging output

        if "error" in translation.lower() or translation == "No text to translate":
            translation = None  # Reset translation on error

    # Step 4: Generate Image if Translation is Available
    if translation:
        image_bytes = generate_image(translation)
        print(f"Image Generation Response: {image_bytes}")  # Debugging output

        if isinstance(image_bytes, str):  # Check if there's an error message
            image = image_bytes  # Assign error message to image output
        else:
            try:
                image = Image.open(io.BytesIO(image_bytes))
            except Exception as e:
                image = f"Error opening image: {str(e)}"

    # Final Output Handling
    if transcription or translation:
        return transcription, translation, image
    else:
        return "No valid audio file or text provided", None, None



# Gradio interface
iface = gr.Interface(
    fn=process_audio,
    inputs=[
        gr.Audio(type="filepath", label="Upload Audio File (Optional)"),
        gr.Textbox(label="Enter Tamil Text (Optional)")
    ],
    outputs=[
        gr.Textbox(label="Transcribed Text (Tamil)"),
        gr.Textbox(label="Translated Text (English)"),
        gr.Image(type="pil", label="Generated Image")
    ],
    title="Audio to Image with Translation",
    description="Upload an audio file, transcribe it, translate it to English, and generate an image based on the translation."
)

# Launch the interface
iface.launch(debug=True)
