In [None]:
import gradio as gr
import numpy as np
import torch
import soundfile as sf
from transformers import pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from huggingface_hub import hf_hub_download

theme = gr.themes.Base(
    primary_hue="orange",
    secondary_hue="orange",
).set(
    button_secondary_background_fill='*primary_500',
    button_secondary_background_fill_dark='*secondary_600'
)

# Load ASR and Text Generation Pipelines
speech_recognition = pipeline("automatic-speech-recognition", model=r"C:\Users\user\Documents\project\101stt")
chatbot = pipeline("text-generation", model=r"C:\Users\user\Documents\project\10bot")

# Function to handle audio transcription
def transcribe_audio(audio):
    # Extract sampling rate and audio data
    sr, y = audio
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))  # Normalize audio signal
    return speech_recognition({"sampling_rate": sr, "raw": y})["text"]


# Function to generate chatbot response
def generate_response(message, history):
    return "", history  + [[message, None]]# Returning empty input box and updated history

def bot(history):
    user_message = history[-1][0]
    new_user_input_ids = tokenizer.encode(
        user_message + tokenizer.eos_token, return_tensors="pt"
    )

    # append the new user input tokens to the chat history
    bot_input_ids = torch.cat([torch.LongTensor([]), new_user_input_ids], dim=-1)

    # generate a response
    response = model.generate(
        bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id
    ).tolist()

    # convert the tokens to text, and then split the responses into lines
    response = tokenizer.decode(response[0]).split("<|endoftext|>")
    response = [
        (response[i], response[i + 1]) for i in range(0, len(response) - 1, 2)
    ]  # convert to tuples of list
    history[-1] = response[0]
    return history


# Text-to-Speech (TTS)
def load_speaker_embedding(filepath: str) -> torch.Tensor:
    embedding_npy = np.load(filepath)
    return torch.tensor(embedding_npy).unsqueeze(0)

def synthesize_speech(text: str, embedding_file_path: str):
    # Load processor, model, and vocoder
    processor = SpeechT5Processor.from_pretrained(r"C:\Users\user\Documents\project\t5tts")
    model = SpeechT5ForTextToSpeech.from_pretrained(r"C:\Users\user\Documents\project\t5tts")
    vocoder = SpeechT5HifiGan.from_pretrained(r"C:\Users\user\Documents\project\t5tts\hifigan")

    # Prepare inputs and load speaker embedding
    inputs = processor(text=text, return_tensors="pt")
    speaker_embedding = load_speaker_embedding(r"C:\Users\user\Documents\project\t5tts\speaker_em.npy")

    # Generate speech
    with torch.no_grad():
        speech = model.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder)

    # Save generated audio to .wav file and return path
    output_file = "tts_example.wav"
    sf.write(output_file, speech.numpy(), samplerate=16000)
    return output_file



# Gradio Interface
with gr.Blocks(theme=theme) as demo:
    gr.Markdown("# Hausa Chatbot tare da Gane Magana da TTS")

    # Define chatbot with message history
    # chatbot = gr.Chatbot(type = "messages")
    chatbot = gr.Chatbot()

    # Audio input, transcription, and text-to-speech components
    with gr.Row():
        audio_input = gr.Audio(label="Yi Rikodin Magana")
        transcription_output = gr.Textbox(label="Rubutaccen Magana")
        
    #tts_text = gr.Textbox(label="Enter Text for TTS")
    tts_audio = gr.Audio(label="Maganar da aka Haifa daga TTS")
    
    #buttons
    transcribe_button = gr.Button("Fassara Sauti")
    response_button = gr.Button("Samu Amsa daga Chatbot")
    tts_button = gr.Button("Haifar da Magana")
    
    # Actions for Buttons
    transcribe_button.click(fn=transcribe_audio, inputs=audio_input, outputs=transcription_output)
    response_button.click(generate_response, [transcription_output, chatbot], [transcription_output, chatbot], queue=False).then(
        bot, chatbot, chatbot
    )
    tts_button.click(fn=synthesize_speech, inputs=chatbot, outputs=tts_audio)

# Launch interface
demo.launch()