<a href="https://colab.research.google.com/github/MK316/Myapps/blob/main/mrkim21apps/tts-pitch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TTS with pitch contour (240202)

In [None]:
!pip install gtts matplotlib librosa IPython

In [None]:
from ipywidgets import Button
from IPython.display import Audio, display


In [None]:
# !pip install gtts matplotlib librosa IPython ipywidgets

import io
from gtts import gTTS
from ipywidgets import Button, Output
from IPython.display import Audio, display
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np

# Function to extract and plot the pitch contour
def plot_pitch_contour(audio_file_path):
    y, sr = librosa.load(audio_file_path, sr=None)
    fmin = librosa.note_to_hz('C2')
    fmax = librosa.note_to_hz('C6')
    pitch, voiced_flag, voiced_probs = librosa.pyin(y, fmin=fmin, fmax=fmax, sr=sr)
    pitch[~np.isfinite(pitch)] = 0
    plt.figure(figsize=(14, 5))
    librosa.display.waveshow(y, sr=sr)
    times = librosa.times_like(pitch, sr=sr)
    for i in range(len(pitch)):
        if pitch[i] > 0:
            plt.plot(times[i], pitch[i], 'ro')
    plt.title('Pitch Contour')
    plt.xlabel('Time (s)')
    plt.ylabel('Pitch (Hz)')
    plt.ylim(0, 350)
    plt.show()

# Function to generate speech and plot pitch contour
def generate_speech_and_plot_contour(text, lang='en'):
    tts = gTTS(text=text, lang=lang)
    audio_fp = io.BytesIO()
    tts.write_to_fp(audio_fp)
    audio_fp.seek(0)

    # Save the audio to a file
    output_file = '/content/output.wav'
    with open(output_file, "wb") as f:
        f.seek(0)
        f.write(audio_fp.getvalue())

    # Display the generated audio
    print(f"This is one possible intonation of: {text}")
    display(Audio(output_file))

    # Plot the pitch contour
    plot_pitch_contour(output_file)

# Widget output for interactive display
output = Output()

# Handlers for buttons
@output.capture(clear_output=True)
def on_click_en(b):
    generate_speech_and_plot_contour(text_en, 'en')

@output.capture(clear_output=True)
def on_click_ko(b):
    generate_speech_and_plot_contour(text_ko, 'ko')

# Initialize buttons
button_en = Button(description="Speak English")
button_ko = Button(description="Speak Korean")

# Assign event handlers
button_en.on_click(on_click_en)
button_ko.on_click(on_click_ko)

# Display buttons
display(button_en, button_ko, output)

# Example text for English and Korean
text_en = "Hello, how are you?"
text_ko = "안녕하세요, 어떻게 지내세요?"


# Gradio

In [None]:
!pip install gradio

In [None]:
import gradio as gr
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
from gtts import gTTS
import io
import os

# This function is adapted to work with Gradio
def generate_speech(text, lang='en'):
    tts = gTTS(text=text, lang=lang)
    audio_fp = io.BytesIO()
    tts.write_to_fp(audio_fp)
    audio_fp.seek(0)
    output_file = 'output.wav'
    with open(output_file, "wb") as f:
        f.write(audio_fp.getvalue())
    return output_file

def plot_pitch_contour(audio_file_path):
    y, sr = librosa.load(audio_file_path, sr=None)
    fmin = librosa.note_to_hz('C2')
    fmax = librosa.note_to_hz('C6')
    pitch, voiced_flag, voiced_probs = librosa.pyin(y, fmin=fmin, fmax=fmax, sr=sr)
    pitch[~np.isfinite(pitch)] = 0
    plt.figure(figsize=(14, 5))
    librosa.display.waveshow(y, sr=sr)
    times = librosa.times_like(pitch, sr=sr)
    for i in range(len(pitch)):
        if pitch[i] > 0:
            plt.plot(times[i], pitch[i], 'ro')
    plt.title('Pitch Contour')
    plt.xlabel('Time (s)')
    plt.ylabel('Pitch (Hz)')
    plt.ylim(0, 350)
    plt.savefig('pitch_contour.png')
    plt.close()
    return 'pitch_contour.png'

def generate_and_plot(text, lang):
    audio_file = generate_speech(text, lang)
    plot_img = plot_pitch_contour(audio_file)
    return audio_file, plot_img

# Gradio interface
iface = gr.Interface(fn=generate_and_plot,
                     inputs=[gr.Textbox(label="Enter Text"), gr.Radio(['en', 'ko'], label="Language")],
                     outputs=[gr.Audio(label="Generated Speech"), gr.Image(label="Pitch Contour")],
                     title="Speech Generation and Pitch Contour Visualization",
                     description="Generates speech from text and visualizes the pitch contour. Select a language and enter text to see the results.")

iface.launch()
