In [None]:
import os
import subprocess
from pytube import YouTube
from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline
import whisper
import librosa
import numpy as np
import torch
import gradio as gr
import sys

sys.setrecursionlimit(10**8)

processor = WhisperProcessor.from_pretrained("openai/whisper-small")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
whisper_model = whisper.load_model("small", device="cuda")

def translate_speech(audio_file, target_language):
    audio_input, sampling_rate = librosa.load(audio_file, sr=16000, mono=True)
    audio_input = np.expand_dims(audio_input, axis=0)
    input_features = processor(audio_input, sampling_rate=sampling_rate, return_tensors="pt").input_features
    model.to("cuda")
    input_features = input_features.to("cuda")
    og_audio = whisper.load_audio(audio_file)
    og_audio = whisper.pad_or_trim(og_audio)
    mel = whisper.log_mel_spectrogram(og_audio).to(model.device)
    _, probs = whisper_model.detect_language(mel)
    detected_lng = max(probs, key=probs.get)
    predicted_ids = model.generate(input_features, language=str(target_language), task="transcribe")
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    return detected_lng, transcription

def get_audio_from_youtube(url):
    yt = YouTube(url)
    video = yt.streams.filter(only_audio=True).first()
    out_file = video.download(output_path=".")
    base, ext = os.path.splitext(out_file)
    new_file = base + '.mp3'
    os.rename(out_file, new_file)
    return new_file

def translate_mic(micorphone_input, target_language):
    return translate_speech(micorphone_input, target_language)

def translate_audio_file(audio_file, target_language):
    return translate_speech(audio_file, target_language)

def translate_youtube_video(url, target_language):
    audio_file = get_audio_from_youtube(url)
    return translate_speech(audio_file, target_language)

gr.close_all()

with gr.Blocks() as demo:
    gr.Markdown("## Whisper Translation System")

    with gr.Tab("Translate Speech"):
        with gr.Row():
            microphone_input = gr.Audio(source="microphone", type="filepath", label="Record your speech")
            language_dropdown = gr.Dropdown(["english", "japanese", "korean", "chinese", 'german', 'spanish', 'russian'], label="Select Target Language")
            translate_button = gr.Button("Translate Speech")
            translate_button.click(translate_mic, inputs=[microphone_input, language_dropdown])
        with gr.Row():
            detected_language = gr.Textbox(label="Detected Language")
            translated_text_mic = gr.Textbox(label="Translated Text")
            translate_button.click(translate_mic, inputs=[microphone_input, language_dropdown], outputs=[detected_language, translated_text_mic])

    with gr.Tab("Translate Audio File"):
        with gr.Row():
            audio_input = gr.Audio(source="upload", type="filepath", label="Upload Audio File")
            language_dropdown = gr.Dropdown(["english", "japanese", "korean", "chinese", 'german', 'spanish', 'russian'], label="Select Target Language")
            translate_button = gr.Button("Translate Audio")
        with gr.Row():
            detected_language = gr.Textbox(label="Detected Language")
            translated_text = gr.Textbox(label="Translated Text")
            translate_button.click(translate_audio_file, inputs=[audio_input, language_dropdown], outputs=[detected_language, translated_text])

    with gr.Tab("Translate YouTube Video"):
        with gr.Row():
            youtube_input = gr.Textbox(label="Enter YouTube URL")
            language_dropdown_youtube = gr.Dropdown(["english", "japanese", "korean", "chinese", 'german', 'spanish', 'russian'], label="Select Target Language")
            translate_youtube_button = gr.Button("Translate Video")
        with gr.Row():
            detected_language_youtube = gr.Textbox(label="Detected Language")
            translated_text_youtube = gr.Textbox(label="Translated Text")
            translate_youtube_button.click(translate_youtube_video, inputs=[youtube_input, language_dropdown_youtube], outputs=[detected_language_youtube, translated_text_youtube])

demo.queue().launch(share=True)


  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://7c5b9839ed7fa57b09.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




  audio_input, sampling_rate = librosa.load(audio_file, sr=16000, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
