# 유튜브 자막 생성 Application

Colab 환경에서 유튜브 자막 생성 애플리케이션을 만들어봅시다. 애플리케이션 사용자의 유스케이스는 아래와 같습니다.


## Colab 환경 설정
python package들을 설치합니다.

In [None]:
# Local에서 Run하는 경우 False로 변경
using_colab = True

In [None]:
if using_colab:
    !wget https://raw.githubusercontent.com/mrsyee/dl_apps/main/speech_recognition/requirements.txt
    !pip install -r requirements.txt

## Import dependency

In [None]:
import os

import torch
import gradio as gr
from pytube import YouTube
import whisper
from whisper.utils import get_writer

## UI 구성

In [None]:
with gr.Blocks() as app:
    gr.Markdown("# Speech to Text")

    with gr.Row():
        with gr.Column(scale=1):
            link = gr.Textbox(label="Youtube Link")
            subtile = gr.File(label="Subtitle", file_types=[".srt"])
            submit_btn = gr.Button(value="Transcibe!")

        with gr.Column(scale=4):
            output_video = gr.Video(label="Output", height=500)


In [None]:
app.launch(inline=False, share=True)

In [None]:
app.close()

## 유튜브 링크에서 영상 가져오기

In [None]:
youtube_link = "https://youtu.be/Or6zvOnSDXA"
yt = YouTube(youtube_link)

In [None]:
streams = yt.streams.filter(progressive=True, file_extension="mp4", type="video").order_by("resolution").desc()
for stream in streams:
    print(stream)

In [None]:
youtube_video_path = f"{yt.title}.mp4"
streams[0].download(filename=youtube_video_path)

In [None]:
from IPython.display import HTML
from base64 import b64encode

with open(youtube_video_path,"rb") as f:
    video = f.read()
    data_url = "data:video/mp4;base64," + b64encode(video).decode()

HTML(f"""
<video width=400 controls>
      <source src="{data_url}" type="video/mp4">
</video>
""")

## Writer 클래스를 활용해 SRT 자막 파일 만들기
ref: https://github.com/openai/whisper/blob/main/whisper/utils.py#L235

In [None]:
audio_file_name = "audio_from_youtube.webm"

audio_streams = yt.streams.filter(type="audio").order_by("abr").desc()
audio_streams[0].download(filename=audio_file_name)

In [None]:
model = whisper.load_model("large")

In [None]:
transcript = model.transcribe(audio_file_name)
transcript

In [None]:
srt_writer = get_writer(output_format="srt", output_dir=".")
srt_writer(transcript, audio_file_name)
!cat audio_from_youtube.srt

In [None]:
del model
torch.cuda.empty_cache()

## Whisper 추론기

In [None]:
class WhisperInferencer:
    def __init__(self):
        self.output_dir = "outputs"
        self.model = whisper.load_model("large")
        self.srt_writer = get_writer(output_format="srt", output_dir=".")

    def inference(self, audio_file_path: str) -> str:
        transcript = self.model.transcribe(audio_file_path)
        self.srt_writer(transcript, audio_file_path)

        filename = os.path.basename(audio_file_path)
        filename = filename.split(".")[0]

        return f"{filename}.srt"

whipser_inferencer = WhisperInferencer()

In [None]:
def transcribe(link: str):
    video_file_name = "video_from_youtube.mp4"
    audio_file_name = "audio_from_youtube.webm"
    yt = YouTube(link)

    # Extract video
    streams = yt.streams.filter(progressive=True, file_extension="mp4", type="video").order_by("resolution").desc()
    streams[0].download(filename=video_file_name)

    # Extract audio
    audio_streams = yt.streams.filter(type="audio").order_by("abr").desc()
    audio_streams[0].download(filename=audio_file_name)

    transcript_file = whipser_inferencer.inference(audio_file_name)
    return transcript_file, [video_file_name, transcript_file]

In [None]:
with gr.Blocks() as app:
    gr.Markdown("# Speech to Text")

    with gr.Row():
        with gr.Column(scale=1):
            link = gr.Textbox(label="Youtube Link")
            subtile = gr.File(label="Subtitle", file_types=[".srt"])
            submit_btn = gr.Button(value="Transcibe!")

        with gr.Column(scale=4):
            output_video = gr.Video(label="Output", height=500)

    submit_btn.click(transcribe, [link], [subtile, output_video])

In [None]:
app.launch(inline=False, share=True)

In [None]:
app.close()
del whipser_inferencer
torch.cuda.empty_cache()

## 최종 App 구현

In [None]:
import os

import gradio as gr
import whisper
from pytube import YouTube
from whisper.utils import get_writer

class WhisperInferencer:
    def __init__(self):
        self.output_dir = "outputs"
        self.model = whisper.load_model("large")
        self.srt_writer = get_writer(output_format="srt", output_dir=".")

    def inference(self, audio_file_path: str) -> str:
        transcript = self.model.transcribe(audio_file_path)
        self.srt_writer(transcript, audio_file_path)

        filename = os.path.basename(audio_file_path)
        filename = filename.split(".")[0]

        return f"{filename}.srt"

whipser_inferencer = WhisperInferencer()

def transcribe(link: str):
    video_file_name = "video_from_youtube.mp4"
    audio_file_name = "audio_from_youtube.webm"
    yt = YouTube(link)

    # Extract video
    streams = yt.streams.filter(progressive=True, file_extension="mp4", type="video").order_by("resolution").desc()
    streams[0].download(filename=video_file_name)

    # Extract audio
    audio_streams = yt.streams.filter(type="audio").order_by("abr").desc()
    audio_streams[0].download(filename=audio_file_name)

    transcript_file = whipser_inferencer.inference(audio_file_name)
    return transcript_file, [video_file_name, transcript_file]

# Set gradio app
with gr.Blocks() as app:
    gr.Markdown("# Speech to Text")

    with gr.Row():
        with gr.Column(scale=1):
            link = gr.Textbox(label="Youtube Link")
            subtile = gr.File(label="Subtitle", file_types=[".srt"])
            submit_btn = gr.Button(value="Transcibe!")

        with gr.Column(scale=4):
            output_video = gr.Video(label="Output", height=500)

    submit_btn.click(transcribe, [link], [subtile, output_video])

app.launch(inline=False, share=True)

In [None]:
app.close()
del whipser_inferencer
torch.cuda.empty_cache()