# 유튜브 자막 생성 Application

Colab 환경에서 유튜브 자막 생성 애플리케이션을 만들어봅시다. 애플리케이션 사용자의 유스케이스는 아래와 같습니다.


## 패키지 및 예제 데이터 다운로드하기
python package들을 설치합니다. Colab에서 실행하지 않는 경우 이 셀은 실행하지 않습니다.

In [None]:
!wget https://raw.githubusercontent.com/mrsyee/dl_apps/main/speech_recognition/requirements-colab.txt
!pip install -r requirements-colab.txt

## 패키지 불러오기

In [None]:
import os

import torch
import gradio as gr

# 24.10.01 youtube 링크 형식 변경 등 서비스들이 변경된 사항이 있어
# 기존 pytube로는 대응이 안되는 경우가 생겼습니다.
# 이에 대응하기 위해 pytube -> pytubefix 로 변경합니다.
from pytubefix import YouTube
from openai import OpenAI
import whisper
from whisper.utils import get_writer

## 애플리케이션 UI 구현하기

In [None]:
with gr.Blocks() as app:
    gr.Markdown("# Youtube 자막 생성기")

    with gr.Row():
        with gr.Column(scale=1):
            link = gr.Textbox(label="Youtube Link")
            subtitle = gr.File(label="Subtitle", file_types=[".srt"])
            transcribe_btn = gr.Button(value="자막 생성!")

        with gr.Column(scale=4):
            output_video = gr.Video(label="Output", height=500)


In [None]:
app.launch(inline=False, share=True)

In [None]:
app.close()

## 유튜브 링크에서 영상 가져오기

In [None]:
youtube_link = "https://youtu.be/Or6zvOnSDXA?si=pVZvl2yAc9K8KJm6"
yt = YouTube(youtube_link)

In [None]:
streams = yt.streams.filter(progressive=True, file_extension="mp4", type="video").order_by("resolution").desc()
for stream in streams:
    print(stream)

In [None]:
youtube_video_path = "video_from_youtube.mp4"
streams[0].download(filename=youtube_video_path)

In [None]:
from IPython.display import HTML
from base64 import b64encode

with open(youtube_video_path,"rb") as f:
    video = f.read()
    data_url = "data:video/mp4;base64," + b64encode(video).decode()

HTML(f"""
<video width=400 controls>
      <source src="{data_url}" type="video/mp4">
</video>
""")

In [None]:
def get_video(link: str):
    video_file_name = "video_from_youtube.mp4"
    yt = YouTube(link)

    # Extract video
    streams = yt.streams.filter(progressive=True, file_extension="mp4", type="video").order_by("resolution").desc()
    streams[0].download(filename=video_file_name)

    return video_file_name

In [None]:
with gr.Blocks() as app:
    gr.Markdown("# Youtube 자막 생성기")

    with gr.Row():
        with gr.Column(scale=1):
            link = gr.Textbox(label="Youtube Link")
            subtitle = gr.File(label="Subtitle", file_types=[".srt"])
            transcribe_btn = gr.Button(value="자막 생성!")

        with gr.Column(scale=4):
            output_video = gr.Video(label="Output", height=500)

    transcribe_btn.click(get_video, [link], [output_video])

In [None]:
app.launch(inline=False, share=True)

In [None]:
app.close()

## Writer 클래스를 활용해 SRT 자막 파일 만들기 - 라이브러리 활용
ref: https://github.com/openai/whisper/blob/main/whisper/utils.py#L235

In [None]:
model = whisper.load_model("large")

In [None]:
audio_file_name = "audio_from_youtube.webm"

audio_streams = yt.streams.filter(type="audio").order_by("abr").desc()
audio_streams[0].download(filename=audio_file_name)

In [None]:
transcript = model.transcribe(audio_file_name)

In [None]:
transcript

In [None]:
srt_writer = get_writer(output_format="srt", output_dir=".")
srt_writer(transcript, audio_file_name)

In [None]:
!cat audio_from_youtube.srt

In [None]:
del model
torch.cuda.empty_cache()

## Writer 클래스를 활용해 SRT 자막 파일 만들기 - API 활용

In [None]:
os.environ["OPENAI_API_KEY"] = "<OPENAI_API_KEY>"

In [None]:
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
model_name = "whisper-1"

In [None]:
with open(audio_file_name, "rb") as audio_file:
    transcription = client.audio.transcriptions.create(
        model=model_name,
        file=audio_file,
        response_format="verbose_json"
    )

In [None]:
print(transcription.model_dump_json(indent=2))

In [None]:
srt_writer = get_writer(output_format="srt", output_dir=".")
srt_writer(dict(transcription), audio_file_name)

In [None]:
!cat audio_from_youtube.srt

## 위스퍼 추론기 구현하기

In [None]:
class WhisperInferencer:
    def __init__(self):
        self.model = whisper.load_model("large")
        self.srt_writer = get_writer(output_format="srt", output_dir=".")

    def inference(self, audio_file_path: str) -> str:
        transcript = self.model.transcribe(audio_file_path)
        self.srt_writer(transcript, audio_file_path)

        filename = os.path.basename(audio_file_path)
        filename = filename.split(".")[0]

        return f"{filename}.srt"

whipser_inferencer = WhisperInferencer()

In [None]:
def transcribe(link: str):
    video_file_name = "video_from_youtube.mp4"
    audio_file_name = "audio_from_youtube.webm"
    yt = YouTube(link)

    # Extract video
    streams = yt.streams.filter(progressive=True, file_extension="mp4", type="video").order_by("resolution").desc()
    streams[0].download(filename=video_file_name)

    # Extract audio
    audio_streams = yt.streams.filter(type="audio").order_by("abr").desc()
    audio_streams[0].download(filename=audio_file_name)

    transcript_file = whipser_inferencer.inference(audio_file_name)
    return transcript_file, [video_file_name, transcript_file]

In [None]:
with gr.Blocks() as app:
    gr.Markdown("# Youtube 자막 생성기")

    with gr.Row():
        with gr.Column(scale=1):
            link = gr.Textbox(label="Youtube Link")
            subtitle = gr.File(label="Subtitle", file_types=[".srt"])
            transcribe_btn = gr.Button(value="자막 생성!")

        with gr.Column(scale=4):
            output_video = gr.Video(label="Output", height=500)

    transcribe_btn.click(transcribe, [link], [subtitle, output_video])

In [None]:
app.launch(inline=False, share=True)

In [None]:
app.close()
del whipser_inferencer
torch.cuda.empty_cache()

## 최종 App 구현

In [None]:
import os

import gradio as gr
import whisper
from pytube import YouTube
from whisper.utils import get_writer

class WhisperInferencer:
    def __init__(self):
        self.model = whisper.load_model("large")
        self.srt_writer = get_writer(output_format="srt", output_dir=".")

    def inference(self, audio_file_path: str) -> str:
        transcript = self.model.transcribe(audio_file_path)
        self.srt_writer(transcript, audio_file_path)

        filename = os.path.basename(audio_file_path)
        filename = filename.split(".")[0]

        return f"{filename}.srt"

whipser_inferencer = WhisperInferencer()

def transcribe(link: str):
    video_file_name = "video_from_youtube.mp4"
    audio_file_name = "audio_from_youtube.webm"
    yt = YouTube(link)

    # Extract video
    streams = yt.streams.filter(progressive=True, file_extension="mp4", type="video").order_by("resolution").desc()
    streams[0].download(filename=video_file_name)

    # Extract audio
    audio_streams = yt.streams.filter(type="audio").order_by("abr").desc()
    audio_streams[0].download(filename=audio_file_name)

    transcript_file = whipser_inferencer.inference(audio_file_name)
    return transcript_file, [video_file_name, transcript_file]

# Set gradio app
with gr.Blocks() as app:
    gr.Markdown("# Youtube 자막 생성기")

    with gr.Row():
        with gr.Column(scale=1):
            link = gr.Textbox(label="Youtube Link")
            subtitle = gr.File(label="Subtitle", file_types=[".srt"])
            transcribe_btn = gr.Button(value="자막 생성!")

        with gr.Column(scale=4):
            output_video = gr.Video(label="Output", height=500)

    transcribe_btn.click(transcribe, [link], [subtitle, output_video])

app.launch(inline=False, share=True)

In [None]:
app.close()
del whipser_inferencer
torch.cuda.empty_cache()