# 生成Apple PodCast字幕



In [None]:
#@markdown # **Check GPU type** 🕵️

#@markdown The type of GPU you get assigned in your Colab session defined the speed at which the video will be transcribed.
#@markdown The higher the number of floating point operations per second (FLOPS), the faster the transcription.
#@markdown But even the least powerful GPU available in Colab is able to run any Whisper model.
#@markdown Make sure you've selected `GPU` as hardware accelerator for the Notebook (Runtime → Change runtime type → Hardware accelerator).

#@markdown |  GPU   |  GPU RAM   | FP32 teraFLOPS |     Availability   |
#@markdown |:------:|:----------:|:--------------:|:------------------:|
#@markdown |  T4    |    16 GB   |       8.1      |         Free       |
#@markdown | P100   |    16 GB   |      10.6      |      Colab Pro     |
#@markdown | V100   |    16 GB   |      15.7      |  Colab Pro (Rare)  |

#@markdown ---
#@markdown **Factory reset your Notebook's runtime if you want to get assigned a new GPU.**

!nvidia-smi -L

!nvidia-smi
     

In [1]:
#@markdown **配置Whisper/Setup Whisper** 🏗️
import locale
locale.getpreferredencoding = lambda: "UTF-8"

!pip install git+https://github.com/yinruiqing/pyannote-whisper.git
!pip install requests beautifulsoup4 pyannote.audio pydub
!pip install git+https://github.com/openai/whisper.git

import torch
import sys
import whisper
import numpy as np
import warnings
import shutil
from IPython.display import Markdown
import os
import requests
import re
from pydub import AudioSegment
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from pathlib import Path
from pyannote.audio import Pipeline
from pyannote_whisper.utils import diarize_text


device = torch.device('cuda:0')
print('Using device:', device, file=sys.stderr)

print('Whisper installed，please execute next cell')

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pyannote.audio
  Downloading pyannote.audio-2.1.1-py2.py3-none-any.whl (390 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m390.7/390.7 KB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sndfile
  Downloading sndfile-0.2.0.tar.gz (4.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ffmpeg-python==0.2.0
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl (25 kB)
Collecting triton==2.0.0
  Downloading triton-2.0.0-1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (63.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.3/63.3 MB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tiktoken==0.3.1
  Downloading tiktoken-0.3.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-xuesl58n
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-xuesl58n
  Resolved https://github.com/openai/whisper.git to commit 6dea21fd7f7253bfe450f1e2512a0fe47ee2d258
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Whisper installed，please execute next cell


Using device: cuda:0


In [2]:
#@markdown # **Model selection** 🧠

#@markdown As of the first public release, there are 4 pre-trained options to play with:

#@markdown |  Size  | Parameters | English-only model | Multilingual model | Required VRAM | Relative speed |
#@markdown |:------:|:----------:|:------------------:|:------------------:|:-------------:|:--------------:|
#@markdown |  tiny  |    39 M    |     `tiny.en`      |       `tiny`       |     ~1 GB     |      ~32x      |
#@markdown |  base  |    74 M    |     `base.en`      |       `base`       |     ~1 GB     |      ~16x      |
#@markdown | small  |   244 M    |     `small.en`     |      `small`       |     ~2 GB     |      ~6x       |
#@markdown | medium |   769 M    |    `medium.en`     |      `medium`      |     ~5 GB     |      ~2x       |
#@markdown | large  |   1550 M   |        N/A         |      `large`       |    ~10 GB     |       1x       |

#@markdown ---

Model = 'large-v2' #@param ['tiny.en', 'tiny', 'base.en', 'base', 'small.en', 'small', 'medium.en', 'medium', 'large', 'large-v2']
#@markdown ---
#@markdown **Run this cell again if you change the model.**

# load pyannote speaker-diarization
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization",
                                            use_auth_token="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")

whisper_model = whisper.load_model(Model)

if Model in whisper.available_models():
    display(Markdown(
        f"**{Model} model is selected.**"
    ))
else:
    display(Markdown(
        f"**{Model} model is no longer available.** Please select one of the following: - {' - '.join(whisper.available_models())}"
    ))

Downloading (…)lve/main/config.yaml:   0%|          | 0.00/500 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

Downloading (…)/2022.07/config.yaml:   0%|          | 0.00/318 [00:00<?, ?B/s]

Downloading (…)ain/hyperparams.yaml:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

Downloading embedding_model.ckpt:   0%|          | 0.00/83.3M [00:00<?, ?B/s]

Downloading (…)an_var_norm_emb.ckpt:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

Downloading classifier.ckpt:   0%|          | 0.00/5.53M [00:00<?, ?B/s]

Downloading (…)in/label_encoder.txt:   0%|          | 0.00/129k [00:00<?, ?B/s]

100%|█████████████████████████████████████| 2.87G/2.87G [00:37<00:00, 81.8MiB/s]


**large-v2 model is selected.**

In [1]:
#@markdown # **Apple Podcast selection** 🎙️

#@markdown Enter the URL of the Apple Podcast you want to transcribe.

#@markdown ---
#@markdown #### **Apple Podcast**
URL = "https://podcasts.apple.com/us/podcast/whats-next-%E7%A7%91%E6%8A%80%E6%97%A9%E7%9F%A5%E9%81%93/id1494812579?i=1000602286723" #@param {type:"string"}
#@markdown ---
#@markdown **Run this cell again if you change the video.**

def convert_audio_to_wav(input_file: str):
    # Determine the output file name by replacing the input file extension with .wav
    output_file = os.path.splitext(input_file)[0] + ".wav"

    # Check the input file extension and load the audio accordingly
    if input_file.lower().endswith(".mp3"):
        audio = AudioSegment.from_mp3(input_file)
    elif input_file.lower().endswith(".m4a"):
        audio = AudioSegment.from_file(input_file, "m4a")
    else:
        raise ValueError("Unsupported audio format. Please provide an MP3 or M4A file.")

    # Export the audio as a WAV file
    audio.export(output_file, format="wav")

    return output_file


def find_audio_url(html: str) -> str:
    # Find all .mp3 and .m4a URLs in the HTML content
    audio_urls = re.findall(r'https://[^\s^"]+(?:\.mp3|\.m4a)', html)

    # If there's at least one URL, return the first one
    if audio_urls:
        return audio_urls[-1]

    # Otherwise, return None
    return None

def get_file_extension(url: str) -> str:
    # Parse the URL to get the path
    parsed_url = urlparse(url)
    path = parsed_url.path

    # Extract the file extension using os.path.splitext
    _, file_extension = os.path.splitext(path)

    # Return the file extension
    return file_extension

def download_apple_podcast(url: str, output_folder: str = 'downloads'):
    response = requests.get(url)
    if response.status_code != 200:
        print(
            f"Error: Unable to fetch the podcast page. Status code: {response.status_code}")
        return

    soup = BeautifulSoup(response.text, 'html.parser')

    audio_url = find_audio_url(response.text)

    if not audio_url:
        print("Error: Unable to find the podcast audio url.")
        return

    episode_title = soup.find('span', {'class': 'product-header__title'})

    if not episode_title:
        print("Error: Unable to find the podcast title.")
        return

    episode_title = episode_title.text.strip().replace('/', '-')

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    output_file = os.path.join(output_folder, f"{episode_title}{get_file_extension(audio_url)}")

    with requests.get(audio_url, stream=True) as r:
        r.raise_for_status()
        with open(output_file, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)

    output_file = convert_audio_to_wav(output_file)

    return episode_title, output_file


result = download_apple_podcast(URL)
if not result:
  print("Error: Unable to download podcast.")
else:
  (title, filepath) = result
  print(f"Downloaded podcast episode '{title}' to '{filepath}'")

NameError: ignored

In [None]:
#@markdown # **Run the model** 🚀

#@markdown Run this cell to execute the transcription of the video. This can take a while and very based on the length of the video and the number of parameters of the model selected above.

#@markdown ## **Parameters** ⚙️

#@markdown ### **Behavior control**
#@markdown ---
language = "Auto detection" #@param ['Auto detection', 'Afrikaans', 'Albanian', 'Amharic', 'Arabic', 'Armenian', 'Assamese', 'Azerbaijani', 'Bashkir', 'Basque', 'Belarusian', 'Bengali', 'Bosnian', 'Breton', 'Bulgarian', 'Burmese', 'Castilian', 'Catalan', 'Chinese', 'Croatian', 'Czech', 'Danish', 'Dutch', 'English', 'Estonian', 'Faroese', 'Finnish', 'Flemish', 'French', 'Galician', 'Georgian', 'German', 'Greek', 'Gujarati', 'Haitian', 'Haitian Creole', 'Hausa', 'Hawaiian', 'Hebrew', 'Hindi', 'Hungarian', 'Icelandic', 'Indonesian', 'Italian', 'Japanese', 'Javanese', 'Kannada', 'Kazakh', 'Khmer', 'Korean', 'Lao', 'Latin', 'Latvian', 'Letzeburgesch', 'Lingala', 'Lithuanian', 'Luxembourgish', 'Macedonian', 'Malagasy', 'Malay', 'Malayalam', 'Maltese', 'Maori', 'Marathi', 'Moldavian', 'Moldovan', 'Mongolian', 'Myanmar', 'Nepali', 'Norwegian', 'Nynorsk', 'Occitan', 'Panjabi', 'Pashto', 'Persian', 'Polish', 'Portuguese', 'Punjabi', 'Pushto', 'Romanian', 'Russian', 'Sanskrit', 'Serbian', 'Shona', 'Sindhi', 'Sinhala', 'Sinhalese', 'Slovak', 'Slovenian', 'Somali', 'Spanish', 'Sundanese', 'Swahili', 'Swedish', 'Tagalog', 'Tajik', 'Tamil', 'Tatar', 'Telugu', 'Thai', 'Tibetan', 'Turkish', 'Turkmen', 'Ukrainian', 'Urdu', 'Uzbek', 'Valencian', 'Vietnamese', 'Welsh', 'Yiddish', 'Yoruba']
#@markdown > Language spoken in the audio, use `Auto detection` to let Whisper detect the language.
#@markdown ---
task = 'transcribe' #@param ['transcribe', 'translate']
#@markdown > Whether to perform X->X speech recognition (`transcribe`) or X->English translation (`translate`).
#@markdown ---
verbose = 'Live transcription' #@param ['Live transcription', 'Progress bar', 'None']
#@markdown > Whether to print out the progress and debug messages.
#@markdown ---

verbose_lut = {
    'Live transcription': True,
    'Progress bar': False,
    'None': None
}

language = (None if language == "Auto detection" else language)
verbose = verbose_lut[verbose]


if Model.endswith(".en") and language not in {"en", "English"}:
    warnings.warn(f"{Model} is an English-only model but receipted '{language}'; using English instead.")
    language = "en"

display(Markdown(f"### {filepath}"))

audio_path_local = Path(filepath).resolve()
subtitle_file = os.path.splitext(filepath)[0] + ".srt"
transcript_with_speakers_file = os.path.splitext(filepath)[0] + ".txt"
print("audio local path:", audio_path_local)

import time

#Transcribe
tic = time.time()
print('Transcribe in progress...')
transcription = whisper.transcribe(
    model=whisper_model,
    audio = str(audio_path_local),
    verbose=verbose,
    task=task,
    language=language
)
#Time comsumed
toc = time.time()
print(f'Time consumpution {toc-tic}s for transcribing')

#Write SRT file
from whisper.utils import WriteSRT
with open(subtitle_file, "w", encoding="utf-8") as srt:
    writer = WriteSRT(audio_path_local.parent)
    writer.write_result(transcription, srt)
print(f"Write SRT file to '{subtitle_file}'")

torch.cuda.empty_cache()

diarization_result = pipeline(audio_path_local)
final_result = diarize_text(transcription, diarization_result)

lines = list()
for seg, spk, sent in final_result:
    line = f'[{seg.start:.2f} --> {seg.end:.2f}]\n{spk}: {sent}'
    print(f"{line}")
    lines.append(line)


print("Save Transcript With Speakers file", transcript_with_speakers_file)
with open(transcript_with_speakers_file, "w", encoding="utf-8") as text_file:
    text_file.write("\n\n".join(lines))    

torch.cuda.empty_cache()

display(Markdown(f"**Transcript file created: {transcript_with_speakers_file}**"))



### downloads/108.跟傅盛和王俊煜聊：大模型、产品经理和热门AI应‪用‬.wav

audio local path: /content/downloads/108.跟傅盛和王俊煜聊：大模型、产品经理和热门AI应‪用‬.wav
Transcribe in progress...
Detecting language using up to the first 30 seconds. Use `--language` to specify the language
Detected language: Chinese
[00:00.000 --> 00:07.720] 大家好 我是潘乱 欢迎来到乱盘书
[00:07.720 --> 00:10.760] 这是一档关注商业科技和互联网的对话节目
[00:10.760 --> 00:14.920] 这一期我们的直播主题是聊大模型和产品经理
[00:14.920 --> 00:18.640] 两位嘉宾是移动互联网时代著名的产品经理
[00:18.640 --> 00:22.560] 猎豹移动的创始人傅胜和豌豆莢的创始人王军玉
[00:22.560 --> 00:26.960] 就是我们聊了包括基于大模型的一些热门产品
[00:26.960 --> 00:30.160] 以及AI时代需要什么样的产品经理等等
[00:30.160 --> 00:33.240] 这次直播是我跟傅胜老师第一次通过
[00:33.240 --> 00:34.880] 就是下面口部一个广告
[00:34.880 --> 00:38.920] 本节目由极客公务员旗下的创始人社区Fundpark发起
[00:38.920 --> 00:41.080] 我跟张鹏老师一起主持
[00:41.080 --> 00:43.920] 如果有AI方向的创业者想加群
[00:43.920 --> 00:46.160] 请搜索关注Fundpark公众号
[00:46.160 --> 00:49.720] 并回复大模型三个字就可以收到报名表单
[00:49.720 --> 00:52.000] OK 我们直接切入正题
[00:52.680 --> 00:56.480] 军玉最近应该是全力投入在研究GDP4
[00:56.520 --> 01:00.960] 就是从你的视角来去看GDP4在这一波相对于3.5
[01:00.960 --> 01:03.440]

In [None]:
#@markdown # **Download the subtitle file** 🎆

from google.colab import files

display(Markdown(f"**Download Subtitle: {subtitle_file}**"))
files.download(subtitle_file)

display(Markdown(f"**Download Transcript With Speakers: {transcript_with_speakers_file}**"))
files.download(transcript_with_speakers_file)

display(Markdown(f"**Download Audio: {filepath}**"))
files.download(filepath)
