In [1]:
%%capture
!pip install torch "torchaudio<0.12" pytube transformers datasets "moviepy==1.0.3" accelerate

### Download audio

In [2]:
from pathlib import Path
from pytube import YouTube

In [3]:
link = "https://www.youtube.com/watch?v=TvLEOmJREf8"
data_folder = Path.cwd() / "output"

In [4]:
yt = YouTube(link)
audio_streams = yt.streams.filter(only_audio=True, subtype="mp4")
for stream in audio_streams:
    outpath = stream.download(output_path=str(data_folder))

In [8]:
filename = [x.name for x in data_folder.iterdir()][0]
audio_path = data_folder / filename

### Convert MP4 to MP3

In [10]:
from moviepy.editor import AudioFileClip

converted_audio_path = audio_path.with_suffix(".mp3")

video = AudioFileClip(str(audio_path))
video.write_audiofile(str(converted_audio_path))

MoviePy - Writing audio in /Users/johanleduc/workspace/youtube_summarizer/output/Éclairage public  pourquoi la couverture aux LED est-elle si faible alors quelle permet une éc….mp3


                                                                      

MoviePy - Done.




### Transcribe Whisper

In [11]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import Audio, load_dataset, Dataset
import torch

In [12]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

sizes = ["tiny", "small", "base", "medium", "large"]
size = sizes[0]

# load model and processor
processor = WhisperProcessor.from_pretrained(f"openai/whisper-{size}")
model = WhisperForConditionalGeneration.from_pretrained(f"openai/whisper-{size}").to(device)

Downloading: 100%|██████████| 185k/185k [00:00<00:00, 458kB/s] 
Downloading: 100%|██████████| 828/828 [00:00<00:00, 301kB/s]
Downloading: 100%|██████████| 1.04M/1.04M [00:00<00:00, 2.10MB/s]
Downloading: 100%|██████████| 494k/494k [00:00<00:00, 1.17MB/s]
Downloading: 100%|██████████| 52.7k/52.7k [00:00<00:00, 299kB/s]
Downloading: 100%|██████████| 2.11k/2.11k [00:00<00:00, 580kB/s]
Downloading: 100%|██████████| 2.06k/2.06k [00:00<00:00, 673kB/s]
Downloading: 100%|██████████| 1.96k/1.96k [00:00<00:00, 526kB/s]
Downloading: 100%|██████████| 151M/151M [00:03<00:00, 44.9MB/s] 


In [13]:
# set initial special token; task = french transcription
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(
    language = "fr",
    task = "transcribe",
)

In [14]:
FRAGMENT_LENGTH = 30

def split_audio(tracks):
  fragment_size = FRAGMENT_LENGTH * tracks["audio"][0]["sampling_rate"]
  audio = tracks["audio"][0]["array"]
  fragments = [
      audio[start:min(start+fragment_size, len(audio))]
      for start in range(0, len(audio), fragment_size)
  ]
  return {"fragment": fragments}

def transcribe(fragment):
    input_features = processor(
        fragment["fragment"],
        return_tensors="pt",
        sampling_rate=16_000,
    ).input_features.to(device)

    predicted_ids = model.generate(input_features)
    transcription = processor.batch_decode(
        predicted_ids,
        skip_special_tokens = True,
    )
    return {"transcription": transcription}

In [15]:
result_ds = (
    Dataset
    .from_dict({"audio": [str(converted_audio_path)]})
    .cast_column("audio", Audio(sampling_rate=16_000))
    .map(split_audio, remove_columns=["audio"], batched=True)
    .map(transcribe)
)

100%|██████████| 1/1 [00:00<00:00,  1.09ba/s]
100%|██████████| 5/5 [00:12<00:00,  2.59s/ex]


In [19]:
from functools import reduce

transcription = reduce(lambda x, y: f"{x} {y[0]}", result_ds["transcription"], "")

In [20]:
transcription_file = data_folder / f"transcript_{size}.txt"
with open(str(transcription_file), "w") as f:
    f.write(transcription)