# Whisper batch

Some quick imports and setting the working directory to coincide with the `git` root, and import our `transcribe` module.

In [1]:
import subprocess
import os
import requests
import librosa
import torch
import warnings
import sys

from git import Repo
from transformers import pipeline
from datasets import load_dataset
from IPython.display import Audio

from pyannote.audio import Pipeline as Pyannote_Pipeline
from pyannote.audio.pipelines.utils.hook import ProgressHook



os.chdir(subprocess
         .check_output(["git", "rev-parse", "--show-toplevel"])
         .decode('utf-8')
         .strip())

sys.path.append("src/madrs-transcribe/")
import transcribe

if not Repo("data").head.commit.hexsha == '5222be26ac04c2e2d498373a78372d6072faf080':
    warnings.warn("Data repository not in the same state as when this notebook was written.")

INFO:speechbrain.utils.quirks:Applied quirks (see `speechbrain.utils.quirks`): [disable_jit_profiling, allow_tf32]
INFO:speechbrain.utils.quirks:Excluded quirks specified by the `SB_DISABLE_QUIRKS` environment (comma-separated list): []


In [2]:
recordings_dir = "data/interviews/audio/ambient"
out_dir = "temp/outfiles/diarized"
os.makedirs(out_dir, exist_ok=True)
recordings = [recording for recording in os.listdir(recordings_dir) if recording.endswith(".wav")]
print("recordings to transcribe:")
recordings

recordings to transcribe:


['MAD-007.wav',
 'MAD-002.wav',
 'MAD-005.wav',
 'MAD-006.wav',
 'MAD-003.wav',
 'MAD-001.wav',
 'MAD-004.wav']

In [3]:
print("Beginning batch transcription...")
for recording in recordings:
    print("Transcribing " + recording + "...")
    transcription = transcribe.Pipeline()(os.path.join(recordings_dir, recording))
    transcription_path = os.path.join(out_dir, os.path.splitext(recording)[0] + ".txt")
    print("Writing transcription to " + transcription_path + "...")
    with open(transcription_path, "w", encoding = "utf8") as transcription_file:
        transcription_file.write(transcription)

Beginning batch transcription...
Transcribing MAD-007.wav...


It can be re-enabled by calling
   >>> import torch
   >>> torch.backends.cuda.matmul.allow_tf32 = True
   >>> torch.backends.cudnn.allow_tf32 = True
See https://github.com/pyannote/pyannote-audio/issues/1370 for more details.

  std = sequences.std(dim=-1, correction=1)
You have passed language=english, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of language=english.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Writing transcription to temp/outfiles/diarized/MAD-007.txt...
Transcribing MAD-002.wav...


  std = sequences.std(dim=-1, correction=1)


Writing transcription to temp/outfiles/diarized/MAD-002.txt...
Transcribing MAD-005.wav...


  std = sequences.std(dim=-1, correction=1)


Writing transcription to temp/outfiles/diarized/MAD-005.txt...
Transcribing MAD-006.wav...


  std = sequences.std(dim=-1, correction=1)


Writing transcription to temp/outfiles/diarized/MAD-006.txt...
Transcribing MAD-003.wav...


  std = sequences.std(dim=-1, correction=1)


Writing transcription to temp/outfiles/diarized/MAD-003.txt...
Transcribing MAD-001.wav...


  std = sequences.std(dim=-1, correction=1)


Writing transcription to temp/outfiles/diarized/MAD-001.txt...
Transcribing MAD-004.wav...


  std = sequences.std(dim=-1, correction=1)


Writing transcription to temp/outfiles/diarized/MAD-004.txt...


In [5]:
out_dir = "temp/outfiles/undiarized"
os.makedirs(out_dir, exist_ok=True)
print("Beginning batch transcription (undiarized)...")
for recording in recordings:
    print("Transcribing " + recording + "...")
    transcription = transcribe.Pipeline(diarize = False)(os.path.join(recordings_dir, recording))
    transcription_path = os.path.join(out_dir, os.path.splitext(recording)[0] + ".txt")
    print("Writing transcription to " + transcription_path + "...")
    with open(transcription_path, "w", encoding = "utf8") as transcription_file:
        transcription_file.write(transcription)

Beginning batch transcription (undiarized)...
Transcribing MAD-007.wav...
Writing transcription to temp/outfiles/undiarized/MAD-007.txt...
Transcribing MAD-002.wav...
Writing transcription to temp/outfiles/undiarized/MAD-002.txt...
Transcribing MAD-005.wav...
Writing transcription to temp/outfiles/undiarized/MAD-005.txt...
Transcribing MAD-006.wav...
Writing transcription to temp/outfiles/undiarized/MAD-006.txt...
Transcribing MAD-003.wav...
Writing transcription to temp/outfiles/undiarized/MAD-003.txt...
Transcribing MAD-001.wav...
Writing transcription to temp/outfiles/undiarized/MAD-001.txt...
Transcribing MAD-004.wav...
Writing transcription to temp/outfiles/undiarized/MAD-004.txt...
