In [None]:
import json
from pathlib import Path

secrets_file = Path(".") / "secrets.json"
with open(secrets_file) as f:
    secrets = json.load(f)

In [2]:
import whisperx
import gc 
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
audio_file = "sample/two-mates-having-a-chat.m4a"
batch_size = 16 # reduce if low on GPU mem
compute_type = "float16" if device == "cuda" else "int8" # change to "int8" if low on GPU mem (may reduce accuracy)

# 1. Transcribe with original whisper (batched)
model = whisperx.load_model("large-v2", device, compute_type=compute_type)

# save model to local path (optional)
# model_dir = "/path/"
# model = whisperx.load_model("large-v2", device, compute_type=compute_type, download_root=model_dir)

audio = whisperx.load_audio(audio_file)
result = model.transcribe(audio, batch_size=batch_size)
print(result["segments"]) # before alignment

# delete model if low on GPU resources
# import gc; gc.collect(); torch.cuda.empty_cache(); del model

# 2. Align whisper output
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)

print(result["segments"]) # after alignment

# delete model if low on GPU resources
# import gc; gc.collect(); torch.cuda.empty_cache(); del model_a

# 3. Assign speaker labels
diarize_model = whisperx.DiarizationPipeline(use_auth_token=secrets.get("hf_token"), device=device)
# add min/max number of speakers if known
diarize_segments = diarize_model(audio)
# diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)
result = whisperx.assign_word_speakers(diarize_segments, result)


  torchaudio.set_audio_backend("soundfile")
  backend = torchaudio.get_audio_backend()
INFO:speechbrain.utils.quirks:Applied quirks (see `speechbrain.utils.quirks`): [disable_jit_profiling, allow_tf32]
INFO:speechbrain.utils.quirks:Excluded quirks specified by the `SB_DISABLE_QUIRKS` environment (comma-separated list): []
  from speechbrain.pretrained import (
  torchaudio.set_audio_backend(backend)
  from torchaudio.backend.common import AudioMetaData


No language specified, language will be first be detected for each audio file (increases inference time).


Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../.cache/torch/whisperx-vad-segmentation.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.4.1. Bad things might happen unless you revert torch to 1.x.
Detected language: en (0.99) in first 30s of audio...
[{'text': " Mate, let me tell you, it was a ripper of a day out on the water. I hooked this beautiful barramundi. I swear it was as big as a croc. Yeah, sounds like a good time, Michael. But you know, I reckon fishing's a bit overrated. Give me a cold beer and a good book any day. Fair dinkum, Scott. But there's nothing like the thrill of reeling in a big one. Feeling that tug on the line, mate. It's like a dance with nature. You know what I mean?", 'start': 1.254, 'end': 24.445}, {'text': " Yeah, I get you, mate. But for me, it's all about the simple things. Sitting back, soaking up the sun, and enjoying the serenity of the bush. True, true. But there's something about the Aussie bush that gets 

  std = sequences.std(dim=-1, correction=1)


                              segment label     speaker      start        end  \
0   [ 00:00:01.264 -->  00:00:08.412]     A  SPEAKER_00   1.264856   8.412564   
1   [ 00:00:09.431 -->  00:00:15.764]     B  SPEAKER_01   9.431239  15.764007   
2   [ 00:00:16.612 -->  00:00:17.699]     C  SPEAKER_00  16.612903  17.699491   
3   [ 00:00:17.852 -->  00:00:21.926]     D  SPEAKER_00  17.852292  21.926995   
4   [ 00:00:22.555 -->  00:00:24.439]     E  SPEAKER_00  22.555178  24.439728   
5   [ 00:00:25.169 -->  00:00:31.825]     F  SPEAKER_01  25.169779  31.825127   
6   [ 00:00:32.691 -->  00:00:36.409]     G  SPEAKER_00  32.691002  36.409168   
7   [ 00:00:36.833 -->  00:00:41.553]     H  SPEAKER_00  36.833616  41.553480   
8   [ 00:00:42.385 -->  00:00:48.904]     I  SPEAKER_01  42.385399  48.904924   
9   [ 00:00:51.298 -->  00:00:56.409]     J  SPEAKER_00  51.298812  56.409168   
10  [ 00:00:56.935 -->  00:01:01.332]     K  SPEAKER_01  56.935484  61.332767   

    intersection      union

In [10]:
for segment in result["segments"]:
    print(segment["speaker"], segment["text"])

SPEAKER_00  Mate, let me tell you, it was a ripper of a day out on the water.
SPEAKER_00 I hooked this beautiful barramundi.
SPEAKER_00 I swear it was as big as a croc.
SPEAKER_01 Yeah, sounds like a good time, Michael.
SPEAKER_01 But you know, I reckon fishing's a bit overrated.
SPEAKER_01 Give me a cold beer and a good book any day.
SPEAKER_00 Fair dinkum, Scott.
SPEAKER_00 But there's nothing like the thrill of reeling in a big one.
SPEAKER_00 Feeling that tug on the line, mate.
SPEAKER_00 It's like a dance with nature.
SPEAKER_00 You know what I mean?
SPEAKER_01  Yeah, I get you, mate.
SPEAKER_01 But for me, it's all about the simple things.
SPEAKER_01 Sitting back, soaking up the sun, and enjoying the serenity of the bush.
SPEAKER_00 True, true.
SPEAKER_00 But there's something about the Aussie bush that gets the blood pumping, mate.
SPEAKER_00 The sounds of the kookaburras, the smell of the eucalyptus trees.
SPEAKER_00 It's like being in paradise.
SPEAKER_01 No arguments there, M

In [6]:
import ffmpeg

# Create a directory to save the audio files
output_dir = Path("output_audio")
output_dir.mkdir(exist_ok=True)

# Iterate through the segments and create audio files for each speaker
speakers = {}
for i, segment in enumerate(result["segments"]):
    start_time = segment["start"]
    end_time = segment["end"]
    speaker = segment["speaker"]
    output_file = output_dir / f"{i}_{speaker}.wav"
    if speaker not in speakers:
        speakers[speaker] = []
    speakers[speaker].append((output_file, start_time, end_time))
    
    # Create a single file for each speaker
    for speaker, segments in speakers.items():
        concat = ffmpeg.input('anullsrc=r=44100:cl=stereo', t=0)  # create an empty input to concatenate with
        for output_file, start_time, end_time in segments:
            segment = ffmpeg.input(audio_file, ss=start_time, to=end_time)
            concat = ffmpeg.concat(concat, segment, v=0, a=1)
        output_file = output_dir / f"{speaker}.wav"
        concat.output(str(output_file), format='wav', loglevel="quiet").run(overwrite_output=True)
# for i, segment in enumerate(result["segments"]):
#     start_time = segment["start"]
#     end_time = segment["end"]
#     speaker = segment["speaker"]
#     output_file = output_dir / f"{i}_{speaker}.wav"
    
#     # Extract the audio segment for the speaker
#     (
#         ffmpeg
#         .input(audio_file, ss=start_time, to=end_time)
#         .output(str(output_file), format='wav',loglevel="quiet",)
#         .run(overwrite_output=True)
#     )