````sh
pip3.12 install huggingface_hub pyannote.audio torch faster-whisper ipython ipykernel
````

In [1]:
import os
from huggingface_hub import login

# To save your Huggingface token, run your terminal:
# echo 'export HF_TOKEN="hf_*******************************"' >> $HOME/.bashrc

# Otherwise, the login function will prompt a login interface
login(token=os.environ.get("HF_TOKEN"))

# FROM https://github.com/yinruiqing/pyannote-whisper
from pyannote.audio import Pipeline
import torch
import utils # CREDIT: https://github.com/yinruiqing/pyannote-whisper
from faster_whisper import WhisperModel
from IPython.display import clear_output

  from .autonotebook import tqdm as notebook_tqdm
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [2]:
%%time

device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")

# Initialize Pyannote pipeline
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1").to(device)

# Load audio file
audio_file = "241118_1543.wav"

model = WhisperModel("medium.en", device="cpu", compute_type="float32")
segments, info = model.transcribe(audio_file, beam_size=5)
generated_segments = list(segments)

print("Detected language '%s' with probability %f" % (info.language, info.language_probability))

#for segment in segments:
#    print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))

INFO:speechbrain.utils.quirks:Applied quirks (see `speechbrain.utils.quirks`): [allow_tf32, disable_jit_profiling]
INFO:speechbrain.utils.quirks:Excluded quirks specified by the `SB_DISABLE_QUIRKS` environment (comma-separated list): []
INFO:faster_whisper:Processing audio with duration 00:44.643


Detected language 'en' with probability 1.000000
CPU times: user 43.1 s, sys: 6.79 s, total: 49.9 s
Wall time: 33.9 s


In [3]:
# utility function to convert generated segments from faster_whisper to whisper format

def to_whisper_format(generated_segments):
    whisper_formatted_generated_segment = []
    for segment in generated_segments:
        whisper_formatted_generated_segment.append({"id":segment.id,
                                                    "seek":segment.seek,
                                                    "start":segment.start,
                                                    "end":segment.end,
                                                    "text":segment.text,
                                                    "tokens":segment.tokens,
                                                    "avg_logprob":segment.avg_logprob,
                                                    "compression_ratio":segment.compression_ratio,
                                                    "no_speech_prob":segment.no_speech_prob,
                                                    "words":segment.words,
                                                    "temperature":segment.temperature
                                                   })
    return {"segments": whisper_formatted_generated_segment}

In [5]:
diarization_result = pipeline(audio_file)
result = utils.diarize_text(to_whisper_format(generated_segments), diarization_result)

clear_output()
for seg, spk, sentence in result:
    print(f'{seg.start:.2f};{seg.end:.2f};{spk};{sentence}')

0.00;2.48;SPEAKER_00; So, welcome to this interview today.
2.48;8.92;SPEAKER_00; I'm sitting here with Natalie, and we're going to have just a little interview with two questions.
8.92;14.52;SPEAKER_00; My name is Lars, and I'm handing over to my interviewee to introduce herself.
14.52;15.52;SPEAKER_01; Okay.
15.52;16.52;SPEAKER_01; Thank you.
16.52;17.52;SPEAKER_01; My name is Natalie.
17.52;21.20;SPEAKER_01; I'm an employee at GAG, and we are testing this new device.
21.20;24.56;SPEAKER_00; Okay, thank you.
24.56;29.24;SPEAKER_00; My first question to you is, when is your next field trip?
29.24;31.64;SPEAKER_01; That is a good question, Lars.
31.64;32.64;SPEAKER_01; I'm not sure.
32.64;34.28;SPEAKER_01; We're hoping for May.
34.28;37.56;SPEAKER_00; Okay, you're hoping for May.
37.56;39.02;SPEAKER_00; When was your last field trip?
39.02;41.02;SPEAKER_01; I last went in July.
41.02;43.32;SPEAKER_00; Okay, thank you very much for the interview.


In [6]:
# Only available in VS code

from IPython import get_ipython
ip = get_ipython()
path = None
if '__vsc_ipynb_file__' in ip.user_ns:
    path = ip.user_ns['__vsc_ipynb_file__']

In [8]:
utils.write_to_txt(result, os.path.basename(path).removesuffix(".ipynb")+".csv", semicolumn=True)