In [1]:
import whisper

In [2]:
from datasets import load_dataset, Dataset, Audio

  from .autonotebook import tqdm as notebook_tqdm


### Creating an audio dataset from .wav files

In [3]:
from glob import glob
files = glob('../voice_cloning/voices/*.wav')

dataset = Dataset.from_dict({
    'name':[elem.split('/')[-1].split('.')[0] for elem in files],
    'path':files,
    'audio':files}).cast_column("audio", Audio(sampling_rate=16000))
dataset[0]

{'name': 'musk',
 'path': '../voice_cloning/voices/musk.wav',
 'audio': {'path': '../voice_cloning/voices/musk.wav',
  'array': array([-0.0107157 , -0.0207513 , -0.01781394, ..., -0.02346013,
          0.08692658,  0.        ]),
  'sampling_rate': 16000}}

In [4]:
id = 3
display(dataset[id])

model = whisper.load_model("base")
model.transcribe(dataset[id]['audio']['array'].astype('float32'))['text']

{'name': 'malte',
 'path': '../voice_cloning/voices/malte.wav',
 'audio': {'path': '../voice_cloning/voices/malte.wav',
  'array': array([ 3.51592462e-06,  2.04406912e-04,  1.73166161e-04, ...,
          1.62316253e-04, -1.63853336e-02, -3.93715128e-03]),
  'sampling_rate': 16000}}

  checkpoint = torch.load(fp, map_location=device)


' Ich kann auch noch ein Beispiel Text von mir einsprechen, wo ich die ganze Zeit durchweg spreche und immer in der ungefähr gleichen Tunnelität und Tonlage und mal schauen ob das schon reicht.'

In [5]:
model.transcribe("audio/afjiv.wav")['text']

" I think if you're a leader and you don't understand the terms that you're using, that's probably the first start. It's really important that as a leader in the organisation you understand what digitisation means, you take the time to read widely in the sector. There are a lot of really good books. Kevin Kelly, who started Wide Magazine, has written a great book on various technologies. I think understanding the technologies, understanding what's out there so that you can separate the hype from the hope is really an important first step. And then making sure you understand the relevance of that for your function and how that fits into your business is the second step. I think two simple suggestions. One is I love the phrase brilliant at the basics, right? So how can you become brilliant at the basics? But beyond that, the fundamental thing I've seen which hasn't changed is so few organisations as a first step have truly taken control of their spend data. As a key first step on the dig

### Pipeline

1. Speaker diarization
2. Audio splitting based on speaker diarization
3. Chunk transcription

In [None]:
# Run Speaker diarization
from pyannote.audio import Pipeline
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-2.1")

diarization = pipeline("audio/afjiv.wav", num_speakers = 2)
for turn, _, speaker in diarization.itertracks(yield_label=True,):
    print(f"start={turn.start:.1f}s stop={turn.end:.1f}s speaker_{speaker}")

INFO:speechbrain.utils.quirks:Applied quirks (see `speechbrain.utils.quirks`): [allow_tf32, disable_jit_profiling]
INFO:speechbrain.utils.quirks:Excluded quirks specified by the `SB_DISABLE_QUIRKS` environment (comma-separated list): []
Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.0.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../../.cache/torch/pyannote/models--pyannote--segmentation/snapshots/c4c8ceafcbb3a7a280c2d357aee9fbc9b0be7f9b/pytorch_model.bin`
INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached


Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.4.1. Bad things might happen unless you revert torch to 1.x.


  wrapped_fwd = torch.cuda.amp.custom_fwd(fwd, cast_inputs=cast_inputs)
INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch mean_var_norm_emb.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch classifier.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch label_encoder.txt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.parameter_transfer:Loading pretrained files for: embedding_model, mean_var_norm_emb, classifier, label_encoder
  state_dict = torch.load(path, map_location=device)
  stats = torch.load(path, map_location=device)


In [None]:
# Split the audion file into speaker chunks
from pydub import AudioSegment

audio = AudioSegment.from_wav('audio/afjiv.wav')

chunks = []
for turn, _, speaker in diarization.itertracks(yield_label=True):
    start_ms = int(turn.start * 1000)
    end_ms = int(turn.end * 1000)
    chunk_audio = audio[start_ms:end_ms]
    chunk_audio.export(f"{speaker}_{start_ms}_{end_ms}.wav", format="wav")
    chunks.append({
        'file': f"splitted_speaker_files/{speaker}_{start_ms}_{end_ms}.wav",
        'speaker': speaker,
        'start': turn.start,
        'end': turn.end
    })
chunks

In [None]:
# Do the actual transcribing using whisper
import whisper
model = whisper.load_model("base")

final_transcript = []

for chunk in chunks:
    result = model.transcribe(chunk['file'])
    final_transcript.append({
        'speaker': chunk['speaker'],
        'start': chunk['start'],
        'end': chunk['end'],
        'text': result['text']
    })

In [None]:
for entry in final_transcript:
    print(f"[{entry['start']:.2f}–{entry['end']:.2f}] {entry['speaker']}: {entry['text']}")
