In [None]:
from pyannote.audio import Pipeline
pipeline = Pipeline.from_pretrained(
    "pyannote/speaker-diarization-3.1",
    use_auth_token="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")

# send pipeline to GPU (when available)
import torch
#pipeline.to(torch.device("cuda"))

# apply pretrained pipeline
diarization = pipeline("wav/a.wav")

# print the result
for turn, _, speaker in diarization.itertracks(yield_label=True):
    print(f"start={turn.start:.1f}s stop={turn.end:.1f}s speaker_{speaker}")
# start=0.2s stop=1.5s speaker_0
# start=1.8s stop=3.9s speaker_1
# start=4.2s stop=5.7s speaker_0
# ...

In [None]:
from pyannote.core import Segment, Annotation, Timeline


def get_text_with_timestamp(transcribe_res):
    timestamp_texts = []
    for item in transcribe_res['segments']:
        start = item['start']
        end = item['end']
        text = item['text']
        timestamp_texts.append((Segment(start, end), text))
    return timestamp_texts


def add_speaker_info_to_text(timestamp_texts, ann):
    spk_text = []
    for seg, text in timestamp_texts:
        spk = ann.crop(seg).argmax()
        spk_text.append((seg, spk, text))
    return spk_text


def merge_cache(text_cache):
    sentence = ''.join([item[-1] for item in text_cache])
    spk = text_cache[0][1]
    start = text_cache[0][0].start
    end = text_cache[-1][0].end
    return Segment(start, end), spk, sentence


PUNC_SENT_END = ['.', '?', '!']


def merge_sentence(spk_text):
    merged_spk_text = []
    pre_spk = None
    text_cache = []
    for seg, spk, text in spk_text:
        if spk != pre_spk and pre_spk is not None and len(text_cache) > 0:
            merged_spk_text.append(merge_cache(text_cache))
            text_cache = [(seg, spk, text)]
            pre_spk = spk

        elif text and len(text) > 0 and text[-1] in PUNC_SENT_END:
            text_cache.append((seg, spk, text))
            merged_spk_text.append(merge_cache(text_cache))
            text_cache = []
            pre_spk = spk
        else:
            text_cache.append((seg, spk, text))
            pre_spk = spk
    if len(text_cache) > 0:
        merged_spk_text.append(merge_cache(text_cache))
    return merged_spk_text


def diarize_text(transcribe_res, diarization_result):
    timestamp_texts = get_text_with_timestamp(transcribe_res)
    spk_text = add_speaker_info_to_text(timestamp_texts, diarization_result)
    res_processed = merge_sentence(spk_text)
    return res_processed


def write_to_txt(spk_sent, file):
    with open(file, 'w') as fp:
        for seg, spk, sentence in spk_sent:
            line = f'{seg.start:.2f} {seg.end:.2f} {spk} {sentence}\n'
            fp.write(line)

In [None]:
import whisper
model = whisper.load_model("tiny.en")
asr_result = model.transcribe("wav/a.m4a")
final_result = diarize_text(asr_result, diarization)

for seg, spk, sent in final_result:
    line = f'{seg.start:.2f} {seg.end:.2f} {spk} {sent}'
    print(line)

In [None]:
!pip install pyannote.audio
!pip install -U openai-whisper

In [None]:
!pip install "numpy<=2.1"


In [None]:
!pip install --force-reinstall numba

In [31]:
import io
from pyannote.audio import Pipeline
import whisper
from pyannote.core import Segment, Annotation
import numpy as np
import soundfile as sf

def process_audio_array(waveform):
    """
    Process audio bytes to perform diarization and speech-to-text, returning a dictionary
    with speaker segments and transcriptions.
    
    Args:
        audio_bytes (bytes): Audio data as bytes
    
    Returns:
        dict: Dictionary with segments containing start time, end time, speaker, and text
    """
    # Create BytesIO object from audio bytes
    
    # Initialize diarization pipeline
    pipeline = Pipeline.from_pretrained(
        "pyannote/speaker-diarization-3.1",
        use_auth_token="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"
    )
    
    # Apply diarization (using the BytesIO object)
    diarization = pipeline({
        "waveform": torch.tensor(waveform).unsqueeze(0),
        "sample_rate": 16000
    })

    
    # Initialize speech recognition model
    model = whisper.load_model("tiny.en")
    
    # Transcribe audio (using the same BytesIO object)
    asr_result = model.transcribe(waveform)
    
    # Process results
    timestamp_texts = get_text_with_timestamp(asr_result)
    spk_text = add_speaker_info_to_text(timestamp_texts, diarization)
    final_result = merge_sentence(spk_text)
    
    # Convert to dictionary format
    result_dict = {
        "segments": []
    }
    
    for seg, spk, text in final_result:
        segment = {
            "start": round(seg.start, 2),
            "end": round(seg.end, 2),
            "speaker": spk,
            "text": text
        }
        result_dict["segments"].append(segment)
    
    return result_dict

def get_text_with_timestamp(transcribe_res):
    timestamp_texts = []
    for item in transcribe_res['segments']:
        start = item['start']
        end = item['end']
        text = item['text']
        timestamp_texts.append((Segment(start, end), text))
    return timestamp_texts

def add_speaker_info_to_text(timestamp_texts, ann):
    spk_text = []
    for seg, text in timestamp_texts:
        spk = ann.crop(seg).argmax()
        spk_text.append((seg, spk, text))
    return spk_text

def merge_cache(text_cache):
    sentence = ''.join([item[-1] for item in text_cache])
    spk = text_cache[0][1]
    start = text_cache[0][0].start
    end = text_cache[-1][0].end
    return Segment(start, end), spk, sentence

PUNC_SENT_END = ['.', '?', '!']

def merge_sentence(spk_text):
    merged_spk_text = []
    pre_spk = None
    text_cache = []
    for seg, spk, text in spk_text:
        if spk != pre_spk and pre_spk is not None and len(text_cache) > 0:
            merged_spk_text.append(merge_cache(text_cache))
            text_cache = [(seg, spk, text)]
            pre_spk = spk
        elif text and len(text) > 0 and text[-1] in PUNC_SENT_END:
            text_cache.append((seg, spk, text))
            merged_spk_text.append(merge_cache(text_cache))
            text_cache = []
            pre_spk = spk
        else:
            text_cache.append((seg, spk, text))
            pre_spk = spk
    if len(text_cache) > 0:
        merged_spk_text.append(merge_cache(text_cache))
    return merged_spk_text

In [33]:
from scipy.io.wavfile import read
import numpy as np
import torch
from src.utils import process_audio_array as processssss

from pyannote.audio import Audio, Pipeline

audio = Audio()

# Example usage
file_path = "wav/mauro_gioele_recording.wav"
waveform = whisper.audio.load_audio(file_path)
results = processssss(waveform)

    
# Print the results
for segment in results["segments"]:
    print(f"{segment['start']:.2f} {segment['end']:.2f} {segment['speaker']} {segment['text']}")

# Optionally save results to a file
with open("diarization_results.txt", 'w') as f:
    for segment in results["segments"]:
        f.write(f"{segment['start']:.2f} {segment['end']:.2f} {segment['speaker']} {segment['text']}\n")

  std = sequences.std(dim=-1, correction=1)


0.00 4.00 SPEAKER_02  Okay, so this is a test here with my already.
4.00 6.00 SPEAKER_02  My only problem is...
6.00 8.00 SPEAKER_01  I don't know.
8.00 10.00 SPEAKER_01  What do you like?
10.00 12.00 SPEAKER_03  I'll speak a bit later.
12.00 19.00 SPEAKER_01  Okay, I'm here at the plant presentation.
