In [None]:
import requests
import json

# List of audio file paths (up to 10 files)
audio_file_paths = [
    "E:/–ó–∞–ø–∏—Å–∏/–§–°–ö –°–ó/incom/08/02/in-7035555-9587853839-20240802-155950-1722603590.21580.wav",
    "E:/–ó–∞–ø–∏—Å–∏/–§–°–ö –°–ó/incom/08/02/out-3036641-3133-20240802-135335-1722596015.21347.wav",
    "E:/–ó–∞–ø–∏—Å–∏/–§–°–ö –°–ó/incom/08/02/PJSIP-3101-000047e6-2024-08-02-15-28-09.wav",
    "E:/–ó–∞–ø–∏—Å–∏/–§–°–ö –°–ó/incom/08/02/PJSIP-3213-000044b5-2024-08-02-09-34-26.wav",
    "E:/–ó–∞–ø–∏—Å–∏/–§–°–ö –°–ó/incom/08/02/PJSIP-3223-0000454f-2024-08-02-10-35-26.wav",
    "E:/–ó–∞–ø–∏—Å–∏/–§–°–ö –°–ó/incom/08/02/PJSIP-3353-000048a9-2024-08-02-18-28-28.wav",
    "E:/–ó–∞–ø–∏—Å–∏/–§–°–ö –°–ó/incom/08/02/PJSIP-5101-000046c7-2024-08-02-13-00-10.wav",
    "E:/–ó–∞–ø–∏—Å–∏/–§–°–ö –°–ó/incom/08/02/PJSIP-5101-0000459d-2024-08-02-11-16-40.wav",
    "E:/–ó–∞–ø–∏—Å–∏/–§–°–ö –°–ó/incom/08/02/PJSIP-5101-00004669-2024-08-02-12-27-31.wav",
    "E:/–ó–∞–ø–∏—Å–∏/–§–°–ö –°–ó/incom/08/02/PJSIP-5102-000045ea-2024-08-02-11-40-30.wav"
]

audio_file_paths = [
    "E:/–ó–∞–ø–∏—Å–∏/BorAvto/–û–ü/mix_13143_17303__2023_01_31__11_05_42_100.mp3",
    "E:/–ó–∞–ø–∏—Å–∏/BorAvto/–û–ü/mix_13301_17384__2023_02_01__17_43_03_327.mp3",
    "E:/–ó–∞–ø–∏—Å–∏/BorAvto/–û–ü/mix_13276_17360__2023_02_01__10_11_14_457.mp3",
    "E:/–ó–∞–ø–∏—Å–∏/BorAvto/–û–ü/mix_13170_13171__2023_02_01__17_32_20_210.mp3"
]

# Step 1: Transcribe the audio files
print('Posting audio files for transcription')

# Prepare the files for the transcription request
transcription_files = [('audio_files', open(file_path, 'rb')) for file_path in audio_file_paths]

try:
    transcription_response = requests.post(
        "http://localhost:8000/transcribe_mono",
        files=transcription_files
    )
    transcription_response.raise_for_status()
    transcription_data = transcription_response.json()
    transcription_results = transcription_data["results"]
except Exception as e:
    print(f"Error during transcription: {e}")
    transcription_results = []
finally:
    # Close the file handles
    for _, file_obj in transcription_files:
        file_obj.close()

print('Posting audio files for transcription -- DONE')

# Inspect the transcription results for debugging
print("Transcription Results:")
print(json.dumps(transcription_results, indent=2))

# Step 2: Diarize using the transcription segments
print('Posting audio files for diarization')

# Prepare the files for the diarization request
diarization_files = []
transcription_segments_list = []
valid_audio_files = []

for idx, result in enumerate(transcription_results):
    if "segments" in result:
        # Add the audio file and transcription segments to the lists
        file_path = audio_file_paths[idx]
        diarization_files.append(('audio_files', open(file_path, 'rb')))
        segments_json = json.dumps(result["segments"])
        transcription_segments_list.append(segments_json)
        valid_audio_files.append(file_path)
    else:
        # Handle the error case
        print(f"Transcription failed for file: {result.get('file', 'unknown')}")
        print(f"Error message: {result.get('error', 'No error message available')}")

# Ensure we have valid files to process
if not valid_audio_files:
    print("No valid transcriptions were obtained. Exiting.")
    exit(1)

# Prepare the data parameter as a list of tuples
data = []
for segments_json in transcription_segments_list:
    data.append(('transcription_segments_list', segments_json))
# Add the num_speakers parameter (optional)
data.append(('num_speakers', '2'))  # Optional

try:
    diarization_response = requests.post(
        "http://localhost:8001/diarize",
        files=diarization_files,
        data=data
    )
    diarization_response.raise_for_status()
    diarized_data = diarization_response.json()
    diarized_results = diarized_data["results"]
except Exception as e:
    print(f"Error during diarization: {e}")
    diarized_results = []
finally:
    # Close the file handles
    for _, file_obj in diarization_files:
        file_obj.close()

print('Posting audio files for diarization -- DONE')

# Now you can process 'diarized_results' which contains the diarized segments for each file
for diarized_result in diarized_results:
    if "diarized_segments" in diarized_result:
        file_name = diarized_result['file']
        diarized_segments = diarized_result['diarized_segments']
        print(f"File: {file_name}")
        for segment in diarized_segments:
            start = segment['start']
            end = segment['end']
            text = segment['text']
            speaker = segment['speaker']
            print(f"[{start:.2f} - {end:.2f}] {speaker}: {text}")
        print()
    else:
        # Handle the error case
        f"Diarization failed for file: {diarized_result.get('file', 'unknown')}"
        f"Error message: {diarized_result.get('error', 'No error message available')}"

In [None]:
audio_file_paths = [
    "E:/–ó–∞–ø–∏—Å–∏/BorAvto/–û–ü/mix_13143_17303__2023_01_31__11_05_42_100.mp3",
    "E:/–ó–∞–ø–∏—Å–∏/BorAvto/–û–ü/mix_13301_17384__2023_02_01__17_43_03_327.mp3",
    "E:/–ó–∞–ø–∏—Å–∏/BorAvto/–û–ü/mix_13276_17360__2023_02_01__10_11_14_457.mp3",
    "E:/–ó–∞–ø–∏—Å–∏/BorAvto/–û–ü/mix_13170_13171__2023_02_01__17_32_20_210.mp3"
] 

stereo_paths = [
    'E:/–ó–∞–ø–∏—Å–∏/–§–°–ö/20210623080301062173700pri.wav'
]

In [None]:
import whisper
import warnings
import torchaudio
from tempfile import NamedTemporaryFile

#warnings.filterwarnings("ignore", category=FutureWarning, module="whisper")

model = whisper.load_model("turbo", device="cuda")

In [None]:
# load audio and pad/trim it to fit 30 seconds
audio = whisper.load_audio(audio_file_paths[0])
audio = whisper.pad_or_trim(audio)

In [None]:
whisper.transcribe(audio)

Torchaudio way

In [None]:
import whisper
import torchaudio
import warnings
import numpy as np
from tempfile import NamedTemporaryFile
import os

# Ignore FutureWarnings from whisper
#warnings.filterwarnings("ignore", category=FutureWarning, module="whisper")
#warnings.filterwarnings("ignore", category=UserWarning, module="whisper")

# Load the Whisper model on GPU
model = whisper.load_model("turbo", device="cuda")

# Path to your stereo audio file
stereo_path = 'E:/–ó–∞–ø–∏—Å–∏/–§–°–ö/20210623080301062173700pri.wav'

In [None]:
# Path to your stereo audio file
stereo_path = 'E:/–ó–∞–ø–∏—Å–∏/–§–°–ö/2021062309023419811401pri.wav'

# Load the stereo audio file
waveform, sample_rate = torchaudio.load(stereo_path)

# Ensure the audio is stereo (2 channels)
if waveform.shape[0] == 2:
    # Separate channels
    channel_0 = waveform[0].unsqueeze(0)  # Channel 0
    channel_1 = waveform[1].unsqueeze(0)  # Channel 1

    # Save each channel to a temporary file
    with NamedTemporaryFile(suffix=".wav", delete=False, dir='./temp') as temp_file_0, \
         NamedTemporaryFile(suffix=".wav", delete=False, dir='./temp') as temp_file_1:
        
        # Save channel 0
        torchaudio.save(temp_file_0.name, channel_0, sample_rate, encoding="PCM_S", bits_per_sample=16)
        channel_0_path = temp_file_0.name
        print(f"Channel 0 saved at: {channel_0_path}")

        # Save channel 1
        torchaudio.save(temp_file_1.name, channel_1, sample_rate, encoding="PCM_S", bits_per_sample=16)
        channel_1_path = temp_file_1.name
        print(f"Channel 1 saved at: {channel_1_path}")
        #whisper.DecodingOptions()
            # Transcribe each channel separately with additional options
        print("Transcribing Speaker 0...")
        whisper.DecodingOptions()
        result_speaker_0 = model.transcribe(
            channel_0_path,
            language="ru",
            initial_prompt='–ó–≤–æ–Ω–æ–∫ –≤ –∫–æ–º–ø–∞–Ω–∏—é, —ç—Ç–æ –∫–æ–ª–ª —Ü–µ–Ω—Ç—Ä –∑–∞—Å—Ç—Ä–æ–π—â–∏–∫–∞, —Ä–∞–∑–≥–æ–≤–æ—Ä –≤–µ–¥–µ—Ç —Å–æ—Ç—Ä—É–¥–Ω–∏–∫ –û–ª—å–≥–∞',
            temperature= (0.0, 0.1),
            logprob_threshold=-0.6,
            no_speech_threshold= 0.0,
            compression_ratio_hallucination_threshold=2.1,
            condition_on_previous_text=True,
            word_timestamps=True,
            hallucination_silence_threshold=1
        )

        print("Transcribing Speaker 1...")
        result_speaker_1 = model.transcribe(
                channel_1_path,
                language="ru",
                initial_prompt='–ó–≤–æ–Ω–æ–∫ –≤ –∫–æ–º–ø–∞–Ω–∏—é –§–°–ö, —ç—Ç–æ –∫–æ–ª–ª —Ü–µ–Ω—Ç—Ä –∑–∞—Å—Ç—Ä–æ–π—â–∏–∫–∞, –∫–ª–∏–µ–Ω—Ç –≥–æ–≤–æ—Ä–∏—Ç –æ –ø–æ–∫—É–ø–∫–µ –∫–≤–∞—Ä—Ç–∏—Ä—ã',
                temperature=(0.0, 0.1),
                no_speech_threshold= 0.0,
                condition_on_previous_text=True,
                word_timestamps=True,
                hallucination_silence_threshold=0.5
            )

        # Print transcriptions for each speaker
        print("Transcription for Speaker 0:")
        for segment in result_speaker_0["segments"]:
            print(f"{segment['start']}s - {segment['end']}s: {segment['text']} {segment['compression_ratio']}")

        print("\nTranscription for Speaker 1:")
        for segment in result_speaker_1["segments"]:
            print(f"{segment['start']}s - {segment['end']}s: {segment['text']}")

else:
    print("Error: Audio is not stereo.")

In [None]:
result_speaker_0

Chunked approach

In [None]:
import torchaudio
from tempfile import NamedTemporaryFile
import os

# Path to your stereo audio file
stereo_path = 'E:/–ó–∞–ø–∏—Å–∏/–§–°–ö/2021062309023419811401pri.wav'

# Load the stereo audio file
waveform, sample_rate = torchaudio.load(stereo_path)

# Parameters
chunk_duration = 900  # Chunk duration in seconds
num_channels = waveform.shape[0]
chunk_samples = chunk_duration * sample_rate  # Number of samples per chunk

# Ensure the audio is stereo (2 channels)
if num_channels == 2:
    for channel_idx in range(num_channels):
        # Select the channel waveform
        channel_waveform = waveform[channel_idx].unsqueeze(0)  # Single channel waveform
        channel_name = f"Speaker {channel_idx}"
        
        # Split into 90-second chunks
        num_chunks = (channel_waveform.shape[1] + chunk_samples - 1) // chunk_samples
        transcriptions = []

        for i in range(num_chunks):
            start_sample = i * chunk_samples
            end_sample = min((i + 1) * chunk_samples, channel_waveform.shape[1])
            chunk_waveform = channel_waveform[:, start_sample:end_sample]
            start_time = start_sample / sample_rate  # in seconds
            end_time = end_sample / sample_rate  # in seconds

            # Save each chunk to a temporary file
            with NamedTemporaryFile(suffix=".wav", delete=False, dir='./temp') as temp_file:
                torchaudio.save(temp_file.name, chunk_waveform, sample_rate, encoding="PCM_S", bits_per_sample=16)
                temp_path = temp_file.name
            
            # Transcribe each chunk with original timings
            print(f"Transcribing {channel_name}, chunk {i + 1}/{num_chunks}, from {start_time:.2f}s to {end_time:.2f}s...")
            whisper.DecodingOptions()
            result = model.transcribe(
                temp_path,
                language="ru",
                #initial_prompt='–ó–≤–æ–Ω–æ–∫ –≤ –∫–æ–º–ø–∞–Ω–∏—é –§–°–ö, —ç—Ç–æ –∫–æ–ª–ª —Ü–µ–Ω—Ç—Ä –∑–∞—Å—Ç—Ä–æ–π—â–∏–∫–∞' if channel_idx == 0 else '–ó–≤–æ–Ω–æ–∫ –≤ –∫–æ–º–ø–∞–Ω–∏—é –§–°–ö, —ç—Ç–æ –∫–æ–ª–ª —Ü–µ–Ω—Ç—Ä –∑–∞—Å—Ç—Ä–æ–π—â–∏–∫–∞, –∫–ª–∏–µ–Ω—Ç –≥–æ–≤–æ—Ä–∏—Ç –æ –ø–æ–∫—É–ø–∫–µ –∫–≤–∞—Ä—Ç–∏—Ä—ã',
                temperature=(0.0, 0.1),
                no_speech_threshold=0.3,
                suppress_tokens = [50365, 2933, 8893, 403, 1635, 10461, 40653, 413, 4775, 51, 284, 89, 453, 51864, 50366, 8567, 1435, 21403, 5627, 15363, 17781, 485, 51863],
                condition_on_previous_text=False,
                word_timestamps=True,
                compression_ratio_hallucination_threshold=2.1,
                fp16 = True
            )
            # Add –°—É–±—Ç–∏—Ç—Ä—ã —Å–¥–µ–ª–∞–ª DimaTorzok and other exceptions
            # Collect transcriptions with original chunk timing
            for segment in result["segments"]:
                segment['start'] += start_time
                segment['end'] += start_time
                transcriptions.append(segment)

            # Clean up temporary file
            os.remove(temp_path)

        # Print transcriptions for the current speaker
        print(f"\nTranscription for {channel_name}:")
        for segment in transcriptions:
            print(f"{segment['start']}s - {segment['end']}s: {segment['text']}  {segment['compression_ratio']}")
        # print(transcriptions)

else:
    print("Error: Audio is not stereo.")


In [None]:
import whisper
import torchaudio
import numpy as np
import torch

# Load the Whisper model on GPU
model = whisper.load_model("turbo", device="cuda")

# Path to your stereo audio file
stereo_path = 'E:/–ó–∞–ø–∏—Å–∏/–§–°–ö/2021062309023419811401pri.wav'

In [None]:
# Load the stereo audio file
waveform, sample_rate = torchaudio.load(stereo_path)

# Resample the entire waveform to 16000 Hz if necessary
target_sample_rate = 16000
if sample_rate != target_sample_rate:
    waveform = torchaudio.functional.resample(
        waveform, sample_rate, target_sample_rate
    )
    sample_rate = target_sample_rate

# Parameters
chunk_duration = 900  # Chunk duration in seconds
num_channels = waveform.shape[0]
chunk_samples = int(chunk_duration * sample_rate)  # Number of samples per chunk

# Ensure the audio is stereo (2 channels)
if num_channels == 2:
    for channel_idx in range(num_channels):
        # Select the channel waveform
        channel_waveform = waveform[channel_idx].unsqueeze(0)  # Shape: [1, num_samples]
        channel_name = f"Speaker {channel_idx}"

        # Split into chunks
        num_samples = channel_waveform.shape[1]
        num_chunks = (num_samples + chunk_samples - 1) // chunk_samples
        transcriptions = []

        for i in range(num_chunks):
            start_sample = i * chunk_samples
            end_sample = min((i + 1) * chunk_samples, num_samples)
            chunk_waveform = channel_waveform[:, start_sample:end_sample]
            start_time = start_sample / sample_rate  # in seconds

            # Convert chunk_waveform to NumPy array
            chunk_numpy = chunk_waveform.squeeze().numpy()

            # Transcribe the chunk
            print(
                f"Transcribing {channel_name}, chunk {i + 1}/{num_chunks}, from {start_time:.2f}s..."
            )

            result = model.transcribe(
                audio=chunk_numpy,
                language="ru",
                temperature=(0.0, 0.1),
                no_speech_threshold=0.3,
                suppress_tokens=[
                    50365, 2933, 8893, 403, 1635, 10461, 40653,
                    413, 4775, 51, 284, 89, 453, 51864, 50366,
                    8567, 1435, 21403, 5627, 15363, 17781, 485,
                    51863
                ],
                condition_on_previous_text=False,
                word_timestamps=True,
                compression_ratio_hallucination_threshold=2.1,
                fp16=True,
            )

            # Adjust the segment times
            for segment in result["segments"]:
                segment['start'] += start_time
                segment['end'] += start_time
                transcriptions.append(segment)

        # Print transcriptions for the current speaker
        print(f"\nTranscription for {channel_name}:")
        for segment in transcriptions:
            print(
                f"{segment['start']:.2f}s - {segment['end']:.2f}s: {segment['text']}  {segment.get('compression_ratio', '')}"
            )

else:
    print("Error: Audio is not stereo.")


In [None]:
transcriptions

In [None]:
from silero_vad import load_silero_vad, read_audio, get_speech_timestamps
vad = load_silero_vad()
wav = read_audio('c:/Users/Alex/whisper_asr_implementation/Drafts/temp/tmpbshce7ez.wav')

In [None]:
speech_timestamps = get_speech_timestamps(
  wav,
  vad,
  min_speech_duration_ms=400,
  return_seconds=True,  # Return speech timestamps in seconds (default is samples)
  sampling_rate=16000
)

In [None]:
speech_timestamps

In [None]:
# Merge VAD segments that are within 0.5 seconds of each other
def merge_vad_segments(vad_segments, merge_threshold=0.5):
    if not vad_segments:
        return []

    merged_segments = []
    current_segment = vad_segments[0]

    for next_segment in vad_segments[1:]:
        if next_segment['start'] - current_segment['end'] <= merge_threshold:
            # Extend the current segment's end time
            current_segment['end'] = next_segment['end']
        else:
            merged_segments.append(current_segment)
            current_segment = next_segment
    merged_segments.append(current_segment)

    return merged_segments


In [None]:
waveform, sample_rate = torchaudio.load('c:/Users/Alex/whisper_asr_implementation/Drafts/temp/tmpbshce7ez.wav')
num_channels = waveform.shape[0]
vad_output = speech_timestamps
# Apply merging
vad_output = merge_vad_segments(vad_output, merge_threshold=2)
# Process each channel separately
for channel_idx in range(num_channels):
    channel_waveform = waveform[channel_idx]  # Single channel waveform
    channel_name = f"Speaker {channel_idx}"

    transcriptions = []

    # Process each VAD segment
    for i, vad_segment in enumerate(vad_output):
        start_time = vad_segment['start']
        end_time = vad_segment['end']
        start_sample = int(start_time * sample_rate)
        end_sample = int(end_time * sample_rate)

        # Extract the audio segment
        segment_waveform = channel_waveform[start_sample:end_sample]

        # Check if the segment is non-empty
        if segment_waveform.numel() == 0:
            continue  # Skip empty segments

        # Reshape to (1, N) for a single channel
        segment_waveform = segment_waveform.unsqueeze(0)

        # Resample to 16 kHz if necessary
        if sample_rate != 16000:
            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
            segment_waveform = resampler(segment_waveform)

        # Convert to numpy array as Whisper expects NumPy arrays
        segment_waveform_np = segment_waveform.squeeze(0).numpy()

        # Transcribe the segment
        print(f"Transcribing {channel_name}, segment {i + 1}/{len(vad_output)}, from {start_time:.2f}s to {end_time:.2f}s...")

        result = model.transcribe(
            audio=segment_waveform_np,
            language="ru",
            initial_prompt='–ó–≤–æ–Ω–æ–∫ –≤ –∫–æ–º–ø–∞–Ω–∏—é –§–°–ö, —ç—Ç–æ –∫–æ–ª–ª —Ü–µ–Ω—Ç—Ä –∑–∞—Å—Ç—Ä–æ–π—â–∏–∫–∞',
            temperature=(0.0, 0.1),
            no_speech_threshold=0.6,
            condition_on_previous_text=True,
            word_timestamps=True,
            hallucination_silence_threshold=0.1 
        )

        # Collect transcriptions with original timings
        for segment in result["segments"]:
            # Adjust the timestamps to the original audio timeline
            segment['start'] += start_time
            segment['end'] += start_time
            transcriptions.append(segment)

    # Print transcriptions for the current speaker
    print(f"\nTranscription for {channel_name}:")
    for segment in transcriptions:
        print(f"{segment['start']:.2f}s - {segment['end']:.2f}s: {segment['text']}")

In [None]:
audio = whisper.load_audio('c:/Users/Alex/whisper_asr_implementation/Drafts/temp/tmpbshce7ez.wav')
audio = whisper.pad_or_trim(audio)

# make log-Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audio, n_mels=128).to(model.device)

# decode the audio
options = whisper.DecodingOptions(language='ru')
result = whisper.decode(model, mel, options)

# make log-Mel spectrogram and move to the same device as the model
result.text


Transformers Pipeline

In [None]:
# https://github.com/huggingface/transformers/pull/28556/files

In [None]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline


device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3-turbo"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, use_safetensors=True
)
model.to(device)

# Enable static cache and compile the forward pass
model.generation_config.max_new_tokens = 256

processor = AutoProcessor.from_pretrained(model_id)

generate_kwargs = {
    "condition_on_prev_tokens": True,
    "temperature": (0.0, 0.2, 0.4),
    "logprob_threshold": -0.4,
    "no_speech_threshold": 0.05,
    "return_timestamps": "word",
    #"task": "transcribe",
    "language": "russian",
    #"initial_prompt": "–§–°–ö"  # https://github.com/huggingface/transformers/issues/27317
    
}

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    chunk_length_s=30,  # When no is passed - sliding window
    batch_size=32,
    torch_dtype=torch_dtype,
    device=device,
    generate_kwargs=generate_kwargs
)


In [None]:
result = pipe('c:/Users/Alex/whisper_asr_implementation/Drafts/temp/tmpxf4eqn1b.wav', return_timestamps=True)

for chunk in result["chunks"]:
    print(str(chunk["timestamp"]) + '  ' + chunk['text'])

Attempt 31-10

In [None]:
# List of audio file paths (up to 10 files)
audio_file_paths = [
    "E:/–ó–∞–ø–∏—Å–∏/–§–°–ö –°–ó/incom/08/02/in-7035555-9587853839-20240802-155950-1722603590.21580.wav",
    "E:/–ó–∞–ø–∏—Å–∏/–§–°–ö –°–ó/incom/08/02/out-3036641-3133-20240802-135335-1722596015.21347.wav",
    "E:/–ó–∞–ø–∏—Å–∏/–§–°–ö –°–ó/incom/08/02/PJSIP-3101-000047e6-2024-08-02-15-28-09.wav",
    "E:/–ó–∞–ø–∏—Å–∏/–§–°–ö –°–ó/incom/08/02/PJSIP-3213-000044b5-2024-08-02-09-34-26.wav",
    "E:/–ó–∞–ø–∏—Å–∏/–§–°–ö –°–ó/incom/08/02/PJSIP-3223-0000454f-2024-08-02-10-35-26.wav",
    "E:/–ó–∞–ø–∏—Å–∏/–§–°–ö –°–ó/incom/08/02/PJSIP-3353-000048a9-2024-08-02-18-28-28.wav",
    "E:/–ó–∞–ø–∏—Å–∏/–§–°–ö –°–ó/incom/08/02/PJSIP-5101-000046c7-2024-08-02-13-00-10.wav",
    "E:/–ó–∞–ø–∏—Å–∏/–§–°–ö –°–ó/incom/08/02/PJSIP-5101-0000459d-2024-08-02-11-16-40.wav",
    "E:/–ó–∞–ø–∏—Å–∏/–§–°–ö –°–ó/incom/08/02/PJSIP-5101-00004669-2024-08-02-12-27-31.wav",
    "E:/–ó–∞–ø–∏—Å–∏/–§–°–ö –°–ó/incom/08/02/PJSIP-5102-000045ea-2024-08-02-11-40-30.wav"
]

'''audio_file_paths = [
    "E:/–ó–∞–ø–∏—Å–∏/BorAvto/–û–ü/mix_13143_17303__2023_01_31__11_05_42_100.mp3",
    "E:/–ó–∞–ø–∏—Å–∏/BorAvto/–û–ü/mix_13301_17384__2023_02_01__17_43_03_327.mp3",
    "E:/–ó–∞–ø–∏—Å–∏/BorAvto/–û–ü/mix_13276_17360__2023_02_01__10_11_14_457.mp3",
    "E:/–ó–∞–ø–∏—Å–∏/BorAvto/–û–ü/mix_13170_13171__2023_02_01__17_32_20_210.mp3"
]'''

In [None]:
import whisper
import librosa

# Load the Whisper model
model = whisper.load_model("turbo")
stereo_path = 'E:/–ó–∞–ø–∏—Å–∏/–§–°–ö/2021062309023419811401pri.wav'
# Load the stereo audio file with librosa
audio, sr = librosa.load(stereo_path, sr=8000, mono=False)

# Ensure the audio has two channels
if audio.shape[0] != 2:
    raise ValueError("Audio file does not have two channels.")

# Separate the left and right channels
audio_left = audio[0]
audio_right = audio[1]


In [None]:
# Process the left channel
#audio_left = whisper.pad_or_trim(audio_left)

# Transcribe the left channel
result_left = model.transcribe(
    audio_left,
    verbose=True,
    language='ru'
)

#print("Left Channel Transcription:")
#print(result_left)

'''# Process the right channel
audio_right = whisper.pad_or_trim(audio_right)
mel_right = whisper.log_mel_spectrogram(audio_right, n_mels=128).to(model.device)

# Transcribe the right channel using the same options
result_right = whisper.decode(model, mel_right, options)
print("Right Channel Transcription:")'''


VAD approach

In [None]:
import whisper
import librosa
import numpy as np
import webrtcvad
import collections

# Step 1: Load and Resample the Audio File
audio_path = 'E:/–ó–∞–ø–∏—Å–∏/–§–°–ö/2021062309023419811401pri.wav'
audio_data, sr = librosa.load(audio_path, sr=8000, mono=False)  # Keep stereo channels

# Resample to 16 kHz
audio_data_16k = librosa.resample(audio_data, orig_sr=sr, target_sr=16000)

# Step 2: Split into Left and Right Channels
left_channel = audio_data_16k[0, :]
right_channel = audio_data_16k[1, :]

channels = [left_channel, right_channel]

# Load the Whisper Model
model = whisper.load_model("turbo")

# Step 3: Define VAD Functions
vad = webrtcvad.Vad(2)  # Aggressiveness mode (0-3)

def frame_generator(frame_duration_ms, audio, sample_rate):
    frame_length = int(sample_rate * frame_duration_ms / 1000)
    num_frames = len(audio) // frame_length
    for i in range(num_frames):
        yield audio[i * frame_length:(i + 1) * frame_length]

def vad_collector(sample_rate, frame_duration_ms, padding_duration_ms, vad, audio):
    frames = list(frame_generator(frame_duration_ms, audio, sample_rate))
    num_padding_frames = int(padding_duration_ms / frame_duration_ms)
    ring_buffer = collections.deque(maxlen=num_padding_frames)
    triggered = False
    voiced_frames = []
    segments = []

    for frame in frames:
        # Convert to 16-bit PCM
        pcm_frame = (frame * 32767).astype(np.int16).tobytes()
        is_speech = vad.is_speech(pcm_frame, sample_rate)

        if not triggered:
            ring_buffer.append((frame, is_speech))
            num_voiced = len([f for f, speech in ring_buffer if speech])
            if num_voiced > 0.9 * ring_buffer.maxlen:
                triggered = True
                voiced_frames.extend([f for f, s in ring_buffer])
                ring_buffer.clear()
        else:
            voiced_frames.append(frame)
            ring_buffer.append((frame, is_speech))
            num_unvoiced = len([f for f, speech in ring_buffer if not speech])
            if num_unvoiced > 0.9 * ring_buffer.maxlen:
                triggered = False
                segments.append(np.concatenate(voiced_frames))
                ring_buffer.clear()
                voiced_frames = []
    if voiced_frames:
        segments.append(np.concatenate(voiced_frames))
    return segments

# Step 4: Transcribe Each Channel
sample_rate = 16000
frame_duration_ms = 30
padding_duration_ms = 300

for idx, ch in enumerate(channels):
    # Apply VAD
    speech_segments = vad_collector(sample_rate, frame_duration_ms, padding_duration_ms, vad, ch)
    print(f"Processing Channel {idx + 1}")
    for i, segment in enumerate(speech_segments):
        # Transcribe with Whisper
        result = model.transcribe(segment, language='ru', fp16=False)
        print(f"Segment {i + 1}: {result['text']}")


# Diarization

In [None]:
import torch
from pyannote.audio import Pipeline
import os

# Diarization Pipeline
pipeline = Pipeline.from_pretrained(
    "pyannote/speaker-diarization-3.1",
    use_auth_token='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'  # Replace with your Hugging Face token
)
pipeline.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

In [None]:
dataset = ["E:/–ó–∞–ø–∏—Å–∏/–ú–∏–≥–∫—Ä–µ–¥–∏—Ç/1/1_–ò–≤—á–µ–Ω–∫–æ –î.–ê_2018-12-10_15-50-35_6136_89055400861_4H7P4LS4QL76B4U1JRH8R0HRAG000939_pcmu.wav",
           "E:/–ó–∞–ø–∏—Å–∏/–ú–∏–≥–∫—Ä–µ–¥–∏—Ç/1/9_–°—Ç—Ä–∏–∂ –ê_2018-12-10_09-23-21_6127_89033605019_IBA75L9EK10JJ452OB2KJ0H51K007DO2_pcmu.wav",
"E:/–ó–∞–ø–∏—Å–∏/–ú–∏–≥–∫—Ä–µ–¥–∏—Ç/1/2016-11-22_06-30-11_6136_89518759355_6IMC8TIPUP6BD8RC4L2T32JC6O07SK0M_pcmu.wav",
"E:/–ó–∞–ø–∏—Å–∏/–ú–∏–≥–∫—Ä–µ–¥–∏—Ç/1/2016-11-22_06-32-02_6136_89518759355_6IMC8TIPUP6BD8RC4L2T32JC6O07SKAU_pcmu.wav",
"E:/–ó–∞–ø–∏—Å–∏/–ú–∏–≥–∫—Ä–µ–¥–∏—Ç/1/2017-12-29_07-52-31_6130_89303431192_–°–∞–¥–æ–µ–≤ –ö.wav",
"E:/–ó–∞–ø–∏—Å–∏/–ú–∏–≥–∫—Ä–µ–¥–∏—Ç/1/2018-03-18_15-07-01_6132_89507086469_–ü–µ—Ä–µ–ª—ã–≥–∏–Ω –ò.wav",
"E:/–ó–∞–ø–∏—Å–∏/–ú–∏–≥–∫—Ä–µ–¥–∏—Ç/1/2018-03-23_05-35-26_6135_89025322333_38NDLA8KCD09R2FR3R9CO3JCT4017CD2_pcmu.wav",
"E:/–ó–∞–ø–∏—Å–∏/–ú–∏–≥–∫—Ä–µ–¥–∏—Ç/1/2018-03-23_05-37-29_6135_89025322333_38NDLA8KCD09R2FR3R9CO3JCT4017CE9_pcmu.wav",
"E:/–ó–∞–ø–∏—Å–∏/–ú–∏–≥–∫—Ä–µ–¥–∏—Ç/1/2018-03-26_11-09-32_6171_89610426801_–ì—Å–ø–æ—è–Ω –ú.wav",
"E:/–ó–∞–ø–∏—Å–∏/–ú–∏–≥–∫—Ä–µ–¥–∏—Ç/1/2018-03-29_10-00-38_6001_89535554036_–ê–≤—Ç–∞–µ–≤ –ê.wav",
"E:/–ó–∞–ø–∏—Å–∏/–ú–∏–≥–∫—Ä–µ–¥–∏—Ç/1/2018-06-06_12-52-10_6134_88462604792_3RKH6IPB751TR09NAMG5QSSGJK01CHLB_pcmu.wav",
"E:/–ó–∞–ø–∏—Å–∏/–ú–∏–≥–∫—Ä–µ–¥–∏—Ç/1/2018-06-12_04-01-07_6001_89824570468_3RKH6IPB751TR09NAMG5QSSGJK01PAC9_pcmu.wav",
"E:/–ó–∞–ø–∏—Å–∏/–ú–∏–≥–∫—Ä–µ–¥–∏—Ç/1/2018-06-18_10-00-20_6140_89677413899_R4P7NOLATT55TD0ANNFK8URBGG00R4AM_pcmu.wav",
"E:/–ó–∞–ø–∏—Å–∏/–ú–∏–≥–∫—Ä–µ–¥–∏—Ç/1/2018-06-18_13-00-21_6129_89123448698_R4P7NOLATT55TD0ANNFK8URBGG00RJE2_pcmu.wav",
"E:/–ó–∞–ø–∏—Å–∏/–ú–∏–≥–∫—Ä–µ–¥–∏—Ç/1/2018-07-04_09-14-52_Resources_89655729177_BJSQGAOFE15KN5QPPG58M763MC00KMBI_pcmu.wav",
"E:/–ó–∞–ø–∏—Å–∏/–ú–∏–≥–∫—Ä–µ–¥–∏—Ç/1/2018-08-14_08-11-05_Resources_89103996696_9L22UTJ9DH4TT09F3KID43KHUS00L1LC_pcmu.wav",
"E:/–ó–∞–ø–∏—Å–∏/–ú–∏–≥–∫—Ä–µ–¥–∏—Ç/1/2018-09-20_14-00-24_6133_89993680214_FTHN9PABGL6PN0RKA92QRCNI5S00V767_pcmu.wav",
"E:/–ó–∞–ø–∏—Å–∏/–ú–∏–≥–∫—Ä–µ–¥–∏—Ç/1/2018-09-20_14-07-23_6133_89993680214_FTHN9PABGL6PN0RKA92QRCNI5S00V81R_pcmu.wav",
"E:/–ó–∞–ø–∏—Å–∏/–ú–∏–≥–∫—Ä–µ–¥–∏—Ç/1/2018-10-17_06-27-29_4992679575_4997023679_38CBN0BE1569T2FAUHMUAT12TC00CV1V_pcmu.wav",
"E:/–ó–∞–ø–∏—Å–∏/–ú–∏–≥–∫—Ä–µ–¥–∏—Ç/1/2018-10-20_14-17-38_6130_89260446126_38CBN0BE1569T2FAUHMUAT12TC00JKQO_pcmu.wav",
"E:/–ó–∞–ø–∏—Å–∏/–ú–∏–≥–∫—Ä–µ–¥–∏—Ç/1/2022-02-23_05-15-49_6106_89134343721_0J8UE9S7N91VL6BMQFUNONT6VS03FMR4_pcmu.wav",
"E:/–ó–∞–ø–∏—Å–∏/–ú–∏–≥–∫—Ä–µ–¥–∏—Ç/1/2022-04-08_08-35-58_6105_89113208278_DCM56DJNV90PR2IQHNAON84VMK00RUAA_pcmu.wav",
"E:/–ó–∞–ø–∏—Å–∏/–ú–∏–≥–∫—Ä–µ–¥–∏—Ç/1/2022-04-11_07-29-22_6105_89113208278_DCM56DJNV90PR2IQHNAON84VMK0128T2_pcmu.wav",
"E:/–ó–∞–ø–∏—Å–∏/–ú–∏–≥–∫—Ä–µ–¥–∏—Ç/1/2022-11-16_12-40-48_Resources_89204089388_1IE7MJESO941555IRFPUOP8EJO0DO534_pcmu.wav",
"E:/–ó–∞–ø–∏—Å–∏/–ú–∏–≥–∫—Ä–µ–¥–∏—Ç/1/2023-01-02_12-31-54_6104_89652214529_1IE7MJESO941555IRFPUOP8EJO0I5MCA_pcmu.wav",
"E:/–ó–∞–ø–∏—Å–∏/–ú–∏–≥–∫—Ä–µ–¥–∏—Ç/1/2023-01-02_12-53-54_6104_89103617383_1IE7MJESO941555IRFPUOP8EJO0I5MD1_pcmu.wav",
"E:/–ó–∞–ø–∏—Å–∏/–ú–∏–≥–∫—Ä–µ–¥–∏—Ç/1/2023-01-02_13-06-43_9111471194_4997023679_1IE7MJESO941555IRFPUOP8EJO0I5MDD_pcmu.wav",
"E:/–ó–∞–ø–∏—Å–∏/–ú–∏–≥–∫—Ä–µ–¥–∏—Ç/1/2023-02-27_05-35-48_Resources_89035610477_TRG4D987PP3P3AE4SA59SAUVVK083UAU_pcmu.wav",
"E:/–ó–∞–ø–∏—Å–∏/–ú–∏–≥–∫—Ä–µ–¥–∏—Ç/1/2023-03-07_04-14-56_Resources_89122646096_TRG4D987PP3P3AE4SA59SAUVVK092JNT_pcmu.wav"]

In [None]:
i = 18

In [None]:
pipeline(dataset[i], num_speakers=2)

In [None]:
j = pipeline(dataset[i], num_speakers=2)

In [None]:
print(j)

In [None]:
segments

In [None]:
test

# Combine

In [None]:
import requests

url = "http://127.0.0.1:8000/transcribe_audio_bulk"

file_name = '2023-09-15_15-35-23_Resources_89688627131_TRG4D987PP3P3AE4SA59SAUVVK14OK43_pcmu.wav'
file_path = f'E:/–ó–∞–ø–∏—Å–∏/–ú–∏–≥–∫—Ä–µ–¥–∏—Ç/1/{file_name}'

payload = {}
files=[
  ('files',(file_name,open(file_path,'rb'),'audio/wav'))
]
headers = {}

transcription = requests.request("POST", url, headers=headers, data=payload, files=files)


url = "http://127.0.0.1:8001/diarize_audio_bulk"

payload = {'num_speakers': '2'}
files=[
  ('files',(file_name,open(file_path,'rb'),'audio/wav'))
]
headers = {}

segments = requests.request("POST", url, headers=headers, data=payload, files=files)

In [None]:
segments = segments.json()
transcription = transcription.json()

In [None]:
def align_transcription_with_diarization(transcription, diarization, overlap_threshold=0.1):
    """
    Aligns transcription words with diarization segments, ensuring each word is
    aligned with all segments it significantly overlaps with.
    """
    # Flatten the word list from the transcription data
    words = []
    for speaker, segments in transcription.items():
        for segment in segments:
            words.extend(segment['words'])
    
    aligned_words = []
    
    # Loop over each word to align it with overlapping diarization segments
    for word in words:
        word_start = word['start']
        word_end = word['end']
        word_duration = word_end - word_start
        word_text = word['word']
    
        # Keep track of overlaps with each speaker
        overlaps = []
    
        # Compare the word against all diarization segments
        for diarization_segment in diarization:
            segment_start = diarization_segment['start']
            segment_end = diarization_segment['end']
            segment_speaker = diarization_segment['speaker']
    
            # Calculate overlap
            overlap_start = max(word_start, segment_start)
            overlap_end = min(word_end, segment_end)
            overlap_duration = max(0, overlap_end - overlap_start)
    
            # Calculate overlap percentage
            overlap_percentage = overlap_duration / word_duration if word_duration > 0 else 0
    
            # Assign word to speaker if overlap is significant
            if overlap_percentage >= overlap_threshold:
                overlaps.append(segment_speaker)
    
        # Append aligned words for all overlapping speakers
        for speaker in overlaps:
            aligned_word = {
                'word': word_text,
                'start': word_start,
                'end': word_end,
                'speaker': speaker
            }
            aligned_words.append(aligned_word)
    
    return aligned_words


In [None]:
test = align_transcription_with_diarization(
transcription[file_name],
segments[file_name]['diarization'])

In [None]:
def create_speech_bubbles(transcription, pause_threshold=0.5, max_duration=5.0):
    speech_bubbles = []
    speaker_bubbles = {}  # Holds the current bubble for each speaker
    last_end_times = {}   # Tracks the last end time for each speaker

    # Ensure transcription is sorted by start time
    transcription.sort(key=lambda x: x['start'])

    for word_data in transcription:
        word = word_data['word']
        start_time = word_data['start']
        end_time = word_data['end']
        speaker = word_data['speaker']

        # Initialize the current bubble for the speaker if not already present
        if speaker not in speaker_bubbles:
            speaker_bubbles[speaker] = {"speaker": speaker, "start": None, "end": None, "text": "", "overlap": False}
            last_end_times[speaker] = None

        current_bubble = speaker_bubbles[speaker]
        last_end_time = last_end_times[speaker]

        # If the current bubble is empty, initialize it with the current word
        if current_bubble["start"] is None:
            current_bubble["start"] = start_time
            current_bubble["end"] = end_time
            current_bubble["text"] = word
        else:
            # Check if we need to start a new bubble
            has_long_pause = last_end_time and (start_time - last_end_time > pause_threshold)
            exceeds_max_duration = (end_time - current_bubble["start"]) > max_duration

            if has_long_pause or exceeds_max_duration:
                # Finalize the current bubble and start a new one
                speech_bubbles.append(current_bubble)
                speaker_bubbles[speaker] = {
                    "speaker": speaker,
                    "start": start_time,
                    "end": end_time,
                    "text": word,
                    "overlap": False
                }
                current_bubble = speaker_bubbles[speaker]
            else:
                # Continue the current bubble
                current_bubble["text"] += " " + word
                current_bubble["end"] = end_time

        # Update the last end time for the speaker
        last_end_times[speaker] = end_time

    # Append any remaining bubbles
    for bubble in speaker_bubbles.values():
        if bubble["start"] is not None:
            speech_bubbles.append(bubble)

    # Now, sort the bubbles by start time
    speech_bubbles.sort(key=lambda x: x['start'])

    # Detect overlaps between bubbles of different speakers and set 'overlap': True
    for i in range(len(speech_bubbles)):
        bubble_i = speech_bubbles[i]
        for j in range(i + 1, len(speech_bubbles)):
            bubble_j = speech_bubbles[j]
            # Stop checking if the next bubble starts after the current bubble ends
            if bubble_j['start'] > bubble_i['end']:
                break
            # Check if bubbles are from different speakers and overlap
            if bubble_i['speaker'] != bubble_j['speaker']:
                # Check for overlap
                start_i, end_i = bubble_i['start'], bubble_i['end']
                start_j, end_j = bubble_j['start'], bubble_j['end']
                # Overlap exists if start_i < end_j and start_j < end_i
                if start_i < end_j and start_j < end_i:
                    # Set 'overlap': True in both bubbles
                    bubble_i['overlap'] = True
                    bubble_j['overlap'] = True

    return speech_bubbles

def generate_html_with_media_player(speech_bubbles, audio_file_url, output_filename="transcription_with_player.html"):
    # Define the HTML structure with Plyr.js for the media player
    html_content = f"""
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <title>Speech Bubbles with Media Player</title>
        <link rel="stylesheet" href="https://cdn.plyr.io/3.7.8/plyr.css" />
        <style>
            body {{
                font-family: Arial, sans-serif;
                background-color: #f4f4f9;
                color: #333;
                padding: 20px;
                margin: 0;
                display: flex;
                flex-direction: column;
                align-items:center;
            }}
            .sticky-player {{
                position: fixed;
                top: 10px;
                left: 50%;
                transform: translateX(-50%);
                z-index: 1000;
                width: 90%;
                max-width: 600px;
                background-color: white;
                border: 1px solid #ccc;
                border-radius: 10px;
                box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
                padding: 10px;
            }}
            .bubble-container {{
                display: flex;
                flex-direction: column;
                gap: 10px;
                margin-top: 120px; /* To avoid overlapping with the fixed player */
            }}
            .bubble {{
                border-radius: 10px;
                padding: 15px;
                max-width: 70%;
                word-wrap: break-word;
            }}
            .bubble.speaker-0 {{
                background-color: #d1e7ff;
                color: #0a58ca;
                align-self: flex-start;
            }}
            .bubble.speaker-1 {{
                background-color: #ffe0e0;
                color: #c92a2a;
                align-self: flex-end;
            }}
            .timestamp {{
                font-size: 0.85em;
                color: #555;
                margin-top: 5px;
                text-align: right;
            }}
        </style>
    </head>
    <body>
        <div class="sticky-player">
            <audio id="player" controls>
                <source src="{audio_file_url}" type="audio/mpeg">
                Your browser does not support the audio element.
            </audio>
        </div>
        <div class="bubble-container">
    """

    # Add bubbles for each speech segment
    for bubble in speech_bubbles:
        speaker_class = "speaker-0" if bubble["speaker"] == "SPEAKER_00" else "speaker-1"
        html_content += f"""
        <div class="bubble {speaker_class}">
            <div class="text">{bubble["text"]}</div>
            <div class="timestamp">[{bubble["start"]:.2f} - {bubble["end"]:.2f}]</div>
        </div>
        """

    # Close the HTML structure
    html_content += """
        </div>
        <script src="https://cdn.plyr.io/3.7.8/plyr.polyfilled.js"></script>
        <script>
            const player = new Plyr('#player', {
                controls: ['play', 'progress', 'current-time', 'duration', 'mute', 'volume']
            });
        </script>
    </body>
    </html>
    """

    # Write to an HTML file
    with open(output_filename, "w", encoding="utf-8") as file:
        file.write(html_content)

    print(f"HTML file with media player has been generated: {output_filename}")


In [None]:
bubbles = create_speech_bubbles(test)

generate_html_with_media_player(bubbles, file_path)

Custom overlapping approach

In [None]:
def create_speech_bubbles_t(transcription, pause_threshold=0.5, max_duration=5.0):
    speech_bubbles = []
    speaker_bubbles = {}  # Holds the current bubble for each speaker
    last_end_times = {}   # Tracks the last end time for each speaker

    # Ensure transcription is sorted by start time
    transcription.sort(key=lambda x: x['start'])

    for word_data in transcription:
        word = word_data['word']
        start_time = word_data['start']
        end_time = word_data['end']
        speaker = word_data['speaker']

        # Initialize the current bubble for the speaker if not already present
        if speaker not in speaker_bubbles:
            speaker_bubbles[speaker] = {"speaker": speaker, "start": None, "end": None, "text": "", "overlap": ""}
            last_end_times[speaker] = None

        current_bubble = speaker_bubbles[speaker]
        last_end_time = last_end_times[speaker]

        # If the current bubble is empty, initialize it with the current word
        if current_bubble["start"] is None:
            current_bubble["start"] = start_time
            current_bubble["end"] = end_time
            current_bubble["text"] = word
        else:
            # Check if we need to start a new bubble
            has_long_pause = last_end_time and (start_time - last_end_time > pause_threshold)
            exceeds_max_duration = (end_time - current_bubble["start"]) > max_duration

            if has_long_pause or exceeds_max_duration:
                # Finalize the current bubble and start a new one
                speech_bubbles.append(current_bubble)
                speaker_bubbles[speaker] = {
                    "speaker": speaker,
                    "start": start_time,
                    "end": end_time,
                    "text": word,
                    "overlap": ""
                }
                current_bubble = speaker_bubbles[speaker]
            else:
                # Continue the current bubble
                current_bubble["text"] += "" + word
                current_bubble["end"] = end_time

        # Update the last end time for the speaker
        last_end_times[speaker] = end_time

    # Append any remaining bubbles
    for bubble in speaker_bubbles.values():
        if bubble["start"] is not None:
            speech_bubbles.append(bubble)

    # Sort the bubbles by start time
    speech_bubbles.sort(key=lambda x: x['start'])

    # Detect overlaps between bubbles of different speakers and capture exact sequences
    for i in range(len(speech_bubbles)):
        bubble_i = speech_bubbles[i]
        for j in range(i + 1, len(speech_bubbles)):
            bubble_j = speech_bubbles[j]
            # Stop checking if the next bubble starts after the current bubble ends
            if bubble_j['start'] > bubble_i['end']:
                break
            # Check if bubbles are from different speakers and overlap
            if bubble_i['speaker'] != bubble_j['speaker']:
                # Check for overlap
                start_i, end_i = bubble_i['start'], bubble_i['end']
                start_j, end_j = bubble_j['start'], bubble_j['end']
                if start_i < end_j and start_j < end_i:
                    # Identify exact overlapping sequences
                    words_i = bubble_i['text'].split()
                    words_j = bubble_j['text'].split()
                    overlap_sequence = []

                    # Compare sequences of words
                    for idx_i, word_i in enumerate(words_i):
                        for idx_j, word_j in enumerate(words_j):
                            if word_i == word_j:
                                temp_sequence = []
                                k = 0
                                # Check for a sequence match
                                while (
                                    idx_i + k < len(words_i)
                                    and idx_j + k < len(words_j)
                                    and words_i[idx_i + k] == words_j[idx_j + k]
                                ):
                                    temp_sequence.append(words_i[idx_i + k])
                                    k += 1
                                if len(temp_sequence) > len(overlap_sequence):
                                    overlap_sequence = temp_sequence

                    if overlap_sequence:
                        overlap_text = " ".join(overlap_sequence)
                        bubble_i['overlap'] = overlap_text
                        bubble_j['overlap'] = overlap_text

    return speech_bubbles



In [None]:
def generate_html_with_media_player_t(speech_bubbles, audio_file_url, output_filename="transcription_with_player.html"):
    html_content = f"""
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <title>Speech Bubbles with Media Player</title>
        <link rel="stylesheet" href="https://cdn.plyr.io/3.7.8/plyr.css" />
        <style>
            body {{
                font-family: Arial, sans-serif;
                background-color: #f4f4f9;
                color: #333;
                padding: 20px;
                margin: 0;
                display: flex;
                flex-direction: column;
                align-items:center;
            }}
            .sticky-player {{
                position: fixed;
                top: 10px;
                left: 50%;
                transform: translateX(-50%);
                z-index: 1000;
                width: 90%;
                max-width: 600px;
                background-color: white;
                border: 1px solid #ccc;
                border-radius: 10px;
                box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
                padding: 10px;
            }}
            .bubble-container {{
                display: flex;
                flex-direction: column;
                gap: 10px;
                margin-top: 120px;
            }}
            .bubble {{
                border-radius: 10px;
                padding: 15px;
                max-width: 70%;
                word-wrap: break-word;
            }}
            .bubble.speaker-0 {{
                background-color: #d1e7ff;
                color: #0a58ca;
                align-self: flex-start;
            }}
            .bubble.speaker-1 {{
                background-color: #ffe0e0;
                color: #c92a2a;
                align-self: flex-end;
            }}
            .overlap-word {{
                font-weight: bold;
            }}
            .timestamp {{
                font-size: 0.85em;
                color: #555;
                margin-top: 5px;
                text-align: right;
            }}
        </style>
    </head>
    <body>
        <div class="sticky-player">
            <audio id="player" controls>
                <source src="{audio_file_url}" type="audio/mpeg">
                Your browser does not support the audio element.
            </audio>
        </div>
        <div class="bubble-container">
    """

    # Add bubbles for each speech segment
    for bubble in speech_bubbles:
        speaker_class = "speaker-0" if bubble["speaker"] == "SPEAKER_00" else "speaker-1"
        text = bubble["text"]
        if bubble["overlap"]:
            # Highlight overlapping words
            overlap_words = bubble["overlap"].split()
            for word in overlap_words:
                text = text.replace(word, f"<span class='overlap-word'>{word}</span>")
        html_content += f"""
        <div class="bubble {speaker_class}">
            <div class="text">{text}</div>
            <div class="timestamp">[{bubble["start"]:.2f} - {bubble["end"]:.2f}]</div>
        </div>
        """

    # Close the HTML structure
    html_content += """
        </div>
        <script src="https://cdn.plyr.io/3.7.8/plyr.polyfilled.js"></script>
        <script>
            const player = new Plyr('#player', {
                controls: ['play', 'progress', 'current-time', 'duration', 'mute', 'volume']
            });
        </script>
    </body>
    </html>
    """

    with open(output_filename, "w", encoding="utf-8") as file:
        file.write(html_content)

    print(f"HTML file with media player has been generated: {output_filename}")


In [None]:
bubbles = create_speech_bubbles_t(test)

generate_html_with_media_player_t(bubbles, file_path)

In [None]:
bubbles

# Test Services

In [None]:
import requests

url = "http://127.0.0.1:8000/transcribe_audio_bulk"

file_name = 'mmvb3.wav'
#'2023-09-15_15-35-23_Resources_89688627131_TRG4D987PP3P3AE4SA59SAUVVK14OK43_pcmu.wav'
file_path = f'C:/Users/Alex/whisper_asr_implementation/Drafts/{file_name}'#f'E:/–ó–∞–ø–∏—Å–∏/–ú–∏–≥–∫—Ä–µ–¥–∏—Ç/1/{file_name}'


In [None]:
payload = {}
files=[
  ('files',(file_name,open(file_path,'rb'),'audio/wav'))
]
headers = {}

transcription = requests.request("POST", url, headers=headers, data=payload, files=files)
print('ASR -- OK')

In [None]:
transcription.json()

In [None]:
url = "http://127.0.0.1:8001/diarize_audio_bulk"
payload = {'num_speakers': None}
files=[
  ('files',(file_name,open(file_path,'rb'),'audio/wav'))
]
headers = {}
segments = requests.request("POST", url, headers=headers, data=payload, files=files)
print('DIARIZATION -- OK')


In [None]:
segments2 = segments.json()[file_name]['diarization']
transcription2 = transcription.json()[file_name]

url = "http://127.0.0.1:8002/process-transcription"

process_payload = {
    "transcription": transcription2,
    "diarization": segments2
}

bubbles = requests.request("POST", url, json=process_payload)
print('ALLIGNMENT -- OK')

In [None]:
url = "http://127.0.0.1:8002/generate-html"

bubbles_input2 = bubbles.json()['speech_bubbles']
query_params = {"audio_file_url": file_path}


html = requests.request("POST", url, params=query_params, json=bubbles_input2)
print('HTML PREVIEW -- OK')

html_content = html.json()['html']

with open('./test_preview.html', "w", encoding="utf-8") as file:
        file.write(html_content)

In [None]:
races = ['üßëüèæ','üßëüèΩ','üßëüèª','üßëüèø']

In [None]:
races.sort()

In [None]:
races