In [1]:
# Speech Diarization with Whisper (HF) + pyannote
import torch
import torchaudio
import librosa
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from pyannote.audio import Pipeline
import os
import IPython.display as ipd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ----------- 1. Load Local WAV File --------------
AUDIO_FILE = "bengali-convo-2.wav"  # replace with your file

# Preview audio
ipd.Audio(AUDIO_FILE)

# Load audio
waveform, sr = torchaudio.load(AUDIO_FILE)

waveform = waveform[0].numpy()  # mono
resampled = librosa.resample(waveform, orig_sr=sr, target_sr=16000)
sr = 16000  # Whisper expects 16kHz

In [3]:
# -------------------Set model properties-----------------------------------
model_name_or_path = "openai/whisper-small"
language = "bengali"
task = "transcribe"

In [4]:
from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name_or_path,language=language,task=task)

In [5]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained(model_name_or_path, language=language, task=task)

In [6]:
from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained(model_name_or_path,language=language,task=task)

In [7]:
#---------------------------------Load LORA model from Hugging Face Hub-----------------------------------

from peft import LoraConfig, PeftModel, LoraModel, LoraConfig, get_peft_model

config = LoraConfig(r=32, lora_alpha=64, target_modules=["k_proj", "v_proj", "q_proj", "out_proj"], lora_dropout=0.05, bias="none")


# Load base model
base_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

peft_model = get_peft_model(base_model, config)

# Load LoRA adapter
fine_tuned_model = PeftModel.from_pretrained(peft_model, "imonghose/whisper-small-bengali-lora-final")
# final_loaded_model = PeftModel.from_pretrained(peft_model, "trained_model")

# ‚úÖ Move model to GPU
fine_tuned_model = fine_tuned_model.to("cuda")




In [None]:
# ----------- 2. Load Whisper model from Hugging Face --------------
device = "cuda" if torch.cuda.is_available() else "cpu"

processor = WhisperProcessor.from_pretrained(model_name_or_path, language=language, task=task)
model = fine_tuned_model

# Prepare input
inputs = processor(resampled, sampling_rate=sr, return_tensors="pt").input_features.to(device)

# Generate token ids
with torch.no_grad():
    op = model.generate(inputs, language='bengali', task='transcribe')
transcription = tokenizer.batch_decode(op, skip_special_tokens=True)[0]

print("Full Transcription:")
print(transcription)

In [None]:
#-----------------------Manually provide correct transcription for testing diarization-------------------
transcription = '‡¶ï‡ßã‡¶•‡¶æ‡¶Ø‡¶º ‡¶Ø‡¶æ‡¶ö‡ßç‡¶õ‡ßã ‡¶¨‡¶æ‡¶ú‡¶æ‡¶∞‡ßá ‡¶Ø‡¶æ‡¶ö‡ßç‡¶õ‡¶ø ‡¶Ü‡¶Æ‡¶æ‡¶∞ ‡¶ú‡¶®‡ßç‡¶Ø ‡¶ï‡¶≤‡¶æ ‡¶è‡¶®‡ßã ‡¶†‡¶ø‡¶ï ‡¶Ü‡¶õ‡ßá, ‡¶®‡¶ø‡¶Ø‡¶º‡ßá ‡¶Ü‡¶∏‡¶¨‡ßã‡•§'
# transcription = '‡¶§‡ßÅ‡¶Æ‡¶ø ‡¶ï‡ßá‡¶Æ‡¶® ‡¶Ü‡¶õ‡ßã ‡¶Ü‡¶ú ‡¶ï‡¶ø ‡¶ï‡¶∞‡¶õ‡ßã ‡¶≠‡¶æ‡¶≤‡ßã ‡¶Ü‡¶õ‡¶ø ‡¶¨‡¶á ‡¶™‡¶°‡¶º‡¶õ‡¶ø ‡¶ï‡ßÄ ‡¶¨‡¶á ‡¶™‡¶°‡¶º‡¶õ‡ßã ‡¶ó‡¶≤‡ßç‡¶™ ‡¶®‡¶æ‡¶ï‡¶ø ‡¶â‡¶™‡¶®‡ßç‡¶Ø‡¶æ‡¶∏ ‡¶â‡¶™‡¶®‡ßç‡¶Ø‡¶æ‡¶∏ ‡¶ñ‡ßÅ‡¶¨‡¶á ‡¶∏‡ßÅ‡¶®‡ßç‡¶¶‡¶∞ ‡¶≤‡¶æ‡¶ó‡¶õ‡ßá'

In [None]:
# ----------- 3. Diarization using pyannote.audio --------------

from pyannote.audio import Pipeline

hf_token = "your_huggingface_token_here"

diarization_pipeline = Pipeline.from_pretrained(
    "pyannote/speaker-diarization",
    use_auth_token=hf_token
)

# Perform diarization
diarization_result = diarization_pipeline(AUDIO_FILE,num_speakers=2)


Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.1.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\imong\.cache\torch\pyannote\models--pyannote--segmentation\snapshots\c4c8ceafcbb3a7a280c2d357aee9fbc9b0be7f9b\pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.6.0+cu124. Bad things might happen unless you revert torch to 1.x.


In [15]:
speaker_segments = list(diarization_result.itertracks(yield_label=True))
speaker_segments

[(<Segment(1.78597, 3.10222)>, 'A', 'SPEAKER_00'),
 (<Segment(3.27097, 4.53659)>, 'B', 'SPEAKER_01'),
 (<Segment(4.53659, 7.91159)>, 'C', 'SPEAKER_00'),
 (<Segment(7.91159, 8.90722)>, 'D', 'SPEAKER_01')]

In [16]:
import numpy as np
from pyannote.core import Segment
import librosa

# Get total duration of the audio
duration = librosa.get_duration(filename=AUDIO_FILE)

# Split words and estimate per-word duration
words = transcription.split()
total_words = len(words)
avg_word_duration = duration / total_words

# Diarization segments
speaker_segments = list(diarization_result.itertracks(yield_label=True))

# Make sure we have diarization segments
if not speaker_segments:
    raise ValueError("No speaker segments found in diarization result.")

# Get start of first speaker segment
first_speaker_start = min([segment[0].start for segment in speaker_segments])

# Build timestamped word list (shifted to match first diarization start)
word_times = []
for i, word in enumerate(words):
    start_time = first_speaker_start + i * avg_word_duration
    end_time = first_speaker_start + (i + 1) * avg_word_duration
    word_times.append((word, start_time, end_time))

# Align each word to the correct speaker based on diarization
speaker_words = []
for word, start, end in word_times:
    for (seg_start, seg_end), _, speaker in speaker_segments:
        if seg_start <= start < seg_end:
            speaker_words.append((speaker, word))
            break

# Group consecutive words by speaker
final_output = []
if speaker_words:
    current_speaker, current_words = speaker_words[0][0], []
    for speaker, word in speaker_words:
        if speaker == current_speaker:
            current_words.append(word)
        else:
            final_output.append((current_speaker, " ".join(current_words)))
            current_speaker = speaker
            current_words = [word]
    final_output.append((current_speaker, " ".join(current_words)))  # last group


	This alias will be removed in version 1.0.
  duration = librosa.get_duration(filename=AUDIO_FILE)


In [17]:
# ----------- 5. Display Final Result --------------

print("\nüó£Ô∏è Final Diarized Transcript:\n")
for speaker, text in final_output:
    print(f"{speaker} : \"{text}\"")



üó£Ô∏è Final Diarized Transcript:

SPEAKER_00 : "‡¶ï‡ßã‡¶•‡¶æ‡¶Ø‡¶º ‡¶Ø‡¶æ‡¶ö‡ßç‡¶õ‡ßã"
SPEAKER_01 : "‡¶¨‡¶æ‡¶ú‡¶æ‡¶∞‡ßá ‡¶Ø‡¶æ‡¶ö‡ßç‡¶õ‡¶ø"
SPEAKER_00 : "‡¶Ü‡¶Æ‡¶æ‡¶∞ ‡¶ú‡¶®‡ßç‡¶Ø ‡¶ï‡¶≤‡¶æ ‡¶è‡¶®‡ßã"
SPEAKER_01 : "‡¶†‡¶ø‡¶ï"
