In [7]:
import whisperx
import gc 
import os
import time
# Get the absolute path of the script's directory
script_dir = r"C:\Users\User\Desktop\Projects\Lucid"
# Change the working directory to the script's directory
os.chdir(script_dir)
from dotenv import load_dotenv
load_dotenv()



device = "cuda" 
audio_file = r"C:\Users\User\Desktop\Projects\Lucid\Test_Audio.wav"
batch_size = 2 # reduce if low on GPU mem
compute_type = "int8" # change to "int8" if low on GPU mem (may reduce accuracy)

# 1. Transcribe with original whisper (batched)
model = whisperx.load_model("Systran/faster-distil-whisper-medium.en", device, compute_type=compute_type)
model_a, metadata = whisperx.load_align_model(language_code="en", device=device)
diarize_model = whisperx.DiarizationPipeline('pyannote/speaker-diarization-3.1', use_auth_token=os.getenv("HUGGINGFACE_READ_KEY"), device=device)
# save model to local path (optional)
# model_dir = "/path/"
# model = whisperx.load_model("large-v2", device, compute_type=compute_type, download_root=model_dir)
start_time = time.time()
audio = whisperx.load_audio(audio_file)
result = model.transcribe(audio, batch_size=batch_size)
print(result["segments"]) # before alignment

# delete model if low on GPU resources
# import gc; gc.collect(); torch.cuda.empty_cache(); del model

# 2. Align whisper output
print("--- %s seconds ---" % (time.time() - start_time))
result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
print("Start of Alignment Results\n=====")
text_result = ""
for i in result["segments"]:
    text_result += i["text"] + " "
print(text_result.strip())
print("=====\nEnd of Alignment Results")
print("--- %s seconds ---" % (time.time() - start_time))
# delete model if low on GPU resources
# import gc; gc.collect(); torch.cuda.empty_cache(); del model_a

# 3. Assign speaker labels


# add min/max number of speakers if known
diarize_segments = diarize_model(audio)
# diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)

result = whisperx.assign_word_speakers(diarize_segments, result)
print(diarize_segments)
print(result["segments"]) # segments are now assigned speaker IDs
print("--- %s seconds ---" % (time.time() - start_time))





Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.2.4. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\User\.cache\torch\whisperx-vad-segmentation.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.2.2+cu121. Bad things might happen unless you revert torch to 1.x.
[{'text': ' Hello, Lucid. Can you hear me?', 'start': 0.589, 'end': 2.688}]
--- 0.2495284080505371 seconds ---
Start of Alignment Results
=====
Hello, Lucid. Can you hear me?
=====
End of Alignment Results
--- 0.28153228759765625 seconds ---
                             segment label     speaker     start       end  \
0  [ 00:00:00.466 -->  00:00:02.606]     A  SPEAKER_00  0.466893  2.606112   

   intersection     union  
0         0.041  2.139219  
[{'start': 0.609, 'end': 1.457, 'text': ' Hello, Lucid.', 'words': [{'word': 'Hello,', 'start': 0.609, 'end': 0.952, 'score': 0.77, 'speaker': 'SPEAKER_00'}, {'word': 'Lucid.', 'start': 1.033, 'end': 1.457, 'score': 0.912, 'speaker': 'SPEAKER_00'}], 'speaker': 'SPEAKER_00'}, {'start': 1.679, 'end'

In [None]:
print("=====\n\nSecond Test\n\n=====")


start_time = time.time()
audio = whisperx.load_audio(r"C:\Users\User\Desktop\Projects\Lucid\Second_Audio_Test.wav")
result = model.transcribe(audio, batch_size=batch_size)
print(result["segments"]) # before alignment

# delete model if low on GPU resources
# import gc; gc.collect(); torch.cuda.empty_cache(); del model

# 2. Align whisper output
print("--- %s seconds ---" % (time.time() - start_time))
print("After Alignment")
result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
#print(result) # after alignment
for i in result:
    print(i)
print("End of Alignment Results")
print("--- %s seconds ---" % (time.time() - start_time))
# delete model if low on GPU resources
# import gc; gc.collect(); torch.cuda.empty_cache(); del model_a

# 3. Assign speaker labels


# add min/max number of speakers if known
diarize_segments = diarize_model(audio)
# diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)

result = whisperx.assign_word_speakers(diarize_segments, result)
print(diarize_segments)
print(result["segments"]) # segments are now assigned speaker IDs
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
[{'start': 0.609, 'end': 1.457, 'text': ' Hello, Lucid.', 'words': [{'word': 'Hello,', 'start': 0.609, 'end': 0.952, 'score': 0.77}, {'word': 'Lucid.', 'start': 1.033, 'end': 1.457, 'score': 0.912}]},
 {'start': 1.679, 'end': 2.325, 'text': 'Can you hear me?', 'words': [{'word': 'Can', 'start': 1.679, 'end': 1.881, 'score': 0.897}, {'word': 'you', 'start': 1.901, 'end': 2.022, 'score': 0.708}, {'word': 'hear', 'start': 2.062, 'end': 2.264, 'score': 0.905}, {'word': 'me?', 'start': 2.284, 'end': 2.325, 'score': 0.0}]}]