In [38]:
import requests
import json

# Load configuration from JSON file
with open('credentials.json', 'r') as config_file:
    config = json.load(config_file)

read_api_key = config['read_api']['api_token']
write_api_key =  config['write_api']['api_token']

import whisperx
import gc

device = "cpu"
audio_file = "F&H/f&h_english.wav"
batch_size = 4 # reduce if low on GPU mem
compute_type = "int8" # change to "int8" if low on GPU mem (may reduce accuracy)

# 1. Transcribe with original whisper (batched)
model_base = whisperx.load_model("base", device, compute_type=compute_type)

# save model to local path (optional)
# model_dir = "/path/"
# model_large = whisperx.load_model("large-v2", device, compute_type=compute_type, download_root=model_dir)

audio = whisperx.load_audio(audio_file)
result1 = model_base.transcribe(audio, batch_size=batch_size)
print(result1["segments"]) # before alignment

# delete model if low on GPU resources
# import gc; gc.collect(); torch.cuda.empty_cache(); del model

# 2. Align whisper output
model_a, metadata = whisperx.load_align_model(language_code=result1["language"], device=device)
result2 = whisperx.align(result1["segments"], model_a, metadata, audio, device, return_char_alignments=False)

print(result2["segments"]) # after alignment

# delete model if low on GPU resources
# import gc; gc.collect(); torch.cuda.empty_cache(); del model_a

# 3. Assign speaker labels
diarize_model = whisperx.DiarizationPipeline(use_auth_token=write_api_key, device=device)

# add min/max number of speakers if known
diarize_segments = diarize_model(audio)
# diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)

result3 = whisperx.assign_word_speakers(diarize_segments, result2)
print(diarize_segments)
print(result3["segments"]) # segments are now assigned speaker IDs

Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.2.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\User\.cache\torch\whisperx-vad-segmentation.bin`


No language specified, language will be first be detected for each audio file (increases inference time).
Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.2.1+cpu. Bad things might happen unless you revert torch to 1.x.
Detected language: en (0.39) in first 30s of audio...
[{'text': " Hello, how are you? I'm fine. How about you? I'm good. Let's talk about Fiverr success. I'm recently created a count on Fiverr, but I have no success and can't get any order from there. So can you please tell me some guidance or some tips from how then I can succeed from there?", 'start': 0.896, 'end': 22.585}, {'text': " Okay, before proceeding further, I want to ask you, what niche are you working in? Currently, I am providing my services on data sciences and artificial intelligence. Wow, that's good. Okay, so you are not getting impressions or orders? Both any pressures, I 

In [39]:
result1["language"]

'en'

In [6]:
from pydub import AudioSegment

# Load the WAV file
audio = AudioSegment.from_wav("E:\\Audios\\Nawal.wav")

# Export the audio to WAV format again
audio.export("E:\\Audios\\Nawal_fixed.wav", format="wav")

CouldntDecodeError: Decoding failed. ffmpeg returned error code: 3199971767

Output from ffmpeg/avlib:

ffmpeg version 6.1.1-essentials_build-www.gyan.dev Copyright (c) 2000-2023 the FFmpeg developers
  built with gcc 12.2.0 (Rev10, Built by MSYS2 project)
  configuration: --enable-gpl --enable-version3 --enable-static --pkg-config=pkgconf --disable-w32threads --disable-autodetect --enable-fontconfig --enable-iconv --enable-gnutls --enable-libxml2 --enable-gmp --enable-bzlib --enable-lzma --enable-zlib --enable-libsrt --enable-libssh --enable-libzmq --enable-avisynth --enable-sdl2 --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxvid --enable-libaom --enable-libopenjpeg --enable-libvpx --enable-mediafoundation --enable-libass --enable-libfreetype --enable-libfribidi --enable-libharfbuzz --enable-libvidstab --enable-libvmaf --enable-libzimg --enable-amf --enable-cuda-llvm --enable-cuvid --enable-ffnvcodec --enable-nvdec --enable-nvenc --enable-dxva2 --enable-d3d11va --enable-libvpl --enable-libgme --enable-libopenmpt --enable-libopencore-amrwb --enable-libmp3lame --enable-libtheora --enable-libvo-amrwbenc --enable-libgsm --enable-libopencore-amrnb --enable-libopus --enable-libspeex --enable-libvorbis --enable-librubberband
  libavutil      58. 29.100 / 58. 29.100
  libavcodec     60. 31.102 / 60. 31.102
  libavformat    60. 16.100 / 60. 16.100
  libavdevice    60.  3.100 / 60.  3.100
  libavfilter     9. 12.100 /  9. 12.100
  libswscale      7.  5.100 /  7.  5.100
  libswresample   4. 12.100 /  4. 12.100
  libpostproc    57.  3.100 / 57.  3.100
[wav @ 00000202cc9dbc40] invalid start code [0][0][0][28] in RIFF header
[in#0 @ 00000202cc9dbac0] Error opening input: Invalid data found when processing input
Error opening input file E:\Audios\Nawal.wav.
Error opening input files: Invalid data found when processing input


In [8]:
from openai import OpenAI

client = OpenAI()

audio_file = open(r"E:\\Audios\\urdu audio.mp3", "rb")
transcript2 = client.audio.transcriptions.create(
  file=audio_file,
  model="whisper-1",
  response_format="verbose_json",
  timestamp_granularities=["word","segment"],
  language='ur'
)

print(transcript2)

Transcription(text='السلام علیکم میں حبیبینک کا نمائندہ ہوں ہم آپ کو کارڈ لینے کی تجویز کریں گے کیا آپ انڈسٹر ہیں؟ وعلیکم سلام جی ضرور پلیس بتائیے آپ کے پاس کون سے کریڈ کارڈز اویلیبل ہیں؟ ہمارے کریڈ کارڈ میں کافی آسانیاں ہیں رقم نکروانے سے لے کر اور بھی بہت سی ایسی خاصوصیت ہیں جو دوسرے بینک کے کارڈ سے ہمیں الگ بناتی ہیں اچھا یہ تو بہت اچھی بات ہے پلیس کیا آپ ڈیٹیئر شیئر کر سکتے ہیں تاکہ میں اس کو دیکھ سکوں؟ ٹھیک ہے آپ ہمارے ویب سائٹ پر فارم فل کر دیں اس کے بعد ساری ڈیٹیئر آپ کو مل جائیں گے بہت شکریہ تو کیا میں آپ کو پرسنلی میسج کر سکتا ہوں؟ نہیں بالکل نہیں آپ ہمارے ویب سائٹ پر جا کے فارم فل کریں ٹھیک ہے میں ویب سائٹ پر جا کے کرتا ہوں تینکیو', task='transcribe', language='urdu', duration=41.130001068115234, segments=[{'id': 0, 'seek': 0, 'start': 1.059999942779541, 'end': 7.960000038146973, 'text': ' السلام علیکم میں حبیبینک کا نمائندہ ہوں ہم آپ کو کارڈ لینے کی تجویز کریں گے کیا آپ انڈسٹر ہیں؟', 'tokens': [50364, 21136, 37440, 11203, 29325, 2304, 27875, 11331, 3555, 4135, 3555, 32151, 6

In [10]:
print(transcript2.language)
print(transcript2.segments)
print(transcript2.text)

urdu
[{'id': 0, 'seek': 0, 'start': 1.2400000095367432, 'end': 4.679999828338623, 'text': ' Hi Sir, How are you?', 'tokens': [50364, 2421, 6144, 11, 1012, 366, 291, 30, 50514], 'temperature': 0.0, 'avg_logprob': -0.3140413165092468, 'compression_ratio': 1.4324324131011963, 'no_speech_prob': 0.2907930910587311}, {'id': 1, 'seek': 0, 'start': 4.940000057220459, 'end': 8.979999542236328, 'text': ' Hello, May I know who is speaking?', 'tokens': [50514, 2425, 11, 1891, 286, 458, 567, 307, 4124, 30, 50714], 'temperature': 0.0, 'avg_logprob': -0.3140413165092468, 'compression_ratio': 1.4324324131011963, 'no_speech_prob': 0.2907930910587311}, {'id': 2, 'seek': 0, 'start': 8.979999542236328, 'end': 12.180000305175781, 'text': ' I am speaking from the youth bank.', 'tokens': [50714, 286, 669, 4124, 490, 264, 7503, 3765, 13, 50864], 'temperature': 0.0, 'avg_logprob': -0.3140413165092468, 'compression_ratio': 1.4324324131011963, 'no_speech_prob': 0.2907930910587311}, {'id': 3, 'seek': 0, 'start': 

In [4]:
transcript2.segments[0]

{'id': 0,
 'seek': 0,
 'start': 0.0,
 'end': 2.0,
 'text': ' I want to know why Loki let us take him.',
 'tokens': [50364,
  286,
  528,
  281,
  458,
  983,
  37940,
  718,
  505,
  747,
  796,
  13,
  50464],
 'temperature': 0.0,
 'avg_logprob': -0.23701393604278564,
 'compression_ratio': 1.4493391513824463,
 'no_speech_prob': 0.1352676898241043}

In [5]:
for i in range(len(transcript2.segments)):
    text = transcript2.segments[i]['text']
    start = transcript2.segments[i]['start']
    end = transcript2.segments[i]['end']
    combine = text, "(" , start, "-" , end, ")"
    print(combine)

(' I want to know why Loki let us take him.', '(', 0.0, '-', 2.0, ')')
(" He's not leading an army from here.", '(', 2.0, '-', 3.5, ')')
(" I don't think we should be focusing on Loki.", '(', 3.5, '-', 5.0, ')')
(" That guy's brain is a bag full of cats.", '(', 5.0, '-', 7.0, ')')
(' You could smell crazy on him.', '(', 7.0, '-', 8.5, ')')
(" I've care how you speak.", '(', 8.5, '-', 11.0, ')')
(' Loki is beyond reason, but he is of Asgard.', '(', 11.0, '-', 13.5, ')')
(' And he is my brother.', '(', 13.5, '-', 15.0, ')')
(' He killed 80 people in two days.', '(', 15.0, '-', 17.5, ')')
(" He's adopted?", '(', 17.5, '-', 19.0, ')')


In [13]:
for i in range(len(transcript2.segments)):
    data = transcript2.segments[0]

print(data)
print("----------")
print(len(data))

{'id': 0, 'seek': 0, 'start': 0.0, 'end': 2.0, 'text': ' I want to know why Loki let us take him.', 'tokens': [50364, 286, 528, 281, 458, 983, 37940, 718, 505, 747, 796, 13, 50464], 'temperature': 0.0, 'avg_logprob': -0.23701393604278564, 'compression_ratio': 1.4493391513824463, 'no_speech_prob': 0.1352676898241043}
----------
10


In [32]:
filtered_segments = []

for i in range(len(transcript2.segments)):
    data = transcript2.segments[i]  # Access the current segment within the loop

    # Extract only 'text', 'start', and 'end'
    filtered_data = {'text': data['text'], 'start': data['start'], 'end': data['end']}
    
    # Append the filtered data to the list
    filtered_segments.append(filtered_data)

# Print the filtered segments
for segment in filtered_segments:
    print(segment)


{'text': ' I want to know why Loki let us take him.', 'start': 0.0, 'end': 2.0}
{'text': " He's not leading an army from here.", 'start': 2.0, 'end': 3.5}
{'text': " I don't think we should be focusing on Loki.", 'start': 3.5, 'end': 5.0}
{'text': " That guy's brain is a bag full of cats.", 'start': 5.0, 'end': 7.0}
{'text': ' You could smell crazy on him.', 'start': 7.0, 'end': 8.5}
{'text': " I've care how you speak.", 'start': 8.5, 'end': 11.0}
{'text': ' Loki is beyond reason, but he is of Asgard.', 'start': 11.0, 'end': 13.5}
{'text': ' And he is my brother.', 'start': 13.5, 'end': 15.0}
{'text': ' He killed 80 people in two days.', 'start': 15.0, 'end': 17.5}
{'text': " He's adopted?", 'start': 17.5, 'end': 19.0}


In [34]:
filtered_segments

[{'text': ' I want to know why Loki let us take him.',
  'start': 0.0,
  'end': 2.0},
 {'text': " He's not leading an army from here.", 'start': 2.0, 'end': 3.5},
 {'text': " I don't think we should be focusing on Loki.",
  'start': 3.5,
  'end': 5.0},
 {'text': " That guy's brain is a bag full of cats.",
  'start': 5.0,
  'end': 7.0},
 {'text': ' You could smell crazy on him.', 'start': 7.0, 'end': 8.5},
 {'text': " I've care how you speak.", 'start': 8.5, 'end': 11.0},
 {'text': ' Loki is beyond reason, but he is of Asgard.',
  'start': 11.0,
  'end': 13.5},
 {'text': ' And he is my brother.', 'start': 13.5, 'end': 15.0},
 {'text': ' He killed 80 people in two days.', 'start': 15.0, 'end': 17.5},
 {'text': " He's adopted?", 'start': 17.5, 'end': 19.0}]

In [26]:
for i in filtered_data:
    print(i)

text
start
end


In [17]:
for i in range(len(transcript2.segments)):
    data = transcript2.segments[i]  # Access the current segment within the loop
# Extract required data
    text = data['text']
    start = data['start']
    end = data['end']
    
    # Format and print the segment information
    print(f"{text} ({start}-{end})")

 I want to know why Loki let us take him. (0.0-2.0)
 He's not leading an army from here. (2.0-3.5)
 I don't think we should be focusing on Loki. (3.5-5.0)
 That guy's brain is a bag full of cats. (5.0-7.0)
 You could smell crazy on him. (7.0-8.5)
 I've care how you speak. (8.5-11.0)
 Loki is beyond reason, but he is of Asgard. (11.0-13.5)
 And he is my brother. (13.5-15.0)
 He killed 80 people in two days. (15.0-17.5)
 He's adopted? (17.5-19.0)


In [18]:
formatted_segments = []

for i in range(len(transcript2.segments)):
    data = transcript2.segments[i]  # Access the current segment within the loop
    
    # Extract required data
    text = data['text']
    start = data['start']
    end = data['end']
    
    # Format the segment information
    formatted_segment = f"{text} ({start}-{end})"
    
    # Append the formatted string to the list
    formatted_segments.append(formatted_segment)

# Print the formatted segments
for segment in formatted_segments:
    print(segment)

 I want to know why Loki let us take him. (0.0-2.0)
 He's not leading an army from here. (2.0-3.5)
 I don't think we should be focusing on Loki. (3.5-5.0)
 That guy's brain is a bag full of cats. (5.0-7.0)
 You could smell crazy on him. (7.0-8.5)
 I've care how you speak. (8.5-11.0)
 Loki is beyond reason, but he is of Asgard. (11.0-13.5)
 And he is my brother. (13.5-15.0)
 He killed 80 people in two days. (15.0-17.5)
 He's adopted? (17.5-19.0)


In [19]:
formatted_segment

" He's adopted? (17.5-19.0)"

In [31]:
# Keys to exclude
exclude_keys = ['id', 'seek', 'tokens', 'temperature', 'avg_logprob', 'compression_ratio', 'no_speech_prob']

# Filter out unwanted keys
filtered_data = {key: value for key, value in filtered_segments.items() if key not in exclude_keys}
print(filtered_data)

AttributeError: 'list' object has no attribute 'items'

In [23]:
# Print the formatted segments
for i in filtered_data:
    print(i)

start
end
text


In [1]:
data = {
    'id': 0,
    'seek': 0,
    'start': 0.0,
    'end': 2.0,
    'text': ' I want to know why Loki let us take him.',
    'tokens': [50364, 286, 528, 281, 458, 983, 37940, 718, 505, 747, 796, 13, 50464],
    'temperature': 0.0,
    'avg_logprob': -0.23701393604278564,
    'compression_ratio': 1.4493391513824463,
    'no_speech_prob': 0.1352676898241043
}

# Keys to exclude
exclude_keys = ['id', 'seek', 'tokens', 'temperature', 'avg_logprob', 'compression_ratio', 'no_speech_prob']

# Filter out unwanted keys
filtered_data = {key: value for key, value in data.items() if key not in exclude_keys}

print(filtered_data)

{'start': 0.0, 'end': 2.0, 'text': ' I want to know why Loki let us take him.'}


In [26]:
def extract_data(json_data):
    for segment in json_data:
       output = print(segment['speaker'] + ": " + segment['text'])
    return output

extract_data(result['segments'])

SPEAKER_00:  यह लाजा तो मेरे ले ले ले ले ले ले ले ले ले ले ले ले ले


In [2]:
# 1. Transcribe with original whisper (batched)
model_base = whisperx.load_model("base", device, compute_type=compute_type)

audio = whisperx.load_audio("hassan/two.wav")
result = model_base.transcribe(audio, batch_size=batch_size)
print(result["segments"]) # before alignment

# delete model if low on GPU resources
# import gc; gc.collect(); torch.cuda.empty_cache(); del model

# 2. Align whisper output
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)

print(result["segments"]) # after alignment

# delete model if low on GPU resources
# import gc; gc.collect(); torch.cuda.empty_cache(); del model_a

# 3. Assign speaker labels
diarize_model = whisperx.DiarizationPipeline(use_auth_token=write_api_key, device=device)

# add min/max number of speakers if known
diarize_segments = diarize_model(audio)
# diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)

result = whisperx.assign_word_speakers(diarize_segments, result)
print(diarize_segments)
print(result["segments"]) # segments are now assigned speaker IDs

Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.2.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\User\.cache\torch\whisperx-vad-segmentation.bin`


No language specified, language will be first be detected for each audio file (increases inference time).
Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.2.1+cpu. Bad things might happen unless you revert torch to 1.x.
Detected language: en (0.99) in first 30s of audio...
[{'text': " Give yourself some credit. Please. Stark Tower is your baby. Give yourself 12% credit. 12%? An argument can be made for 15. 12%? Well, I did do all the heavy lifting. Literally, I lifted the heavy things. And sorry, but the security snafu, that was on you. Oh. My part of the elevator. You mean our elevator? It was teaming with sweaty workmen. I'm going to pay for that comment about percentages in some sort of way later, aren't I?", 'start': 0.009, 'end': 29.462}, {'text': ' Not gonna be that subtle.', 'start': 30.196, 'end': 31.476}]
[{'start': 0.569, 'end': 1.39, 'text': ' Gi

In [16]:
print(result['segments'][0]['speaker'] + ": " + result['segments'][0]['text'])
print(result['segments'][1]['speaker'] + ": " + result['segments'][1]['text'])
print(result['segments'][2]['speaker'] + ": " + result['segments'][2]['text'])
print(result['segments'][3]['speaker'] + ": " + result['segments'][3]['text'])
print(result['segments'][4]['text'])
print(result['segments'][5]['text'])
print(result['segments'][6]['text'])
print(result['segments'][7]['text'])
print(result['segments'][8]['text'])
print(result['segments'][9]['text'])
print(result['segments'][10]['text'])
print(result['segments'][11]['text'])
print(result['segments'][12]['text'])
print(result['segments'][13]['text'])
print(result['segments'][14]['text'])
print(result['segments'][15]['text'])

SPEAKER_00:  Give yourself some credit.
SPEAKER_00: Please.
SPEAKER_00: Stark Tower is your baby.
SPEAKER_00: Give yourself 12% credit.
12%?
An argument can be made for 15.
12%?
Well, I did do all the heavy lifting.
Literally, I lifted the heavy things.
And sorry, but the security snafu, that was on you.
Oh.
My part of the elevator.
You mean our elevator?
It was teaming with sweaty workmen.
I'm going to pay for that comment about percentages in some sort of way later, aren't I?
 Not gonna be that subtle.


In [13]:
def extract_data(json_data):
    for segment in json_data:
       output = print(segment['text'])
    return output

extract_data(result['segments'])

 Give yourself some credit.
Please.
Stark Tower is your baby.
Give yourself 12% credit.
12%?
An argument can be made for 15.
12%?
Well, I did do all the heavy lifting.
Literally, I lifted the heavy things.
And sorry, but the security snafu, that was on you.
Oh.
My part of the elevator.
You mean our elevator?
It was teaming with sweaty workmen.
I'm going to pay for that comment about percentages in some sort of way later, aren't I?
 Not gonna be that subtle.


In [17]:
# 1. Transcribe with original whisper (batched)
model_base = whisperx.load_model("base", device, compute_type=compute_type)

audio = whisperx.load_audio("hassan/three.wav")
result = model_base.transcribe(audio, batch_size=batch_size)
print(result["segments"]) # before alignment

# delete model if low on GPU resources
# import gc; gc.collect(); torch.cuda.empty_cache(); del model

# 2. Align whisper output
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)

print(result["segments"]) # after alignment

# delete model if low on GPU resources
# import gc; gc.collect(); torch.cuda.empty_cache(); del model_a

# 3. Assign speaker labels
diarize_model = whisperx.DiarizationPipeline(use_auth_token=write_api_key, device=device)

# add min/max number of speakers if known
diarize_segments = diarize_model(audio)
# diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)

result = whisperx.assign_word_speakers(diarize_segments, result)
print(diarize_segments)
print(result["segments"]) # segments are now assigned speaker IDs

Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.2.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\User\.cache\torch\whisperx-vad-segmentation.bin`


No language specified, language will be first be detected for each audio file (increases inference time).
Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.2.1+cpu. Bad things might happen unless you revert torch to 1.x.
Detected language: en (0.98) in first 30s of audio...
[{'text': " How do we know we can trust him? He said he knows the form. He's a beaver. He shouldn't be saying anything.", 'start': 0.009, 'end': 9.241}]
[{'start': 0.67, 'end': 2.192, 'text': ' How do we know we can trust him?', 'words': [{'word': 'How', 'start': 0.67, 'end': 0.81, 'score': 0.976}, {'word': 'do', 'start': 0.85, 'end': 0.99, 'score': 0.911}, {'word': 'we', 'start': 1.01, 'end': 1.09, 'score': 0.996}, {'word': 'know', 'start': 1.13, 'end': 1.291, 'score': 0.848}, {'word': 'we', 'start': 1.331, 'end': 1.431, 'score': 0.912}, {'word': 'can', 'start': 1.451, 'end': 1.591, 'sco

In [19]:
def extract_data(json_data):
    for segment in json_data:
       output = print(segment['speaker'] + ": " + segment['text'])
    return output

extract_data(result['segments'])

SPEAKER_01:  How do we know we can trust him?
SPEAKER_01: He said he knows the form.
SPEAKER_01: He's a beaver.
SPEAKER_01: He shouldn't be saying anything.


In [20]:
# 1. Transcribe with original whisper (batched)
model_base = whisperx.load_model("base", device, compute_type=compute_type)

audio = whisperx.load_audio("hassan/four.wav")
result = model_base.transcribe(audio, batch_size=batch_size)
print(result["segments"]) # before alignment

# delete model if low on GPU resources
# import gc; gc.collect(); torch.cuda.empty_cache(); del model

# 2. Align whisper output
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)

print(result["segments"]) # after alignment

# delete model if low on GPU resources
# import gc; gc.collect(); torch.cuda.empty_cache(); del model_a

# 3. Assign speaker labels
diarize_model = whisperx.DiarizationPipeline(use_auth_token=write_api_key, device=device)

# add min/max number of speakers if known
diarize_segments = diarize_model(audio)
# diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)

result = whisperx.assign_word_speakers(diarize_segments, result)
print(diarize_segments)
print(result["segments"]) # segments are now assigned speaker IDs

Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.2.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\User\.cache\torch\whisperx-vad-segmentation.bin`


No language specified, language will be first be detected for each audio file (increases inference time).
Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.2.1+cpu. Bad things might happen unless you revert torch to 1.x.
Detected language: en (0.97) in first 30s of audio...
[{'text': " I want to know why Loki let us take him. He's not leading an army from here. I don't think we should be focusing on Loki. That guy's brain is a bag full of cats. He gets smell crazy on him. I've cared how you speak. Loki is beyond reason, but he is a vows god, and he is my brother. He killed 80 people in two days. He's adopted?", 'start': 0.009, 'end': 18.507}]
[{'start': 0.089, 'end': 1.591, 'text': ' I want to know why Loki let us take him.', 'words': [{'word': 'I', 'start': 0.089, 'end': 0.149, 'score': 0.581}, {'word': 'want', 'start': 0.169, 'end': 0.309, 'score': 0.318},

In [21]:
def extract_data(json_data):
    for segment in json_data:
       output = print(segment['speaker'] + ": " + segment['text'])
    return output

extract_data(result['segments'])

SPEAKER_02:  I want to know why Loki let us take him.
SPEAKER_02: He's not leading an army from here.
SPEAKER_00: I don't think we should be focusing on Loki.
SPEAKER_00: That guy's brain is a bag full of cats.
SPEAKER_00: He gets smell crazy on him.
SPEAKER_02: I've cared how you speak.
SPEAKER_02: Loki is beyond reason, but he is a vows god, and he is my brother.
SPEAKER_01: He killed 80 people in two days.
SPEAKER_02: He's adopted?


In [22]:
for segment in result['segments']:
    print(segment['text'])

 I want to know why Loki let us take him.
He's not leading an army from here.
I don't think we should be focusing on Loki.
That guy's brain is a bag full of cats.
He gets smell crazy on him.
I've cared how you speak.
Loki is beyond reason, but he is a vows god, and he is my brother.
He killed 80 people in two days.
He's adopted?


In [57]:
audio = whisperx.load_audio("talking-people-6368.mp3")
transcribe_result = model_base.transcribe(audio, batch_size=batch_size)
print(transcribe_result["segments"])

Detected language: pl (0.95) in first 30s of audio...
[{'text': ' I co? Mają otwarte?', 'start': 6.493, 'end': 16.698}, {'text': ' Nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie,', 'start': 17.824, 'end': 45.998}, {'text': ' Dzień dobry, ale dźwięk zostanie.', 'start': 46.493, 'end': 68.848}]


In [59]:
print(transcribe_result["segments"][0]['text'])
print(transcribe_result["segments"][1]['text'])
print(transcribe_result["segments"][2]['text'])

 I co? Mają otwarte?
 Nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie,
 Dzień dobry, ale dźwięk zostanie.


In [64]:
# 2. Align whisper output
model_a, metadata = whisperx.load_align_model(language_code=transcribe_result["language"], device=device)
aligned_result = whisperx.align(transcribe_result["segments"], model_a, metadata, audio, device, return_char_alignments=False)

print(aligned_result["segments"])

Some weights of the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-polish were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-polish and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You

[{'start': 11.616, 'end': 12.136, 'text': ' I co?', 'words': [{'word': 'I', 'start': 11.616, 'end': 11.716, 'score': 0.8}, {'word': 'co?', 'start': 12.056, 'end': 12.136, 'score': 0.395}]}, {'start': 12.176, 'end': 13.236, 'text': 'Mają otwarte?', 'words': [{'word': 'Mają', 'start': 12.176, 'end': 12.456, 'score': 0.956}, {'word': 'otwarte?', 'start': 12.476, 'end': 13.236, 'score': 0.654}]}, {'start': 18.284, 'end': 45.998, 'text': ' Nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie,',

In [70]:
print(aligned_result['segments'][0]['text'])
print(aligned_result['segments'][1]['text'])
print(aligned_result['segments'][2]['text'])
print(aligned_result['segments'][3]['text'])

 I co?
Mają otwarte?
 Nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie,
 Dzień dobry, ale dźwięk zostanie.


In [67]:
for segment in aligned_result['segments']:
    print(segment['text'])

 I co?
Mają otwarte?
 Nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie,
 Dzień dobry, ale dźwięk zostanie.


In [71]:
# 3. Assign speaker labels
diarize_model = whisperx.DiarizationPipeline(use_auth_token=write_api_key, device=device)

# add min/max number of speakers if known
diarize_segments = diarize_model(audio)
# diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)

assigned_word_result = whisperx.assign_word_speakers(diarize_segments, aligned_result)
print(diarize_segments)
print(assigned_word_result["segments"])

                              segment label     speaker      start        end  \
0   [ 00:00:00.008 -->  00:00:01.162]     A  SPEAKER_00   0.008489   1.162988   
1   [ 00:00:01.383 -->  00:00:03.998]     B  SPEAKER_01   1.383701   3.998302   
2   [ 00:00:01.383 -->  00:00:17.003]     C  SPEAKER_00   1.383701  17.003396   
3   [ 00:00:04.202 -->  00:00:04.303]     D  SPEAKER_01   4.202037   4.303905   
4   [ 00:00:05.831 -->  00:00:05.899]     E  SPEAKER_01   5.831919   5.899830   
5   [ 00:00:11.502 -->  00:00:14.083]     F  SPEAKER_01  11.502547  14.083192   
6   [ 00:00:17.325 -->  00:00:40.042]     G  SPEAKER_00  17.325976  40.042445   
7   [ 00:00:19.668 -->  00:00:20.976]     H  SPEAKER_01  19.668930  20.976231   
8   [ 00:00:21.825 -->  00:00:21.977]     I  SPEAKER_01  21.825127  21.977929   
9   [ 00:00:25.237 -->  00:00:25.645]     J  SPEAKER_01  25.237691  25.645161   
10  [ 00:00:26.358 -->  00:00:26.392]     K  SPEAKER_01  26.358234  26.392190   
11  [ 00:00:26.511 -->  00:0

In [72]:
extract_data(assigned_word_result['segments'])

SPEAKER_00:  I co?
SPEAKER_00: Mają otwarte?
SPEAKER_00:  Nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie, nie,
SPEAKER_00:  Dzień dobry, ale dźwięk zostanie.


### Large-v2

In [100]:
import requests
import json

# Load configuration from JSON file
with open('credentials.json', 'r') as config_file:
    config = json.load(config_file)

read_api_key = config['read_api']['api_token']
write_api_key =  config['write_api']['api_token']

import whisperx
import gc

device = "cpu"
audio_file = "talking-people-2-6400.mp3"
batch_size = 4 # reduce if low on GPU mem
compute_type = "int8" # change to "int8" if low on GPU mem (may reduce accuracy)

# 1. Transcribe with original whisper (batched)
model_base = whisperx.load_model("large-v2", device, compute_type=compute_type)

# save model to local path (optional)
# model_dir = "/path/"
# model_large = whisperx.load_model("large-v2", device, compute_type=compute_type, download_root=model_dir)

audio = whisperx.load_audio(audio_file)
result = model_base.transcribe(audio, batch_size=batch_size)
print(result["segments"]) # before alignment

# delete model if low on GPU resources
# import gc; gc.collect(); torch.cuda.empty_cache(); del model

# 2. Align whisper output
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)

print(result["segments"]) # after alignment

# delete model if low on GPU resources
# import gc; gc.collect(); torch.cuda.empty_cache(); del model_a

# 3. Assign speaker labels
diarize_model = whisperx.DiarizationPipeline(use_auth_token=write_api_key, device=device)

# add min/max number of speakers if known
diarize_segments = diarize_model(audio)
# diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)

result = whisperx.assign_word_speakers(diarize_segments, result)
print(diarize_segments)
print(result["segments"]) # segments are now assigned speaker IDs

No language specified, language will be first be detected for each audio file (increases inference time).


100%|█████████████████████████████████████| 16.9M/16.9M [00:12<00:00, 1.41MiB/s]
Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.2.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\User\.cache\torch\whisperx-vad-segmentation.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.2.1+cpu. Bad things might happen unless you revert torch to 1.x.
Detected language: pl (0.68) in first 30s of audio...
[{'text': ' Zostawić się, uderzyć czasem i cześć. Zostawić się.', 'start': 0.009, 'end': 12.449}, {'text': ' Niech prosimy się spodziewać, jestem w ogóle, wiesz... Ja mam w ogóle flagi, flagi podwija, to wygląda jakbym nas wypraszała. Cześć! Cześć! Cześć! Cześć! Cześć! Cześć!', 'start': 13.063, 'end': 33.712}, {'text': ' Założyć nadzieję, że w pierwszym czasie, że dążym przyjmujemy i nie powiemy niczego.', 'start': 33.712, 'end': 55.35}, {'text': ' Można.', 'start': 55.35, 'end': 81.186}]


Some weights of the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-polish were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-polish and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You

[{'start': 0.009, 'end': 9.524, 'text': ' Zostawić się, uderzyć czasem i cześć.', 'words': [{'word': 'Zostawić', 'start': 0.009, 'end': 5.017, 'score': 0.818}, {'word': 'się,', 'start': 5.097, 'end': 5.658, 'score': 0.585}, {'word': 'uderzyć', 'start': 5.858, 'end': 7.802, 'score': 0.639}, {'word': 'czasem', 'start': 7.882, 'end': 8.623, 'score': 0.544}, {'word': 'i', 'start': 8.743, 'end': 8.823, 'score': 0.75}, {'word': 'cześć.', 'start': 8.863, 'end': 9.524, 'score': 0.352}]}, {'start': 9.584, 'end': 12.008, 'text': 'Zostawić się.', 'words': [{'word': 'Zostawić', 'start': 9.584, 'end': 10.927, 'score': 0.657}, {'word': 'się.', 'start': 11.127, 'end': 12.008, 'score': 0.547}]}, {'start': 13.123, 'end': 28.57, 'text': ' Niech prosimy się spodziewać, jestem w ogóle, wiesz... Ja mam w ogóle flagi, flagi podwija, to wygląda jakbym nas wypraszała.', 'words': [{'word': 'Niech', 'start': 13.123, 'end': 13.563, 'score': 0.453}, {'word': 'prosimy', 'start': 14.203, 'end': 14.604, 'score': 0.3

In [101]:
extract_data(result['segments'])

SPEAKER_00:  Zostawić się, uderzyć czasem i cześć.
SPEAKER_00: Zostawić się.
SPEAKER_00:  Niech prosimy się spodziewać, jestem w ogóle, wiesz... Ja mam w ogóle flagi, flagi podwija, to wygląda jakbym nas wypraszała.
SPEAKER_00: Cześć!
SPEAKER_00: Cześć!
SPEAKER_00: Cześć!
SPEAKER_00: Cześć!
SPEAKER_00: Cześć!
SPEAKER_00: Cześć!
SPEAKER_00:  Założyć nadzieję, że w pierwszym czasie, że dążym przyjmujemy i nie powiemy niczego.
SPEAKER_00:  Można.


In [91]:
dir(model_base)

['__abstractmethods__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_batch_size',
 '_ensure_tensor_on_device',
 '_forward',
 '_forward_params',
 '_num_workers',
 '_postprocess_params',
 '_preprocess_params',
 '_sanitize_parameters',
 '_vad_params',
 'call_count',
 'check_model_type',
 'default_input_names',
 'detect_language',
 'device',
 'device_placement',
 'ensure_tensor_on_device',
 'forward',
 'framework',
 'get_inference_context',
 'get_iterator',
 'iterate',
 'model',
 'options',
 'postprocess',
 'predict',
 'preprocess',
 'preset_language',
 'run_multi',
 'run_single',
 'save_pretrained',
 'suppress_numerals',
 'tok

In [103]:
print(model_base.check_model_type)

<bound method Pipeline.check_model_type of <whisperx.asr.FasterWhisperPipeline object at 0x0000025589859DE0>>


In [104]:
print(model_base.model.supported_languages)

['af', 'am', 'ar', 'as', 'az', 'ba', 'be', 'bg', 'bn', 'bo', 'br', 'bs', 'ca', 'cs', 'cy', 'da', 'de', 'el', 'en', 'es', 'et', 'eu', 'fa', 'fi', 'fo', 'fr', 'gl', 'gu', 'ha', 'haw', 'he', 'hi', 'hr', 'ht', 'hu', 'hy', 'id', 'is', 'it', 'ja', 'jw', 'ka', 'kk', 'km', 'kn', 'ko', 'la', 'lb', 'ln', 'lo', 'lt', 'lv', 'mg', 'mi', 'mk', 'ml', 'mn', 'mr', 'ms', 'mt', 'my', 'ne', 'nl', 'nn', 'no', 'oc', 'pa', 'pl', 'ps', 'pt', 'ro', 'ru', 'sa', 'sd', 'si', 'sk', 'sl', 'sn', 'so', 'sq', 'sr', 'su', 'sv', 'sw', 'ta', 'te', 'tg', 'th', 'tk', 'tl', 'tr', 'tt', 'uk', 'ur', 'uz', 'vi', 'yi', 'yo', 'zh', 'yue']


In [7]:
import whisperx
import gc

device = "cpu"
audio_file = "F&H/F&h_english.wav"
batch_size = 8 # reduce if low on GPU mem
compute_type = "int8" # change to "int8" if low on GPU mem (may reduce accuracy)

# save model to local path (optional)
model_dir = "C:\\Users\\User\\.cache\whisper\\"
model_large_off = whisperx.load_model("base", device, compute_type=compute_type, download_root=model_dir)

audio = whisperx.load_audio(audio_file)
off_result = model_large_off.transcribe(audio, batch_size=batch_size)
print(off_result["segments"]) # before alignment

Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.2.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\User\.cache\torch\whisperx-vad-segmentation.bin`


No language specified, language will be first be detected for each audio file (increases inference time).
Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.2.1+cpu. Bad things might happen unless you revert torch to 1.x.
Detected language: en (0.39) in first 30s of audio...
[{'text': " Hello, how are you? I'm fine. How about you? I'm good. Let's talk about Fiverr success. I'm recently created a count on Fiverr, but I have no success and can't get any order from there. So can you please tell me some guidance or some tips from how then I can succeed from there?", 'start': 0.896, 'end': 22.585}, {'text': " Okay, before proceeding further, I want to ask you, what niche are you working in? Currently, I am providing my services on data sciences and artificial intelligence. Wow, that's good. Okay, so you are not getting impressions or orders? Both any pressures, I 

In [4]:
for segment in off_result['segments']:
    print(segment['text'])

 Hello Farhan, how are you? I'm fine, how about you? I'm good. Let's talk about Fiverr success. I recently created an account on Fiverr but I have no success and can't get any order from there. So can you please tell me some guidance or some tips from how then I can succeed from there?
 Okay, before proceeding further, I want to ask you what niche are you working in? Currently, I am providing my services in data sciences and artificial intelligence. Wow, that's good. Okay, so you are not getting impressions or orders? Both. In impression, I also can't get more impression and the success rate is also not good.
 okay so i would advise you that you should go for sharing your fire profile in your whatsapp group facebook groups that is the first step in the in our next call i will tell you about the second tip that that is all for today is it feasible for me or or i can attempt some more techniques like no for now just focus on this technique okay and then i will tell you the more secrets
 

In [None]:
json_data = [{'start': 1.374, 'end': 11.759, 'text': ' السلام علیکم فرن، کیسے ہو؟ آپ خیریت ہو؟ وعلیکم السلام عماد جی، بالکل میں ٹھیک ہوں، آپ بتایا آپ کیسے ہیں؟ چکر لگا، यार मरको नے एक बाइक लेनी है, second hand bike यह हो, जो मनासिब rate में हो, तो इसके बारे में कोई आपका आइडिया है?', 'words': [{'word': 'السلام'}, {'word': 'علیکم'}, {'word': 'فرن،'}, {'word': 'کیسے'}, {'word': 'ہو؟'}, {'word': 'آپ'}, {'word': 'خیریت'}, {'word': 'ہو؟'}, {'word': 'وعلیکم'}, {'word': 'السلام'}, {'word': 'عماد'}, {'word': 'جی،'}, {'word': 'بالکل'}, {'word': 'میں'}, {'word': 'ٹھیک'}, {'word': 'ہوں،'}, {'word': 'آپ'}, {'word': 'بتایا'}, {'word': 'آپ'}, {'word': 'کیسے'}, {'word': 'ہیں؟'}, {'word': 'چکر'}, {'word': 'لگا،'}, {'word': 'यार', 'start': 6.517, 'end': 6.697, 'score': 0.552}, {'word': 'मरको', 'start': 6.797, 'end': 7.037, 'score': 0.431}, {'word': 'नے', 'start': 7.057, 'end': 7.077, 'score': 0.997}, {'word': 'एक', 'start': 7.097, 'end': 7.177, 'score': 0.001}, {'word': 'बाइक', 'start': 7.297, 'end': 7.477, 'score': 0.276}, {'word': 'लेनी', 'start': 7.517, 'end': 7.737, 'score': 0.495}, {'word': 'है,', 'start': 7.797, 'end': 7.937, 'score': 0.556}, {'word': 'second'}, {'word': 'hand'}, {'word': 'bike'}, {'word': 'यह', 'start': 8.418, 'end': 8.678, 'score': 0.014}, {'word': 'हो,', 'start': 8.698, 'end': 8.798, 'score': 0.006}, {'word': 'जो', 'start': 9.298, 'end': 9.578, 'score': 0.202}, {'word': 'मनासिब', 'start': 9.678, 'end': 9.998, 'score': 0.293}, {'word': 'rate'}, {'word': 'में', 'start': 10.038, 'end': 10.279, 'score': 0.056}, {'word': 'हो,', 'start': 10.299, 'end': 10.439, 'score': 0.1}, {'word': 'तो', 'start': 10.479, 'end': 10.519, 'score': 0.223}, {'word': 'इसके', 'start': 10.579, 'end': 10.819, 'score': 0.477}, {'word': 'बारे', 'start': 10.839, 'end': 10.999, 'score': 0.207}, {'word': 'में', 'start': 11.019, 'end': 11.139, 'score': 0.0}, {'word': 'कोई', 'start': 11.159, 'end': 11.219, 'score': 0.001}, {'word': 'आपका', 'start': 11.239, 'end': 11.359, 'score': 0.081}, {'word': 'आइडिया', 'start': 11.379, 'end': 11.619, 'score': 0.352}, {'word': 'है?', 'start': 11.659, 'end': 11.759, 'score': 0.62}]}, {'start': 12.34, 'end': 14.1, 'text': 'जी, बल्कुल आइडिया है, आप कौन सी बाइक लेना चाह', 'words': [{'word': 'जी,', 'start': 12.34, 'end': 12.42, 'score': 0.624}, {'word': 'बल्कुल', 'start': 12.48, 'end': 12.68, 'score': 0.302}, {'word': 'आइडिया', 'start': 12.7, 'end': 12.98, 'score': 0.357}, {'word': 'है,', 'start': 13.0, 'end': 13.04, 'score': 0.0}, {'word': 'आप', 'start': 13.08, 'end': 13.2, 'score': 0.003}, {'word': 'कौन', 'start': 13.3, 'end': 13.44, 'score': 0.177}, {'word': 'सी', 'start': 13.48, 'end': 13.54, 'score': 0.246}, {'word': 'बाइक', 'start': 13.58, 'end': 13.7, 'score': 0.244}, {'word': 'लेना', 'start': 13.76, 'end': 13.92, 'score': 0.287}, {'word': 'चाह', 'start': 14.02, 'end': 14.1, 'score': 0.347}]}, {'start': 43.78, 'end': 55.692, 'text': ' हाँ बस आप पता करें जो आपका जानने बादा है तो मैं उसको फिर देख लूँगा और इंशाला मैं ले लूँगा इरादा तो है मेरा लेने का आप उसको पता करें। ठीक है आप ऐसा करेंगा आप तला आ जाएगी है हमारे आफिस तो हम आपको बाईक दे लेंगे। ठ', 'words': [{'word': 'हाँ', 'start': 43.78, 'end': 44.0, 'score': 0.046}, {'word': 'बस', 'start': 44.04, 'end': 44.18, 'score': 0.435}, {'word': 'आप', 'start': 44.2, 'end': 44.341, 'score': 0.156}, {'word': 'पता', 'start': 44.481, 'end': 44.661, 'score': 0.385}, {'word': 'करें', 'start': 44.741, 'end': 45.462, 'score': 0.187}, {'word': 'जो', 'start': 45.482, 'end': 45.522, 'score': 0.0}, {'word': 'आपका', 'start': 45.562, 'end': 46.182, 'score': 0.0}, {'word': 'जानने', 'start': 46.603, 'end': 46.723, 'score': 0.09}, {'word': 'बादा', 'start': 46.783, 'end': 47.003, 'score': 0.163}, {'word': 'है', 'start': 47.063, 'end': 47.424, 'score': 0.0}, {'word': 'तो', 'start': 47.504, 'end': 47.564, 'score': 0.744}, {'word': 'मैं', 'start': 47.644, 'end': 48.004, 'score': 0.169}, {'word': 'उसको', 'start': 48.445, 'end': 48.825, 'score': 0.399}, {'word': 'फिर', 'start': 48.885, 'end': 48.985, 'score': 0.168}, {'word': 'देख', 'start': 49.145, 'end': 49.325, 'score': 0.466}, {'word': 'लूँगा', 'start': 49.345, 'end': 49.466, 'score': 0.463}, {'word': 'और', 'start': 49.506, 'end': 49.646, 'score': 0.334}, {'word': 'इंशाला', 'start': 49.846, 'end': 50.226, 'score': 0.344}, {'word': 'मैं', 'start': 50.266, 'end': 50.326, 'score': 0.332}, {'word': 'ले', 'start': 50.366, 'end': 50.487, 'score': 0.372}, {'word': 'लूँगा', 'start': 50.507, 'end': 50.747, 'score': 0.255}, {'word': 'इरादा', 'start': 51.067, 'end': 51.347, 'score': 0.312}, {'word': 'तो', 'start': 51.387, 'end': 51.427, 'score': 0.596}, {'word': 'है', 'start': 51.447, 'end': 51.508, 'score': 0.498}, {'word': 'मेरा', 'start': 51.548, 'end': 51.708, 'score': 0.448}, {'word': 'लेने', 'start': 51.748, 'end': 51.908, 'score': 0.575}, {'word': 'का', 'start': 51.948, 'end': 51.988, 'score': 0.974}, {'word': 'आप', 'start': 52.008, 'end': 52.048, 'score': 0.0}, {'word': 'उसको', 'start': 52.068, 'end': 52.148, 'score': 0.0}, {'word': 'पता', 'start': 52.168, 'end': 52.248, 'score': 0.024}, {'word': 'करें।', 'start': 52.288, 'end': 52.368, 'score': 0.246}, {'word': 'ठीक', 'start': 52.388, 'end': 52.448, 'score': 0.0}, {'word': 'है', 'start': 52.468, 'end': 52.529, 'score': 0.0}, {'word': 'आप', 'start': 52.549, 'end': 52.589, 'score': 0.0}, {'word': 'ऐसा', 'start': 52.629, 'end': 52.829, 'score': 0.356}, {'word': 'करेंगा', 'start': 52.869, 'end': 53.009, 'score': 0.133}, {'word': 'आप', 'start': 53.049, 'end': 53.109, 'score': 0.008}, {'word': 'तला', 'start': 53.209, 'end': 53.329, 'score': 0.08}, {'word': 'आ', 'start': 53.369, 'end': 53.429, 'score': 0.303}, {'word': 'जाएगी', 'start': 53.489, 'end': 53.65, 'score': 0.234}, {'word': 'है', 'start': 53.69, 'end': 53.77, 'score': 0.64}, {'word': 'हमारे', 'start': 53.79, 'end': 54.07, 'score': 0.189}, {'word': 'आफिस', 'start': 54.13, 'end': 54.33, 'score': 0.088}, {'word': 'तो', 'start': 54.47, 'end': 54.571, 'score': 0.164}, {'word': 'हम', 'start': 54.591, 'end': 54.671, 'score': 0.0}, {'word': 'आपको', 'start': 54.731, 'end': 54.931, 'score': 0.288}, {'word': 'बाईक', 'start': 54.991, 'end': 55.111, 'score': 0.559}, {'word': 'दे', 'start': 55.171, 'end': 55.251, 'score': 0.474}, {'word': 'लेंगे।', 'start': 55.291, 'end': 55.632, 'score': 0.028}, {'word': 'ठ', 'start': 55.672, 'end': 55.692, 'score': 0.002}]}]

print(json_data[0]['words'])
print("-------------------------------------------------------------")
print(json_data[1]['words'])
print("-------------------------------------------------------------")
print(json_data[2]['words'])
print("-------------------------------------------------------------")

In [96]:
import whisperx
import gc

device = "cpu"
audio_file = "hassan_urdu_neg3.wav"
batch_size = 4 # reduce if low on GPU mem
compute_type = "int8" # change to "int8" if low on GPU mem (may reduce accuracy)

# 1. Transcribe with original whisper (batched)
# model_base = whisperx.load_model("large-v2", device, compute_type=compute_type)

# save model to local path (optional)
model_dir = "C:\\Users\\User\\.cache\\huggingface\\hub\\models--jonatasgrosman--wav2vec2-large-xlsr-53-polish\\snapshots\\6b1cea36bd8bc5f65ec8081667cd9c0207d51970\\"
model_large = whisperx.load_model("large-v2", device, compute_type=compute_type, download_root=model_dir)

audio = whisperx.load_audio(audio_file)
result = model_base.transcribe(audio, batch_size=batch_size)
print(result["segments"]) # before alignment

# delete model if low on GPU resources
# import gc; gc.collect(); torch.cuda.empty_cache(); del model

# 2. Align whisper output
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)

print(result["segments"]) # after alignment

# delete model if low on GPU resources
# import gc; gc.collect(); torch.cuda.empty_cache(); del model_a

# 3. Assign speaker labels
diarize_model = whisperx.DiarizationPipeline(use_auth_token=write_api_key, device=device)

# add min/max number of speakers if known
diarize_segments = diarize_model(audio)
# diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)

result = whisperx.assign_word_speakers(diarize_segments, result)
print(diarize_segments)
print(result["segments"]) # segments are now assigned speaker IDs

No language specified, language will be first be detected for each audio file (increases inference time).


Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.2.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\User\.cache\torch\whisperx-vad-segmentation.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.2.1+cpu. Bad things might happen unless you revert torch to 1.x.
Detected language: hi (0.55) in first 30s of audio...
[{'text': ' السلام علیکم میں حسان بات کر رہا ہوں حبیبی بینک سے میں آپ کو بتانا چاہتا ہوں کہ حبیبی بینک نے ایک نئی credit card scheme شروع کی ہے اگر آپ یہ service مجھ سے دیں گے تو میں کم سے کم charges میں آپ کو credit card service دے سکتا ہوں اس کی details میں آپ کو whatsapp کر دوں گا اور اگر آپ direct bank سے لےنا چاہتے ہیں تو پھر آپ کو یہ service costly پڑے گی', 'start': 0.009, 'end': 24.172}, {'text': ' अभी आप interested हैं तो मुझे बताए मैं ये service आपके number पर अक्टिवेट कर दूँगा या फिर आप मुझसे WhatsApp पर भी रापता कर सकते हैं शुक्रिया', 'start': 24.497, 'end': 33.439}]


Some weights of the model checkpoint at theainerd/Wav2Vec2-large-xlsr-hindi were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at theainerd/Wav2Vec2-large-xlsr-hindi and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably T

[{'start': 24.497, 'end': 33.098, 'text': ' अभी आप interested हैं तो मुझे बताए मैं ये service आपके number पर अक्टिवेट कर दूँगा या फिर आप मुझसे WhatsApp पर भी रापता कर सकते हैं शुक्रिया', 'words': [{'word': 'अभी', 'start': 24.497, 'end': 24.838, 'score': 0.439}, {'word': 'आप', 'start': 24.958, 'end': 25.199, 'score': 0.09}, {'word': 'interested'}, {'word': 'हैं', 'start': 25.68, 'end': 25.76, 'score': 0.469}, {'word': 'तो', 'start': 25.8, 'end': 25.9, 'score': 0.409}, {'word': 'मुझे', 'start': 25.941, 'end': 26.141, 'score': 0.554}, {'word': 'बताए', 'start': 26.201, 'end': 26.402, 'score': 0.522}, {'word': 'मैं', 'start': 26.542, 'end': 26.642, 'score': 0.458}, {'word': 'ये', 'start': 26.702, 'end': 26.783, 'score': 0.46}, {'word': 'service'}, {'word': 'आपके', 'start': 27.144, 'end': 27.845, 'score': 0.156}, {'word': 'number'}, {'word': 'पर', 'start': 27.966, 'end': 28.086, 'score': 0.594}, {'word': 'अक्टिवेट', 'start': 28.106, 'end': 28.667, 'score': 0.242}, {'word': 'कर', 'start': 28.

In [97]:
extract_data(result['segments'])

SPEAKER_00:  अभी आप interested हैं तो मुझे बताए मैं ये service आपके number पर अक्टिवेट कर दूँगा या फिर आप मुझसे WhatsApp पर भी रापता कर सकते हैं शुक्रिया


In [99]:
import whisperx
import gc

device = "cpu"
audio_file = "hassan_urdu_neg3.wav"
batch_size = 4 # reduce if low on GPU mem
compute_type = "int8" # change to "int8" if low on GPU mem (may reduce accuracy)

# 1. Transcribe with original whisper (batched)
# model_base = whisperx.load_model("large-v2", device, compute_type=compute_type)

# save model to local path (optional)
model_dir = r"C:\\Users\\User\\.cache\\huggingface\\hub\\models--Systran--faster-whisper-large-v2\\snapshots\\f0fe81560cb8b68660e564f55dd99207059c092e\\"
model_large = whisperx.load_model("large-v2", device, compute_type=compute_type, download_root=model_dir)

audio = whisperx.load_audio(audio_file)
result = model_base.transcribe(audio, batch_size=batch_size)
print(result["segments"]) # before alignment

# delete model if low on GPU resources
# import gc; gc.collect(); torch.cuda.empty_cache(); del model

# 2. Align whisper output
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)

print(result["segments"]) # after alignment

# delete model if low on GPU resources
# import gc; gc.collect(); torch.cuda.empty_cache(); del model_a

# 3. Assign speaker labels
diarize_model = whisperx.DiarizationPipeline(use_auth_token=write_api_key, device=device)

# add min/max number of speakers if known
diarize_segments = diarize_model(audio)
# diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)

result = whisperx.assign_word_speakers(diarize_segments, result)
print(diarize_segments)
print(result["segments"]) # segments are now assigned speaker IDs

OSError: [Errno 28] No space left on device

In [None]:
import requests
import json

# Load configuration from JSON file
with open('credentials.json', 'r') as config_file:
    config = json.load(config_file)

read_api_key = config['read_api']['api_token']
write_api_key =  config['write_api']['api_token']

In [7]:
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()

client = OpenAI()

audio_file= open("./F&H/f&h_urdu.wav", "rb")
transcription = client.audio.transcriptions.create(
  model="whisper-1",
  file=audio_file,
  response_format='verbose_json',
  timestamp_granularities=['word', 'segment'],
  
)
print(transcription)

Transcription(text='اسلام علیکم افراد کیسے ہو آپ خیریت ہو؟ وعلیکم السلام عماد جی بالکل میں ٹھیک ہوں آپ بتایا آپ کیسے ہیں؟ شکر لگا یار میرے کو نے ایک بائیک لینی ہے سیکرنٹ بائیک کیا ہو جو مناصف ریٹ میں ہو تو اس کے بارے میں کوئی آپ کا آئیڈیا ہے؟ جی بالکل آئیڈیا آپ کونسی بائیک لینے چاہ رہے ہیں؟ سیونٹی، اونٹو فائف، اونٹو فیٹی میں سیونٹی کے ہے اور میرا فوکس میں یونیک برینڈ ہے تو کوئی ایسی جو مناصف پرائیس میں میں کوئی بائیک مل جائے تیس ہزار کے اندر جی بالکل مل جائے گی لیکن آپ تھوڑی کالیٹی کومپرمائز ہوگی کیونکہ آپ کو پتا ہے کہ آرچکر مہنگاہی بہت ہے بہت ہی ٹھیک ہے تو تھرٹی تاؤزن میں جو آپ کو بائیک ملے گی وہ اتنی خاص نہیں ہوگی حالانکہ چلتا جائے گی آپ کا کام نکل جائے گا مگر اتنی اچھی کالیٹی نہیں ہوگی تو کیا میں آپ کے لئے وہ بائیک نکال دوں ہاں بس آپ پتا کریں جو آپ کا جاننے والا ہے تو میں اس کو پھر دیکھ لوں گا اور انشاءاللہ میں لے لوں گا ارادہ تو ہے میرا لینے کا ایسا کہنا آپ کل آجائیے ہمارا آفیس تو ہم آپ کو بائیک دے دیں گے اللہ حافظ', task='transcribe', language='urdu', duration=57.06999969482422, s

In [8]:
transcription.words

[{'word': 'اسلام', 'start': 1.2599999904632568, 'end': 1.7000000476837158},
 {'word': 'علیکم', 'start': 1.7000000476837158, 'end': 1.7000000476837158},
 {'word': 'افراد', 'start': 1.7000000476837158, 'end': 1.940000057220459},
 {'word': 'کیسے', 'start': 1.940000057220459, 'end': 2.2200000286102295},
 {'word': 'ہو', 'start': 2.2200000286102295, 'end': 2.240000009536743},
 {'word': 'آپ', 'start': 2.240000009536743, 'end': 2.380000114440918},
 {'word': 'خیریت', 'start': 2.380000114440918, 'end': 2.7200000286102295},
 {'word': 'ہو؟', 'start': 2.7200000286102295, 'end': 3.0199999809265137},
 {'word': 'وعلیکم', 'start': 3.0199999809265137, 'end': 3.3399999141693115},
 {'word': 'السلام', 'start': 3.3399999141693115, 'end': 3.640000104904175},
 {'word': 'عماد', 'start': 3.640000104904175, 'end': 4.0},
 {'word': 'جی', 'start': 4.0, 'end': 4.079999923706055},
 {'word': 'بالکل', 'start': 4.079999923706055, 'end': 4.360000133514404},
 {'word': 'میں', 'start': 4.360000133514404, 'end': 4.5799999237