In [2]:
from datasets import load_dataset
import IPython.display as ipd
from datasets import Dataset, DatasetDict
import torchaudio
import torch
from transformers import WhisperFeatureExtractor
from transformers import WhisperTokenizer
import os

In [3]:
ds = load_dataset("yashtiwari/PaulMooney-Medical-ASR-Data")
ds.column_names

{'train': ['id', 'sentence', 'prompt', 'speaker_id', 'path'],
 'validation': ['id', 'sentence', 'prompt', 'speaker_id', 'path'],
 'test': ['id', 'sentence', 'prompt', 'speaker_id', 'path']}

In [4]:
import torch
import torchaudio

def resample_single_audio(sample):
    # Extract the waveform and sample rate from the audio array
    waveform = torch.tensor(sample["path"]["array"],dtype=torch.float32).unsqueeze(0)  # Add a batch dimension
    sample_rate = sample["path"]["sampling_rate"]
    
    # Resample audio to 16kHz
    resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
    waveform_resampled = resampler(waveform)
    
    # Update the sample with the resampled audio
    sample["path"]["array"] = waveform_resampled.squeeze().numpy().tolist()  # Remove batch dimension and convert to list
    sample["path"]["sampling_rate"] = 16000
    return sample

# Apply resampling to a single audio sample
resampled_audio = resample_single_audio(ds["test"][200])
print(resampled_audio["path"]["sampling_rate"])  # Should print 16000

16000


In [5]:
from transformers import WhisperTokenizer, WhisperFeatureExtractor

tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small",language="English", task="transcribe")
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")

# tokenizer.save_pretrained("./whisper-small-en")
# feature_extractor.save_pretrained("./whisper-small-en")

In [6]:
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration

# Load model and processor
processor = WhisperProcessor.from_pretrained("./whisper-small-en", language="English", task="transcribe")
model = WhisperForConditionalGeneration.from_pretrained("./whisper-small-en").to("cuda:6")

# Remove unnecessary forced decoder ids setting (it's not needed for transcribing task)
model.generation_config.forced_decoder_ids = None

# Extract audio sample (ensure resampled_audio contains valid data)
sample = resampled_audio["path"]
waveform = torch.tensor(sample["array"], dtype=torch.float32)  # Ensure correct data type

# Process audio to get input features
inputs = processor(waveform.numpy(), sampling_rate=sample["sampling_rate"], return_tensors="pt", padding=True)

# Move input features to GPU
input_features = inputs['input_features'].to("cuda:6")  # Use 'input_features', not 'input_values'

# Generate token IDs using input_features directly
predicted_ids = model.generate(input_features=input_features)

# Decode token IDs to text
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
print(transcription)


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


['Every morning when I wake up my neck feels like I slept on in wrong.']


In [7]:
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_dataset

# Load model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="en")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to("cuda:6")

# Extract audio sample
sample = resampled_audio["path"]
waveform = torch.tensor(sample["array"], dtype=torch.float32)  # Ensure correct data type

# Process audio
input_features = processor(waveform.numpy(), sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features
# Move input to GPU
input_features = input_features.to("cuda:6")
# Generate token IDs
predicted_ids = model.generate(input_features=input_features)

# Decode token IDs to text
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
print(transcription)


Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.


[' Every morning when I wake up my neck feels like I slept on in wrong.']


In [None]:
import torch
import torchaudio
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_dataset
import evaluate

# Load test dataset and WER metric
wer_metric = evaluate.load("wer")

def resample_single_audio(sample):
    waveform = torch.tensor(sample["path"]["array"], dtype=torch.float32).unsqueeze(0)
    sample_rate = sample["path"]["sampling_rate"]
    resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
    waveform_resampled = resampler(waveform)
    sample["path"]["array"] = waveform_resampled.squeeze().numpy().tolist()
    sample["path"]["sampling_rate"] = 16000
    return sample

def transcribe(model, processor, waveform, sampling_rate):
    try:
        # Preprocess the waveform to get input features
        inputs = processor(
            waveform.numpy(), 
            sampling_rate=sampling_rate, 
            return_tensors="pt"
        )
        input_features = inputs['input_features']

        # Check the shape and pad to length 3000 if needed
        if input_features.shape[-1] < 3000:
            pad_length = 3000 - input_features.shape[-1]
            input_features = torch.nn.functional.pad(input_features, (0, pad_length), mode="constant", value=-100)

        # Move to GPU
        input_features = input_features.to("cuda:6")

        # Generate transcription
        predicted_ids = model.generate(input_features=input_features)
        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
        return transcription

    except Exception as e:
        print(f"Error during transcription: {e}")
        return None


# Load Fine-Tuned Whisper Model
processor_ft = WhisperProcessor.from_pretrained("./whisper-small-en", language="English", task="transcribe")
model_ft = WhisperForConditionalGeneration.from_pretrained("./whisper-small-en").to("cuda:6")
model_ft.generation_config.forced_decoder_ids = None

# Load OpenAI Whisper Model
processor_og = WhisperProcessor.from_pretrained("openai/whisper-small", language="en", task="transcribe")
model_og = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to("cuda:6")

# Initialize lists to store predictions and references
predictions_ft = []
predictions_og = []
references = []

# Loop through the entire test set
for i, sample in enumerate(ds["validation"]):
    # Resample the audio sample
    sample = resample_single_audio(sample)
    waveform = torch.tensor(sample["path"]["array"], dtype=torch.float32)
    sampling_rate = sample["path"]["sampling_rate"]
    ground_truth = sample["sentence"]
    
    try:
        # Transcribe using Fine-Tuned Whisper
        transcription_ft = transcribe(model_ft, processor_ft, waveform, sampling_rate)
        
        # Transcribe using OpenAI Whisper
        transcription_og = transcribe(model_og, processor_og, waveform, sampling_rate)

        # Append transcriptions and references if both are successful
        if transcription_ft and transcription_og:
            predictions_ft.append(transcription_ft)
            predictions_og.append(transcription_og)
            references.append(ground_truth)
    
        print(i)
        # Progress print every 50 samples
        if (i + 1) % 50 == 0:
            print(f"Processed {i + 1} samples...")

        # Clear CUDA cache periodically to save memory
        if (i + 1) % 100 == 0:
            torch.cuda.empty_cache()

    except Exception as e:
        print(f"Error processing sample {i} - {e}")
        continue

# Calculate Average WER
avg_wer_ft = wer_metric.compute(predictions=predictions_ft, references=references)
avg_wer_og = wer_metric.compute(predictions=predictions_og, references=references)

# Print the average WER scores
print(f"\nAverage WER for Fine-Tuned Whisper: {avg_wer_ft:.4f}")
print(f"Average WER for OpenAI Whisper: {avg_wer_og:.4f}")


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
Processed 50 samples...
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
Processed 100 samples...
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
Processed 150 samples...
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
Processed 200 samples...
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
Processed 2