In [1]:
! pip install -q transformers datasets evaluate jiwer

In [2]:
import pandas as pd
import torch
import torchaudio
from datasets import Dataset, DatasetDict
from transformers import (
    WhisperTokenizer,
    WhisperProcessor,
    WhisperFeatureExtractor,
    WhisperForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)
import evaluate
import os
from dataclasses import dataclass
from typing import Dict, List, Optional, Union
import numpy as np 

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(DEVICE)

cuda


In [3]:
from huggingface_hub import login

login(new_session=False,
      write_permission=True, 
      token='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', 
      add_to_git_credential=True)

Token is valid (permission: write).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.[0m
Token has not been saved to git credential helper.
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [4]:
import os
import pandas as pd
from datasets import Dataset, Audio

# Define base paths
base_path = "/kaggle/input/medical-speech-transcription-and-intent/Medical Speech, Transcription, and Intent"
csv_file_path = os.path.join(base_path, "overview-of-recordings.csv")
recordings_path = os.path.join(base_path, "recordings")

# Load CSV
df = pd.read_csv(csv_file_path)

# Function to find the subdirectory and file path
def find_subdirectory_and_path(file_name):
    for subdirectory in ['test', 'train', 'validate']:
        file_path = os.path.join(recordings_path, subdirectory, file_name)
        if os.path.exists(file_path):
            return subdirectory, file_path
    return None, None 

# Apply the function to find subdirectories and paths
df[['subdirectory', 'file_path']] = df['file_name'].apply(
    lambda file_name: pd.Series(find_subdirectory_and_path(file_name))
)

# Drop unnecessary columns
df = df.drop(['writer_id', 'speaker_id', 'file_download', 'file_name'], axis=1)

# Convert dataframe to dataset
dataset = Dataset.from_pandas(df)

# Split the dataset into train, test, and validate
train_data = dataset.filter(lambda x: x['subdirectory'] == 'train')
test_data = dataset.filter(lambda x: x['subdirectory'] == 'test')
val_data = dataset.filter(lambda x: x['subdirectory'] == 'validate')

# Cast and rename columns for each split
for split_name, split_data in zip(['train', 'test', 'validate'], [train_data, test_data, val_data]):
    split_data = split_data.cast_column("file_path", Audio())
    split_data = split_data.rename_column("file_path", "audio")
    split_data = split_data.rename_column("phrase", "text")
    if split_name == 'train':
        train_data = split_data
    elif split_name == 'test':
        test_data = split_data
    elif split_name == 'validate':
        val_data = split_data

# Remove unnecessary columns from each split
columns_to_remove = [
    "subdirectory", "prompt", 'audio_clipping', 'audio_clipping:confidence',
    'background_noise_audible', 'background_noise_audible:confidence',
    'overall_quality_of_the_audio', 'quiet_speaker', 'quiet_speaker:confidence'
]

train_data = train_data.remove_columns(columns_to_remove)
test_data = test_data.remove_columns(columns_to_remove)
val_data = val_data.remove_columns(columns_to_remove)

# Format preview for a single sample
def preview_sample(dataset):
    sample = dataset[0]  # First sample in the dataset
    return {
        "audio": {
            "path": sample["audio"]["path"],
            "array": sample["audio"]["array"],  # No need to convert; it's already a list or numpy array
            "sampling_rate": sample["audio"]["sampling_rate"],
        },
        "text": sample["text"]
    }

print("Train Data:", train_data)
train_sample = preview_sample(train_data)
print("Train Sample:", train_sample)


Filter:   0%|          | 0/6661 [00:00<?, ? examples/s]

Filter:   0%|          | 0/6661 [00:00<?, ? examples/s]

Filter:   0%|          | 0/6661 [00:00<?, ? examples/s]

Train Data: Dataset({
    features: ['text', 'audio'],
    num_rows: 381
})
Train Sample: {'audio': {'path': '/kaggle/input/medical-speech-transcription-and-intent/Medical Speech, Transcription, and Intent/recordings/train/1249120_44197979_23991689.wav', 'array': array([0.02072144, 0.01501465, 0.01168823, ..., 0.05114746, 0.10168457,
       0.07489014]), 'sampling_rate': 48000}, 'text': 'I have a sharp pain in my lower stomach.'}


In [5]:
train_data = train_data.cast_column("audio", Audio(sampling_rate=16000))
test_data = test_data.cast_column("audio", Audio(sampling_rate=16000))
val_data = val_data.cast_column("audio", Audio(sampling_rate=16000))

In [6]:
import re
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\ï\`\√\d\\n]'

def remove_special_characters(batch):
    batch["text"] = re.sub(chars_to_ignore_regex, '', batch["text"]).lower() + " "
    return batch
    
train_data = train_data.map(remove_special_characters)
test_data = test_data.map(remove_special_characters)
val_data = val_data.map(remove_special_characters)


Map:   0%|          | 0/381 [00:00<?, ? examples/s]

Map:   0%|          | 0/5895 [00:00<?, ? examples/s]

Map:   0%|          | 0/385 [00:00<?, ? examples/s]

In [10]:
val_data


Dataset({
    features: ['text', 'audio'],
    num_rows: 385
})

# For FINETUNED Model:

In [12]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import evaluate

# Load the finetuned model and processor
model = WhisperForConditionalGeneration.from_pretrained("Kabir259/whisper-small_kabir")
processor = WhisperProcessor.from_pretrained("Kabir259/whisper-small_kabir")
metric = evaluate.load("wer")
model.generation_config.task = "transcribe"

# Move model to device
model = model.to(DEVICE)

# Function to prepare predictions
def compute_validation_wer(dataset):
    pred_strs = []
    label_strs = []
        
    for idx, example in enumerate(dataset):
        try:
           
            audio = example["audio"]

            input_features = processor.feature_extractor(
                audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt"
            ).input_features[0]  # Select the first feature array (consistent with training)

            input_features = input_features.to(DEVICE)  # Move input to the same device as the model

            labels = example["text"]  # Reference text

            # Generate predictions
            with torch.no_grad():
                outputs = model.generate(input_features.unsqueeze(0))  # Add batch dimension
            pred_str = processor.tokenizer.decode(outputs[0], skip_special_tokens=True)

            pred_strs.append(pred_str)
            label_strs.append(labels)
        except Exception as e:
            print(f"Error processing sample {idx + 1}: {e}")
            continue

    # Compute WER
    try:
        wer = metric.compute(predictions=pred_strs, references=label_strs)
        print(f"WER computed successfully.")
    except Exception as e:
        print(f"Error during WER computation: {e}")
        wer = None
    
    return wer

# Evaluate on validation set
try:
    val_wer = compute_validation_wer(val_data)
    if val_wer is not None:
        print(f"Validation WER: {val_wer:.3f}")
    else:
        print("Failed to compute WER.")
except Exception as e:
    print(f"Error during evaluation: {e}")


WER computed successfully.
Validation WER: 0.213


# For BASE model:

In [13]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration, WhisperTokenizer, WhisperFeatureExtractor
import evaluate

# Base model details
model_id = "openai/whisper-small"
model = WhisperForConditionalGeneration.from_pretrained(model_id)
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_id)
tokenizer = WhisperTokenizer.from_pretrained(model_id, language="English", task="transcribe")
processor = WhisperProcessor.from_pretrained(model_id, language="English", task="transcribe")

metric = evaluate.load("wer")

# Move model to device
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(DEVICE)

# Function to prepare predictions
def compute_validation_wer(dataset):
    pred_strs = []
    label_strs = []
    
    for idx, example in enumerate(dataset):
        try:
        
            audio = example["audio"]

            input_features = processor.feature_extractor(
                audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt"
            ).input_features[0]  # Select the first feature array (consistent with training)

            input_features = input_features.to(DEVICE)  # Move input to the same device as the model

            labels = example["text"]  # Reference text

            # Generate predictions
            with torch.no_grad():
                outputs = model.generate(input_features.unsqueeze(0))  # Add batch dimension
            pred_str = tokenizer.decode(outputs[0], skip_special_tokens=True)

            pred_strs.append(pred_str)
            label_strs.append(labels)
        except Exception as e:
            print(f"Error processing sample {idx + 1}: {e}")
            continue

    # Compute WER
    try:
        wer = metric.compute(predictions=pred_strs, references=label_strs)
        print(f"WER computed successfully.")
    except Exception as e:
        print(f"Error during WER computation: {e}")
        wer = None
    
    return wer

# Evaluate on validation set
try:
    val_wer = compute_validation_wer(val_data)
    if val_wer is not None:
        print(f"Validation WER (Base Model): {val_wer:.2f}")
    else:
        print("Failed to compute WER.")
except Exception as e:
    print(f"Error during evaluation: {e}")


config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.


WER computed successfully.
Validation WER (Base Model): 1.28
