In [1]:
!pip install datasets
!pip install evaluate
!pip install jiwer

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [3]:
#mount frst
from google.colab import drive
drive.mount('/content/drive')
import os

file_path = '/content/drive/My Drive/Project/dev-clean/LibriSpeech/test-clean'

if os.path.exists(file_path):
    print("File exists.")
else:
    print("File does not exist.")


Mounted at /content/drive
File exists.


Tiny

In [9]:
import os
import json
import torch
import torchaudio
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import evaluate
from sklearn.model_selection import train_test_split
from datetime import datetime
import time

#Check time
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)
start = time.process_time()


# Load the Whisper model and processor
model_name = "openai/whisper-tiny"
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)

# Define paths
output_path = "/content/drive/My Drive/Project/dev-clean/LibriSpeech/processed_audio"
mapping_file = os.path.join(output_path, "processed_audio_transcription_mapping.json")

# Load the json into a dictionary
with open(mapping_file, 'r') as f:
    audio_transcription_mapping = json.load(f)
print(f"Total number of cleaned audio-transcription pairs: {len(audio_transcription_mapping)}")

# Split the data into training and testing sets
data_pairs = list(audio_transcription_mapping.items())
train_pairs, test_pairs = train_test_split(data_pairs, test_size=0.2, random_state=42)
print(f"Training set size: {len(train_pairs)}")
print(f"Test set size: {len(test_pairs)}")

# Initialize WER and CER metrics
wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")

# Function to process and evaluate audio files
def evaluate_audio_files(pairs):
    predictions = []
    references = []

    for audio_path, transcription in pairs:
        # Load and preprocess the audio
        waveform, sample_rate = torchaudio.load(audio_path)
        inputs = processor(waveform.squeeze().numpy(), sampling_rate=sample_rate, return_tensors="pt")
        inputs = inputs.input_features.to(model.device)

        # Decode the audio
        with torch.no_grad():
            predicted_ids = model.generate(inputs)
        predicted_text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]

        # Store predictions and references
        predictions.append(predicted_text)
        references.append(transcription)

    # Compute WER and CER
    wer = wer_metric.compute(predictions=predictions, references=references)
    cer = cer_metric.compute(predictions=predictions, references=references)

    return wer, cer

# Evaluate on the training dataset
train_wer, train_cer = evaluate_audio_files(train_pairs)
print(f"Training WER: {train_wer}")
print(f"Training CER: {train_cer}")

# Evaluate on the testing dataset
test_wer, test_cer = evaluate_audio_files(test_pairs)
print(f"Test WER: {test_wer}")
print(f"Test CER: {test_cer}")

new_time = now.strftime("%H:%M:%S")
print("Current Time =", new_time)
# your code here
elapsed_time = time.process_time() - start
hours = int(elapsed_time // 3600)
minutes = int((elapsed_time % 3600) // 60)
seconds = elapsed_time % 60

print(f"Total time is {hours} hours, {minutes} minutes, and {seconds:.2f} seconds")

Current Time = 15:24:26
Total number of cleaned audio-transcription pairs: 2620
Training set size: 2096
Test set size: 524
Training WER: 1.0113761555779086
Training CER: 0.8464181381386755
Test WER: 1.0037885788209204
Test CER: 0.847968138480371
Current Time = 15:24:26
Total time is 2 hours, 1 minutes, and 43.81 seconds


In [3]:
import os
import json
import torch
import torchaudio
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import evaluate
from sklearn.model_selection import train_test_split
from datetime import datetime
import time

#Check time
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)
start = time.process_time()


# Load the Whisper model and processor
model_name = "openai/whisper-base"
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)

# Define paths
output_path = "/content/drive/My Drive/Project/dev-clean/LibriSpeech/processed_audio"
mapping_file = os.path.join(output_path, "processed_audio_transcription_mapping.json")

# Load the json into a dictionary
with open(mapping_file, 'r') as f:
    audio_transcription_mapping = json.load(f)
print(f"Total number of cleaned audio-transcription pairs: {len(audio_transcription_mapping)}")

# Split the data into training and testing sets
data_pairs = list(audio_transcription_mapping.items())
train_pairs, test_pairs = train_test_split(data_pairs, test_size=0.2, random_state=42)
print(f"Training set size: {len(train_pairs)}")
print(f"Test set size: {len(test_pairs)}")

# Initialize WER and CER metrics
wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")

# Function to process and evaluate audio files
def evaluate_audio_files(pairs):
    predictions = []
    references = []

    for audio_path, transcription in pairs:
        # Load and preprocess the audio
        waveform, sample_rate = torchaudio.load(audio_path)
        inputs = processor(waveform.squeeze().numpy(), sampling_rate=sample_rate, return_tensors="pt")
        inputs = inputs.input_features.to(model.device)

        # Decode the audio
        with torch.no_grad():
            predicted_ids = model.generate(inputs)
        predicted_text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]

        # Store predictions and references
        predictions.append(predicted_text)
        references.append(transcription)

    # Compute WER and CER
    wer = wer_metric.compute(predictions=predictions, references=references)
    cer = cer_metric.compute(predictions=predictions, references=references)

    return wer, cer

# Evaluate on the training dataset
train_wer, train_cer = evaluate_audio_files(train_pairs)
print(f"Training WER: {train_wer}")
print(f"Training CER: {train_cer}")

# Evaluate on the testing dataset
test_wer, test_cer = evaluate_audio_files(test_pairs)
print(f"Test WER: {test_wer}")
print(f"Test CER: {test_cer}")

new_time = now.strftime("%H:%M:%S")
print("Current Time =", new_time)
# your code here
elapsed_time = time.process_time() - start
hours = int(elapsed_time // 3600)
minutes = int((elapsed_time % 3600) // 60)
seconds = elapsed_time % 60

print(f"Total time is {hours} hours, {minutes} minutes, and {seconds:.2f} seconds")

Current Time = 11:32:59


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.98k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/290M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.81k [00:00<?, ?B/s]

Total number of cleaned audio-transcription pairs: 2620
Training set size: 2096
Test set size: 524


Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.60k [00:00<?, ?B/s]

Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Training WER: 0.9972457728600853
Training CER: 0.8390857562106036
Test WER: 1.0084087968952133
Test CER: 0.8453130118446234
Current Time = 11:32:59
Total time is 4 hours, 13 minutes, and 13.64 seconds
