In [1]:
import jsonlines
import torch
import torchaudio
from datasets import Dataset, DatasetDict, concatenate_datasets
from transformers import Trainer, TrainingArguments, WhisperTokenizer, WhisperFeatureExtractor, WhisperForConditionalGeneration, WhisperProcessor, Seq2SeqTrainingArguments, Seq2SeqTrainer
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift
from torch.utils.data import DataLoader
import numpy as np
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import evaluate

In [2]:
# Set up the device
print(torch.cuda.is_available())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

True


In [3]:
whisper_model = "openai/whisper-small.en"

feature_extractor = WhisperFeatureExtractor.from_pretrained(whisper_model)
tokenizer = WhisperTokenizer.from_pretrained(whisper_model, language="English", task="transcribe")
model = WhisperForConditionalGeneration.from_pretrained(whisper_model).to(device)
processor = WhisperProcessor.from_pretrained(whisper_model, language="English", task="transcribe")

# model.config.config_language = "English"
# model.generation_config.task = "transcribe"
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

In [4]:
common_voice = DatasetDict()

# Define the path to the directory
data_dir = "./audio/"

# Read data from a jsonl file and reformat it
data = {'key': [], 'audio': [], 'transcript': []}
with jsonlines.open("./asr.jsonl") as reader:
    for obj in reader:
        data['key'].append(obj['key'])
        data['audio'].append(obj['audio'])
        data['transcript'].append(obj['transcript'])

# Convert to a Hugging Face dataset
dataset = Dataset.from_dict(data)

audio_augment = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
    TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),
    PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
    Shift(min_shift=-0.5, max_shift=0.5, p=0.5),
])


def augment_and_add_audio(example):
    audio_path = data_dir + example["audio"]
    speech_array, sampling_rate = torchaudio.load(audio_path)
    
    # Apply augmentation
    augmented_speech_array = audio_augment(samples=speech_array.squeeze().numpy(), sample_rate=sampling_rate)
    
    # Return a new example with the same metadata but augmented audio
    return {
        "key": example["key"],
        "audio": {
            "path": audio_path,
            "array": augmented_speech_array,
            "sampling_rate": sampling_rate
        },
        "transcript": example["transcript"]
    }


# Append string to all values of the "audio" column
def append_string_to_path(example):
    audio_path = data_dir + example["audio"]  # Adjust this line as needed
    speech_array, sampling_rate = torchaudio.load(audio_path)
    
    
    example["audio"] = {
        "path": audio_path,
        "array": speech_array.squeeze().numpy(),
        "sampling_rate": sampling_rate
    }
    return example

# Apply the transformation
aug_dataset = dataset.select(range(int(len(dataset) * 0.5))).map(augment_and_add_audio)
dataset = dataset.map(append_string_to_path)

Map:   0%|          | 0/1750 [00:00<?, ? examples/s]

Map:   0%|          | 0/3500 [00:00<?, ? examples/s]

In [5]:
combined_dataset = concatenate_datasets([dataset, aug_dataset])
combined_dataset = combined_dataset.shuffle(seed=42)

In [6]:
combined_dataset

Dataset({
    features: ['key', 'audio', 'transcript'],
    num_rows: 5250
})

In [7]:
train_size = int(0.8 * len(combined_dataset))
test_size = len(combined_dataset) - train_size

train_dataset = combined_dataset.select(range(train_size))
test_dataset = combined_dataset.select(range(train_size, train_size + test_size))

In [8]:
common_voice["train"] = train_dataset
common_voice["test"] = test_dataset

In [9]:
common_voice

DatasetDict({
    train: Dataset({
        features: ['key', 'audio', 'transcript'],
        num_rows: 4200
    })
    test: Dataset({
        features: ['key', 'audio', 'transcript'],
        num_rows: 1050
    })
})

In [10]:
def prepare_dataset(batch):
    try:
        # load and resample audio data from 48 to 16kHz
        audio = batch["audio"]
        
        # compute log-Mel input features from input audio array 
        batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
        
        # encode target text to label ids 
        batch["labels"] = tokenizer(batch["transcript"]).input_ids
        return batch
    except Exception as e:
        print("Error processing batch:", e)
    
#     # load and resample audio data from 48 to 16kHz
#     audio = batch["audio"]

#     # compute log-Mel input features from input audio array 
#     batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

#     # encode target text to label ids 
#     batch["labels"] = tokenizer(batch["transcript"]).input_ids
#     return batch

# common_voice = common_voice.map(prepare_dataset, remove_columns=common_voice.column_names["train"], num_proc=1)
common_voice = common_voice.map(prepare_dataset, remove_columns=common_voice.column_names["train"], num_proc=1, batch_size=1)

Map:   0%|          | 0/4200 [00:00<?, ? examples/s]

Map:   0%|          | 0/1050 [00:00<?, ? examples/s]

In [11]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [12]:
metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [13]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results-aug",  # change to a repo name of your choice
    per_device_train_batch_size=12,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=100,
    max_steps=1000,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=4,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=100,
    eval_steps=100,
    logging_steps=25,
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
)


trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=common_voice["train"],
    eval_dataset=common_voice["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

max_steps is given, it will override any value given in num_train_epochs


In [14]:
trainer.train()

`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


Step,Training Loss,Validation Loss,Wer
100,0.4993,0.364478,24.911072
200,0.0314,0.02703,33.542686
300,0.0183,0.014287,21.631836
400,0.0062,0.013823,16.318364
500,0.0052,0.011203,19.797688
600,0.0087,0.011235,35.515785
700,0.0046,0.01023,22.749
800,0.0029,0.009166,21.353935
900,0.0015,0.008882,21.192752
1000,0.0006,0.008624,21.731881


Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50256]}
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50256]}
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50256]}
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50256]}
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50256]}
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50256]}
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50256]}
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50256]}
Non-default generation parameters: {'max_length': 448, 'suppress

TrainOutput(global_step=1000, training_loss=0.1716730762505904, metrics={'train_runtime': 7814.691, 'train_samples_per_second': 1.536, 'train_steps_per_second': 0.128, 'total_flos': 3.46302480384e+18, 'train_loss': 0.1716730762505904, 'epoch': 2.857142857142857})

In [15]:
model_save_path = "./whispersmall-augment-norm-model-1000"
model.save_pretrained(model_save_path)
processor.save_pretrained(model_save_path)

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50256]}


[]

In [16]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import torch
import librosa

# Load the fine-tuned model and processor
model_save_path = "./whispersmall-augment-norm-model-1000"
processor = WhisperProcessor.from_pretrained(model_save_path)
model = WhisperForConditionalGeneration.from_pretrained(model_save_path)

# Load your audio file
audio_path = "./audio/audio_2.wav"
audio_input, sample_rate = librosa.load(audio_path, sr=16000)

# Tokenize the audio
input_features = processor(audio_input, return_tensors="pt", sampling_rate=16000).input_features

# Perform inference
with torch.no_grad():
    generated_ids = model.generate(input_features)

# Decode the output
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

print("Transcription:", transcription)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Transcription: Heading is one zero five, target is silver, green, and yellow light aircraft, tool to deploy is anti-air artillery.
