<a href="https://colab.research.google.com/github/Nathan-Roll1/PSST/blob/main/PSST_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## PSST! Prosodic Speech Segmentation with Transformers

This notebook contains the code used for training and pushing the PSST model. 


## INIT

### Installs

In [None]:
!pip install datasets
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.8.0-py3-none-any.whl (452 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m452.9/452.9 KB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.2.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash
  Downloading xxhash-3.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (213 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.0/213.0 KB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting multiprocess
  Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.0/132.0 KB[0

In [None]:
!add-apt-repository -y ppa:jonathonf/ffmpeg-4
!apt update
!apt install -y ffmpeg
!pip install librosa
!pip install evaluate>=0.3.0
!pip install jiwer
!pip install gradio
!pip install more-itertools

Get:1 https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/ InRelease [3,622 B]
Get:2 http://security.ubuntu.com/ubuntu focal-security InRelease [114 kB]
Hit:3 http://archive.ubuntu.com/ubuntu focal InRelease
Ign:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2004/x86_64  InRelease
Hit:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64  InRelease
Hit:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2004/x86_64  Release
Get:7 http://archive.ubuntu.com/ubuntu focal-updates InRelease [114 kB]
Get:8 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu focal InRelease [18.1 kB]
Hit:9 http://ppa.launchpad.net/cran/libgit2/ubuntu focal InRelease
Get:11 http://archive.ubuntu.com/ubuntu focal-backports InRelease [108 kB]
Hit:12 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu focal InRelease
Hit:13 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu focal InRelease
Ign:14 http://ppa.launchpad.net/jonathonf/f

### Imports

In [None]:
from datasets import interleave_datasets, load_dataset, IterableDatasetDict, Audio
from huggingface_hub import notebook_login
from transformers.models.whisper.english_normalizer import BasicTextNormalizer
from transformers import WhisperProcessor
import torch
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import evaluate
from transformers import WhisperForConditionalGeneration
from transformers import TrainerCallback
from transformers.trainer_pt_utils import IterableDatasetShard
from torch.utils.data import IterableDataset
from transformers import Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainer

### Functions

In [None]:
def load_streaming_dataset(dataset_name, split, **kwargs):
    if "+" in split:
        # load multiple splits separated by the `+` symbol *with* streaming mode
        dataset_splits = [load_dataset(dataset_name, split=split_name, streaming=True, **kwargs) for split_name in split.split("+")]
        # interleave multiple splits to form one dataset
        interleaved_dataset = interleave_datasets(dataset_splits)
        return interleaved_dataset
    else:
        # load a single split *with* streaming mode
        dataset = load_dataset(dataset_name, split=split, streaming=True, **kwargs)
        return dataset

In [None]:
def prepare_dataset(batch):
    # load and (possibly) resample audio datato 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array 
    batch["input_features"] = processor.feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
    # compute input length of audio sample in seconds
    batch["input_length"] = len(audio["array"]) / audio["sampling_rate"]
    
    # optional pre-processing steps
    transcription = batch["sentence"]
    if do_lower_case:
        transcription = transcription.lower()
    if do_remove_punctuation:
        transcription = re.sub(punctuation_to_remove_regex, " ", transcription).strip()
    
    # encode target text to label ids
    batch["labels"] = processor.tokenizer(transcription).input_ids
    return batch

In [None]:
def is_audio_in_length_range(length):
    return length < max_input_length

In [None]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    if do_normalize_eval:
        pred_str = [normalizer(pred) for pred in pred_str]
        label_str = [normalizer(label) for label in label_str]
        # filtering step to only evaluate the samples that correspond to non-zero references:
        pred_str = [pred_str[i] for i in range(len(pred_str)) if len(label_str[i]) > 0]
        label_str = [label_str[i] for i in range(len(label_str)) if len(label_str[i]) > 0]
    
    wer = 100 * metric.compute(predictions=pred_str, references=label_str)
    return {"wer":wer}

### Classes

In [None]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [None]:
# trainer callback to reinitialise and reshuffle the streamable datasets at the beginning of each epoch
class ShuffleCallback(TrainerCallback):
    def on_epoch_begin(self, args, state, control, train_dataloader, **kwargs):
        if isinstance(train_dataloader.dataset, IterableDatasetShard):
            pass  # set_epoch() is handled by the Trainer
        elif isinstance(train_dataloader.dataset, IterableDataset):
            train_dataloader.dataset.set_epoch(train_dataloader.dataset._epoch + 1)

In [None]:
model_size = "medium"

## Data

In [None]:
# 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
notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.huggingface/token
Login successful


In [None]:
raw_datasets = IterableDatasetDict()

raw_datasets["train"] = load_streaming_dataset("NathanRoll/SBC_word_segmented", split="train", use_auth_token=True)
raw_datasets["test"] = load_streaming_dataset("NathanRoll/SBC_word_segmented", split="test", use_auth_token=True)

Downloading readme:   0%|          | 0.00/481 [00:00<?, ?B/s]



In [None]:
raw_datasets = raw_datasets.cast_column("audio", Audio(sampling_rate=16000))

In [None]:
# from transformers import WhisperFeatureExtractor
# feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-tiny.en")

# from transformers import WhisperTokenizer
# tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny.en", task="transcribe")

## Prepare Processor and Pre-Process Data

In [None]:
processor = WhisperProcessor.from_pretrained(f"openai/whisper-{model_size}.en", task="transcribe", predict_timestamps=True)

Downloading:   0%|          | 0.00/185k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/846 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/999k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

In [None]:
do_lower_case = False
do_remove_punctuation = False

normalizer = BasicTextNormalizer()

In [None]:
vectorized_datasets = raw_datasets.map(prepare_dataset, remove_columns=list(next(iter(raw_datasets.values())).features)).with_format("torch")

In [None]:
vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(
    buffer_size=500,
    seed=0,
)

In [None]:
max_input_length = 30.0

In [None]:
vectorized_datasets["train"] = vectorized_datasets["train"].filter(
    is_audio_in_length_range,
    input_columns=["input_length"],
)

## Training and Evaluation

In [None]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [None]:
metric = evaluate.load("wer")

# evaluate with the 'normalised' WER
do_normalize_eval = True

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

### Load a Pre-Trained Checkpoint

In [None]:
model = WhisperForConditionalGeneration.from_pretrained(f"openai/whisper-{model_size}.en")
model.config.use_cache = False

Downloading:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.06G [00:00<?, ?B/s]

Override generation arguments - no tokens are forced as decoder outputs (see [`forced_decoder_ids`](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.generation_utils.GenerationMixin.generate.forced_decoder_ids)), no tokens are suppressed during generation (see [`suppress_tokens`](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.generation_utils.GenerationMixin.generate.suppress_tokens)). Set `use_cache` to False since we're using gradient checkpointing, and the two are incompatible:

In [None]:
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

### Define the Training Configuration

In the final step, we define all the parameters related to training. Here, you can set the `max_steps` to train for longer. For more detail on the training arguments, refer to the Seq2SeqTrainingArguments [docs](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.Seq2SeqTrainingArguments).

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir=f"./psst-{model_size}-syllabic-en",  # your repo name
    per_device_train_batch_size=32,
    gradient_accumulation_steps=2,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=50,
    max_steps=400,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="no",
    save_strategy="no",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=100,
    logging_steps=10,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)

In [None]:
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=vectorized_datasets["train"],
    eval_dataset=vectorized_datasets["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor,
    callbacks=[ShuffleCallback()],
)

Cloning https://huggingface.co/NathanRoll/psst-medium-syllabic-en into local empty directory.


Download file pytorch_model.bin:   0%|          | 3.44k/2.85G [00:00<?, ?B/s]

Download file runs/Jan19_15-50-54_eb7e018c1412/1674143472.993101/events.out.tfevents.1674143472.eb7e018c1412.2…

Download file runs/Jan19_15-50-54_eb7e018c1412/events.out.tfevents.1674143472.eb7e018c1412.299.0: 100%|#######…

Download file training_args.bin: 100%|##########| 3.50k/3.50k [00:00<?, ?B/s]

Clean file runs/Jan19_15-50-54_eb7e018c1412/1674143472.993101/events.out.tfevents.1674143472.eb7e018c1412.299.…

Clean file runs/Jan19_15-50-54_eb7e018c1412/events.out.tfevents.1674143472.eb7e018c1412.299.0:  21%|##1       …

Clean file training_args.bin:  29%|##8       | 1.00k/3.50k [00:00<?, ?B/s]

Clean file pytorch_model.bin:   0%|          | 1.00k/2.85G [00:00<?, ?B/s]

max_steps is given, it will override any value given in num_train_epochs
Using cuda_amp half precision backend


In [None]:
model.save_pretrained(training_args.output_dir)
processor.save_pretrained(training_args.output_dir)

Configuration saved in ./psst-medium-syllabic-en/config.json
Model weights saved in ./psst-medium-syllabic-en/pytorch_model.bin
Feature extractor saved in ./psst-medium-syllabic-en/preprocessor_config.json
tokenizer config file saved in ./psst-medium-syllabic-en/tokenizer_config.json
Special tokens file saved in ./psst-medium-syllabic-en/special_tokens_map.json
added tokens file saved in ./psst-medium-syllabic-en/added_tokens.json


## Training

In [None]:
trainer.train()

***** Running training *****
  Num examples = 25600
  Num Epochs = 9223372036854775807
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 2
  Total optimization steps = 400
  Number of trainable parameters = 763856896
The following columns in the training set don't have a corresponding argument in `WhisperForConditionalGeneration.forward` and have been ignored: input_length. If input_length are not expected by `WhisperForConditionalGeneration.forward`,  you can safely ignore this message.


Step,Training Loss
10,4.4624
20,2.9295
30,1.6172


Step,Training Loss
10,4.4624
20,2.9295
30,1.6172
40,0.9762
50,0.8181
60,0.7595
70,0.7151
80,0.6049
90,0.5504
100,0.5094




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=400, training_loss=0.5523145267367363, metrics={'train_runtime': 8262.869, 'train_samples_per_second': 3.098, 'train_steps_per_second': 0.048, 'total_flos': 2.619790737113088e+19, 'train_loss': 0.5523145267367363, 'epoch': 3.0})

In [None]:
kwargs = {
    "dataset_tags": "NathanRoll/SBC_word_segmented",
    "dataset": "Santa Barbara Corpus of Spoken American English",  # a 'pretty' name for the training dataset
    "language": "en",
    "model_name": f"PSST {model_size} Scrambled",  
    "finetuned_from": f"openai/whisper-{model_size}.en",
    "tasks": "automatic-speech-recognition"
}

In [None]:
trainer.push_to_hub(**kwargs)

Saving model checkpoint to ./psst-medium-syllabic-en
Configuration saved in ./psst-medium-syllabic-en/config.json
Model weights saved in ./psst-medium-syllabic-en/pytorch_model.bin
Feature extractor saved in ./psst-medium-syllabic-en/preprocessor_config.json
tokenizer config file saved in ./psst-medium-syllabic-en/tokenizer_config.json
Special tokens file saved in ./psst-medium-syllabic-en/special_tokens_map.json
added tokens file saved in ./psst-medium-syllabic-en/added_tokens.json


Upload file pytorch_model.bin:   0%|          | 32.0k/2.85G [00:00<?, ?B/s]

Upload file runs/Jan20_02-31-14_0124bacbd8d0/1674182576.6809433/events.out.tfevents.1674182576.0124bacbd8d0.22…

Upload file runs/Jan20_02-31-14_0124bacbd8d0/events.out.tfevents.1674182576.0124bacbd8d0.2223.0: 100%|########…

Upload file training_args.bin: 100%|##########| 3.50k/3.50k [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/NathanRoll/psst-medium-syllabic-en
   b88d71b..ff25205  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/NathanRoll/psst-medium-syllabic-en
   b88d71b..ff25205  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Automatic Speech Recognition', 'type': 'automatic-speech-recognition'}, 'dataset': {'name': 'Santa Barbara Corpus of Spoken American English', 'type': 'NathanRoll/SBC_word_segmented', 'config': None, 'split': 'train'}}
To https://huggingface.co/NathanRoll/psst-medium-syllabic-en
   ff25205..c8d411b  main -> main

   ff25205..c8d411b  main -> main



'https://huggingface.co/NathanRoll/psst-medium-syllabic-en/commit/ff25205b0a0d6be30e8506eea922d8155c21e291'

In [None]:
from google.colab import runtime
runtime.unassign()