In [None]:
!pip install transformers==4.29.2

In [None]:
%cd ../src

In [None]:
!git clone https://github.com/152334H/DL-Art-School
%cd DL-Art-School
!wget https://huggingface.co/Gatozu35/tortoise-tts/resolve/main/dvae.pth -O experiments/dvae.pth
!wget https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/autoregressive.pth -O experiments/autoregressive.pth
!pip install -r codes/requirements.laxed.txt

In [4]:
# @markdown Check the integrity of the dVAE checkpoint
!sha256sum ../DL-Art-School/experiments/dvae.pth | grep a990825371506c16bcf0e8167bf24ccf82f65bb6a1dbcbfcf058d76f9b197e35 || echo "SOMETHING IS WRONG WITH THE CHECKPOINT; REPORT THIS AS A GITHUB ISSUE AND DO NOT PROCEED"
# @markdown You should see the following message when verified:

# @markdown > `a990825371506c16bcf0e8167bf24ccf82f65bb6a1dbcbfcf058d76f9b197e35  ../DL-Art-School/experiments/dvae.pth`


a990825371506c16bcf0e8167bf24ccf82f65bb6a1dbcbfcf058d76f9b197e35  ../DL-Art-School/experiments/dvae.pth


## Dataset loading and checking

In [1]:
!ls ../../data

metadata.csv  train.txt  val.txt  wavs


In [2]:
from pathlib import Path
from math import ceil

def txt_file_lines(p: str) -> int:
    return len(Path(p).read_text().strip().split('\n'))

def div_spillover(n: int, bs: int) -> int: # returns new batch size
    epoch_steps,remain = divmod(n,bs)
    if epoch_steps*2 > bs: return bs # don't bother optimising this stuff if epoch_steps are high
    if not remain: return bs # unlikely but still

    if remain*2 < bs: # "easier" to get rid of remainder -- should increase bs
        target_bs = n//epoch_steps
    else: # easier to increase epoch_steps by 1 -- decrease bs
        target_bs = n//(epoch_steps+1)
    assert n%target_bs < epoch_steps+2 # should be very few extra 
    return target_bs

In [3]:
DEFAULT_TRAIN_BS = 64
DEFAULT_VAL_BS   = 32
#@markdown # Hyperparameter calculation
#@markdown Run this cell to obtain suggested parameters for training
Dataset_Training_Path = "../../data/train.txt" #@param {type:"string"}
ValidationDataset_Training_Path = "../../data/val.txt" #@param {type:"string"}

#@markdown ### **NOTE**: Dataset must be in the following format.

#@markdown  `dataset/`
#@markdown * ---├── `val.txt`
#@markdown * ---├── `train.txt`
#@markdown * ---├── `wavs/`

#@markdown `wavs/` directory must contain `.wav` files.

#@markdown  Example for `train.txt` and `val.txt`:

#@markdown * `wavs/A.wav|Write the transcribed audio here.`

#@markdown todo: actually check the dataset structure

if Dataset_Training_Path == ValidationDataset_Training_Path:
    print("WARNING: training dataset path == validation dataset path!!!")
    print("\tThis is technically okay but will make all of the validation metrics useless. ")
    print("it will also SUBSTANTIALLY slow down the rate of training, because validation datasets are supposed to be much smaller than training ones.")


training_samples = txt_file_lines(Dataset_Training_Path)
val_samples      = txt_file_lines(ValidationDataset_Training_Path)

if training_samples < 128: print("WARNING: very small dataset! the smallest dataset tested thus far had ~200 samples.")
if val_samples < 20: print("WARNING: very small validation dataset! val batch size will be scaled down to account")


if training_samples < DEFAULT_TRAIN_BS:
    print("WARNING: dataset is smaller than a single batch. This will almost certainly perform poorly. Trying anyway")
    train_bs = training_samples
else:
    train_bs = div_spillover(training_samples, DEFAULT_TRAIN_BS)
    
if val_samples < DEFAULT_VAL_BS:
    val_bs = val_samples
else:
    val_bs = div_spillover(val_samples, DEFAULT_VAL_BS)

steps_per_epoch = training_samples//train_bs
lr_decay_epochs = [20, 40, 56, 72]
lr_decay_steps  = [steps_per_epoch * e for e in lr_decay_epochs]
print_freq      = min(100, max(20, steps_per_epoch))
val_freq        = save_checkpoint_freq = print_freq * 3

print("===CALCULATED SETTINGS===")
print(f'{train_bs=} {val_bs=}')
print(f'{val_freq=} {lr_decay_steps=}')
print(f'{print_freq=} {save_checkpoint_freq=}')

===CALCULATED SETTINGS===
train_bs=64 val_bs=32
val_freq=300 lr_decay_steps=[5600, 11200, 15680, 20160]
print_freq=100 save_checkpoint_freq=300


## Experiment settings

In [10]:
%cd  ../../src/DL-Art-School

/home/caytu/Wolof-TTS/src/DL-Art-School


In [11]:
#@markdown ##_Settings for normal users:_
Experiment_Name         = "Tortoise_wolof" #@param {type:"string"}
Dataset_Training_Name   = "AntaTrain" #@param {type:"string"}
ValidationDataset_Name  = "AntaValidation" # this seems to be useless??? @param {type:"string"}
SaveTrainingStates      = False # @param {type:"boolean"}
Keep_Last_N_Checkpoints = 0 #@param {type:"slider", min:0, max:10, step:1}
#@markdown * **NOTE**: 0 means "keep all models saved", which could potentially cause out-of-storage issues.
#@markdown * Without training states, each model "only" takes up ~1.6GB. You should have ~50GB of free space to begin with.
#@markdown * With training states, each model (pth+state) takes up ~4.9 GB; Colab will crash around ~10 undeleted checkpoints in this case.

#@markdown ##_Other training parameters_
Fp16    = False #@param {type:"boolean"}
Use8bit = True #@param {type:"boolean"}
#@markdown * **NOTE**: for some reason, fp16 does not seem to improve vram use when combined with 8bit [citation needed]. To be verified later...
TrainingRate   = "1e-5" #@param {type:"string"}
TortoiseCompat = True #@param {type:"boolean"}

#@markdown * **NOTE**: TortoiseCompat introduces some breaking changes to the training process. **If you want to reproduce older models**, disable this checkbox.

#@markdown ##_Calculated settings_ override
#@markdown #####Blank entries rely on the calculated defaults from the cell above.
#@markdown ######**Leave them blank unless you want to adjust them manually**
TrainBS            = "" #@param {type:"string"}
ValBS              = "" #@param {type:"string"}
ValFreq            = "" #@param {type:"string"}
LRDecaySteps       = "" #@param {type:"string"}
PrintFreq          = "" #@param {type:"string"}
SaveCheckpointFreq = "" #@param {type:"string"}

def take(orig, override):
    if override == "": return orig
    return type(orig)(override)

train_bs             = take(train_bs, TrainBS)
val_bs               = take(val_bs, ValBS)
val_freq             = take(val_freq, ValFreq)
lr_decay_steps       = eval(LRDecaySteps) if LRDecaySteps else lr_decay_steps
print_freq           = take(print_freq, PrintFreq)
save_checkpoint_freq = take(save_checkpoint_freq, SaveCheckpointFreq)
assert len(lr_decay_steps) == 4 
gen_lr_steps = ', '.join(str(v) for v in lr_decay_steps)

#@markdown #Run this cell after you finish editing the settings.

!wget https://raw.githubusercontent.com/152334H/DL-Art-School/master/experiments/EXAMPLE_gpt.yml -O experiments/EXAMPLE_gpt.yml

#@markdown This will apply the settings defined above to a fresh yml config file.
import os
!sed -i 's/batch_size: 128/batch_size: '"$train_bs"'/g' ./experiments/EXAMPLE_gpt.yml
!sed -i 's/batch_size: 64/batch_size: '"$val_bs"'/g' ./experiments/EXAMPLE_gpt.yml
!sed -i 's/val_freq: 500/val_freq: '"$val_freq"'/g' ./experiments/EXAMPLE_gpt.yml
!sed -i 's/500, 1000, 1400, 1800/'"$gen_lr_steps"'/g' ./experiments/EXAMPLE_gpt.yml
!sed -i 's/print_freq: 100/print_freq: '"$print_freq"'/g' ./experiments/EXAMPLE_gpt.yml
!sed -i 's/save_checkpoint_freq: 500/save_checkpoint_freq: '"$save_checkpoint_freq"'/g' ./experiments/EXAMPLE_gpt.yml

!sed -i 's+CHANGEME_validation_dataset_name+'"$ValidationDataset_Name"'+g' ./experiments/EXAMPLE_gpt.yml
!sed -i 's+CHANGEME_path_to_validation_dataset+'"$ValidationDataset_Training_Path"'+g' ./experiments/EXAMPLE_gpt.yml

if(Fp16==True):
    os.system("sed -i 's+fp16: false+fp16: true+g' ./experiments/EXAMPLE_gpt.yml")
!sed -i 's/use_8bit: true/use_8bit: '"$Use8bit"'/g' ./experiments/EXAMPLE_gpt.yml

!sed -i 's/disable_state_saving: true/disable_state_saving: '"$SaveTrainingStates"'/g' ./experiments/EXAMPLE_gpt.yml
!sed -i 's/tortoise_compat: True/tortoise_compat: '"$TortoiseCompat"'/g' ./experiments/EXAMPLE_gpt.yml
!sed -i 's/number_of_checkpoints_to_save: 0/number_of_checkpoints_to_save: '"$Keep_Last_N_Checkpoints"'/g' ./experiments/EXAMPLE_gpt.yml


!sed -i 's/CHANGEME_training_dataset_name/'"$Dataset_Training_Name"'/g' ./experiments/EXAMPLE_gpt.yml
!sed -i 's/CHANGEME_your_experiment_name/'"$Experiment_Name"'/g' ./experiments/EXAMPLE_gpt.yml
!sed -i 's+CHANGEME_path_to_training_dataset+'"$Dataset_Training_Path"'+g' ./experiments/EXAMPLE_gpt.yml


if (not TrainingRate=="1e-5"):
    os.system("sed -i 's+!!float 1e-5 # CHANGEME:+!!float '" + TrainingRate + "' #+g' ./experiments/EXAMPLE_gpt.yml")

--2024-11-12 13:43:35--  https://raw.githubusercontent.com/152334H/DL-Art-School/master/experiments/EXAMPLE_gpt.yml
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6489 (6.3K) [text/plain]
Saving to: ‘experiments/EXAMPLE_gpt.yml’


2024-11-12 13:43:37 (72.1 MB/s) - ‘experiments/EXAMPLE_gpt.yml’ saved [6489/6489]



## Training

Press the stop button for this cell when you are satisfied with the results, and have seen:

`INFO:base:Saving models and training states.`

If your training run saves many models, you might exceed the storage limits on the colab runtime. To prevent this, try to delete old checkpoints in `DL-Art-School/experiments/$Experiment_Name/(models|training_state)/` via the file explorer panel as the training runs. **Resuming training after a crash requires config editing,** so try to not let that happen.

TODO: implement code to automatically prune useless checkpoints later && restore training states

In [25]:
%cd codes/

/home/caytu/Wolof-TTS/src/DL-Art-School/codes


In [32]:
!python3 train.py -opt ../experiments/EXAMPLE_gpt.yml

  from torch.distributed.optim import ZeroRedundancyOptimizer
Disabled distributed training.
Path already exists. Rename it to [/home/caytu/Wolof-TTS/src/DL-Art-School/experiments/Tortoise_wolof_archived_241029-125031]
24-10-29 12:50:31.573 - INFO:   name: Tortoise_wolof
  model: extensibletrainer
  scale: 1
  gpu_ids: [0]
  start_step: 0
  checkpointing_enabled: True
  fp16: False
  use_8bit: True
  wandb: False
  use_tb_logger: True
  datasets:[
    train:[
      name: AntaTrain
      n_workers: 8
      batch_size: 32
      mode: paired_voice_audio
      path: ../../../data/train.txt
      fetcher_mode: ['lj']
      phase: train
      max_wav_length: 255995
      max_text_length: 200
      sample_rate: 22050
      load_conditioning: True
      num_conditioning_candidates: 2
      conditioning_length: 44000
      use_bpe_tokenizer: True
      load_aligned_codes: False
      data_type: img
    ]
    val:[
      name: AntaValidation
      n_workers: 1
      batch_size: 32
      mode: pa