In [1]:
!nvidia-smi

# If this doesn't work, there's no GPU available or detected

Fri Feb 24 00:38:32 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.65.01    Driver Version: 515.65.01    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A10          On   | 00000000:06:00.0 Off |                    0 |
|  0%   33C    P8    20W / 150W |      0MiB / 23028MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install tensorboardX
!pip install audiolm-pytorch

Defaulting to user installation because normal site-packages is not writeable
Collecting tensorboardX
  Downloading tensorboardX-2.6-py2.py3-none-any.whl (114 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.5/114.5 kB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorboardX
Successfully installed tensorboardX-2.6

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3[0m[39;49m -> [0m[32;49m23.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Defaulting to user installation because normal site-packages is not writeable
Collecting audiolm-pytorch
  Downloading audiolm_pytorch-0.16.1-py3-none-any.whl (30 kB)
Collecting einops>=0.6
  Downloading einops-0.6.0-py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.6/41.6 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25hColl

In [8]:
# single cell notebook to simulate a python script, so we can just train SoundStream by running the script

# requires: audiolm-pytorch, tensorboardX

# raise AssertionError("Make sure to set num train steps and how frequently to save model. right now it's very low just to test in google colab")

import os
import urllib.request
import tarfile
from audiolm_pytorch import SoundStream, SoundStreamTrainer, HubertWithKmeans, SemanticTransformer, SemanticTransformerTrainer, HubertWithKmeans, CoarseTransformer, CoarseTransformerWrapper, CoarseTransformerTrainer, FineTransformer, FineTransformerWrapper, FineTransformerTrainer, AudioLM
from torch import nn
from torch.profiler import profile, record_function, ProfilerActivity
import torch
import torchaudio
import datetime
import shutil

# small clean dataset
filename = "dev-clean"
filename_targz = filename + ".tar.gz"
url = f"https://us.openslr.org/resources/12/{filename_targz}"
if not os.path.isfile(filename_targz):
  urllib.request.urlretrieve(url, filename_targz)
if not os.path.isdir(filename):
  # open file
  with tarfile.open(filename_targz) as t:
    t.extractall(filename)
dataset_folder = filename # update dataset_folder so we use the right dataset

# remove old results files if they're around
if os.path.isdir("results"):
  shutil.rmtree("results")

with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof:
  with record_function("data_load"):
    soundstream = SoundStream(
        codebook_size = 1024,
        rq_num_quantizers = 12,
        attn_window_size = 128,       # local attention receptive field at bottleneck
        attn_depth = 2                # 2 local attention transformer blocks - the soundstream folks were not experts with attention, so i took the liberty to add some. encodec went with lstms, but attention should be better
    )
    
    before_trainer_load = datetime.datetime.now()
    trainer = SoundStreamTrainer(
        soundstream,
        folder = dataset_folder,
        batch_size = 2,
        grad_accum_every = 8,
        data_max_length_seconds = 1,
        num_train_steps = 9,
        save_results_every = 4,
        save_model_every = 4,
    ).cuda()
    before_train = datetime.datetime.now()
    with record_function("train"):
      trainer.train()

      data_load_time = {(before_train - before_trainer_load).total_seconds()}
      train_time = {(datetime.datetime.now() - before_train).total_seconds()}
      print(f"data_load_time: {data_load_time}, train_time: {train_time}")

training with dataset of 2567 samples and validating with randomly splitted 136 samples


do you want to clear previous experiment checkpoints and results? (y/n)  y


0: soundstream total loss: 1028670.922, soundstream recon loss: 1.146 | discr (scale 1) loss: 2.003 | discr (scale 0.5) loss: 2.001 | discr (scale 0.25) loss: 2.000
0: saving to results
0: saving model to results
1: soundstream total loss: 982905.281, soundstream recon loss: 1.053 | discr (scale 1) loss: 1.982 | discr (scale 0.5) loss: 1.993 | discr (scale 0.25) loss: 1.996
2: soundstream total loss: 820282.883, soundstream recon loss: 0.813 | discr (scale 1) loss: 1.951 | discr (scale 0.5) loss: 1.985 | discr (scale 0.25) loss: 1.981
3: soundstream total loss: 280241.453, soundstream recon loss: 0.341 | discr (scale 1) loss: 1.931 | discr (scale 0.5) loss: 1.983 | discr (scale 0.25) loss: 1.975
4: soundstream total loss: 114009.168, soundstream recon loss: 0.154 | discr (scale 1) loss: 1.940 | discr (scale 0.5) loss: 1.990 | discr (scale 0.25) loss: 1.992
4: saving to results
4: saving model to results
5: soundstream total loss: 61593.055, soundstream recon loss: 0.072 | discr (scale 

In [11]:
# print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))
prof.key_averages().table(sort_by="cuda_time_total", row_limit=10)
# prof.key_averages().table(row_limit=10)

'-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  \n                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  \n-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  \n                                              data_load         4.81%        2.266s        71.49%       33.681s       33.681s       0.000us         0.00%        5.161s        5.161s             1  \n                                                  train        40.79%       19.217s        66.02%       31.104s       31.104s       0.000us         0.00%        5.125s        5.125s             1  \naut

In [7]:
??ProfilerActivity.CUDA

[0;31mType:[0m           ProfilerActivity
[0;31mString form:[0m    ProfilerActivity.CUDA
[0;31mDocstring:[0m     
Members:

CPU

CUDA
[0;31mInit docstring:[0m __init__(self: torch._C._profiler.ProfilerActivity, value: int) -> None
