# Initialisation and Dependencies

In [1]:
import sys
lib_path = '/home/jovyan/libs'
sys.path.insert(0, lib_path)

In [2]:
%reload_ext autoreload
%autoreload 2

import gc, math, traceback, datetime

import numpy as np

import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter

from datasets import load_from_disk

import whisper
from whisper.tokenizer import get_tokenizer

from utils import audio, gradient, gpu
from utils.attacks import PrepareFront, PrepareAtPosition

# GPU RAM Tracking

In [3]:
device = gpu.get_device()

Device: cuda


# Load Model

In [4]:
try:
    del model
    print("Model deleted!")
except:
    pass

In [5]:
MODEL_NAME = "tiny.en"

model = whisper.load_model(MODEL_NAME).to(device)
model.eval()

  checkpoint = torch.load(fp, map_location=device)


Whisper(
  (encoder): AudioEncoder(
    (conv1): Conv1d(80, 384, kernel_size=(3,), stride=(1,), padding=(1,))
    (conv2): Conv1d(384, 384, kernel_size=(3,), stride=(2,), padding=(1,))
    (blocks): ModuleList(
      (0-3): 4 x ResidualAttentionBlock(
        (attn): MultiHeadAttention(
          (query): Linear(in_features=384, out_features=384, bias=True)
          (key): Linear(in_features=384, out_features=384, bias=False)
          (value): Linear(in_features=384, out_features=384, bias=True)
          (out): Linear(in_features=384, out_features=384, bias=True)
        )
        (attn_ln): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (0): Linear(in_features=384, out_features=1536, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=1536, out_features=384, bias=True)
        )
        (mlp_ln): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
      )
    )
    (ln_post): LayerNorm((384,), eps=1e-05,

# Load Data

In [6]:
tedlium_path = "../tedlium"
train_path, validation_path, test_path = f"{tedlium_path}/train_idx.hf", f"{tedlium_path}/validation_idx.hf", f"{tedlium_path}/test.hf"

In [7]:
TRAIN_SELECT = 500
VALID_SELECT = 150
TEST_SELECT = 250

SEED = 1

tedlium_train = load_from_disk(train_path).with_format("torch").shuffle(seed=SEED).select(range(TRAIN_SELECT))
tedlium_validation = load_from_disk(validation_path).with_format("torch").shuffle(seed=SEED).select(range(VALID_SELECT))
tedlium_test = load_from_disk(test_path).with_format("torch").shuffle(seed=SEED).select(range(TEST_SELECT))

Loading dataset from disk:   0%|          | 0/109 [00:00<?, ?it/s]

In [8]:
# def collate(ls):
#     pad_to = max(list(map(lambda x: x["audio"].shape[0], ls)))
#     return torch.cat(list(map(lambda x: F.pad(x["audio"], (0, pad_to - x["audio"].shape[0])).unsqueeze(0).to(torch.bfloat16), ls)), dim=0)

def collate_idx(ls):
    return ls[0]["audio"].unsqueeze(0), ls[0]["idx"].item()

TRAIN_BATCH_SIZE = 1 # highly recommended to be 1
VALID_BATCH_SIZE = 1

train_dataset = DataLoader(tedlium_train, batch_size=TRAIN_BATCH_SIZE, collate_fn=collate_idx)
validation_dataset = DataLoader(tedlium_validation, batch_size=VALID_BATCH_SIZE, collate_fn=collate_idx)
test_dataset = DataLoader(tedlium_test)

# Training Loop

In [9]:
gc.collect()
torch.cuda.empty_cache()
gpu.print_cuda_usage()

0.14174842834472656 GB


In [10]:
LR = 1e-3
PATIENCE = 4
MIN_LIMIT = 45
ITER_LIMIT = 30
CLAMP_EP = 0.005
SNIPPET_SIZE = (1, 4_800)
POSITION = 0
PREPARE_METHOD = PrepareAtPosition(SNIPPET_SIZE, POSITION)

writer = None

In [11]:
# tensorboard writer
timestamp = datetime.datetime.now().strftime(f'%Y%m%d-%H%M%S_size_{SNIPPET_SIZE}_{PREPARE_METHOD.name}')
writer = SummaryWriter(log_dir=f"../runs/size_tests/{timestamp}", max_queue=5)

In [16]:
best_snippet, snippets, _, _ = gradient.train(model, gradient.forward,
                                              train_dataset, validation_dataset, 
                                              PREPARE_METHOD,
                                              writer, lr=LR, 
                                              train_success=None, valid_success=None,
                                              iter_limit=ITER_LIMIT, mins_limit=MIN_LIMIT, patience=PATIENCE, clamp_epsilon=CLAMP_EP)

Prepare method: prepare_at_position
Snippet initialised to [2.2640283532382455e-06, 0.004999119322746992] of size (1, 4800)
Clamp: 0.005
Time Limit (Mins): 45
Epochs Limit: 30
Tracking training success: False
Tracking valid success: False


Training:   0% 0/1 [01:57<?, ?it/s, Iter 2, Training Batch 2/500]    

Trng Avg Loss: 7.245643588542938 | Valid Avg Loss: 7.174496650695801 | Patience: 4 | LR: [0.001] | Epoch Limit: 29


Training:   0% 0/1 [03:10<?, ?it/s, Iter 2, Validation Batch 150/150]

Trng Avg Loss: 6.703906090259552 | Valid Avg Loss: 6.892661094665527 | Patience: 4 | LR: [0.001] | Epoch Limit: 28


Training:   0% 0/1 [04:24<?, ?it/s, Iter 3, Validation Batch 150/150]

Trng Avg Loss: 6.638621542453766 | Valid Avg Loss: 6.827492713928223 | Patience: 4 | LR: [0.001] | Epoch Limit: 27


Training:   0% 0/1 [05:36<?, ?it/s, Iter 4, Validation Batch 150/150]

Trng Avg Loss: 6.574115832328796 | Valid Avg Loss: 6.704929351806641 | Patience: 4 | LR: [0.001] | Epoch Limit: 26


Training:   0% 0/1 [06:47<?, ?it/s, Iter 5, Validation Batch 150/150]

Trng Avg Loss: 6.577115032672882 | Valid Avg Loss: 6.965965270996094 | Patience: 3 | LR: [0.001] | Epoch Limit: 25


Training:   0% 0/1 [08:00<?, ?it/s, Iter 6, Validation Batch 150/150]

Trng Avg Loss: 6.149748334407806 | Valid Avg Loss: 6.530749797821045 | Patience: 4 | LR: [0.0005] | Epoch Limit: 24


Training:   0% 0/1 [09:13<?, ?it/s, Iter 8, Training Batch 1/500]    

Trng Avg Loss: 5.9912870688438415 | Valid Avg Loss: 6.491474628448486 | Patience: 4 | LR: [0.0005] | Epoch Limit: 23


Training:   0% 0/1 [10:25<?, ?it/s, Iter 8, Validation Batch 150/150]

Trng Avg Loss: 5.732128712177277 | Valid Avg Loss: 6.747194766998291 | Patience: 3 | LR: [0.0005] | Epoch Limit: 22


Training:   0% 0/1 [11:33<?, ?it/s, Iter 10, Training Batch 1/500]   

Trng Avg Loss: 5.641536406517028 | Valid Avg Loss: 6.556363105773926 | Patience: 2 | LR: [0.0005] | Epoch Limit: 21


Training:   0% 0/1 [12:40<?, ?it/s, Iter 10, Validation Batch 150/150]

Trng Avg Loss: 5.312358336687088 | Valid Avg Loss: 6.294861793518066 | Patience: 4 | LR: [0.0005] | Epoch Limit: 20


Training:   0% 0/1 [13:48<?, ?it/s, Iter 11, Validation Batch 150/150]

Trng Avg Loss: 4.342826099634171 | Valid Avg Loss: 5.495185852050781 | Patience: 4 | LR: [0.00025] | Epoch Limit: 19


Training:   0% 0/1 [14:57<?, ?it/s, Iter 12, Validation Batch 150/150]

Trng Avg Loss: 3.8484785668849946 | Valid Avg Loss: 5.277195453643799 | Patience: 4 | LR: [0.00025] | Epoch Limit: 18


Training:   0% 0/1 [16:07<?, ?it/s, Iter 13, Validation Batch 150/150]

Trng Avg Loss: 3.6438039082884788 | Valid Avg Loss: 5.5649189949035645 | Patience: 3 | LR: [0.00025] | Epoch Limit: 17


Training:   0% 0/1 [17:17<?, ?it/s, Iter 14, Validation Batch 150/150]

Trng Avg Loss: 1.3283408657684923 | Valid Avg Loss: 1.8384003639221191 | Patience: 4 | LR: [0.00025] | Epoch Limit: 16


Training:   0% 0/1 [18:26<?, ?it/s, Iter 15, Validation Batch 150/150]

Trng Avg Loss: 0.5405238292999566 | Valid Avg Loss: 0.8207846879959106 | Patience: 4 | LR: [0.00025] | Epoch Limit: 15


Training:   0% 0/1 [19:37<?, ?it/s, Iter 16, Validation Batch 150/150]

Trng Avg Loss: 0.2800968388337642 | Valid Avg Loss: 0.5816791653633118 | Patience: 4 | LR: [0.000125] | Epoch Limit: 14


Training:   0% 0/1 [20:46<?, ?it/s, Iter 17, Validation Batch 150/150]

Trng Avg Loss: 0.36545687531121074 | Valid Avg Loss: 0.8003106117248535 | Patience: 3 | LR: [0.000125] | Epoch Limit: 13


Training:   0% 0/1 [21:59<?, ?it/s, Iter 18, Validation Batch 150/150]

Trng Avg Loss: 0.27933081186003983 | Valid Avg Loss: 0.6341371536254883 | Patience: 2 | LR: [0.000125] | Epoch Limit: 12


Training:   0% 0/1 [23:07<?, ?it/s, Iter 19, Validation Batch 150/150]

Trng Avg Loss: 0.18440822417847813 | Valid Avg Loss: 0.4914575219154358 | Patience: 4 | LR: [0.000125] | Epoch Limit: 11


Training:   0% 0/1 [24:14<?, ?it/s, Iter 20, Validation Batch 150/150]

Trng Avg Loss: 0.2934424491226673 | Valid Avg Loss: 0.5583539605140686 | Patience: 3 | LR: [0.000125] | Epoch Limit: 10


Training:   0% 0/1 [25:22<?, ?it/s, Iter 21, Validation Batch 150/150]

Trng Avg Loss: 0.12869979793857783 | Valid Avg Loss: 0.46208587288856506 | Patience: 4 | LR: [6.25e-05] | Epoch Limit: 9


Training:   0% 0/1 [26:30<?, ?it/s, Iter 23, Training Batch 1/500]    

Trng Avg Loss: 0.11279250544402748 | Valid Avg Loss: 0.4517457187175751 | Patience: 4 | LR: [6.25e-05] | Epoch Limit: 8


Training:   0% 0/1 [27:38<?, ?it/s, Iter 24, Training Batch 1/500]    

Trng Avg Loss: 0.10427129467250779 | Valid Avg Loss: 0.4013691246509552 | Patience: 4 | LR: [6.25e-05] | Epoch Limit: 7


Training:   0% 0/1 [28:45<?, ?it/s, Iter 24, Validation Batch 150/150]

Trng Avg Loss: 0.10151601471798495 | Valid Avg Loss: 0.417500376701355 | Patience: 3 | LR: [6.25e-05] | Epoch Limit: 6


Training:   0% 0/1 [29:54<?, ?it/s, Iter 25, Validation Batch 150/150]

Trng Avg Loss: 0.17617867982480676 | Valid Avg Loss: 0.46424469351768494 | Patience: 2 | LR: [6.25e-05] | Epoch Limit: 5


Training:   0% 0/1 [31:02<?, ?it/s, Iter 26, Validation Batch 150/150]

Trng Avg Loss: 0.09682173435296863 | Valid Avg Loss: 0.3684993386268616 | Patience: 4 | LR: [3.125e-05] | Epoch Limit: 4


Training:   0% 0/1 [32:11<?, ?it/s, Iter 27, Validation Batch 150/150]

Trng Avg Loss: 0.08218016158649698 | Valid Avg Loss: 0.34555333852767944 | Patience: 4 | LR: [3.125e-05] | Epoch Limit: 3


Training:   0% 0/1 [33:17<?, ?it/s, Iter 28, Validation Batch 150/150]

Trng Avg Loss: 0.07845211572572588 | Valid Avg Loss: 0.3483119606971741 | Patience: 3 | LR: [3.125e-05] | Epoch Limit: 2


Training:   0% 0/1 [34:26<?, ?it/s, Iter 29, Validation Batch 150/150]

Trng Avg Loss: 0.07358976627187803 | Valid Avg Loss: 0.32703179121017456 | Patience: 4 | LR: [3.125e-05] | Epoch Limit: 1


Training:   0% 0/1 [34:26<?, ?it/s, Epoch limit reached! Terminating...]


Cleared buffer
Cleared loss


In [None]:
audio.view_mel(best_snippet.detach().to("cpu").squeeze())

# Evaluation

In [17]:
gradient.evaluate(model, best_snippet, PREPARE_METHOD, test_dataset, CLAMP_EP, POSITION) # commented to prevent the runtime from autorunning and crashing the thing

Clamp: 0.005
Prepare Method: prepare_at_position
Snippet Size: (1, 4800)
Position: 0


Inference: 100%|██████████| 250/250 [00:43<00:00,  5.81it/s, Valid Examples: 194 | Empty Sequences: 73 | Total SL: 15560 | Non_empty ASL: 128.59504132231405]



Total valid examples: 194
Success rate (Empty): 0.37628865979381443
Success rate (ASL): 80.20618556701031 (attacked) out of 122.16494845360825 (original)





In [18]:
random_snippet = torch.rand(SNIPPET_SIZE) * CLAMP_EP
gradient.evaluate(model, random_snippet, PREPARE_METHOD, test_dataset, CLAMP_EP, POSITION) # commented to prevent the runtime from autorunning and crashing the thing

Clamp: 0.005
Prepare Method: prepare_at_position
Snippet Size: (1, 4800)
Position: 0


Inference: 100%|██████████| 250/250 [00:43<00:00,  5.74it/s, Valid Examples: 194 | Empty Sequences: 0 | Total SL: 23716 | Non_empty ASL: 122.24742268041237]



Total valid examples: 194
Success rate (Empty): 0.0
Success rate (ASL): 122.24742268041237 (attacked) out of 122.16494845360825 (original)





# Save Tensors

In [None]:
snippets = torch.stack(list(map(lambda x: x.cpu(), snippets)) + [best_snippet.cpu()])
snippets.shape

In [None]:
# torch.save(snippets.squeeze(), "snippets.pt")

In [None]:
# torch.save(torch.stack(list(map(torch.tensor, train_success.values()))), "train_success.pt")
# torch.save(torch.tensor(list(train_success.keys())), "train_ids.pt")

In [None]:
# torch.save(torch.stack(list(map(torch.tensor, valid_success.values()))), "valid_success.pt")
# torch.save(torch.tensor(list(valid_success.keys())), "valid_ids.pt")

# Save and Hear Snippet

In [None]:
def normalise(random_snippet, ep):
    # we assume torch.rand inits to [0, 1)
    res = random_snippet * ep * 2 - ep
    print(f"Normalised, Min {torch.min(res)}, Max {torch.max(res)}")
    return res

In [None]:
# Save snippet to wav file
# save_audio(snippet, f"./snippets/clamp_{CLAMP_EP}_{PREPARE_METHOD.name}_snippet_only.wav")

In [None]:
# save_audio(PREPARE_METHOD(snippet.to("cpu"), tedlium_test[2]["audio"].unsqueeze(0)), f"./snippets/clamp_{CLAMP_EP}_{PREPARE_METHOD.name}_combined.wav")