# Initialisation and Dependencies

In [1]:
import sys
lib_path = '/home/jovyan/libs'
sys.path.insert(0, lib_path)

In [26]:
%reload_ext autoreload
%autoreload 2

import gc, math, traceback, datetime

import numpy as np

import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter

from datasets import load_from_disk

import whisper
from whisper.tokenizer import get_tokenizer

from utils import audio, gradient, gpu
from utils.attacks import PrepareFront, PrepareAtPosition

# GPU RAM Tracking

In [8]:
device = gpu.get_device()

Device: cuda


# Load Model

In [9]:
try:
    del model
    print("Model deleted!")
except:
    pass

In [10]:
MODEL_NAME = "tiny.en"

model = whisper.load_model(MODEL_NAME).to(device)
model.eval();

  checkpoint = torch.load(fp, map_location=device)


# Load Data

In [11]:
tedlium_path = "../tedlium"
train_path, validation_path, test_path = f"{tedlium_path}/train_idx.hf", f"{tedlium_path}/validation_idx.hf", f"{tedlium_path}/test.hf"

In [12]:
TRAIN_SELECT = 500
VALID_SELECT = 150
TEST_SELECT = 250

SEED = 1

tedlium_train = load_from_disk(train_path).with_format("torch").shuffle(seed=SEED).select(range(TRAIN_SELECT))
tedlium_validation = load_from_disk(validation_path).with_format("torch").shuffle(seed=SEED).select(range(VALID_SELECT))
tedlium_test = load_from_disk(test_path).with_format("torch").shuffle(seed=SEED).select(range(TEST_SELECT))

Loading dataset from disk:   0%|          | 0/109 [00:00<?, ?it/s]

In [13]:
# def collate(ls):
#     pad_to = max(list(map(lambda x: x["audio"].shape[0], ls)))
#     return torch.cat(list(map(lambda x: F.pad(x["audio"], (0, pad_to - x["audio"].shape[0])).unsqueeze(0).to(torch.bfloat16), ls)), dim=0)

def collate_idx(ls):
    return ls[0]["audio"].unsqueeze(0), ls[0]["idx"].item()

TRAIN_BATCH_SIZE = 1 # highly recommended to be 1
VALID_BATCH_SIZE = 1

train_dataset = DataLoader(tedlium_train, batch_size=TRAIN_BATCH_SIZE, collate_fn=collate_idx)
validation_dataset = DataLoader(tedlium_validation, batch_size=VALID_BATCH_SIZE, collate_fn=collate_idx)
test_dataset = DataLoader(tedlium_test)

# Training Loop

In [15]:
gc.collect()
torch.cuda.empty_cache()
gpu.print_cuda_usage()

0.14174842834472656 GB


In [16]:
LR = 1e-3
PATIENCE = 5
MIN_LIMIT = 45
ITER_LIMIT = 30
CLAMP_EP = 0.005
SNIPPET_SIZE = (1, 6_800)
POSITION = 16_000
PREPARE_METHOD = PrepareAtPosition(SNIPPET_SIZE, POSITION)

writer = None

In [19]:
# tensorboard writer
timestamp = datetime.datetime.now().strftime(f'%Y%m%d-%H%M%S_size_{SNIPPET_SIZE}_{PREPARE_METHOD.name}')
writer = SummaryWriter(log_dir=f"../runs/size_tests/{timestamp}", max_queue=5)

In [27]:
best_snippet, snippets, _, _ = gradient.train(model, train_dataset, validation_dataset, 
                                                            PREPARE_METHOD,
                                                            writer, lr=LR, 
                                                            train_success=None, valid_success=None,
                                                            iter_limit=ITER_LIMIT, mins_limit=MIN_LIMIT, patience=PATIENCE, clamp_epsilon=CLAMP_EP)

Prepare method: prepare_at_position
Snippet initialised to [3.107941438429407e-06, 0.004998547025024891] of size (1, 6800)
Clamp: 0.005
Time Limit (Mins): 45
Epochs Limit: 30
Tracking training success: False
Tracking valid success: False


Training:   0% 0/1 [03:01<?, ?it/s, Iter 1, Validation Batch 150/150]

Trng Avg Loss: 7.948866844177246 | Valid Avg Loss: 8.048896789550781 | Patience: 5 | LR: [0.001] | Epoch Limit: 29


Training:   0% 0/1 [05:05<?, ?it/s, Iter 2, Validation Batch 150/150]

Trng Avg Loss: 7.6776909828186035 | Valid Avg Loss: 7.407385349273682 | Patience: 5 | LR: [0.001] | Epoch Limit: 28


Training:   0% 0/1 [06:56<?, ?it/s, Iter 3, Validation Batch 150/150]

Trng Avg Loss: 7.0080132484436035 | Valid Avg Loss: 7.173605918884277 | Patience: 5 | LR: [0.001] | Epoch Limit: 27


Training:   0% 0/1 [08:28<?, ?it/s, Iter 4, Validation Batch 150/150]

Trng Avg Loss: 6.775969982147217 | Valid Avg Loss: 7.016367435455322 | Patience: 5 | LR: [0.001] | Epoch Limit: 26


Training:   0% 0/1 [09:30<?, ?it/s, Iter 5, Validation Batch 150/150]

Trng Avg Loss: 6.901845455169678 | Valid Avg Loss: 7.114048004150391 | Patience: 4 | LR: [0.001] | Epoch Limit: 25


Training:   0% 0/1 [10:33<?, ?it/s, Iter 6, Validation Batch 150/150]

Trng Avg Loss: 6.0824174880981445 | Valid Avg Loss: 6.17254114151001 | Patience: 5 | LR: [0.0005] | Epoch Limit: 24


Training:   0% 0/1 [11:36<?, ?it/s, Iter 7, Validation Batch 150/150]

Trng Avg Loss: 5.6127705574035645 | Valid Avg Loss: 5.709960460662842 | Patience: 5 | LR: [0.0005] | Epoch Limit: 23


Training:   0% 0/1 [12:40<?, ?it/s, Iter 8, Validation Batch 150/150]

Trng Avg Loss: 5.325212001800537 | Valid Avg Loss: 5.983372688293457 | Patience: 4 | LR: [0.0005] | Epoch Limit: 22


Training:   0% 0/1 [13:43<?, ?it/s, Iter 10, Training Batch 2/500]   

Trng Avg Loss: 5.2271037101745605 | Valid Avg Loss: 5.970066070556641 | Patience: 3 | LR: [0.0005] | Epoch Limit: 21


Training:   0% 0/1 [14:45<?, ?it/s, Iter 10, Validation Batch 150/150]

Trng Avg Loss: 5.02964973449707 | Valid Avg Loss: 5.622575283050537 | Patience: 5 | LR: [0.0005] | Epoch Limit: 20


Training:   0% 0/1 [16:35<?, ?it/s, Iter 11, Validation Batch 150/150]

Trng Avg Loss: 3.139237642288208 | Valid Avg Loss: 3.7960007190704346 | Patience: 5 | LR: [0.00025] | Epoch Limit: 19


Training:   0% 0/1 [18:40<?, ?it/s, Iter 12, Validation Batch 150/150]

Trng Avg Loss: 2.4630630016326904 | Valid Avg Loss: 3.3148787021636963 | Patience: 5 | LR: [0.00025] | Epoch Limit: 18


Training:   0% 0/1 [20:49<?, ?it/s, Iter 14, Training Batch 2/500]    

Trng Avg Loss: 2.083742618560791 | Valid Avg Loss: 2.9950034618377686 | Patience: 5 | LR: [0.00025] | Epoch Limit: 17


Training:   0% 0/1 [23:01<?, ?it/s, Iter 14, Validation Batch 150/150]

Trng Avg Loss: 1.9601311683654785 | Valid Avg Loss: 2.6071858406066895 | Patience: 5 | LR: [0.00025] | Epoch Limit: 16


Training:   0% 0/1 [24:57<?, ?it/s, Iter 15, Validation Batch 150/150]

Trng Avg Loss: 1.6863411664962769 | Valid Avg Loss: 2.1483376026153564 | Patience: 5 | LR: [0.00025] | Epoch Limit: 15


Training:   0% 0/1 [27:04<?, ?it/s, Iter 17, Training Batch 1/500]    

Trng Avg Loss: 0.8921125531196594 | Valid Avg Loss: 1.3239494562149048 | Patience: 5 | LR: [0.000125] | Epoch Limit: 14


Training:   0% 0/1 [29:10<?, ?it/s, Iter 17, Validation Batch 150/150]

Trng Avg Loss: 0.7668418884277344 | Valid Avg Loss: 1.3484176397323608 | Patience: 4 | LR: [0.000125] | Epoch Limit: 13


Training:   0% 0/1 [30:19<?, ?it/s, Iter 18, Validation Batch 150/150]

Trng Avg Loss: 0.750175416469574 | Valid Avg Loss: 1.3023782968521118 | Patience: 5 | LR: [0.000125] | Epoch Limit: 12


Training:   0% 0/1 [31:33<?, ?it/s, Iter 19, Validation Batch 150/150]

Trng Avg Loss: 0.6418514847755432 | Valid Avg Loss: 0.8784810304641724 | Patience: 5 | LR: [0.000125] | Epoch Limit: 11


Training:   0% 0/1 [33:25<?, ?it/s, Iter 20, Validation Batch 150/150]

Trng Avg Loss: 0.49461430311203003 | Valid Avg Loss: 0.8934996724128723 | Patience: 4 | LR: [0.000125] | Epoch Limit: 10


Training:   0% 0/1 [34:48<?, ?it/s, Iter 21, Validation Batch 150/150]

Trng Avg Loss: 0.28399837017059326 | Valid Avg Loss: 0.5657040476799011 | Patience: 5 | LR: [6.25e-05] | Epoch Limit: 9


Training:   0% 0/1 [36:55<?, ?it/s, Iter 23, Training Batch 1/500]    

Trng Avg Loss: 0.2702611982822418 | Valid Avg Loss: 0.5914958715438843 | Patience: 4 | LR: [6.25e-05] | Epoch Limit: 8


Training:   0% 0/1 [38:22<?, ?it/s, Iter 23, Training Batch 468/500]


Cleared buffer
Cleared loss
0.2993779182434082 GB


In [None]:
audio.view_mel(best_snippet.detach().to("cpu").squeeze())

# Evaluation

In [28]:
gradient.evaluate(best_snippet, PREPARE_METHOD, model, test_dataset, CLAMP_EP, POSITION) # commented to prevent the runtime from autorunning and crashing the thing

[autoreload of utils.gradient failed: Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/IPython/extensions/autoreload.py", line 273, in check
    superreload(m, reload, self.old_objects)
  File "/usr/local/lib/python3.8/dist-packages/IPython/extensions/autoreload.py", line 471, in superreload
    module = reload(module)
  File "/usr/lib/python3.8/importlib/__init__.py", line 169, in reload
    _bootstrap._exec(spec, module)
  File "<frozen importlib._bootstrap>", line 604, in _exec
  File "<frozen importlib._bootstrap_external>", line 848, in exec_module
  File "<frozen importlib._bootstrap>", line 219, in _call_with_frames_removed
  File "/home/jovyan/code/utils/gradient.py", line 257, in <module>
    def evaluate(model: whisper.model.Whisper, snippet: Tensor, prepare_method: PrepareMethod, test_dataset: Dataset, clamp_ep: float, position: tuple):
NameError: name 'Dataset' is not defined
]
[autoreload of utils.gradient failed: Traceback (most recent cal

Clamp: 0.005
Prepare Method: prepare_at_position
Snippet Size: (1, 6800)
Position: 16000


Inference: 100%|██████████| 250/250 [01:10<00:00,  3.56it/s, Valid Examples: 194 | Empty Sequences: 113 | Total SL = 9166]



Total valid examples: 194
Success rate (Empty): 0.5824742268041238
Success rate (ASL): 47.24742268041237 (attacked) out of 122.16494845360825 (original)





In [None]:
snippets = torch.stack(list(map(lambda x: x.cpu(), snippets)) + [best_snippet.cpu()])
snippets.shape

In [None]:
torch.save(snippets.squeeze(), "snippets.pt")

In [None]:
torch.save(torch.stack(list(map(torch.tensor, train_success.values()))), "train_success.pt")
torch.save(torch.tensor(list(train_success.keys())), "train_ids.pt")

In [None]:
torch.save(torch.stack(list(map(torch.tensor, valid_success.values()))), "valid_success.pt")
torch.save(torch.tensor(list(valid_success.keys())), "valid_ids.pt")

# Save and Hear Snippet

In [None]:
def normalise(random_snippet, ep):
    # we assume torch.rand inits to [0, 1)
    res = random_snippet * ep * 2 - ep
    print(f"Normalised, Min {torch.min(res)}, Max {torch.max(res)}")
    return res

In [None]:
# Save snippet to wav file
save_audio(snippet, f"./snippets/clamp_{CLAMP_EP}_{PREPARE_METHOD.name}_snippet_only.wav")

In [None]:
save_audio(PREPARE_METHOD(snippet.to("cpu"), tedlium_test[2]["audio"].unsqueeze(0)), f"./snippets/clamp_{CLAMP_EP}_{PREPARE_METHOD.name}_combined.wav")