# Initialisation and Dependencies

In [1]:
import sys
lib_path = '/home/jovyan/libs'
sys.path.insert(0, lib_path)

In [2]:
%reload_ext autoreload
%autoreload 2

import gc, math, traceback, datetime

import numpy as np

import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter

from datasets import load_from_disk

import whisper
from whisper.tokenizer import get_tokenizer

from utils import audio, gradient, gpu
from utils.attacks import PrepareFront, PrepareAtPosition

# GPU RAM Tracking

In [3]:
device = gpu.get_device()
# device = "cpu"

Device: cuda


# Load Model

In [4]:
try:
    del model
    print("Model deleted!")
except:
    pass

In [5]:
from utils import attention
MODEL_NAME = "small.en"

model = whisper.load_model("tiny.en").to(device)
model.eval();

  checkpoint = torch.load(fp, map_location=device)


In [6]:
tokenizer = whisper.tokenizer.get_tokenizer(False, num_languages=1, language="en", task="transcribe")
sot_ids = torch.tensor(tokenizer.sot_sequence_including_notimestamps, requires_grad=False).to(model.device)
eos = tokenizer.eot

# Load Data

In [8]:
tedlium_path = "../tedlium"
train_path, validation_path, test_path = f"{tedlium_path}/train_idx.hf", f"{tedlium_path}/validation_idx.hf", f"{tedlium_path}/test.hf"

In [9]:
TRAIN_SELECT = 500
VALID_SELECT = 150
TEST_SELECT = 250

SEED = 1

tedlium_train = load_from_disk(train_path).with_format("torch").shuffle(seed=SEED).select(range(TRAIN_SELECT))
tedlium_validation = load_from_disk(validation_path).with_format("torch").shuffle(seed=SEED).select(range(VALID_SELECT))
tedlium_test = load_from_disk(test_path).with_format("torch").shuffle(seed=SEED).select(range(TEST_SELECT))

Loading dataset from disk:   0%|          | 0/109 [00:00<?, ?it/s]

In [10]:
example = tedlium_train[1]["audio"]

In [31]:
tedlium_train[1]

{'audio': tensor([ 0.0027,  0.0045,  0.0049,  ...,  0.0053, -0.0042, -0.0082]),
 'text': 'an(2) ipod or(2) a {UH} tape deck or(2) something that would start {UH} playback {BREATH} and the machine would maintain synchronization throughout {BREATH} and speaking of synchronization {COUGH} they wanted(2) it {SMACK} to sync to the rhythm <sil> and to hit specific beats along the way',
 'idx': tensor(19345)}

In [180]:
snippet = torch.rand((10240), requires_grad=True)

In [184]:
snippet.grad

In [185]:
attacker = torch.cat([snippet, example])

In [186]:
attacker.requires_grad

True

In [187]:
def inference(audio_tensor, model, sot_ids, skip_special_tokens=False) -> tuple:
    # """
    # Pass audio to model for inference. Audio tensor must be non-batched (single example) and 1-Dimensional
    # """
    # assert audio_tensor.dim() == 1, f"Audio tensor must be 1-Dimensional! Got {audio_tensor.dim()} dims"
    
    # Feature extraction
    input_features = gradient.audio_to_mel(audio_tensor).to(model.device).unsqueeze(0)
    loss = 0.0
    sum_logprobs = torch.tensor([0.0], device=model.device)
    decoder = d.GreedyDecoder(0.0, eot=eos)
    tokens = sot_ids.unsqueeze(0)
    completed = False
    losses = []
    sf = torch.nn.Softmax(dim=2)
    while not completed:
        logits = model.forward(input_features, tokens)
        print(logits.requires_grad)
        loss = sf(logits)[:, -1, eos]
        losses.append(loss)
        tokens, completed = decoder.update(tokens, logits[:, -1, :], sum_logprobs)
    
    loss_stack = torch.stack(losses).squeeze()
    print(loss_stack.requires_grad)
    print(torch.arange(loss_stack.size(0), device=model.device).shape)
    final_loss = torch.sum(loss_stack * torch.arange(loss_stack.size(0), device=model.device))
    return final_loss

In [188]:
import whisper.decoding as d

In [189]:
loss = inference(attacker, model, sot_ids)

True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
torch.Size([46])


In [190]:
loss.backward()

In [192]:
snippet.grad

tensor([0.0142, 0.0277, 0.0231,  ..., 0.0264, 0.0413, 0.0497])

In [93]:
tokens

tensor([[50257, 50264,    13,   843,  5486,   286, 42133,    11,   484,  2227,
           340,   284, 14595,   284,   262, 18662,   290,   284,  2277,  2176,
         17825,  1863,   262,   835,    13, 50256]], device='cuda:0')

In [None]:
# def collate(ls):
#     pad_to = max(list(map(lambda x: x["audio"].shape[0], ls)))
#     return torch.cat(list(map(lambda x: F.pad(x["audio"], (0, pad_to - x["audio"].shape[0])).unsqueeze(0).to(torch.bfloat16), ls)), dim=0)

def collate_idx(ls):
    return ls[0]["audio"].unsqueeze(0), ls[0]["idx"].item()

TRAIN_BATCH_SIZE = 1 # highly recommended to be 1
VALID_BATCH_SIZE = 1

train_dataset = DataLoader(tedlium_train, batch_size=TRAIN_BATCH_SIZE, collate_fn=collate_idx)
validation_dataset = DataLoader(tedlium_validation, batch_size=VALID_BATCH_SIZE, collate_fn=collate_idx)
test_dataset = DataLoader(tedlium_test)

# Training Loop

In [None]:
gc.collect()
torch.cuda.empty_cache()
gpu.print_cuda_usage()

In [None]:
# LR = 1e-3
# PATIENCE = 4
# MIN_LIMIT = 45
# ITER_LIMIT = 30
# CLAMP_EP = 0.005
# SNIPPET_SIZE = (1, 16_000)
# POSITION = 0
# PREPARE_METHOD = PrepareAtPosition(SNIPPET_SIZE, POSITION)

# writer = None

In [None]:
# # tensorboard writer
# timestamp = datetime.datetime.now().strftime(f'%Y%m%d-%H%M%S_size_{SNIPPET_SIZE}_{PREPARE_METHOD.name}')
# writer = SummaryWriter(log_dir=f"../runs/size_tests/{timestamp}", max_queue=5)

In [None]:
# best_snippet, snippets, _, _ = gradient.train(model, train_dataset, validation_dataset, 
#                                                             PREPARE_METHOD,
#                                                             writer, lr=LR, 
#                                                             train_success=None, valid_success=None,
#                                                             iter_limit=ITER_LIMIT, mins_limit=MIN_LIMIT, patience=PATIENCE, clamp_epsilon=CLAMP_EP)

In [None]:
# audio.view_mel(best_snippet.detach().to("cpu").squeeze())

# Evaluation

In [None]:
# gradient.evaluate(model, best_snippet, PREPARE_METHOD, test_dataset, CLAMP_EP, POSITION) # commented to prevent the runtime from autorunning and crashing the thing

In [None]:
# random_snippet = torch.rand(SNIPPET_SIZE) * CLAMP_EP
# gradient.evaluate(model, random_snippet, PREPARE_METHOD, test_dataset, CLAMP_EP, POSITION) # commented to prevent the runtime from autorunning and crashing the thing

# Save Tensors

In [None]:
snippets = torch.stack(list(map(lambda x: x.cpu(), snippets)) + [best_snippet.cpu()])
snippets.shape

In [None]:
# torch.save(snippets.squeeze(), "snippets.pt")

In [None]:
# torch.save(torch.stack(list(map(torch.tensor, train_success.values()))), "train_success.pt")
# torch.save(torch.tensor(list(train_success.keys())), "train_ids.pt")

In [None]:
# torch.save(torch.stack(list(map(torch.tensor, valid_success.values()))), "valid_success.pt")
# torch.save(torch.tensor(list(valid_success.keys())), "valid_ids.pt")

# Save and Hear Snippet

In [None]:
def normalise(random_snippet, ep):
    # we assume torch.rand inits to [0, 1)
    res = random_snippet * ep * 2 - ep
    print(f"Normalised, Min {torch.min(res)}, Max {torch.max(res)}")
    return res

In [None]:
# Save snippet to wav file
# save_audio(snippet, f"./snippets/clamp_{CLAMP_EP}_{PREPARE_METHOD.name}_snippet_only.wav")

In [None]:
# save_audio(PREPARE_METHOD(snippet.to("cpu"), tedlium_test[2]["audio"].unsqueeze(0)), f"./snippets/clamp_{CLAMP_EP}_{PREPARE_METHOD.name}_combined.wav")