In [1]:
import torch
import os

from torch.utils.data import DataLoader
from tqdm.autonotebook import tqdm, trange
from transformers.optimization import AdamW
from transformers import HubertModel, AutoConfig

from models.hubert_selective import HuBERTSelectiveNet
from utils.model_tools import *
from utils.selective_loss import SelectiveLoss

  from tqdm.autonotebook import tqdm, trange


In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
import csv

embeds_folder = 'embeds/vocal_imitation-v1.1.3-full'

labels = []

with open(embeds_folder + '/labelvocabulary.csv', mode='r', encoding='utf-8') as file:
    label_csv = csv.DictReader(file)
    for row in label_csv:
        labels.append(row['label'])

print(labels[:5])

classes = set([filename[3:].split('_')[0] for filename in labels])
print(classes)

def get_class(label):
    return label[3:].split('_')[0]

['000Animal_Domestic animals_ pets_Cat_Growling_reference.wav', '001Animal_Domestic animals_ pets_Cat_Hiss_reference.wav', '002Animal_Domestic animals_ pets_Cat_Meow_reference.wav', '003Animal_Domestic animals_ pets_Cat_Purr_reference.wav', '004Animal_Domestic animals_ pets_Dog_Bark_reference.wav']
{'Animal', 'Sounds of things', 'Source-ambiguous sounds', 'Natural sounds', 'Channel', 'Music', 'Human sounds'}


In [4]:
import pickle

def get_percent_correct_per_class(foldname):
    with open(f'{embeds_folder}/{foldname}.target-labels.pkl', 'rb') as file:
        targetlabels = pickle.load(file)
    
    with open(f'{embeds_folder}/{foldname}.predictions.pkl', 'rb') as file:
        predictions = pickle.load(file)
    
    correct_classes = {key: 0 for key in classes}
    total_classes = {key: 0 for key in classes}
    
    for target, pred in zip(targetlabels, predictions['prediction']):
        label_idx = labels.index(target[0])
        pred_idx = torch.argmax(pred).item()
        label_class = get_class(target[0])
        total_classes[label_class] += 1
        if label_idx == pred_idx:
            correct_classes[label_class] += 1
    
    for key, value in correct_classes.items():
        correct_classes[key] = value / total_classes[key]
    
    return correct_classes

fold00 = get_percent_correct_per_class('fold00')
fold01 = get_percent_correct_per_class('fold01')
fold02 = get_percent_correct_per_class('fold02')

for key, value in fold00.items():
    print(key, (value + fold01[key] + fold02[key])/3)

Animal 0.16535121101205036
Sounds of things 0.07952992817010536
Source-ambiguous sounds 0.06819154399178363
Natural sounds 0.05687169312169312
Channel 0.09629877369007804
Music 0.1090071789969681
Human sounds 0.19200464411164817


### Take a look at embeddings

In [5]:
fold00_embed = np.load(f'embeds/vocal_imitation-v1.1.3-full/fold00/000Animal_Domestic animals_ pets_Cat_Growling-4815397341626368.wav.embedding.npy', allow_pickle=True)
print(fold00_embed.shape)


(768,)


In [6]:
# embeds/vocal_imitation-v1.1.3-full/fold00/000Animal_Domestic animals_ pets_Cat_Growling-4815397341626368.wav.embedding.npy
# embeds/vocal_imitation-v1.1.3-full/fold00/000Animal_Domestic animals_ pets_Cat_Growling-4815397341626368.wav.target-labels.json

int('032Animal_Domestic animals_ '[:3])

32

In [7]:
class EmbeddingsDataset(Dataset):
    def __init__(self, data_dir, fold_name, vocab_file='labelvocabulary.csv'):
        vocab_path = os.path.join(data_dir, vocab_file)
        
        if os.path.exists(vocab_path):
            self.vocab_list = read_csv(vocab_path)
        else:
            raise Exception("Data folder must contain a valid vocab index csv file")

        fold_label_file = fold_name + '.json'
        label_path = os.path.join(data_dir, fold_label_file)
    
        with open(label_path, mode='r') as file:
            data = json.load(file)
            
        self.samples = list(data.keys())
        
        self.fold_name = fold_name
        self.data_dir = data_dir

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx] + '.embedding.npy'
        embed_path = os.path.join(self.data_dir, self.fold_name, sample)
        embeddings = np.load(embed_path)
        
        embeddings = torch.from_numpy(embeddings)
        label = torch.tensor(int(sample[:3]), dtype=torch.int16)
        return [embeddings, label]

### Train SelectiveNet on Embeddings

In [8]:
class OneHotToCrossEntropyLoss(torch.nn.Module):
    def __init__(self):
        super(OneHotToCrossEntropyLoss, self).__init__()
        # Set reduction to 'none' to get a loss per item in the batch
        self.loss = torch.nn.CrossEntropyLoss(reduction='none')

    def forward(self, y_hat: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
        print(y_hat.shape)
        print(y.shape)
        # One and only one label per class
        assert torch.all(
            torch.sum(y, dim=1) == torch.tensor(1., device=y.device)
        )
        y = y.argmax(dim=1)
        # This will now return a tensor of shape (B,)
        return self.loss(y_hat, y)

loss_func = torch.nn.CrossEntropyLoss()#OneHotToCrossEntropyLoss()

fold00 = EmbeddingsDataset(embeds_folder, fold_name='fold00')
fold01 = EmbeddingsDataset(embeds_folder, fold_name='fold01')
fold02 = EmbeddingsDataset(embeds_folder, fold_name='fold02')

#for embed, label in fold00:
#    print(embed.shape)
#    print(label)

coverage = 0.8
alpha = 0.5
lm = 32.0
num_classes = 302
feature_size = 768

num_epochs = 500
batch_size = 200

model = HuBERTSelectiveNet(num_classes=num_classes, feature_size=feature_size)
loss_fn = SelectiveLoss(loss_func, coverage, alpha, lm, device=device)

optimizer = torch.optim.AdamW(model.parameters(), lr=0.0032)

In [9]:
train_losses_file = 'logs/selective-hubert-10ep-80c-train.txt'
test_losses_file = 'logs/selective-hubert-10ep-80c-test.txt'

train_losses = []
test_losses = []
t = trange(num_epochs)

model.to(device)

fold_set = set([fold00, fold01, fold02])

try:
    for fold in fold_set:
        off_folds = fold_set.difference([fold])
        off_concat = torch.utils.data.ConcatDataset(off_folds)
        
        train_loader = DataLoader(fold, batch_size=batch_size, shuffle=True, num_workers=os.cpu_count())
        test_loader = DataLoader(off_concat, batch_size=batch_size, shuffle=True, num_workers=os.cpu_count())
    
        for epoch in t:
            print(f"Epoch {epoch+1}\n-------------------------------")
            train_loss = selective_train(train_loader, model, loss_fn, optimizer, device)
            test_loss = selective_test(test_loader, model, device)
            train_losses.append(train_loss)
            test_losses.append(test_loss)
            
            torch.save(model.state_dict(), model_file)

finally:
    with open(train_losses_file, 'w') as fp:
        for s in train_losses:
            fp.write("%s\n" % s)
            
    with open(test_losses_file, 'w') as fp:
        for x in test_losses:
            fp.write("%s\n" % x)
    
    print("Done!")

  0%|          | 0/500 [00:00<?, ?it/s]

Epoch 1
-------------------------------
threshold 0.5
Done!


../aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [0,0,0] Assertion `t >= 0 && t < n_classes` failed.
../aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [1,0,0] Assertion `t >= 0 && t < n_classes` failed.
../aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [2,0,0] Assertion `t >= 0 && t < n_classes` failed.
../aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [3,0,0] Assertion `t >= 0 && t < n_classes` failed.
../aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [4,0,0] Assertion `t >= 0 && t < n_classes` failed.
../aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [5,0,0] Assertion `t >= 0 && t < n_classes` failed.
../aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_f

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
