In [1]:
import torch
import os

from torch.utils.data import DataLoader
from tqdm.autonotebook import tqdm, trange
from transformers.optimization import AdamW
from transformers import HubertModel, AutoConfig

from models.hubert_selective import HuBERTSelectiveNet
from utils.model_tools import *
from utils.selective_loss import SelectiveLoss

os.environ["CUDA_LAUNCH_BLOCKING"] = "1" # just for debugging this loss thing

  from tqdm.autonotebook import tqdm, trange


In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
import csv

embeds_folder = 'embeds/vocal_imitation-v1.1.3-full'

labels = []

with open(embeds_folder + '/labelvocabulary.csv', mode='r', encoding='utf-8') as file:
    label_csv = csv.DictReader(file)
    for row in label_csv:
        labels.append(row['label'])

print(labels[:5])

classes = set([filename[3:].split('_')[0] for filename in labels])
print(classes)

def get_class(label):
    return label[3:].split('_')[0]

['000Animal_Domestic animals_ pets_Cat_Growling_reference.wav', '001Animal_Domestic animals_ pets_Cat_Hiss_reference.wav', '002Animal_Domestic animals_ pets_Cat_Meow_reference.wav', '003Animal_Domestic animals_ pets_Cat_Purr_reference.wav', '004Animal_Domestic animals_ pets_Dog_Bark_reference.wav']
{'Natural sounds', 'Human sounds', 'Sounds of things', 'Music', 'Animal', 'Channel', 'Source-ambiguous sounds'}


In [4]:
import pickle

def get_percent_correct_per_class(foldname):
    with open(f'{embeds_folder}/{foldname}.target-labels.pkl', 'rb') as file:
        targetlabels = pickle.load(file)
    
    with open(f'{embeds_folder}/{foldname}.predictions.pkl', 'rb') as file:
        predictions = pickle.load(file)

    print(predictions)
    
    correct_classes = {key: 0 for key in classes}
    total_classes = {key: 0 for key in classes}
    
    for target, pred in zip(targetlabels, predictions['prediction']):
        label_idx = labels.index(target[0])
        pred_idx = torch.argmax(pred).item()
        label_class = get_class(target[0])
        total_classes[label_class] += 1
        if label_idx == pred_idx:
            correct_classes[label_class] += 1
    
    for key, value in correct_classes.items():
        correct_classes[key] = value / total_classes[key]
    
    return correct_classes

fold00 = get_percent_correct_per_class('fold00')
fold01 = get_percent_correct_per_class('fold01')
fold02 = get_percent_correct_per_class('fold02')

for key, value in fold00.items():
    print(key, (value + fold01[key] + fold02[key])/3)

{'target': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 1., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]]), 'prediction': tensor([[7.4165e-05, 4.9898e-04, 4.2090e-04,  ..., 7.2377e-04, 2.1606e-04,
         3.6174e-02],
        [2.9739e-07, 1.8492e-05, 6.0240e-04,  ..., 8.7165e-07, 2.1620e-05,
         1.0725e-06],
        [2.2136e-02, 1.4008e-04, 1.8936e-05,  ..., 2.1327e-05, 3.8000e-02,
         1.5987e-02],
        ...,
        [1.7235e-04, 5.4822e-06, 5.7101e-06,  ..., 1.2099e-06, 2.6956e-04,
         6.9220e-07],
        [6.2079e-05, 7.5346e-05, 6.5691e-04,  ..., 2.9677e-04, 7.0809e-05,
         1.6825e-03],
        [7.8546e-06, 3.9271e-05, 2.6359e-04,  ..., 5.2309e-05, 8.7509e-06,
         4.0081e-05]]), 'prediction_logit': tensor([[-3.7811, -1.8748, -2.0450,  ..., -1.5029, -2.7118,  2.4087],
        [-7.4342, -3.3042,  0.179

### Take a look at embeddings

In [5]:
fold00_embed = np.load(f'embeds/vocal_imitation-v1.1.3-full/fold00/000Animal_Domestic animals_ pets_Cat_Growling-4815397341626368.wav.embedding.npy', allow_pickle=True)
print(fold00_embed.shape)


(768,)


In [6]:
# embeds/vocal_imitation-v1.1.3-full/fold00/000Animal_Domestic animals_ pets_Cat_Growling-4815397341626368.wav.embedding.npy
# embeds/vocal_imitation-v1.1.3-full/fold00/000Animal_Domestic animals_ pets_Cat_Growling-4815397341626368.wav.target-labels.json

int('032Animal_Domestic animals_ '[:3])

32

In [7]:
class EmbeddingsDataset(Dataset):
    def __init__(self, data_dir, fold_name, vocab_file='labelvocabulary.csv'):
        vocab_path = os.path.join(data_dir, vocab_file)
        
        if os.path.exists(vocab_path):
            self.vocab_list = read_csv(vocab_path)
        else:
            raise Exception("Data folder must contain a valid vocab index csv file")

        fold_label_file = fold_name + '.json'
        label_path = os.path.join(data_dir, fold_label_file)
    
        with open(label_path, mode='r') as file:
            data = json.load(file)
            
        self.samples = list(data.keys())
        
        self.fold_name = fold_name
        self.data_dir = data_dir

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx] + '.embedding.npy'
        embed_path = os.path.join(self.data_dir, self.fold_name, sample)
        embeddings = np.load(embed_path)
        
        embeddings = torch.from_numpy(embeddings)
        label = torch.tensor(int(sample[:3]), dtype=torch.long)
        return [embeddings, label]

### Train SelectiveNet on Embeddings

In [8]:
class OneHotToCrossEntropyLoss(torch.nn.Module):
    def __init__(self):
        super(OneHotToCrossEntropyLoss, self).__init__()
        # Set reduction to 'none' to get a loss per item in the batch
        self.loss = torch.nn.CrossEntropyLoss(reduction='none')

    def forward(self, y_hat: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
        print(y_hat.shape)
        print(y.shape)
        # One and only one label per class
        assert torch.all(
            torch.sum(y, dim=1) == torch.tensor(1., device=y.device)
        )
        y = y.argmax(dim=1)
        # This will now return a tensor of shape (B,)
        return self.loss(y_hat, y)

loss_func = torch.nn.CrossEntropyLoss()#OneHotToCrossEntropyLoss()

fold00 = EmbeddingsDataset(embeds_folder, fold_name='fold00')
fold01 = EmbeddingsDataset(embeds_folder, fold_name='fold01')
fold02 = EmbeddingsDataset(embeds_folder, fold_name='fold02')

coverage = 1.0
alpha = 0.5
lm = 32.0
num_classes = 302
feature_size = 768

num_epochs = 130
batch_size = 1024

model = HuBERTSelectiveNet(num_classes=num_classes, feature_size=feature_size)
loss_fn = SelectiveLoss(loss_func, coverage, alpha, lm, device=device)

optimizer = torch.optim.AdamW(model.parameters(), lr=0.0032)

In [9]:
import json

model_file = 'models/selective-hubert-10ep-100c.pt' # 80c is 80% coverage

train_losses_file = 'logs/selective-hubert-10ep-100c-train'
test_losses_file = 'logs/selective-hubert-10ep-100c-test'

train_losses = []
test_losses = []
t = trange(num_epochs)

model.to(device)

fold_set = set([fold00, fold01, fold02])
fold_models = []

for i, fold in enumerate(fold_set):
    off_folds = fold_set.difference([fold])
    off_concat = torch.utils.data.ConcatDataset(off_folds)
        
    train_loader = DataLoader(off_concat, batch_size=batch_size, shuffle=True, num_workers=os.cpu_count())
    test_loader = DataLoader(fold, batch_size=batch_size, shuffle=True, num_workers=os.cpu_count())
    
    for epoch in t:
        print(f"Epoch {epoch+1}\n-------------------------------")
        train_loss, train_loss_dict = selective_train(train_loader, model, loss_fn, optimizer, device)
        test_loss, test_loss_dict = selective_test(test_loader, model, loss_fn, device)
        train_losses.append(train_loss)
        test_losses.append(test_loss)

    torch.save(model.state_dict(), model_file)
    
    with open(f'{train_losses_file}_fold_{str(i)}.txt', 'w') as fp:
        for s in train_losses:
            fp.write("%s\n" % s)
            
    with open(f'{test_losses_file}_fold_{str(i)}.txt', 'w') as fp:
        for x in test_losses:
            fp.write("%s\n" % x)

print("Done!")

  0%|          | 0/130 [00:00<?, ?it/s]

Epoch 1
-------------------------------
loss: 8.282101  [    0/ 3734]
Test Error: 
 Avg accuracy: 0.005512 

Epoch 2
-------------------------------
loss: 5.822941  [    0/ 3734]
Test Error: 
 Avg accuracy: 0.023093 

Epoch 3
-------------------------------
loss: 5.214411  [    0/ 3734]
Test Error: 
 Avg accuracy: 0.033871 

Epoch 4
-------------------------------
loss: 4.896560  [    0/ 3734]
Test Error: 
 Avg accuracy: 0.041860 

Epoch 5
-------------------------------
loss: 4.698348  [    0/ 3734]
Test Error: 
 Avg accuracy: 0.051488 

Epoch 6
-------------------------------
loss: 4.546929  [    0/ 3734]
Test Error: 
 Avg accuracy: 0.070917 

Epoch 7
-------------------------------
loss: 4.395479  [    0/ 3734]
Test Error: 
 Avg accuracy: 0.092614 

Epoch 8
-------------------------------
loss: 4.316939  [    0/ 3734]
Test Error: 
 Avg accuracy: 0.097778 

Epoch 9
-------------------------------
loss: 4.185995  [    0/ 3734]
Test Error: 
 Avg accuracy: 0.119580 

Epoch 10
----------

In [13]:
device = 'cpu'

model = HuBERTSelectiveNet(num_classes=num_classes, feature_size=feature_size)
model.load_state_dict(torch.load(model_file))
model.eval()

loss_fn = SelectiveLoss(loss_func, coverage, alpha, lm, device=device)

model.to(device)

predictions_file = 'logs/selective-hubert-10ep-100c-predictions'
selections_file = 'logs/selective-hubert-10ep-100c-selections'
auxiliary_file = 'logs/selective-hubert-10ep-100c-auxiliary'

loss_dict_file = 'logs/selective-hubert-10ep-100c-loss-dict'

results = []

for i, fold in enumerate(fold_set):
    print(f'Fold 00{i}')
        
    test_loader = DataLoader(fold, batch_size=batch_size, shuffle=False, num_workers=os.cpu_count())

    loss_fn = SelectiveLoss(loss_func, coverage, alpha, lm, device=device)

    pred = []
    selection = []
    aux = []
    for X, y in test_loader:
        X, y = X.to(device), y.to(device)
        with torch.no_grad():
            logits, selection_logits, auxiliary_logits = model(X)
        pred.append(logits.detach().cpu().numpy())
        selection.append(selection_logits.detach().cpu().numpy())
        aux.append(auxiliary_logits.detach().cpu().numpy())

    test_loss, test_loss_dict = selective_test(test_loader, model, loss_fn, device)
    
    test_loss_dict['loss'] = test_loss_dict['loss'].detach().cpu().item()
    test_loss_dict['loss_pytorch'] = test_loss_dict['loss_pytorch'].detach().cpu().item()
    
    with open(f'{loss_dict_file}_fold_{str(i)}', 'w') as fp: 
            fp.write(json.dumps(test_loss_dict))

    np.save(f'{predictions_file}_fold_{str(i)}.npy', np.array(pred, dtype='object'), allow_pickle=True) 
    np.save(f'{selections_file}_fold_{str(i)}.npy', np.array(selection, dtype='object'), allow_pickle=True)
    np.save(f'{auxiliary_file}_fold_{str(i)}.npy', np.array(aux, dtype='object'), allow_pickle=True)
        
print("Done!")

Fold 000
Test Error: 
 Avg accuracy: 1.000000 

Fold 001
Test Error: 
 Avg accuracy: 1.000000 

Fold 002
Test Error: 
 Avg accuracy: 0.923883 

Done!
