In [1]:
import torch
import os

from torch.utils.data import DataLoader
from tqdm.autonotebook import tqdm, trange
from transformers.optimization import AdamW
from transformers import HubertModel, AutoConfig

from models.hubert_selective import HuBERTSelectiveNet
from utils.model_tools import *
from utils.selective_loss import SelectiveLoss

  from tqdm.autonotebook import tqdm, trange


In [2]:
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = 'cpu'

### Loading in Vocal Imitation Data

In [3]:
data_dir = 'data/vocal_imitation-full'
model_id = "facebook/hubert-base-ls960"

from transformers import Wav2Vec2FeatureExtractor

hubert_model = HubertModel.from_pretrained(model_id)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_id)

for param in hubert_model.parameters():
     param.requires_grad = False

def prepare_dataset(batch):
    waveforms, labels = zip(*batch)
    waveforms = list(waveforms)
    features = feature_extractor(waveforms, return_tensors="pt", padding=True, sampling_rate=16000)
    features = features.input_values
    labels = torch.tensor(labels)
    
    return features, labels
;
fold00 = VocalImitationDataset(data_dir, fold_name='fold00')
fold01 = VocalImitationDataset(data_dir, fold_name='fold01')
fold02 = VocalImitationDataset(data_dir, fold_name='fold02')

num_classes = len(fold00.vocab_list)
print(num_classes)
fold_set = set([fold00, fold01, fold02])

Some weights of the model checkpoint at facebook/hubert-base-ls960 were not used when initializing HubertModel: ['encoder.pos_conv_embed.conv.weight_v', 'encoder.pos_conv_embed.conv.weight_g']
- This IS expected if you are initializing HubertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertModel were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'encoder.pos_conv_embed.conv.parametrizations.weight.original0']
You should probably TRAIN this model on a down-stream task to be able to use it for pre

data/vocal_imitation-full/labelvocabulary.csv
data/vocal_imitation-full/fold00.json
data/vocal_imitation-full/labelvocabulary.csv
data/vocal_imitation-full/fold01.json
data/vocal_imitation-full/labelvocabulary.csv
data/vocal_imitation-full/fold02.json
302


In [4]:
# Instantiate hubert model and make the full selectivenet

# I wonder if I will have a problem with this featuresize if I need to do padding.

inputs, labels = next(iter(fold00))
print(inputs.shape)
inputs = feature_extractor(inputs, return_tensors="pt", sampling_rate=16000)
print(inputs.input_values.shape)
outputs = hubert_model(inputs.input_values)
feature_size = outputs.last_hidden_state.shape[2]
print('features:', feature_size)

model = HuBERTSelectiveNet(hubert_model, num_classes=num_classes, feature_size=feature_size)

print(model.num_classes)

(180160,)
torch.Size([1, 180160])
features: 768
302


In [5]:
# class OneHotToCrossEntropyLoss(torch.nn.Module):
#     def __init__(self):
#         super(OneHotToCrossEntropyLoss, self).__init__()
#         self.loss = torch.nn.CrossEntropyLoss()

#     def forward(self, y_hat: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
#         # One and only one label per class
#         assert torch.all(
#             torch.sum(y, dim=1) == torch.tensor(1., device=y.device)
#         )
#         y = y.argmax(dim=1)
#         return self.loss(y_hat, y)

class OneHotToCrossEntropyLoss(torch.nn.Module):
    def __init__(self):
        super(OneHotToCrossEntropyLoss, self).__init__()
        # Set reduction to 'none' to get a loss per item in the batch
        self.loss = torch.nn.CrossEntropyLoss(reduction='none')

    def forward(self, y_hat: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
        # One and only one label per class
        assert torch.all(
            torch.sum(y, dim=1) == torch.tensor(1., device=y.device)
        )
        y = y.argmax(dim=1)
        # This will now return a tensor of shape (B,)
        return self.loss(y_hat, y)

loss_func = OneHotToCrossEntropyLoss()
coverage = 0.8
alpha = 0.5
lm = 32.0
num_epochs = 10
batch_size = 2

loss_fn = SelectiveLoss(loss_func, coverage, alpha, lm) #, device=device)

# loss=loss
# Create the optimizer
optimizer = AdamW(model.parameters(), lr=1e-4)

model_file = 'models/selective-hubert-10ep-80c.pt' # 80c is 80% coverage



In [6]:
train_losses_file = 'logs/selective-hubert-10ep-80c-train.txt'
test_losses_file = 'logs/selective-hubert-10ep-80c-test.txt'

train_losses = []
test_losses = []
t = trange(num_epochs)

model.to(device)

try:
    for fold in fold_set:
        off_folds = fold_set.difference([fold])
        off_concat = torch.utils.data.ConcatDataset(off_folds)
        
        train_loader = DataLoader(fold, batch_size=batch_size, shuffle=True, num_workers=os.cpu_count())
        test_loader = DataLoader(off_concat, batch_size=batch_size, shuffle=True, num_workers=os.cpu_count())
    
        for epoch in t:
            print(f"Epoch {epoch+1}\n-------------------------------")
            train_loss = selective_train(train_loader, model, loss_fn, optimizer, device)
            test_loss = selective_test(test_loader, model, device)
            train_losses.append(train_loss)
            test_losses.append(test_loss)
            
            torch.save(model.state_dict(), model_file)

finally:
    with open(train_losses_file, 'w') as fp:
        for s in train_losses:
            fp.write("%s\n" % s)
            
    with open(test_losses_file, 'w') as fp:
        for x in test_losses:
            fp.write("%s\n" % x)
    
    print("Done!")

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1
-------------------------------
Done!


IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

In [None]:
# ValueError: Expected more than 1 value per channel when training, got input size torch.Size([1, 768])