In [1]:
from loss.loss_utils import combined_loss
import os

import torch
from torch import nn
from models.AudioTransformer import AudioTransformer
from data.data_utils import *
from models.AudioResnet import AudioResnet
from loss.FocalLoss import FocalLoss

augmentations = Compose([
    AddGaussianNoise(std=0.5),
    TimeMasking(max_mask_pct=0.15),
    FrequencyMasking(max_mask_pct=0.15),
])

class Config:
    # === General ===

    model_name = "Audio-Transformer-"
    device = "cuda" if torch.cuda.is_available() else "cpu"
    dtype = torch.float32
    save_path = f"trained_models\\{model_name}\\"
    seed = 42

    # === Training ===
    num_classes = 50
    num_epochs = 100
    batch_size = 1
    max_batch_size = 64
    learning_rate = 5e-5
    min_learning_rate = 1e-4
    weight_decay = 1e-4

    warmup_threshold = 1.0 / 100.0
    step_coefficient = 25.0 / 100.0

    gamma = 2.0
    save_checkpoints = True

    # === Dataset ===
    transforms = None
    use_masks = True
    num_workers = 1
    prefetch_factor = 3
    val_split = 0.1
    shuffle = True
    pos_weight = (torch.ones(num_classes) * 10).to("cuda")
    criterion = combined_loss

In [None]:
from data.processing import ParseBalanced

directory = "large-melspec-dataset-top-50-LIBROSA"
data_directory = "E:/mtg-jamendo/"
subset_file_name = "autotagging_top50tags"
ParseBalanced(subset_file_name, f"{data_directory}", f"E:/SongsDataset/{directory}", convert=True, target_per_genre=1300)

In [2]:
from torch.utils.data import DataLoader

large_directory = "large-melspec-dataset-top-50-LIBROSA"

train_dataset = StreamingSongDataset(f"E:\\SongsDataset\\{large_directory}\\train_set\\data", f"E:\\SongsDataset\\{large_directory}\\train_set\\genre_labels", transform=augmentations)
test_dataset = StreamingSongDataset(f"E:\\SongsDataset\\{large_directory}\\test_set\\data", f"E:\\SongsDataset\\{large_directory}\\test_set\\genre_labels")

train_dataloader = DataLoader(
    train_dataset,
    batch_size=Config.batch_size,
    shuffle=True,
    num_workers=Config.num_workers,
    prefetch_factor=Config.prefetch_factor,
)

test_dataloader = DataLoader(
    test_dataset,
    batch_size=Config.batch_size,
    shuffle=True,
    num_workers=Config.num_workers,
    prefetch_factor=Config.prefetch_factor,
)

In [3]:
from models.ShortChunkCNN import ShortChunkCNN
from utils import misc

model = AudioTransformer(latent_space=512, input_dim=128, length=256, num_heads=8, encoder_layers=8, decoder_layers=8, d_model=256, dropout=0.1)
print(f"{misc.model_size(model)} Parameters")

77884800 Parameters


In [None]:
from training.autoencoding_training import train_autoencode
train_autoencode(model, test_dataloader, train_dataloader, Config, show_graph=False)