In [None]:
from data.processing import ParseBalanced

directory = "melspec-dataset-top-50-LIBROSA-256-Triplet"
data_directory = "E:/mtg-jamendo/"
subset_file_name = "autotagging_top50tags"
ParseBalanced(subset_file_name, f"{data_directory}", f"D:/SongsDataset/{directory}", convert=True, target_per_genre=1300, chunk_size=256, chunks_per_batch=1, write_individually=True)

In [1]:
from torchaudio.transforms import TimeMasking, FrequencyMasking
from info_nce import InfoNCE
from data.data_utils import *

directory = "melspec-dataset-top-50-LIBROSA-256-Triplet"
data_directory = "E:/mtg-jamendo/"
subset_file_name = "autotagging_top50tags"

# augmentations = Compose([
#     AddGaussianNoise(std=0.25),
# ])

class Config:
    # === General ===
    device = "cuda" if torch.cuda.is_available() else "cpu"
    dtype = torch.float32
    model_name = "ViT-Contrastive-Embeddings-Masking-0.9"
    save_path = f"trained_models\\{model_name}\\"
    seed = 42

    # === Training ===
    num_classes = 50
    num_epochs = 128
    batch_size = 128
    learning_rate = 1e-4
    weight_decay = 1e-4

    coef = 1
    cycles = 42
    warmup = 0
    gamma = 2.0

    # === Dataset ===
    transforms = None
    use_masks = True
    num_workers = 2
    prefetch_factor = 1
    val_split = 0.1
    #pos_weight = (torch.ones(num_classes) * 50).to("cuda")
    criterion = InfoNCE()

In [2]:
from torch.utils.data import DataLoader

large_directory = directory

train_dataset = StreamViewDataset(f"D:\\SongsDataset\\{large_directory}\\train_set\\data", f"D:\\SongsDataset\\{large_directory}\\train_set\\genre_labels", pair_album=True)
test_dataset  = StreamViewDataset(f"D:\\SongsDataset\\{large_directory}\\test_set\\data", f"D:\\SongsDataset\\{large_directory}\\test_set\\genre_labels", pair_album=True)

train_dataloader = DataLoader(
    train_dataset,
    batch_size=Config.batch_size,
    shuffle=True,
    num_workers=Config.num_workers,
    prefetch_factor=Config.prefetch_factor,
)

test_dataloader = DataLoader(
    test_dataset,
    batch_size=Config.batch_size,
    shuffle=True,
    num_workers=Config.num_workers,
    prefetch_factor=Config.prefetch_factor,
)

Reading: 54380 tracks, 11107 albums, 3517 artists
Reading: 54380 tracks, 11107 albums, 3517 artists


In [None]:
from models.AudioPreCNNTransformer import AudioPreCNNTransformer
from utils import misc

model = AudioPreCNNTransformer(latent_space=512, input_dim=128, length=1024, num_heads=8, transformer_layers=8, d_model=256, dropout=0.1)
print(f"{misc.model_size(model)} Parameters")

In [None]:
from models.AudioTransformer import AudioTransformer
from utils import misc

model = AudioTransformer(latent_space=32, input_dim=128, d_model=256, dim_feedforward=512, length=256, num_heads=8, encoder_layers=16, decoder_layers=16, dropout=0.1, use_alibi=True, custom_slopes=True)

print(f"{misc.model_size(model)} Parameters")

In [None]:
from models.AudioTransformerWeaved import AudioTransformerWeaved
from utils import misc

model = AudioTransformerWeaved(latent_space=128, input_dim=128, d_model=256, dim_feedforward=512, length=1024, num_heads=8, encoder_layers=8, decoder_layers=8, dropout=0.1, use_alibi=True)
print(f"{misc.model_size(model)} Parameters")

In [None]:
from training.contrastive_training import train_contrastive
from models.AudioViTEncoder import AudioViTEncoder
from utils import misc

Config.model_name = "ViT-Contrastive-Embeddings-Masking-0.9-Variational-Full-Album"
Config.save_path = f"trained_models\\{Config.model_name}\\"

model = torch.load("E:/Coding/SongAnalyzer/Analyzer/src/trained_models/ViT-Contrastive-Embeddings-Masking-0.9-Variational-Full/Classifier-Epoch-63.pt", weights_only=False)

print(f"{misc.model_size(model)} Parameters")
train_contrastive(model, test_dataloader, train_dataloader, Config, variational=True, train_masked=True, test_masked=False)

8123008 Parameters


  0%|          | 0/79 [00:11<?, ?it/s]

In [None]:
from training.contrastive_training import train_contrastive
from models.AudioViTEncoder import AudioViTEncoder
from utils import misc

from torch.utils.data import DataLoader

large_directory = directory

train_dataset = StreamViewDataset(f"D:\\SongsDataset\\{large_directory}\\train_set\\data", f"D:\\SongsDataset\\{large_directory}\\train_set\\genre_labels", pair_album=True)
test_dataset  = StreamViewDataset(f"D:\\SongsDataset\\{large_directory}\\test_set\\data", f"D:\\SongsDataset\\{large_directory}\\test_set\\genre_labels", pair_album=True)

train_dataloader = DataLoader(
    train_dataset,
    batch_size=Config.batch_size,
    shuffle=True,
    num_workers=Config.num_workers,
    prefetch_factor=Config.prefetch_factor,
)

test_dataloader = DataLoader(
    test_dataset,
    batch_size=Config.batch_size,
    shuffle=True,
    num_workers=Config.num_workers,
    prefetch_factor=Config.prefetch_factor,
)

Config.model_name = "ViT-Contrastive-Embeddings-Masking-0.9-Variational-Full-Album"
Config.save_path = f"trained_models\\{Config.model_name}\\"

model = torch.load("E:/Coding/SongAnalyzer/Analyzer/src/trained_models/ViT-Contrastive-Embeddings-Masking-0.9-Variational/Classifier-Epoch-36.pt", weights_only=False)

print(f"{misc.model_size(model)} Parameters")

train_contrastive(model, test_dataloader, train_dataloader, Config, variational=True, train_masked=True, test_masked=False)

In [None]:
from training.autoencoding_training import train_autoencode
train_autoencode(model, test_dataloader, train_dataloader, Config, show_graph=False)

In [None]:
import librosa
import IPython
import numpy as np
import torch
import os

from datasets import tqdm
from training.inference import load_and_parse_audio

def test(model):
    path = "E:\\SongsDataset\\songs\\"
    all_folders = os.listdir(path)

    with torch.no_grad():
        for each_song in tqdm(all_folders[100:110]):
            song_path = os.path.join(path, each_song)
            chunks = load_and_parse_audio(song_path, convert=True, chunk_size=1024).to("cuda")
            permuted_chunks = torch.stack([c for c in chunks])

            mean = permuted_chunks.mean(dim=[1, 2], keepdim=True)
            std = permuted_chunks.std(dim=[1, 2], keepdim=True)

            permuted_chunks = (permuted_chunks - mean) / (std + 1e-6)

            reconstructed, latent = model(permuted_chunks)

            input_tensor = np.concatenate(permuted_chunks.cpu().detach().numpy(), axis=1)
            reconstructed = np.concatenate(reconstructed.cpu().detach().numpy(), axis=1)

            input_tensor = input_tensor[:, :512]
            reconstructed = reconstructed[:, :512]

            graph(input_tensor, reconstructed)

            S_recon = librosa.feature.inverse.mel_to_stft(reconstructed)
            Y_recon = librosa.griffinlim(S_recon)

            S_orig = librosa.feature.inverse.mel_to_stft(input_tensor)
            Y_orig = librosa.griffinlim(S_orig)

            IPython.display.display(IPython.display.Audio(Y_orig, rate=44100))
            IPython.display.display(IPython.display.Audio(Y_recon, rate=44100))

In [None]:
test(model)