In [None]:
from data.processing import ParseBalanced

directory = "melspec-dataset-top-50-LIBROSA-256-Triplet"
data_directory = "E:/mtg-jamendo/"
subset_file_name = "autotagging_top50tags"
ParseBalanced(subset_file_name, f"{data_directory}", f"D:/SongsDataset/{directory}", convert=True, target_per_genre=1300, chunk_size=256, chunks_per_batch=1, write_individually=True)

In [3]:
from info_nce import InfoNCE
from data.data_utils import *

from libauc.losses.contrastive import GCLoss_v1

directory = "melspec-dataset-top-50-LIBROSA-256-Triplet"
data_directory = "E:/mtg-jamendo/"
subset_file_name = "autotagging_top50tags"

# augmentations = Compose([
#     AddGaussianNoise(std=0.25),
# ])

class Config:
    # === General ===
    device = "cuda" if torch.cuda.is_available() else "cpu"
    dtype = torch.float32
    model_name = "Myna-CLS-ALIBI-Convex"
    save_path = f"trained_models\\{model_name}\\"
    seed = 42

    # === Training ===
    num_epochs = 256
    batch_size = 128
    learning_rate = 2e-4
    weight_decay = 1e-5

    coef = 1
    cycles = 42
    warmup = 0
    gamma = 2.0

    sogclr_tau = 0.1
    sogclr_gamma = 0.9
    gamma_schedule = 'constant' #'cosine'
    epochs = 0
    sogclr_eps = 1e-8
    isogclr = False
    rank = 0
    lr_schedule = 'constant'

    # === Dataset ===
    transforms = None
    use_masks = True
    num_workers = 1
    prefetch_factor = 1
    val_split = 0.1
    #pos_weight = (torch.ones(num_classes) * 50).to("cuda")
    criterion = InfoNCE()

In [4]:
from torch.utils.data import DataLoader

large_directory = directory

train_dataset = StreamViewDataset(f"D:\\SongsDataset\\{large_directory}\\train_set\\data", 
                                  f"D:\\SongsDataset\\{large_directory}\\train_set\\genre_labels", pair_album=False, views=4)

test_dataset  = StreamViewDataset(f"D:\\SongsDataset\\{large_directory}\\test_set\\data", 
                                  f"D:\\SongsDataset\\{large_directory}\\test_set\\genre_labels", pair_album=False, views=4)

train_dataloader = DataLoader(
    train_dataset,
    batch_size=Config.batch_size,
    shuffle=True,
    num_workers=Config.num_workers,
    prefetch_factor=Config.prefetch_factor,
)

test_dataloader = DataLoader(
    test_dataset,
    batch_size=Config.batch_size,
    shuffle=True,
    num_workers=Config.num_workers,
    prefetch_factor=Config.prefetch_factor,
)

In [None]:
from models.AudioPreCNNTransformer import AudioPreCNNTransformer
from utils import misc

model = AudioPreCNNTransformer(latent_space=512, input_dim=128, length=1024, num_heads=8, transformer_layers=8, d_model=256, dropout=0.1)
print(f"{misc.model_size(model)} Parameters")

In [None]:
from models.AudioTransformer import AudioTransformer
from utils import misc

model = AudioTransformer(latent_space=32, input_dim=128, d_model=256, dim_feedforward=512, length=256, num_heads=8, encoder_layers=16, decoder_layers=16, dropout=0.1, use_alibi=True, custom_slopes=True)

print(f"{misc.model_size(model)} Parameters")

In [None]:
from models.AudioTransformerWeaved import AudioTransformerWeaved
from utils import misc

model = AudioTransformerWeaved(latent_space=128, input_dim=128, d_model=256, dim_feedforward=512, length=1024, num_heads=8, encoder_layers=8, decoder_layers=8, dropout=0.1, use_alibi=True)
print(f"{misc.model_size(model)} Parameters")

In [None]:
from training.contrastive_training import train_contrastive
from models.AudioViTEncoder import AudioViTEncoder
from utils import misc

Config.model_name = "ViT-Contrastive-Embeddings-Masking-0.9"
Config.save_path = f"trained_models\\{Config.model_name}\\"

model = AudioViTEncoder(patch_size=8, input_dim=128, num_heads=8, encoder_layers=8, length=256, d_model=256, dim_feedforward=512, dropout=0.1, latent_space=128, use_alibi=True, use_pooling=False, CLS=True, use_rope=False, masking_percent=0.0, variational=False)

print(f"{misc.model_size(model)} Parameters")
train_contrastive(model, test_dataloader, train_dataloader, Config, variational=False, train_masked=True, test_masked=False, album=False)

In [None]:
from utils import misc
from models.Myna import Myna
from training.contrastive_training import train_contrastive

model = Myna(
    image_size=(128, 256),
    channels=1,
    patch_size=(16, 16),
    latent_space=128,
    d_model=384,
    depth=12,
    heads=6,
    mlp_dim=1536,
    mask_ratio=0.9,
    use_cls=True
)

#model = torch.load("E:\\Coding\\SongAnalyzer\\Analyzer\\src\\trained_models\\Myna-CLS-ASlbum\\Epoch-89.pt", weights_only=False)

print(f"{misc.model_size(model)} Parameters")
train_contrastive(model, test_dataloader, train_dataloader, Config, variational=False, train_masked=True, test_masked=False, album=False, convex=True, start_epoch=0, views=4)

42719872 Parameters


  0%|          | 0/79 [00:05<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

[Epoch 0] Train: Same Song Contrastive Loss = 4.7767	|	Convex Loss = 0.0081
Test: Same Song Contrastive Loss = 4.5575



  0%|          | 0/79 [00:04<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

[Epoch 1] Train: Same Song Contrastive Loss = 4.7633	|	Convex Loss = 0.0104
Test: Same Song Contrastive Loss = 4.4086



  0%|          | 0/79 [00:04<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

[Epoch 2] Train: Same Song Contrastive Loss = 4.5159	|	Convex Loss = 0.0091
Test: Same Song Contrastive Loss = 4.2246



  0%|          | 0/79 [00:04<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

[Epoch 3] Train: Same Song Contrastive Loss = 4.3460	|	Convex Loss = 0.0121
Test: Same Song Contrastive Loss = 4.0049



  0%|          | 0/79 [00:04<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

[Epoch 4] Train: Same Song Contrastive Loss = 4.2404	|	Convex Loss = 0.0130
Test: Same Song Contrastive Loss = 3.9485



  0%|          | 0/79 [00:04<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

[Epoch 5] Train: Same Song Contrastive Loss = 4.1484	|	Convex Loss = 0.0142
Test: Same Song Contrastive Loss = 3.8786



  0%|          | 0/79 [00:04<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

[Epoch 6] Train: Same Song Contrastive Loss = 4.0562	|	Convex Loss = 0.0161
Test: Same Song Contrastive Loss = 3.8346



  0%|          | 0/79 [00:04<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

[Epoch 7] Train: Same Song Contrastive Loss = 3.9950	|	Convex Loss = 0.0164
Test: Same Song Contrastive Loss = 3.7889



  0%|          | 0/79 [00:04<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

[Epoch 8] Train: Same Song Contrastive Loss = 3.9331	|	Convex Loss = 0.0168
Test: Same Song Contrastive Loss = 3.6831



  0%|          | 0/79 [00:04<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

[Epoch 9] Train: Same Song Contrastive Loss = 3.8947	|	Convex Loss = 0.0185
Test: Same Song Contrastive Loss = 3.7349



  0%|          | 0/79 [00:04<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

[Epoch 10] Train: Same Song Contrastive Loss = 3.8473	|	Convex Loss = 0.0182
Test: Same Song Contrastive Loss = 3.6189



  0%|          | 0/79 [00:04<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

[Epoch 11] Train: Same Song Contrastive Loss = 3.7842	|	Convex Loss = 0.0184
Test: Same Song Contrastive Loss = 3.5189



  0%|          | 0/79 [00:04<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

[Epoch 12] Train: Same Song Contrastive Loss = 3.7285	|	Convex Loss = 0.0190
Test: Same Song Contrastive Loss = 3.5077



  0%|          | 0/79 [00:04<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

[Epoch 13] Train: Same Song Contrastive Loss = 3.6984	|	Convex Loss = 0.0197
Test: Same Song Contrastive Loss = 3.4208



  0%|          | 0/79 [00:04<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

[Epoch 14] Train: Same Song Contrastive Loss = 3.6330	|	Convex Loss = 0.0186
Test: Same Song Contrastive Loss = 3.4065



  0%|          | 0/79 [00:04<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

[Epoch 15] Train: Same Song Contrastive Loss = 3.6200	|	Convex Loss = 0.0184
Test: Same Song Contrastive Loss = 3.4683



  0%|          | 0/79 [00:04<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

[Epoch 16] Train: Same Song Contrastive Loss = 3.5992	|	Convex Loss = 0.0182
Test: Same Song Contrastive Loss = 3.4608



  0%|          | 0/79 [00:04<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

[Epoch 17] Train: Same Song Contrastive Loss = 3.5810	|	Convex Loss = 0.0196
Test: Same Song Contrastive Loss = 3.4235



  0%|          | 0/79 [00:04<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

[Epoch 18] Train: Same Song Contrastive Loss = 3.5725	|	Convex Loss = 0.0198
Test: Same Song Contrastive Loss = 3.3648



  0%|          | 0/79 [00:04<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

[Epoch 19] Train: Same Song Contrastive Loss = 3.4976	|	Convex Loss = 0.0185
Test: Same Song Contrastive Loss = 3.2795



  0%|          | 0/79 [00:04<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

[Epoch 20] Train: Same Song Contrastive Loss = 3.5154	|	Convex Loss = 0.0182
Test: Same Song Contrastive Loss = 3.4076



  0%|          | 0/79 [00:04<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

[Epoch 21] Train: Same Song Contrastive Loss = 3.4536	|	Convex Loss = 0.0181
Test: Same Song Contrastive Loss = 3.3897



  0%|          | 0/79 [00:04<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

[Epoch 22] Train: Same Song Contrastive Loss = 3.4671	|	Convex Loss = 0.0187
Test: Same Song Contrastive Loss = 3.3301



  0%|          | 0/79 [00:04<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

[Epoch 23] Train: Same Song Contrastive Loss = 3.4185	|	Convex Loss = 0.0177
Test: Same Song Contrastive Loss = 3.2588



  0%|          | 0/79 [00:04<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

[Epoch 24] Train: Same Song Contrastive Loss = 3.4013	|	Convex Loss = 0.0201
Test: Same Song Contrastive Loss = 3.2063



  0%|          | 0/79 [00:04<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

[Epoch 25] Train: Same Song Contrastive Loss = 3.4044	|	Convex Loss = 0.0190
Test: Same Song Contrastive Loss = 3.2696



  0%|          | 0/79 [00:04<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

[Epoch 26] Train: Same Song Contrastive Loss = 3.3803	|	Convex Loss = 0.0189
Test: Same Song Contrastive Loss = 3.2042



  0%|          | 0/79 [00:04<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

[Epoch 27] Train: Same Song Contrastive Loss = 3.3471	|	Convex Loss = 0.0179
Test: Same Song Contrastive Loss = 3.2097



  0%|          | 0/79 [00:05<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

[Epoch 28] Train: Same Song Contrastive Loss = 3.3544	|	Convex Loss = 0.0195
Test: Same Song Contrastive Loss = 3.1640



  0%|          | 0/79 [00:05<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

[Epoch 29] Train: Same Song Contrastive Loss = 3.3444	|	Convex Loss = 0.0185
Test: Same Song Contrastive Loss = 3.1876



  0%|          | 0/79 [00:05<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

[Epoch 30] Train: Same Song Contrastive Loss = 3.3271	|	Convex Loss = 0.0187
Test: Same Song Contrastive Loss = 3.1781



  0%|          | 0/79 [00:04<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

[Epoch 31] Train: Same Song Contrastive Loss = 3.3099	|	Convex Loss = 0.0189
Test: Same Song Contrastive Loss = 3.1334



  0%|          | 0/79 [00:17<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

[Epoch 32] Train: Same Song Contrastive Loss = 3.3109	|	Convex Loss = 0.0186
Test: Same Song Contrastive Loss = 3.2298



  0%|          | 0/79 [00:07<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

[Epoch 33] Train: Same Song Contrastive Loss = 3.2747	|	Convex Loss = 0.0186
Test: Same Song Contrastive Loss = 3.1598



  0%|          | 0/79 [00:04<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

[Epoch 34] Train: Same Song Contrastive Loss = 3.2727	|	Convex Loss = 0.0178
Test: Same Song Contrastive Loss = 3.1076



  0%|          | 0/79 [00:04<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

[Epoch 35] Train: Same Song Contrastive Loss = 3.2781	|	Convex Loss = 0.0184
Test: Same Song Contrastive Loss = 3.1980



  0%|          | 0/79 [00:05<?, ?it/s]

In [None]:
from training.contrastive_training import evaluate_contrastive

model.mask_ratio = 0.9
same_song_contrastive_loss = evaluate_contrastive(model, test_dataloader, Config, test_masked=False)
print(same_song_contrastive_loss)

In [None]:
from training.contrastive_training import evaluate_contrastive

model.mask_ratio = 0.5
same_song_contrastive_loss = evaluate_contrastive(model, test_dataloader, Config, test_masked=False)
print(same_song_contrastive_loss)

In [None]:
from training.contrastive_training import evaluate_contrastive

model.mask_ratio = 0.25
same_song_contrastive_loss = evaluate_contrastive(model, test_dataloader, Config, test_masked=False)
print(same_song_contrastive_loss)

In [None]:
from training.contrastive_training import evaluate_contrastive

model.mask_ratio = 0.0
same_song_contrastive_loss = evaluate_contrastive(model, test_dataloader, Config, test_masked=False)
print(same_song_contrastive_loss)

In [None]:
from training.autoencoding_training import train_autoencode
train_autoencode(model, test_dataloader, train_dataloader, Config, show_graph=False)

In [None]:
import librosa
import IPython
import numpy as np
import torch
import os

from datasets import tqdm
from training.inference import load_and_parse_audio

def test(model):
    path = "E:\\SongsDataset\\songs\\"
    all_folders = os.listdir(path)

    with torch.no_grad():
        for each_song in tqdm(all_folders[100:110]):
            song_path = os.path.join(path, each_song)
            chunks = load_and_parse_audio(song_path, convert=True, chunk_size=1024).to("cuda")
            permuted_chunks = torch.stack([c for c in chunks])

            mean = permuted_chunks.mean(dim=[1, 2], keepdim=True)
            std = permuted_chunks.std(dim=[1, 2], keepdim=True)

            permuted_chunks = (permuted_chunks - mean) / (std + 1e-6)

            reconstructed, latent = model(permuted_chunks)

            input_tensor = np.concatenate(permuted_chunks.cpu().detach().numpy(), axis=1)
            reconstructed = np.concatenate(reconstructed.cpu().detach().numpy(), axis=1)

            input_tensor = input_tensor[:, :512]
            reconstructed = reconstructed[:, :512]

            graph(input_tensor, reconstructed)

            S_recon = librosa.feature.inverse.mel_to_stft(reconstructed)
            Y_recon = librosa.griffinlim(S_recon)

            S_orig = librosa.feature.inverse.mel_to_stft(input_tensor)
            Y_orig = librosa.griffinlim(S_orig)

            IPython.display.display(IPython.display.Audio(Y_orig, rate=44100))
            IPython.display.display(IPython.display.Audio(Y_recon, rate=44100))