In [None]:
from data.processing import ParseBalanced

directory = "melspec-dataset-top-50-LIBROSA-256-Triplet"
data_directory = "E:/mtg-jamendo/"
subset_file_name = "autotagging_top50tags"
ParseBalanced(subset_file_name, f"{data_directory}", f"D:/SongsDataset/{directory}", convert=True, target_per_genre=1300, chunk_size=256, chunks_per_batch=1, write_individually=True)

In [1]:
from info_nce import InfoNCE
from data.data_utils import *

from libauc.losses.contrastive import GCLoss_v1

directory = "melspec-dataset-top-50-LIBROSA-256-Triplet"
data_directory = "E:/mtg-jamendo/"
subset_file_name = "autotagging_top50tags"

# augmentations = Compose([
#     AddGaussianNoise(std=0.25),
# ])

class Config:
    # === General ===
    device = "cuda" if torch.cuda.is_available() else "cpu"
    dtype = torch.float32
    model_name = "Myna-CLS-Sinusoidal"
    save_path = f"trained_models\\{model_name}\\"
    seed = 42

    # === Training ===
    num_epochs = 128
    batch_size = 484
    learning_rate = 2e-4
    weight_decay = 1e-5

    coef = 1
    cycles = 42
    warmup = 0
    gamma = 2.0

    sogclr_tau = 0.1
    sogclr_gamma = 0.9
    gamma_schedule = 'constant' #'cosine'
    epochs = 0
    sogclr_eps = 1e-8
    isogclr = False
    rank = 0
    lr_schedule = 'constant'

    # === Dataset ===
    transforms = None
    use_masks = True
    num_workers = 1
    prefetch_factor = 1
    val_split = 0.1
    #pos_weight = (torch.ones(num_classes) * 50).to("cuda")
    criterion = InfoNCE()

In [2]:
from torch.utils.data import DataLoader

large_directory = directory

train_dataset = StreamViewDataset(f"D:\\SongsDataset\\{large_directory}\\train_set\\data", 
                                  f"D:\\SongsDataset\\{large_directory}\\train_set\\genre_labels", pair_album=False, views=2)

test_dataset  = StreamViewDataset(f"D:\\SongsDataset\\{large_directory}\\test_set\\data", 
                                  f"D:\\SongsDataset\\{large_directory}\\test_set\\genre_labels", pair_album=False, views=2)

train_dataloader = DataLoader(
    train_dataset,
    batch_size=Config.batch_size,
    shuffle=True,
    num_workers=Config.num_workers,
    prefetch_factor=Config.prefetch_factor,
)

test_dataloader = DataLoader(
    test_dataset,
    batch_size=Config.batch_size,
    shuffle=True,
    num_workers=Config.num_workers,
    prefetch_factor=Config.prefetch_factor,
)

In [None]:
from training.contrastive_training import train_contrastive
from models.AudioViTEncoder import AudioViTEncoder
from utils import misc

Config.model_name = "ViT-Contrastive-Embeddings-Masking-0.9"
Config.save_path = f"trained_models\\{Config.model_name}\\"

model = AudioViTEncoder(patch_size=8, input_dim=128, num_heads=8, encoder_layers=8, length=256, d_model=256, dim_feedforward=512, dropout=0.1, latent_space=128, use_alibi=True, use_pooling=False, CLS=True, use_rope=False, masking_percent=0.0, variational=False)

print(f"{misc.model_size(model)} Parameters")
train_contrastive(model, test_dataloader, train_dataloader, Config, variational=False, train_masked=True, test_masked=False, album=False)

In [None]:
from utils import misc
from models.Myna import Myna
from training.contrastive_training import train_contrastive

model = Myna(
    image_size=(128, 256),
    channels=1,
    patch_size=(16, 16),
    latent_space=128,
    d_model=384,
    depth=12,
    heads=6,
    mlp_dim=1536,
    mask_ratio=0.9,
    use_cls=True,
    alibi=False
)


print(f"{misc.model_size(model)} Parameters")
train_contrastive(model, test_dataloader, train_dataloader, Config, variational=False, train_masked=True, test_masked=False, album=False, convex=False, start_epoch=0, views=2)

21425536 Parameters


  0%|          | 0/21 [00:03<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

[Epoch 0] Train: Same Song Contrastive Loss = 6.1726
Test: Same Song Contrastive Loss = 5.5328



  0%|          | 0/21 [00:04<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

[Epoch 1] Train: Same Song Contrastive Loss = 6.1222
Test: Same Song Contrastive Loss = 5.4503



  0%|          | 0/21 [00:04<?, ?it/s]

In [3]:
from training.contrastive_training import evaluate_contrastive

model = torch.load("E:\\Coding\\SongAnalyzer\\Analyzer\\src\\trained_models\\Myna-CLS-ALIBI-2\\Epoch-64.pt", weights_only=False)
model.mask_ratio = 0.0
same_song_contrastive_loss = evaluate_contrastive(model, test_dataloader, Config, test_masked=False)
print(same_song_contrastive_loss)

  0%|          | 0/3 [00:00<?, ?it/s]

3.3003698190053306


In [4]:
from training.contrastive_training import evaluate_contrastive

model = torch.load("E:\\Coding\\SongAnalyzer\\Analyzer\\src\\trained_models\\Myna-CLS\\Epoch-64.pt", weights_only=False)
model.mask_ratio = 0.0
same_song_contrastive_loss = evaluate_contrastive(model, test_dataloader, Config, test_masked=False)
print(same_song_contrastive_loss)

  0%|          | 0/3 [00:00<?, ?it/s]

3.2113026777903237


In [7]:
from utils import misc
from models.Myna import Myna
from training.contrastive_training import train_contrastive

model = Myna(
    image_size=(128, 256),
    channels=1,
    patch_size=(16, 16),
    latent_space=128,
    d_model=384,
    depth=12,
    heads=6,
    mlp_dim=1536,
    mask_ratio=0.9,
    use_cls=True,
    alibi=False
)


print(f"{misc.model_size(model)} Parameters")
train_contrastive(model, test_dataloader, train_dataloader, Config, variational=False, train_masked=True, test_masked=False, album=False, convex=False, start_epoch=14, views=2)

21425536 Parameters


  0%|          | 0/21 [00:04<?, ?it/s]

AttributeError: 'NoneType' object has no attribute 'shape'

In [None]:
from training.contrastive_training import evaluate_contrastive

model.mask_ratio = 0.25
same_song_contrastive_loss = evaluate_contrastive(model, test_dataloader, Config, test_masked=False)
print(same_song_contrastive_loss)

In [None]:
from training.contrastive_training import evaluate_contrastive

model.mask_ratio = 0.0
same_song_contrastive_loss = evaluate_contrastive(model, test_dataloader, Config, test_masked=False)
print(same_song_contrastive_loss)

In [None]:
from training.autoencoding_training import train_autoencode
train_autoencode(model, test_dataloader, train_dataloader, Config, show_graph=False)

In [None]:
import librosa
import IPython
import numpy as np
import torch
import os

from datasets import tqdm
from training.inference import load_and_parse_audio

def test(model):
    path = "E:\\SongsDataset\\songs\\"
    all_folders = os.listdir(path)

    with torch.no_grad():
        for each_song in tqdm(all_folders[100:110]):
            song_path = os.path.join(path, each_song)
            chunks = load_and_parse_audio(song_path, convert=True, chunk_size=1024).to("cuda")
            permuted_chunks = torch.stack([c for c in chunks])

            mean = permuted_chunks.mean(dim=[1, 2], keepdim=True)
            std = permuted_chunks.std(dim=[1, 2], keepdim=True)

            permuted_chunks = (permuted_chunks - mean) / (std + 1e-6)

            reconstructed, latent = model(permuted_chunks)

            input_tensor = np.concatenate(permuted_chunks.cpu().detach().numpy(), axis=1)
            reconstructed = np.concatenate(reconstructed.cpu().detach().numpy(), axis=1)

            input_tensor = input_tensor[:, :512]
            reconstructed = reconstructed[:, :512]

            graph(input_tensor, reconstructed)

            S_recon = librosa.feature.inverse.mel_to_stft(reconstructed)
            Y_recon = librosa.griffinlim(S_recon)

            S_orig = librosa.feature.inverse.mel_to_stft(input_tensor)
            Y_orig = librosa.griffinlim(S_orig)

            IPython.display.display(IPython.display.Audio(Y_orig, rate=44100))
            IPython.display.display(IPython.display.Audio(Y_recon, rate=44100))