# Initialize Model

In [1]:
from loss.loss_utils import combined_loss
import os

import torch
from torch import nn
from data.data import *
from models.AudioResnet import AudioResnet
from loss.FocalLoss import FocalLoss
from models.AudioTransformer import AudioTransformer

augmentations = Compose([
    AddGaussianNoise(std=0.5),
    TimeMasking(max_mask_pct=0.05),
    FrequencyMasking(max_mask_pct=0.05),
])


class Config:
    # === General ===

    model_name = "Audio-Transformer-"
    device = "cuda" if torch.cuda.is_available() else "cpu"
    dtype = torch.float32
    save_path = f"trained_models\\{model_name}\\"
    seed = 42

    # === Training ===
    num_classes = 50
    num_epochs = 100
    batch_size = 1
    max_batch_size = 64
    learning_rate = 5e-5
    min_learning_rate = 1e-4
    weight_decay = 1e-4

    warmup_threshold = 1.0 / 100.0
    step_coefficient = 25.0 / 100.0

    gamma = 2.0
    save_checkpoints = True

    # === Dataset ===
    transforms = None
    use_masks = True
    num_workers = 1
    prefetch_factor = 3
    val_split = 0.1
    shuffle = True
    #pos_weight = (torch.ones(num_classes) * 10).to("cuda")
    criterion = combined_loss

ModuleNotFoundError: No module named 'flash_attn'

In [3]:
from torch.utils.data import DataLoader

large_directory = "large-melspec-dataset-top-50-LIBROSA"

train_dataset = StreamingSongDataset(f"E:\\SongsDataset\\{large_directory}\\train_set\\data",
                                     f"E:\\SongsDataset\\{large_directory}\\train_set\\genre_labels",
                                     transform=augmentations)
test_dataset = StreamingSongDataset(f"E:\\SongsDataset\\{large_directory}\\test_set\\data",
                                    f"E:\\SongsDataset\\{large_directory}\\test_set\\genre_labels")

train_dataloader = DataLoader(
    train_dataset,
    batch_size=Config.batch_size,
    shuffle=True,
    num_workers=Config.num_workers,
    prefetch_factor=Config.prefetch_factor,
)

test_dataloader = DataLoader(
    test_dataset,
    batch_size=Config.batch_size,
    shuffle=True,
    num_workers=Config.num_workers,
    prefetch_factor=Config.prefetch_factor,
)

NameError: name 'StreamingSongDataset' is not defined

In [None]:
from data.processing import ParseBalanced

directory = "large-melspec-dataset-top-50-LIBROSA"
data_directory = "E:/mtg-jamendo/"
subset_file_name = "autotagging_top50tags"
ParseBalanced(subset_file_name, f"{data_directory}", f"E:/SongsDataset/{directory}", convert=True,
              target_per_genre=1300)

In [1]:
import subprocess

def cluster_elki(name, num_clusters):
    # Define parameters
    elki_jar = "elki-bundle-0.8.0.jar"
    data_file = f"output_analysis/output-{name}.csv"

    # Construct the ELKI command
    cmd = [
        "java", "-jar", elki_jar,
        "KDDCLIApplication",
        "-dbc.in", data_file,
        "-algorithm", "clustering.hierarchical.extraction.CutDendrogramByNumberOfClusters",
        "-algorithm", "Anderberg",
        "-algorithm.distancefunction", "CosineDistance",
        "-hierarchical.minclusters", str(num_clusters),
        "-resulthandler", "ResultWriter",
        "-out.gzip", "false",
        "-out", f"output_analysis/elki-TEST-{name}-{num_clusters}",
    ]

    # Execute the command
    try:
        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
    except subprocess.CalledProcessError as e:
        print("An error occurred:\n", e.stderr)

In [5]:
from datasets import tqdm
from training.inference import load_and_parse_audio
import torch
import os

def compute(model, name):
    path = "E:\\SongsDataset\\songs\\"
    all_folders = os.listdir(path)
    length = 256
    with open(name, 'w', encoding="utf-8") as f:
        with torch.no_grad():
            for each_song in tqdm(all_folders):
                song_path = os.path.join(path, each_song)

                # audio, sr = librosa.load(song_path, sr=44100, mono=True)
                # data = librosa.feature.melspectrogram(y=audio, sr=sr)
                # data = librosa.amplitude_to_db(data, ref=np.max)

                chunks = load_and_parse_audio(song_path, convert=True, chunk_size=length).to("cuda")
                permuted_chunks = torch.stack([c for c in chunks])

                # mean = permuted_chunks.mean(dim=[1, 2], keepdim=True)
                # std = permuted_chunks.std(dim=[1, 2], keepdim=True)
                # permuted_chunks = (permuted_chunks - mean) / (std + 1e-6)

                # num_chunks = int(permuted_chunks.shape[0] / 64) + 1
                # data_minibatches = torch.chunk(permuted_chunks, num_chunks, dim=0)
                #
                # latents = []
                # for i, data_minibatch in enumerate(data_minibatches):
                #     latent = model(data_minibatch)
                #     latents.append(latent)

                B, T, F = permuted_chunks.shape

                if T > length:
                    continue

                _, latents, _ = model(permuted_chunks, masked=False)

                #averages = torch.cat(latents, dim=0).mean(dim=0).cpu().detach().numpy()
                averages = latents.mean(dim=0).cpu().detach().numpy()

                line = " ".join([str(x) for x in averages]) + f" {each_song}\n"
                f.write(line)

In [2]:
import os
import torch
import numpy as np
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
from training.inference import load_and_parse_audio

def process_song(song_path, length=256):
    """Preprocess one song (CPU only)."""
    chunks = load_and_parse_audio(song_path, convert=True, chunk_size=length)
    return song_path, chunks

def compute(model, name, batch_size=16, num_workers=4, length=256, variational=False):
    path = "E:\\SongsDataset\\songs\\"
    all_songs = os.listdir(path)

    with open(name, 'w', encoding="utf-8") as f, torch.no_grad(), ThreadPoolExecutor(max_workers=num_workers) as executor:
        # submit all preprocessing jobs
        partitions = 8
        num_songs = len(all_songs) // partitions
        for index in range(1, partitions):
            futures = [executor.submit(process_song, os.path.join(path, song), length) for song in all_songs[(index - 1) * num_songs:index * num_songs]]

            # process results in batches of X
            batch = []
            for future in tqdm(as_completed(futures), total=len(futures)):
                song_path, chunks = future.result()
                batch.append((song_path, chunks))

                if len(batch) >= batch_size:
                    run_batch(model, batch, f, length, variational)
                    batch = []

            # handle last small batch
            if batch:
                run_batch(model, batch, f, length)

def run_batch(model, batch, file_handle, length=256, variational=False):
    """Run model on a batch of preprocessed songs and write results."""
    for song_path, chunks in batch:
        chunks = chunks.to("cuda")
        B, T, F = chunks.shape

        if B > length or T > length or F > length:
            continue

        chunks = chunks.unsqueeze(1)
        latents = model(chunks)

        averages = latents.mean(dim=0).cpu().numpy()
        torch.cuda.empty_cache()
        line = " ".join([str(x) for x in averages]) + f" \"{os.path.basename(song_path)}\"\n"
        file_handle.write(line)

In [3]:
model = torch.load("E:/Coding/SongAnalyzer/Analyzer/src/trained_models/Myna-CLS-Album/Epoch-131.pt", weights_only=False)

model.mask_ratio= 0.0

compute(model, "E:/Coding/SongAnalyzer/Analyzer/src/output_analysis/output-Myna-CLS-Album.csv", variational=False)

100%|██████████| 496/496 [03:20<00:00,  2.48it/s]
100%|██████████| 496/496 [02:51<00:00,  2.88it/s]
100%|██████████| 496/496 [03:02<00:00,  2.71it/s]
100%|██████████| 496/496 [04:10<00:00,  1.98it/s]
100%|██████████| 496/496 [03:12<00:00,  2.57it/s]
100%|██████████| 496/496 [03:08<00:00,  2.64it/s]
100%|██████████| 496/496 [02:45<00:00,  2.99it/s]


In [4]:
cluster_elki("Myna-CLS-Album", 64)
cluster_elki("Myna-CLS-Album", 256)

In [4]:
model = torch.load("E:/Coding/SongAnalyzer/Analyzer/src/trained_models/ViT-Contrastive-Embeddings-Masking-0.9-Variational/Classifier-Epoch-36.pt", weights_only=False)
compute(model, "E:/Coding/SongAnalyzer/Analyzer/src/output_analysis/output-contrastive-masking-09-Variational.csv", variational=True)

100%|██████████| 496/496 [03:57<00:00,  2.09it/s]
100%|██████████| 496/496 [02:39<00:00,  3.12it/s]
100%|██████████| 496/496 [02:31<00:00,  3.26it/s]
100%|██████████| 496/496 [02:37<00:00,  3.16it/s]
100%|██████████| 496/496 [02:55<00:00,  2.82it/s]
100%|██████████| 496/496 [02:41<00:00,  3.07it/s]
100%|██████████| 496/496 [02:30<00:00,  3.29it/s]


In [6]:
model = torch.load("E:/Coding/SongAnalyzer/Analyzer/src/trained_models/ViT-Contrastive-Embeddings-Masking-0.9-ALIBI/Classifier-Epoch-15.pt", weights_only=False)
compute(model, "E:/Coding/SongAnalyzer/Analyzer/src/output_analysis/output-contrastive-masking-09-alibi.csv", variational=False)

100%|██████████| 496/496 [02:34<00:00,  3.22it/s]
100%|██████████| 496/496 [02:36<00:00,  3.18it/s]
100%|██████████| 496/496 [02:28<00:00,  3.34it/s]
100%|██████████| 496/496 [02:35<00:00,  3.18it/s]
100%|██████████| 496/496 [02:40<00:00,  3.08it/s]
100%|██████████| 496/496 [02:43<00:00,  3.03it/s]
100%|██████████| 496/496 [02:29<00:00,  3.32it/s]


In [9]:
cluster_elki("contrastive-masking-09", 64)
cluster_elki("contrastive-masking-09", 256)
cluster_elki("contrastive-masking-09-variational", 64)
cluster_elki("contrastive-masking-09-variational", 256)
cluster_elki("contrastive-masking-09-alibi", 64)
cluster_elki("contrastive-masking-09-alibi", 256)

In [3]:
cluster_elki("Classification-With-Reconstruction", 100)