# Prepare Data

In [None]:
import sns

from utils.data import ParseTaggedDataset

full_dataset, full_masks, full_keys = ParseTaggedDataset("")

In [1]:
from utils.data import MakeTripletDataset, MakeDataset

train_dataloader, test_dataloader = MakeDataset(sample_length=256, batch_size=24, tags=True)

In [2]:
import torch

class Config:
    # === General ===
    model_name = "Transformer-Classifier"
    device = "cuda" if torch.cuda.is_available() else "cpu"
    dtype = torch.float32
    save_path = f"E:\\Coding\\SongAnalyzer\\Analyzer\\src\\trained_models\\{model_name}\\"
    seed = 42

    # === Training ===
    num_epochs = 30
    batch_size = 24
    learning_rate = 5e-5
    weight_decay = 1e-5
    warmup_percent = 0.15
    max_grad_norm = 1.0
    log_every = 10  # steps between logs (optional)
    save_checkpoints = True

    # === Dataset ===
    use_masks = True
    num_workers = 4
    val_split = 0.2
    shuffle = True

    # === Model Behavior ===
    variational = False
    autoregressive = False

    # === Loss Coefficients ===
    beta_schedule = "log"   # e.g., "log", "linear", etc. (for getBetaLog)
    beta_max = 1.0
    cycle_length = 2
    contrastive_coeff = 0.1  # if using contrastive loss
    margin = 0.1

# Initialize Model

In [3]:
from models.AudioTransformer import AudioTransformer
from utils.misc import model_size

# ==== Model & Optimizer ====
num_heads = 16
num_layers = 16
encoder_layers = 16
decoder_layers = 5
d_model = 256
latent_space = 512
dim_feedforward = 1024
sample_length = 256
projection_dim = 128
dropout = 0.1

name_extension = ""

model = AudioTransformer(d_model=d_model, num_heads=num_heads, encoder_layers=encoder_layers, decoder_layers=decoder_layers, dim_feedforward=dim_feedforward, latent_space=latent_space, length=sample_length, dropout=dropout, name_extension=name_extension, genre_count=249, mood_count=249)
print(f"Parameters: {model_size(model)}")

Parameters: 85451314


In [None]:
from torch import optim
from training.training import trainTriplet, trainHybrid

optimizer = optim.AdamW(model.parameters(), lr=5e-5, weight_decay=1e-6)
trainHybrid(model, train_dataloader, test_dataloader, optimizer, Config, device=Config.device)

Epoch 1/30:  37%|███▋      | 2337/6330 [03:02<05:10, 12.87it/s]

In [8]:
torch.save(model, "E:\\Coding\\SongAnalyzer\\Analyzer\\src\\trained_models\\Transformer-Classification\\Epoch-20.pt")

In [4]:
from training.evaluation import evaluate
import torch

model.load_state_dict(torch.load("E:\\Coding\SongAnalyzer\\Analyzer\\src\\trained_models\\Transformer-Classifier\\AudioTransformer-LatentSpace512-Heads16-EncoderLayers16-DecoderLayers5-DModel256-Dropout0.1-AutoRegressiveFalse\\AudioTransformer-LatentSpace512-Heads16-EncoderLayers16-DecoderLayers5-DModel256-Dropout0.1-AutoRegressiveFalse-epoch8.pt", weights_only=False))
model = model.to("cuda")

In [5]:
evaluation_metrics = evaluate(model, test_dataloader, Config, device=Config.device)
cosine, mse, genre, mood, all_genre_predictions, all_genre_targets, all_mood_predictions, all_mood_targets = evaluation_metrics

100%|██████████| 704/704 [00:25<00:00, 27.31it/s]


In [6]:
all_genre_preds = torch.cat([t.detach().cpu() for t in all_genre_predictions], dim=0)
all_genre_targets = torch.cat([t.detach().cpu() for t in all_genre_targets], dim=0)

all_mood_preds = torch.cat([t.detach().cpu() for t in all_mood_predictions], dim=0)
all_mood_targets = torch.cat([t.detach().cpu() for t in all_mood_targets], dim=0)

In [8]:
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score

probs1 = torch.softmax(all_mood_preds, 1)
probs2 = torch.sigmoid(all_mood_preds)

binary_preds = (probs1 > 0.5).int()

In [9]:
print(f"{cosine}\n{mse}\n{genre}\n{mood}")

0.42278745770454407
0.7207701802253723
-9.089376449584961
-0.7990401983261108


In [22]:
torch.softmax(all_genre_preds[0], 0)

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 

In [18]:
all_genre_targets[0]

tensor([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 

In [8]:
max(probs1[2])

tensor(0.0809)

In [None]:
conf_matrices = multilabel_confusion_matrix(all_mood_targets.numpy(), binary_preds.numpy())

precision = precision_score(all_mood_targets, binary_preds, average='macro')  # or 'micro', 'weighted'
recall = recall_score(all_mood_targets, binary_preds, average='macro')
f1 = f1_score(all_mood_targets, binary_preds, average='macro')

print(f"Precision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1:.3f}")

In [33]:
all_mood_targets[104].int()

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=torch.int32)

In [36]:
binary_preds = (probs > 0.2).int()
binary_preds[104]

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=torch.int32)

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
num_classes = conf_matrices.shape[0]

for i in range(20):
    plt.figure(figsize=(3, 3))
    sns.heatmap(conf_matrices[i], annot=True, fmt='d', cmap='Blues', xticklabels=['Pred 0', 'Pred 1'], yticklabels=['True 0', 'True 1'])
    plt.title(f"Confusion Matrix - Class {i}")
    plt.ylabel("True Label")
    plt.xlabel("Predicted Label")
    plt.tight_layout()
    plt.show()

In [54]:
from sklearn.metrics import precision_recall_fscore_support
import pandas as pd

prec, rec, f1, support = precision_recall_fscore_support(all_mood_targets.numpy(), (torch.sigmoid(all_mood_preds) > 0.1).numpy(), average=None)

df = pd.DataFrame({
    "Class": list(range(len(prec))),
    "Precision": prec,
    "Recall": rec,
    "F1 Score": f1,
    "Support": support
})

print(df.round(3))

     Class  Precision  Recall  F1 Score  Support
0        0      0.269   0.657     0.381     3189
1        1      0.331   0.886     0.482     2178
2        2      0.268   0.756     0.396     2921
3        3      0.322   0.820     0.462     2732
4        4      0.274   0.787     0.406     2699
..     ...        ...     ...       ...      ...
244    244      0.000   0.000     0.000        0
245    245      0.000   0.000     0.000        0
246    246      0.000   0.000     0.000        0
247    247      0.000   0.000     0.000        0
248    248      0.000   0.000     0.000        0

[249 rows x 5 columns]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [55]:
df

Unnamed: 0,Class,Precision,Recall,F1 Score,Support
0,0,0.268521,0.656946,0.381221,3189
1,1,0.331216,0.885675,0.482129,2178
2,2,0.268165,0.755563,0.395839,2921
3,3,0.321937,0.820278,0.462396,2732
4,4,0.273958,0.786958,0.406429,2699
...,...,...,...,...,...
244,244,0.000000,0.000000,0.000000,0
245,245,0.000000,0.000000,0.000000,0
246,246,0.000000,0.000000,0.000000,0
247,247,0.000000,0.000000,0.000000,0


In [53]:
df

Unnamed: 0,Class,Precision,Recall,F1 Score,Support
0,0,0.315741,0.834015,0.458067,2446
1,1,0.413159,1.000000,0.584731,20446
2,2,0.267866,0.730917,0.392052,3118
3,3,0.305743,0.641902,0.414199,3890
4,4,0.167541,0.773504,0.275425,5616
...,...,...,...,...,...
244,244,0.000000,0.000000,0.000000,0
245,245,0.000000,0.000000,0.000000,0
246,246,0.000000,0.000000,0.000000,0
247,247,0.000000,0.000000,0.000000,48


In [44]:
import torch.nn.functional as F
# Apply sigmoid and normalize
probs = torch.sigmoid(all_mood_preds)
probs = probs / probs.sum(dim=1, keepdim=True)  # [N, C], sums to 1

targets = all_mood_targets.float() + 1e-10
targets = targets / targets.sum(dim=1, keepdim=True)  # [N, C]

# KL divergence (averaged)
kl = F.kl_div(probs.log(), targets, reduction='batchmean')
print(f"KL Divergence: {kl.item():.4f}")

KL Divergence: 2.7547


In [56]:
from tqdm import tqdm
import torch


def compress_song_average(song, model, mask=None):
    with torch.no_grad():
        mean = model.to_latent(song, mask)
        return torch.sum(mean.to('cpu'), dim=0) / len(song)


from data.Data import chunk_song
import os


In [57]:
def compute(model, name, length=256):
    model.to("cuda")
    model.eval()

    path = "E:\\SongsDataset\\latents\\"
    all_folders = os.listdir(path)

    file = open(f"output_analysis\\output-{name}.csv", "w", encoding='utf-8')

    for each_song in tqdm(all_folders):
        song_path = os.path.join(path, each_song)

        padded_data, zeros = chunk_song(song_path, length)
        input_tensor = torch.Tensor(padded_data).reshape(-1, length, 64).to("cuda")

        mask = [0 for _ in range(padded_data.shape[0] // length - 1)]
        mask.append(zeros)
        bool_masks = [[0 if (length - i) > x else 1 for i in range(length)] for x in mask]
        bool_masks = torch.tensor(bool_masks).bool()
        bool_masks = bool_masks.to("cuda")

        latent = compress_song_average(input_tensor, model, mask=bool_masks)

        output = ""
        for value in latent:
            output += f"{str(value.item())} "

        file.write(output + f"\"{each_song}\"\n")

In [2]:
import subprocess

def cluster_elki(name, num_clusters):
    # Define parameters
    elki_jar = "elki-bundle-0.8.0.jar"
    data_file = f"output_analysis/output-{name}.csv"

    # Construct the ELKI command
    cmd = [
        "java", "-jar", elki_jar,
        "KDDCLIApplication",
        "-dbc.in", data_file,
        "-algorithm", "clustering.hierarchical.extraction.CutDendrogramByNumberOfClusters",
        "-algorithm", "Anderberg",
        "-algorithm.distancefunction", "CosineDistance",
        "-hierarchical.minclusters", str(num_clusters),
        "-resulthandler", "ResultWriter",
        "-out.gzip", "false",
        "-out", f"output_analysis/elki-TEST-{name}-{num_clusters}",
    ]

    # Execute the command
    try:
        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
    except subprocess.CalledProcessError as e:
        print("An error occurred:\n", e.stderr)

In [59]:
compute(model, "Classification-With-Reconstruction")

100%|██████████| 3973/3973 [03:07<00:00, 21.22it/s]


In [2]:
cluster_elki("Classification-With-Reconstruction", 30)
cluster_elki("Classification-With-Reconstruction", 252)

In [3]:
cluster_elki("Classification-With-Reconstruction", 100)