In [1]:
from copy import deepcopy

import sys
sys.path.append("..")

import torch
from tqdm import tqdm

from src.data.utils.spectrogram_normalizer import SpectrogramNormalizer
from src.data.utils.label_normalizer import LabelNormalizer
from src.data.datasets.melody_dataset import MelodyDataset
from src.data.datasets.audio_dataset import AudioDataset

In [2]:
train_dataset = AudioDataset.from_path("../datasets/melody_extraction/processed/train")
pipeline = train_dataset.pipeline

Slicing audio: 100%|██████████| 103/103 [00:20<00:00,  4.99it/s]
  conv1d(
Preprocessing audio: 100%|██████████| 1179/1179 [00:14<00:00, 79.11it/s]


In [3]:
spectrograms = []

for a in tqdm(train_dataset.audio):
    
    audio_copy = deepcopy(a)
    audio_copy.trim_silence()
    audio_copy = pipeline._preprocess_audio(audio_copy)

    spectrogram = pipeline._get_spectrogram(audio_copy)
    spectrogram = pipeline.amplitude_to_db(spectrogram.spectrogram)
    spectrogram = torch.nn.functional.interpolate(
        spectrogram.unsqueeze(0),
        size=(128, 256),
        mode='bilinear',
        align_corners=True
    ).squeeze(0)
    
    spectrograms.append(spectrogram)

100%|██████████| 103/103 [00:08<00:00, 11.66it/s]


In [4]:
normalizer1 = SpectrogramNormalizer()
normalizer1.fit(spectrograms)

print(f"Вычисленное среднее: {normalizer1.mean}")
print(f"Вычисленное стандартное отклонение: {normalizer1.std}")

normalizer2 = SpectrogramNormalizer(
    mean=normalizer1.mean,
    std=normalizer1.std
)
normalizer2.fit(
    spectrograms=[
        normalizer1.transform(spectrogram)
        for spectrogram in spectrograms
    ],
)

print(f"Среднее после нормализации: {normalizer2.mean}")
print(f"Стандартное отклонение после нормализации: {normalizer2.std}")

Calculating mean and std: 100%|██████████| 4/4 [00:02<00:00,  1.85it/s]


Вычисленное среднее: 8.240909576416016
Вычисленное стандартное отклонение: 15.86678615808288


Calculating mean and std: 100%|██████████| 4/4 [00:02<00:00,  1.86it/s]

Среднее после нормализации: 2.307961999292729e-08
Стандартное отклонение после нормализации: 1.0000000096663868





In [5]:
train_dataset = MelodyDataset.from_path("../datasets/melody_extraction/processed/train")
pipeline = train_dataset.pipeline

Slicing audio and melody: 100%|██████████| 103/103 [00:08<00:00, 11.73it/s]


In [6]:
labels = [pipeline._get_label(m) for m in train_dataset.sliced_melody]

In [7]:
label_normalizer = LabelNormalizer()
label_normalizer.fit_from_labels(labels)

print(f"freqs_min = {label_normalizer.freq_min}")
print(f"freqs_max = {label_normalizer.freq_max}")
print(f"durations_min = {label_normalizer.dur_min}")
print(f"durations_max = {label_normalizer.dur_max}")
print(f"seq_len_min = {label_normalizer.seq_len_min}")
print(f"seq_len_max = {label_normalizer.seq_len_max}")

freqs_min = 0.0
freqs_max = 1567.981689453125
durations_min = 0.25
durations_max = 10.0
seq_len_min = 2
seq_len_max = 58


In [8]:
labels[42].freqs

tensor([184.9972, 184.9972, 184.9972, 184.9972,   0.0000])

In [9]:
label = label_normalizer.transform_label(labels[42])
label.freqs

tensor([0.7102, 0.7102, 0.7102, 0.7102, 0.0000])

In [10]:
label = label_normalizer.inverse_transform_label(label)
label.freqs

tensor([184.9972, 184.9972, 184.9972, 184.9972,   0.0000])