In [46]:
import torch.utils.data as data
import os
import numpy as np
import json

class Dataset(data.Dataset):
    def __init__(self, data_dir_path="preprocessed/", mode="cqt", context_window=9):
        self.data_dir_path = data_dir_path
        self.mode = mode
        self.context_window = context_window
        self.half_context_window = int(self.context_window / 2)
        self.tab_data_paths = self.get_tab_data_paths()
        self.audio_data_paths = self.get_audio_data_paths()
        
        if self.mode == "cqt":
            self.full_audio_length = 2000
        elif self.mode == "melspec":
            self.full_audio_length = 2000
        
    def get_tab_data_paths(self):
        data_paths = []
        tab_dir_path = os.path.join(self.data_dir_path, "tab")
        for file in os.listdir(tab_dir_path):
            data_paths.append(os.path.join(tab_dir_path, file))
        return data_paths
    
    def get_audio_data_paths(self):
        data_paths = []
        audio_dir_path = os.path.join(self.data_dir_path, self.mode)
        for file in os.listdir(audio_dir_path):
            data_paths.append(os.path.join(audio_dir_path, file))
        return data_paths
    
    def __len__(self):
        return len(self.tab_data_paths)
    
    def __getitem__(self, index):
        # data path
        tab_data_path = self.tab_data_paths[index]
        audio_data_path = self.audio_data_paths[index]
        # load data
        loaded_tab_data = np.load(tab_data_path)
        tempo = loaded_tab_data["tempo"]
        tab_data = loaded_tab_data["tab"]
        loaded_audio_data = np.load(audio_data_path)
        
        # generate audio data
        audio_data = np.empty((self.full_audio_length, loaded_audio_data.shape[0], self.context_window))
        # padding
        full_audio_data = np.pad(loaded_audio_data, ((0, 0), (0, self.full_audio_length - loaded_audio_data.shape[1])))
        padd_audio_data = np.pad(full_audio_data, ((0, 0), (self.half_context_window, self.half_context_window)))
        # insert
        for i in range(self.full_audio_length):
            audio_data[i] = padd_audio_data[:, i:i+self.context_window]
        # expand dim
        audio_data = np.expand_dims(audio_data, -1)
        return tempo, tab_data, audio_data

In [48]:
dataset = Dataset()
tempo, tab_data, audio_data = dataset[0]
print(audio_data.shape)
print(tab_data.shape)

(2000, 192, 9, 1)
(75, 21)


In [12]:
data = np.load("preprocessed/tab/test2.npz")
tempo = data["tempo"]
tab = data["tab"]
print(tab.shape)
print(tempo, tab)

(75, 21)
[120] [[5. 0. 0. ... 2. 0. 1.]
 [0. 6. 0. ... 2. 0. 1.]
 [0. 0. 7. ... 2. 0. 1.]
 ...
 [0. 0. 0. ... 2. 0. 1.]
 [0. 0. 0. ... 1. 0. 1.]
 [0. 0. 0. ... 1. 1. 1.]]


In [23]:
audio_data = np.load("preprocessed/cqt/test2.npy")
print(audio_data.shape)
print(audio_data)

(192, 1322)
[[1.03439415e-04 1.02625061e-04 9.77542149e-05 ... 6.76293275e-04
  6.87648659e-04 6.91265857e-04]
 [1.73491731e-04 1.71410124e-04 1.71554449e-04 ... 7.12694658e-04
  7.04140344e-04 6.99724362e-04]
 [3.00559797e-04 3.00102198e-04 2.90700031e-04 ... 3.17597995e-04
  3.36186873e-04 3.50715418e-04]
 ...
 [1.72285014e-04 3.68449801e-05 1.67578954e-04 ... 1.41001670e-04
  1.17982323e-04 7.07401196e-05]
 [8.88005015e-05 5.41087138e-05 4.49347899e-05 ... 5.49771648e-05
  1.49864354e-04 8.48159543e-05]
 [5.98453662e-05 1.45750309e-04 1.00274119e-04 ... 9.79508332e-05
  1.27555453e-04 8.62547167e-05]]
