# Dataset, DataLoaders

In [18]:
import sys
sys.path.insert(0, '../')
import torch
from audiocraft.solvers import builders
from audiocraft.models.musicgen import MusicGen
from audiocraft.environment import AudioCraftEnvironment
from audiocraft.utils.utils import get_loader
from audiocraft.modules.conditioners import ClassifierFreeGuidanceDropout
from omegaconf import OmegaConf
import os
os.environ["AUDIOCRAFT_CONFIG"] = "../config/teams/default.yaml"

In [2]:
cfg_raw = {'sample_rate': 32000,
       'seed': 2036,
       'max_sample_rate': 32000,
       'max_channels': 1,
        'channels': 1,
       'dataset': {'batch_size': 1,
                   'num_workers': 10,
                   'segment_duration': 30,
                   'num_samples': None,
                   'return_info': True,
                   'shuffle': False,
                   'sample_on_duration': False,
                   'sample_on_weight': False,
                   'min_segment_ratio': 0.8,
                   'train': {'num_samples': 1000000, 'shuffle': True},
                   'valid': {'num_samples': 10000},
                   'evaluate': {'num_samples': 10000},
                   'generate': {'num_samples': 50, 'return_info': True, 'batch_size': 1}},
       'execute_only': None,
       'datasource': {'max_sample_rate': 44100, 
                      'max_channels': 2, 
                      'train': '../egs/example_notebook_1',
                      'valid': '../egs/example_notebook_1', 
                      'evaluate': '../egs/example_notebook_1', 
                      'generate': '../egs/example_notebook_1'},
       'optim': {
               # 'epochs': 500,
                 'updates_per_epoch': 2000,
                 # 'lr': 1,
                 # 'optimizer': 'dadam',
                 # 'adam': {'betas': [0.9, 0.95],
                 #          'weight_decay': 0.1,
                 #          'eps': 1e-08},
                 # 'ema': {'use': True,
                 #         'updates': 10,
                 #         'device': 'cpu',
                 #         'decay': 0.99},
                 # 'max_norm': 1.0,
                 # 'eager_sync': True
                },
           'return_info': True
    }



In [3]:
# cfg = AttrDict.from_nested_dicts(cfg_raw)
cfg = OmegaConf.create(cfg_raw)

In [4]:
dataloaders = builders.get_audio_datasets(cfg, dataset_type=builders.DatasetType.MUSIC)

In [5]:
dataloaders

{'train': <torch.utils.data.dataloader.DataLoader at 0x733f2d4d6f50>,
 'valid': <torch.utils.data.dataloader.DataLoader at 0x733d8545cc10>,
 'evaluate': <torch.utils.data.dataloader.DataLoader at 0x733d85392ad0>,
 'generate': <torch.utils.data.dataloader.DataLoader at 0x733d853912a0>}

In [6]:
batch = next(iter(dataloaders['train']))

In [7]:
len(batch)

2

In [8]:
music, segment_with_attributes = batch

In [9]:
segment_with_attributes[0]

MusicInfo(meta=AudioMeta(path='/home/karlos/Documents/workspace/proj/music/MusicGeneration/NOTEBOOKS/../dataset/example/electro_3.wav', duration=180.0, sample_rate=32000, amplitude=None, weight=None, info_path=None), seek_time=99.24656939506531, n_frames=960000, total_frames=960000, sample_rate=32000, channels=1, audio_tokens=None, title='Untitled song', artist='Unknown', key=None, bpm=None, genre='electronic', moods=None, keywords=None, description='Happy Song', name='electro_2', instrument='mix', self_wav=WavCondition(wav=tensor([[[-0.0114, -0.0021,  0.0094,  ..., -0.0494, -0.0476, -0.0621]]]), length=tensor([960000]), sample_rate=[32000], path=['/home/karlos/Documents/workspace/proj/music/MusicGeneration/NOTEBOOKS/../dataset/example/electro_3.wav'], seek_time=[99.24656939506531]), joint_embed={})

In [10]:
audio, infos = batch
audio_tokens = None
assert audio.size(0) == len(infos), (
    f"Mismatch between number of items in audio batch ({audio.size(0)})",
    f" and in metadata ({len(infos)})"
)

In [229]:
infos[0].description = 'This is very long sentence '*20
infos

[MusicInfo(meta=AudioMeta(path='/home/karlos/Documents/workspace/proj/music/MusicGeneration/NOTEBOOKS/../dataset/example/electro_3.wav', duration=180.0, sample_rate=32000, amplitude=None, weight=None, info_path=None), seek_time=99.24656939506531, n_frames=960000, total_frames=960000, sample_rate=32000, channels=1, audio_tokens=None, title='Untitled song', artist='Unknown', key=None, bpm=None, genre='electronic', moods=None, keywords=None, description='This is very long sentence This is very long sentence This is very long sentence This is very long sentence This is very long sentence This is very long sentence This is very long sentence This is very long sentence This is very long sentence This is very long sentence This is very long sentence This is very long sentence This is very long sentence This is very long sentence This is very long sentence This is very long sentence This is very long sentence This is very long sentence This is very long sentence This is very long sentence ', n

In [230]:
# prepare attributes
attributes = [info.to_condition_attributes() for info in infos]
attributes

[ConditioningAttributes(text={'meta': AudioMeta(path='/home/karlos/Documents/workspace/proj/music/MusicGeneration/NOTEBOOKS/../dataset/example/electro_3.wav', duration=180.0, sample_rate=32000, amplitude=None, weight=None, info_path=None), 'seek_time': 99.24656939506531, 'n_frames': 960000, 'total_frames': 960000, 'sample_rate': 32000, 'channels': 1, 'audio_tokens': None, 'title': 'Untitled song', 'artist': 'Unknown', 'key': None, 'bpm': None, 'genre': 'electronic', 'moods': None, 'keywords': None, 'description': 'This is very long sentence This is very long sentence This is very long sentence This is very long sentence This is very long sentence This is very long sentence This is very long sentence This is very long sentence This is very long sentence This is very long sentence This is very long sentence This is very long sentence This is very long sentence This is very long sentence This is very long sentence This is very long sentence This is very long sentence This is very long sen

In [231]:
attributes_cfg = ClassifierFreeGuidanceDropout(1.0)(attributes)
attributes_cfg

[ConditioningAttributes(text={'meta': None, 'seek_time': None, 'n_frames': None, 'total_frames': None, 'sample_rate': None, 'channels': None, 'audio_tokens': None, 'title': None, 'artist': None, 'key': None, 'bpm': None, 'genre': None, 'moods': None, 'keywords': None, 'description': None, 'name': None, 'instrument': None}, wav={'self_wav': WavCondition(wav=tensor([[[-0.]]]), length=tensor([0]), sample_rate=[32000], path=[None], seek_time=[None])}, joint_embed={})]

### Load T5

In [19]:
model = MusicGen.get_pretrained('facebook/musicgen-small')



In [232]:
model.lm.condition_provider.tokenize(attributes)

{'description': {'input_ids': tensor([[ 100,   19,  182,  307, 7142,  100,   19,  182,  307, 7142,  100,   19,
           182,  307, 7142,  100,   19,  182,  307, 7142,  100,   19,  182,  307,
          7142,  100,   19,  182,  307, 7142,  100,   19,  182,  307, 7142,  100,
            19,  182,  307, 7142,  100,   19,  182,  307, 7142,  100,   19,  182,
           307, 7142,  100,   19,  182,  307,    1]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}}

In [233]:
model.lm.condition_provider.tokenize(attributes_cfg)

{'description': {'input_ids': tensor([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0]], device='cuda:0'), 'attention_mask': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0]], device='cuda:0')}}

In [234]:
model.lm.condition_provider(model.lm.condition_provider.tokenize(attributes_cfg))

{'description': (tensor([[[-0., 0., -0.,  ..., -0., -0., 0.],
           [-0., 0., -0.,  ..., 0., -0., -0.],
           [-0., 0., -0.,  ..., 0., -0., -0.],
           ...,
           [-0., 0., -0.,  ..., 0., -0., -0.],
           [-0., 0., -0.,  ..., 0., -0., -0.],
           [-0., 0., -0.,  ..., 0., -0., -0.]]], device='cuda:0',
         grad_fn=<MulBackward0>),
  tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
           0, 0, 0, 0, 0, 0, 0]], device='cuda:0'))}

## MAKES ALL DESCRIPTIONS 0 EXCEPT THE FIRST ONE WITH ATTENTION MASK ALL 0s

## WORD DROPOUT BEHAVIOR

words are dropped randomly in original codes. They are just replaced with empty string, so randomly some tokens should be selected and removed with corresponding attention_masks by shifting everything to left 

In [242]:
import torch.nn.functional as F


In [243]:
atts = {'description': 
            {'input_ids': torch.rand((55, 1024)),
             'attention_mask': torch.ones((55), dtype=int)
             # 'attention_mask': torch.cat([torch.ones((52), dtype=int), torch.zeros((3), dtype=int)])
            }
       }

In [244]:
atts

{'description': {'input_ids': tensor([[0.6371, 0.1715, 0.8561,  ..., 0.0298, 0.2770, 0.6483],
          [0.5234, 0.3489, 0.8574,  ..., 0.3117, 0.1708, 0.2470],
          [0.3904, 0.4157, 0.8487,  ..., 0.5094, 0.7890, 0.9687],
          ...,
          [0.0982, 0.8166, 0.2535,  ..., 0.8962, 0.9450, 0.4499],
          [0.7777, 0.0501, 0.0100,  ..., 0.5549, 0.4836, 0.4864],
          [0.5826, 0.7799, 0.2680,  ..., 0.9577, 0.1493, 0.4358]]),
  'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1])}}

In [245]:
def drop_description(inp):
    """
    Inplace full dropout.
    """
    inp['description']['input_ids'][...] = 0
    inp['description']['attention_mask'][...] = 0
    return inp
    

In [246]:
def token_dropout(inp, index):
    """
    Given input_ids and attention_mask perform token_dropout in given indices.
    """

    inp_ids = inp['description']['input_ids']
    attention_mask = inp['description']['attention_mask']
    
    original_length = attention_mask.shape[0]
    
    inp_ids[index] = torch.zeros(inp_ids.shape[1], dtype=inp_ids.dtype)
    attention_mask[index] = 0
    
    inp_ids = inp_ids[attention_mask.nonzero(as_tuple=True)]
    attention_mask = attention_mask[attention_mask.nonzero(as_tuple=True)]

    inp_ids = F.pad(inp_ids, (0, 0, 0, original_length - inp_ids.shape[0]), value = 0.0)
    attention_mask = F.pad(attention_mask, (0, original_length - attention_mask.shape[-1]), value = 0)
    
    inp['description']['input_ids'] = inp_ids
    inp['description']['attention_mask'] = attention_mask

    return inp

In [247]:
indices = torch.rand(55) < 0.3

In [248]:
indices

tensor([ True, False, False, False, False,  True,  True, False,  True,  True,
         True,  True, False, False, False, False, False, False, False, False,
        False,  True, False, False, False, False, False, False, False, False,
        False, False,  True, False, False, False,  True, False,  True, False,
        False, False, False, False,  True, False, False, False, False, False,
        False,  True, False, False, False])

In [249]:
token_dropout(atts, indices)

{'description': {'input_ids': tensor([[0.5234, 0.3489, 0.8574,  ..., 0.3117, 0.1708, 0.2470],
          [0.3904, 0.4157, 0.8487,  ..., 0.5094, 0.7890, 0.9687],
          [0.2988, 0.4120, 0.6921,  ..., 0.1899, 0.9186, 0.1255],
          ...,
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]]),
  'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0])}}

In [250]:
drop_description(atts)

{'description': {'input_ids': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]),
  'attention_mask': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0])}}

In [213]:
inp_ids = inp['description']['input_ids']
attention_mask = inp['description']['attention_mask']

original_length = attention_mask.shape[0]

inp_ids[index] = torch.zeros(inp_ids.shape[1], dtype=inp_ids.dtype)
attention_mask[index] = 0


In [225]:
inp_ids[-15:-12]

tensor([[0.5171, 0.8077, 0.6169,  ..., 0.3838, 0.4154, 0.5786],
        [0.9120, 0.6587, 0.4010,  ..., 0.0647, 0.3351, 0.0739],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]])

In [222]:
attention_mask

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0])

In [216]:
inp_ids[attention_mask.nonzero(as_tuple=True)]

tensor([[0.9780, 0.4434, 0.2783,  ..., 0.2558, 0.9914, 0.2663],
        [0.3825, 0.8021, 0.8806,  ..., 0.3276, 0.3776, 0.0110],
        [0.3789, 0.7601, 0.0995,  ..., 0.5196, 0.0649, 0.8234],
        ...,
        [0.8508, 0.4552, 0.9451,  ..., 0.6921, 0.3860, 0.2226],
        [0.5171, 0.8077, 0.6169,  ..., 0.3838, 0.4154, 0.5786],
        [0.9120, 0.6587, 0.4010,  ..., 0.0647, 0.3351, 0.0739]])

In [217]:

inp_ids = inp_ids[attention_mask.nonzero(as_tuple=True)]
attention_mask = attention_mask[attention_mask.nonzero(as_tuple=True)]


In [218]:

inp_ids = F.pad(inp_ids, (0, 0, 0, original_length - inp_ids.shape[0]), value = 0.0)
attention_mask = F.pad(attention_mask, (0, original_length - attention_mask.shape[-1]), value = 0)

inp['description']['input_ids'] = inp_ids
inp['description']['attention_mask'] = attention_mask


In [221]:
inp_ids[-15:-10]

tensor([[0.5171, 0.8077, 0.6169,  ..., 0.3838, 0.4154, 0.5786],
        [0.9120, 0.6587, 0.4010,  ..., 0.0647, 0.3351, 0.0739],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]])

In [132]:
indices = torch.rand(55) < 1

first_zero_element = (atts['description']['attention_mask'] == 0).nonzero()
if len(first_zero_element) > 0:
    indices[(atts['description']['attention_mask'] == 0).nonzero()[0].item()-1] = False # Handling Special Token

In [133]:
indices

tensor([ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True, False,  True,  True,  True])

In [107]:
token_dropout(atts, indices)

{'description': {'input_ids': tensor([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0]),
  'attention_mask': tensor([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0])}}

In [236]:
drop_description(atts)

{'description': {'input_ids': tensor([[1., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]),
  'attention_mask': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0])}}

In [37]:
from copy import deepcopy

In [38]:
a = deepcopy(model.lm.condition_provider.tokenize(attributes))
a

{'description': {'input_ids': tensor([[ 5574, 11263,     1,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0]], device='cuda:0')}}

In [41]:
a['description']['input_ids'].shape, a['description']['attention_mask'].shape

(torch.Size([1, 55]), torch.Size([1, 55]))

In [None]:
de