In [1]:
import sys
sys.path.insert(0, '../')
from audiocraft.data.process_data import PreprocessData 
from audiocraft.models.loaders import load_lm_model_ckpt
from omegaconf import OmegaConf, DictConfig

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
checkpoint = 'facebook/musicgen-small'
lm_model_ckpt = load_lm_model_ckpt(checkpoint)
cfg = OmegaConf.create(lm_model_ckpt['xp.cfg'])

In [5]:
cfg.datasource.train = '../egs/train'
cfg.datasource.valid = '../egs/train'
cfg.datasource.evaluate = '../egs/train'
cfg.datasource.generate = '../egs/train'

In [6]:
preprocessor = PreprocessData(cfg, checkpoint)



In [7]:
preprocessor.condition_provider.conditioners['description']

T5Conditioner(
  (output_proj): Linear(in_features=768, out_features=1024, bias=True)
)

In [16]:
preprocessor.condition_provider.conditioners['description'].t5

T5EncoderModel(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dropout(p=0.1, 

## Learning -> preprocessed files are from T5 + output_proj

In [1]:
import sys
sys.path.insert(0, '../')
import torchaudio
from audiocraft.models import MusicGen
from audiocraft.data.audio import audio_write
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from audiocraft.models.loaders import _get_state_dict, _delete_param, load_compression_model, load_lm_model_ckpt
import torch
from omegaconf import OmegaConf, DictConfig
from audiocraft.models import builders

if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

In [5]:
lm_trained_checkpoint = '../../9a4ede9c/checkpoint.th'
default_model_checkpoint = 'facebook/musicgen-small'

In [6]:
pkg = _get_state_dict(lm_trained_checkpoint)
cfg = OmegaConf.create(pkg['xp.cfg'])
cfg.device = str(device)
if cfg.device == 'cpu':
    cfg.dtype = 'float32'
else:
    cfg.dtype = 'float16'
cache_dir=None
pkg_def = load_lm_model_ckpt(default_model_checkpoint, cache_dir=cache_dir)
cfg_def = OmegaConf.create(pkg_def['xp.cfg'])
cfg_def.memory_saver = {}
cfg_def.memory_saver.enable=False
pkg['best_state']['model']['condition_provider.conditioners.description.output_proj.weight'] = pkg_def['best_state']['condition_provider.conditioners.description.output_proj.weight']
pkg['best_state']['model']['condition_provider.conditioners.description.output_proj.bias'] = pkg_def['best_state']['condition_provider.conditioners.description.output_proj.bias']


cfg_def['classifier_free_guidance']['training_dropout'] = 0.0
cfg_def['conditioners']['description']['t5']['word_dropout'] = 0.0

cfg['classifier_free_guidance']['training_dropout'] = 0.0
cfg['conditioners']['description']['t5']['word_dropout'] = 0.0


lm_model = builders.get_lm_model(cfg_def)

lm_model.load_state_dict(pkg['best_state']['model'])
lm_model.eval()
lm_model.cfg = cfg

In [7]:
lm_model

LMModel(
  (cfg_dropout): ClassifierFreeGuidanceDropout(p=0.0)
  (att_dropout): AttributeDropout({})
  (condition_provider): ConditioningProvider(
    (conditioners): ModuleDict(
      (description): T5Conditioner(
        (output_proj): Linear(in_features=768, out_features=1024, bias=True)
      )
    )
  )
  (fuser): ConditionFuser()
  (emb): ModuleList(
    (0-3): 4 x ScaledEmbedding(2049, 1024)
  )
  (transformer): StreamingTransformer(
    (layers): ModuleList(
      (0-23): 24 x StreamingTransformerLayer(
        (self_attn): StreamingMultiheadAttention(
          (out_proj): Linear(in_features=1024, out_features=1024, bias=False)
        )
        (linear1): Linear(in_features=1024, out_features=4096, bias=False)
        (dropout): Dropout(p=0.0, inplace=False)
        (linear2): Linear(in_features=4096, out_features=1024, bias=False)
        (norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  

In [9]:
attributes_file = '/home/karlos/Documents/workspace/projects/music/MusicGeneration/dataset/data_creator_apr_6/train/Ludovico Einaudi Nuvole Bianche Piano Cover  Relaxing Instrumental Music_5/attributes.pt'
conditions = torch.load(attributes_file)

In [28]:
condition_tensors, padding_mask = conditions['condition_tensors'], conditions['padding_mask']
# audio_tokens = audio_tokens.to(self.device)
for k, v in condition_tensors.items():
    if isinstance(v, torch.Tensor):
        condition_tensors[k] = condition_tensors[k].to(self.device).unsqueeze(0)
    elif isinstance(v, list) or isinstance(v, tuple):
        condition_tensors[k] = tuple(
            [condition_tensors[k][i].to(device).unsqueeze(0) for i in range(len(condition_tensors[k]))])

In [29]:
# generation_params = {
#             'use_sampling': use_sampling,
#             'temp': temperature,
#             'top_k': top_k,
#             'top_p': top_p,
#             'cfg_coef': cfg_coef,
#             'two_step_cfg': two_step_cfg,
#         }

generation_params = {'use_sampling': True, 'temp': 1.0, 'top_k': 250, 'top_p': 0.0, 
                     # 'prompted_samples': True, 'unprompted_samples': True, 'gen_gt_samples': False, 'prompt_duration': None, 'gen_duration': None, 'remove_prompts': False
                    }

In [30]:
import math
lm_model.compression_frame_rate = 50
duration_s = 20
num_samples = 1
# generate by sampling from the LM
with torch.no_grad():
    total_gen_len = math.ceil(duration_s * lm_model.compression_frame_rate)
    gen_tokens = lm_model.generate(
        None, None, condition_tensors, max_gen_len=total_gen_len,
        num_samples=num_samples, **generation_params)

In [32]:
torch.save(gen_tokens.cpu().detach(), '../../temp.pt')

## Listen

In [3]:
encoded_generation.shape

torch.Size([1, 4, 1000])

In [4]:
import IPython.display as ipd

from audiocraft.models import encodec, loaders, builders
from audiocraft.utils import utils
import omegaconf
import torch

musicgen_model_name = 'facebook/musicgen-small'

#reading generation
encoded_generation = torch.load('../../temp.pt')

#Loading Encodec model
pkg = loaders.load_compression_model_ckpt(musicgen_model_name)
cfg = omegaconf.OmegaConf.create(pkg['xp.cfg'])

kwargs = utils.dict_from_config(getattr(cfg, 'encodec'))

encoder_name = kwargs.pop('autoencoder')
quantizer_name = kwargs.pop('quantizer')

encoder, decoder = builders.get_encodec_autoencoder(encoder_name, cfg)
quantizer = builders.get_quantizer(quantizer_name, cfg, encoder.dimension)
frame_rate = kwargs['sample_rate'] // encoder.hop_length
renormalize = kwargs.pop('renormalize', False)
kwargs.pop('renorm', None), type(quantizer), frame_rate
model = encodec.EncodecModel(encoder, decoder, quantizer,
                    frame_rate=frame_rate, renormalize=renormalize, **kwargs)
model.load_state_dict(pkg['best_state'])
model = model.eval()

# decode and listen
melody_waveform_reconstructed = model.decode(encoded_generation, None)
ipd.display(ipd.Audio(melody_waveform_reconstructed[0].detach().numpy(), rate=kwargs['sample_rate']))

## WORKED!! Now From descriptions!!

### Load T5 tokenizer

In [4]:
import sys
sys.path.insert(0, '../')
from audiocraft.modules.conditioners import T5Conditioner
from audiocraft.utils.utils import dict_from_config
from audiocraft.models.loaders import load_lm_model_ckpt
from omegaconf import OmegaConf
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
checkpoint = 'facebook/musicgen-small'
lm_model_ckpt = load_lm_model_ckpt(checkpoint)
cfg = OmegaConf.create(lm_model_ckpt['xp.cfg'])

In [8]:

def load_conditioner_and_load_state_dict(cfg):
    conditioner_cfg = getattr(cfg, 'conditioners')
    dict_cfg = {} if conditioner_cfg is None else dict_from_config(conditioner_cfg)

    condition_provider_args = dict_cfg.pop('args', {})
    condition_provider_args.pop('merge_text_conditions_p', None)
    condition_provider_args.pop('drop_desc_p', None)

    cond_cfg = dict_cfg['description']
    model_type = cond_cfg['model']
    model_args = cond_cfg[model_type]
    t5 = T5Conditioner(output_dim=cfg.transformer_lm['dim'], device=device, **model_args)

    state = {
            'best_state': {
                'model': lm_model_ckpt['best_state'],
            },
        }

    output_proj_weight = state['best_state']['model'].pop('condition_provider.conditioners.description.output_proj.weight')
    output_proj_bias = state['best_state']['model'].pop('condition_provider.conditioners.description.output_proj.bias')

    t5.output_proj.load_state_dict({'weight': output_proj_weight, 'bias': output_proj_bias})

    
    return t5

In [9]:

if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'
tokenizer = load_conditioner_and_load_state_dict(cfg)

In [10]:
tokenizer.to(device)

T5Conditioner(
  (output_proj): Linear(in_features=768, out_features=1024, bias=True)
)

In [11]:
tokenizer.t5.eval()
tokenizer.eval()

T5Conditioner(
  (output_proj): Linear(in_features=768, out_features=1024, bias=True)
)

In [19]:
with torch.no_grad():
    input_tokenized = tokenizer.tokenize(["A music that has been described as classical, impressionist, modern, and contemporary can be characterized by its use of piano, moods such as relaxing, emotional, and cinematic, and it is often accompanied by a sense of nostalgia or reflection. as we are here"])

In [20]:
input_tokenized['input_ids'].shape

torch.Size([1, 55])

In [29]:
input

(tensor([[[-0.3420, -0.2097, -0.2832,  ...,  0.2556,  0.2116, -0.2758],
          [-0.2869,  0.5394,  0.1463,  ...,  0.0600, -0.0473,  0.0645],
          [-0.5730,  0.1519,  0.2257,  ...,  0.5498, -0.0755, -0.6174],
          ...,
          [ 0.1237,  0.2914, -0.2088,  ...,  0.1451, -0.3533, -0.1596],
          [ 0.3678,  0.2220, -0.5175,  ..., -0.2070, -0.2863,  0.1573],
          [ 0.0044,  0.0121,  0.0027,  ..., -0.0018, -0.0135, -0.0081]]]),
 tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1]]))

In [30]:
input = tokenizer(input_tokenized)
input = tuple(
    [input[i].to('cpu').detach() for i in range(len(input))])
input

(tensor([[[-0.2971,  0.2297,  0.1465,  ...,  0.2773, -0.0791,  0.1852],
          [ 0.1175,  0.5913,  0.0731,  ..., -0.0547, -0.1188,  0.3828],
          [ 0.3193,  0.0639,  0.2037,  ...,  0.2922,  0.3155, -0.0428],
          [-0.3401,  0.4772,  0.2584,  ...,  0.1217,  0.0398, -0.0651],
          [ 0.0374, -0.0050, -0.0148,  ...,  0.0302, -0.0184, -0.0186]]]),
 tensor([[1, 1, 1, 1, 1]]))

In [31]:
tt = '/home/karlos/Documents/workspace/projects/music/MusicGeneration/dataset/data_creator_apr_6/train/Ludovico Einaudi Nuvole Bianche Piano Cover  Relaxing Instrumental Music_5/attributes.pt'
t = torch.load(tt)
t

{'condition_tensors': {'description': (tensor([[-6.7678e-01,  2.7852e-01,  9.2402e-02,  ..., -2.4481e-03,
             1.7407e-02, -1.0254e-01],
           [-6.6206e-01, -4.6321e-01,  3.4895e-01,  ..., -8.5354e-02,
            -2.1787e-02, -5.3650e-01],
           [ 4.0586e-01,  8.0539e-02, -1.9450e-01,  ..., -3.3040e-01,
            -1.6539e-01, -1.7100e-01],
           ...,
           [ 1.5044e-01,  2.1617e-02,  5.5314e-01,  ...,  1.0531e-01,
            -1.1475e+00, -1.6050e-01],
           [-3.1950e-02, -3.2770e-01, -2.7889e-01,  ..., -1.5583e-01,
            -1.0872e-01, -3.2523e-01],
           [ 6.2193e-03,  1.3666e-02, -3.9386e-05,  ..., -1.3862e-03,
            -8.1567e-03, -1.1556e-02]]),
   tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]))},
 'padding_mask': tensor([[True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         [True, True, True,  .

In [32]:
t['condition_tensors']['description'][0].shape

torch.Size([38, 1024])

In [33]:
torch.save(input, '../../temp_condition_tensors.pt')

## Generate

In [1]:
import sys
sys.path.insert(0, '../')
import torchaudio
from audiocraft.models import MusicGen
from audiocraft.data.audio import audio_write
import torch

from audiocraft.models.loaders import _get_state_dict, _delete_param, load_compression_model, load_lm_model_ckpt
import torch
from omegaconf import OmegaConf, DictConfig
from audiocraft.models import builders

if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
lm_trained_checkpoint = '../../9a4ede9c/checkpoint.th'
default_model_checkpoint = 'facebook/musicgen-small'

pkg = _get_state_dict(lm_trained_checkpoint)
cfg = OmegaConf.create(pkg['xp.cfg'])
cfg.device = str(device)
if cfg.device == 'cpu':
    cfg.dtype = 'float32'
else:
    cfg.dtype = 'float16'
cache_dir=None
pkg_def = load_lm_model_ckpt(default_model_checkpoint, cache_dir=cache_dir)
cfg_def = OmegaConf.create(pkg_def['xp.cfg'])
cfg_def.memory_saver = {}
cfg_def.memory_saver.enable=False
pkg['best_state']['model']['condition_provider.conditioners.description.output_proj.weight'] = pkg_def['best_state']['condition_provider.conditioners.description.output_proj.weight']
pkg['best_state']['model']['condition_provider.conditioners.description.output_proj.bias'] = pkg_def['best_state']['condition_provider.conditioners.description.output_proj.bias']


cfg_def['classifier_free_guidance']['training_dropout'] = 0.0
cfg_def['conditioners']['description']['t5']['word_dropout'] = 0.0

cfg['classifier_free_guidance']['training_dropout'] = 0.0
cfg['conditioners']['description']['t5']['word_dropout'] = 0.0


lm_model = builders.get_lm_model(cfg_def)

lm_model.load_state_dict(pkg['best_state']['model'])
lm_model.eval()
lm_model.cfg = cfg

In [3]:
attributes_file = '../../temp_condition_tensors.pt'
conditions = torch.load(attributes_file)

condition_tensors, padding_mask = conditions, None
# audio_tokens = audio_tokens.to(self.device)

condition_tensors = tuple(
    [condition_tensors[i].to(device) for i in range(len(condition_tensors))])
# for k, v in condition_tensors.items():
#     if isinstance(v, torch.Tensor):
#         condition_tensors[k] = condition_tensors[k].to(self.device).unsqueeze(0)
#     elif isinstance(v, list) or isinstance(v, tuple):
#         condition_tensors[k] = tuple(
#             [condition_tensors[k][i].to(device).unsqueeze(0) for i in range(len(condition_tensors[k]))])

condition_tensors

(tensor([[[-0.2971,  0.2297,  0.1465,  ...,  0.2773, -0.0791,  0.1852],
          [ 0.1175,  0.5913,  0.0731,  ..., -0.0547, -0.1188,  0.3828],
          [ 0.3193,  0.0639,  0.2037,  ...,  0.2922,  0.3155, -0.0428],
          [-0.3401,  0.4772,  0.2584,  ...,  0.1217,  0.0398, -0.0651],
          [ 0.0374, -0.0050, -0.0148,  ...,  0.0302, -0.0184, -0.0186]]],
        device='cuda:0'),
 tensor([[1, 1, 1, 1, 1]], device='cuda:0'))

In [4]:
condition_tensors[0].shape

torch.Size([1, 5, 1024])

In [5]:
import math
generation_params = {'use_sampling': True, 'temp': 1.0, 'top_k': 250, 'top_p': 0.0, 
                     # 'prompted_samples': True, 'unprompted_samples': True, 'gen_gt_samples': False, 'prompt_duration': None, 'gen_duration': None, 'remove_prompts': False
                    }
lm_model.compression_frame_rate = 50
duration_s = 30
num_samples = 1
# generate by sampling from the LM
with torch.no_grad():
    total_gen_len = math.ceil(duration_s * lm_model.compression_frame_rate)
    gen_tokens = lm_model.generate(
        None, None, {'description': condition_tensors}, max_gen_len=total_gen_len,
        num_samples=num_samples, **generation_params)

In [6]:
torch.save(gen_tokens.cpu().detach(), '../../temp_transformer_output.pt')

## Listen

In [1]:
import IPython.display as ipd

from audiocraft.models import encodec, loaders, builders
from audiocraft.utils import utils
import omegaconf
import torch

musicgen_model_name = 'facebook/musicgen-small'

#reading generation
encoded_generation = torch.load('../../temp_transformer_output.pt')

#Loading Encodec model
pkg = loaders.load_compression_model_ckpt(musicgen_model_name)
cfg = omegaconf.OmegaConf.create(pkg['xp.cfg'])

kwargs = utils.dict_from_config(getattr(cfg, 'encodec'))

encoder_name = kwargs.pop('autoencoder')
quantizer_name = kwargs.pop('quantizer')

encoder, decoder = builders.get_encodec_autoencoder(encoder_name, cfg)
quantizer = builders.get_quantizer(quantizer_name, cfg, encoder.dimension)
frame_rate = kwargs['sample_rate'] // encoder.hop_length
renormalize = kwargs.pop('renormalize', False)
kwargs.pop('renorm', None), type(quantizer), frame_rate
model = encodec.EncodecModel(encoder, decoder, quantizer,
                    frame_rate=frame_rate, renormalize=renormalize, **kwargs)
model.load_state_dict(pkg['best_state'])
model = model.eval()

# decode and listen
melody_waveform_reconstructed = model.decode(encoded_generation, None)
ipd.display(ipd.Audio(melody_waveform_reconstructed[0].detach().numpy(), rate=kwargs['sample_rate']))

  from .autonotebook import tqdm as notebook_tqdm
