In [1]:
from audiocraft.models import MusicGen
from audiocraft.models import MultiBandDiffusion

USE_DIFFUSION_DECODER = False
# Using small model, better results would be obtained with `medium` or `large`.
model = MusicGen.get_pretrained('facebook/musicgen-small')
if USE_DIFFUSION_DECODER:
    mbd = MultiBandDiffusion.get_mbd_musicgen()



In [2]:
model.set_generation_params(
    use_sampling=True,
    top_k=250,
    duration=30
)

In [3]:
import math
import torchaudio
import torch
from audiocraft.utils.notebook import display_audio

def get_bip_bip(bip_duration=0.125, frequency=440,
                duration=0.5, sample_rate=32000, device="cuda"):
    """Generates a series of bip bip at the given frequency."""
    t = torch.arange(
        int(duration * sample_rate), device="cuda", dtype=torch.float) / sample_rate
    wav = torch.cos(2 * math.pi * 440 * t)[None]
    tp = (t % (2 * bip_duration)) / (2 * bip_duration)
    envelope = (tp >= 0.5).float()
    return wav * envelope

In [22]:
# Here we use a synthetic signal to prompt both the tonality and the BPM
# of the generated audio.
res = model.generate_continuation(
    get_bip_bip(0.125).expand(2, -1, -1), 
    32000, ['Epic and emotional music for fighting scene', 'funny reggae song'], 
    progress=True)
display_audio(res, 32000)

  1478 /   1500

In [21]:
import scipy

import numpy as np
audio_np=res[0, 0].cpu().numpy()
music = np.int16((audio_np / np.max(np.abs(audio_np))) * 32767)

sampling_rate = model.sample_rate
scipy.io.wavfile.write("musicgen_out.wav", rate=sampling_rate, data=music)

array([  -7.257639 ,   -2.9611948,   21.245975 , ..., -682.93506  ,
       -575.6274   , -757.1973   ], dtype=float32)