In [1]:
# !pip install numpy
# !pip install soundfile
# !pip install av
# !pip install julius
# !pip install torchaudio
# !pip install omegaconf
# !pip install ipywidgets
# !pip install openai
# !pip install --upgrade typing-extensions
# !pip install einops
# !pip install xformers

# !pip install --upgrade soundfile
# !pip install -r requirements.txt
# !pip install --upgrade pip setuptools wheel
# !pip install flashy
# !pip install num2words 
# !pip install spacy
# !pip install transformers 
# !pip install librosa
# !pip install torchmetrics
# !pip install sentencepiece

# MusicGen
Welcome to MusicGen's demo jupyter notebook. Here you will find a series of self-contained examples of how to use MusicGen in different settings.

First, we start by initializing MusicGen, you can choose a model from the following selection:
1. `facebook/musicgen-small` - 300M transformer decoder.
2. `facebook/musicgen-medium` - 1.5B transformer decoder.
3. `facebook/musicgen-melody` - 1.5B transformer decoder also supporting melody conditioning.
4. `facebook/musicgen-large` - 3.3B transformer decoder.

We will use the `facebook/musicgen-small` variant for the purpose of this demonstration.

In [2]:
from audiocraft.models import MusicGen
from audiocraft.models import MultiBandDiffusion

USE_DIFFUSION_DECODER = False
# Using small model, better results would be obtained with `medium` or `large`.
model = MusicGen.get_pretrained('facebook/musicgen-small')
if USE_DIFFUSION_DECODER:
    mbd = MultiBandDiffusion.get_mbd_musicgen()

    PyTorch 2.6.0+cu124 with CUDA 1204 (you have 2.6.0+cpu)
    Python  3.9.13 (you have 3.9.21)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details
  WeightNorm.apply(module, name, dim)


Next, let us configure the generation parameters. Specifically, you can control the following:
* `use_sampling` (bool, optional): use sampling if True, else do argmax decoding. Defaults to True.
* `top_k` (int, optional): top_k used for sampling. Defaults to 250.
* `top_p` (float, optional): top_p used for sampling, when set to 0 top_k is used. Defaults to 0.0.
* `temperature` (float, optional): softmax temperature parameter. Defaults to 1.0.
* `duration` (float, optional): duration of the generated waveform. Defaults to 30.0.
* `cfg_coef` (float, optional): coefficient used for classifier free guidance. Defaults to 3.0.

When left unchanged, MusicGen will revert to its default parameters.

In [3]:
model.set_generation_params(
    use_sampling=True,
    top_k=250,
    duration=10
)
"""Set the generation parameters for MusicGen.

        Args:
            use_sampling (bool, optional): Use sampling if True, else do argmax decoding. Defaults to True.
            top_k (int, optional): top_k used for sampling. Defaults to 250.
            top_p (float, optional): top_p used for sampling, when set to 0 top_k is used. Defaults to 0.0.
            temperature (float, optional): Softmax temperature parameter. Defaults to 1.0.
            duration (float, optional): Duration of the generated waveform. Defaults to 30.0.
            cfg_coef (float, optional): Coefficient used for classifier free guidance. Defaults to 3.0.
            cfg_coef_beta (float, optional): beta coefficient in double classifier free guidance.
                Should be only used for MusicGen melody if we want to push the text condition more than
                the audio conditioning. See paragraph 4.3 in https://arxiv.org/pdf/2407.12563 to understand
                double CFG.
            two_step_cfg (bool, optional): If True, performs 2 forward for Classifier Free Guidance,
                instead of batching together the two. This has some impact on how things
                are padded but seems to have little impact in practice.
            extend_stride: when doing extended generation (i.e. more than 30 seconds), by how much
                should we extend the audio each time. Larger values will mean less context is
                preserved, and shorter value will require extra computations.
        """

'Set the generation parameters for MusicGen.\n\n        Args:\n            use_sampling (bool, optional): Use sampling if True, else do argmax decoding. Defaults to True.\n            top_k (int, optional): top_k used for sampling. Defaults to 250.\n            top_p (float, optional): top_p used for sampling, when set to 0 top_k is used. Defaults to 0.0.\n            temperature (float, optional): Softmax temperature parameter. Defaults to 1.0.\n            duration (float, optional): Duration of the generated waveform. Defaults to 30.0.\n            cfg_coef (float, optional): Coefficient used for classifier free guidance. Defaults to 3.0.\n            cfg_coef_beta (float, optional): beta coefficient in double classifier free guidance.\n                Should be only used for MusicGen melody if we want to push the text condition more than\n                the audio conditioning. See paragraph 4.3 in https://arxiv.org/pdf/2407.12563 to understand\n                double CFG.\n       

Next, we can go ahead and start generating music using one of the following modes:
* Unconditional samples using `model.generate_unconditional`
* Music continuation using `model.generate_continuation`
* Text-conditional samples using `model.generate`
* Melody-conditional samples using `model.generate_with_chroma`

### Music Continuation

In [4]:
import math
import torchaudio
import torch
from audiocraft.utils.notebook import display_audio

# def get_bip_bip(bip_duration=0.125, frequency=440,
#                 duration=0.5, sample_rate=32000, device="cpu"):
#     """Generates a series of bip bip at the given frequency."""
#     t = torch.arange(
#         int(duration * sample_rate), device="cpu", dtype=torch.float) / sample_rate
#     wav = torch.cos(2 * math.pi * 440 * t)[None]
#     tp = (t % (2 * bip_duration)) / (2 * bip_duration)
#     envelope = (tp >= 0.5).float()
#     return wav * envelope

In [5]:
# # Here we use a synthetic signal to prompt both the tonality and the BPM
# # of the generated audio.
# res = model.generate_continuation(
#     get_bip_bip(0.125).expand(2, -1, -1), 
#     32000, ['Jazz jazz and only jazz', 
#             'Heartful EDM with beautiful synths and chords'], 
#     progress=True)
# display_audio(res, 32000)

In [6]:
# # You can also use any audio from a file. Make sure to trim the file if it is too long!
# prompt_waveform, prompt_sr = torchaudio.load("../assets/bach.mp3")
# prompt_duration = 2
# prompt_waveform = prompt_waveform[..., :int(prompt_duration * prompt_sr)]
# output = model.generate_continuation(prompt_waveform, prompt_sample_rate=prompt_sr, progress=True, return_tokens=True)
# display_audio(output[0], sample_rate=32000)
# if USE_DIFFUSION_DECODER:
#     out_diffusion = mbd.tokens_to_wav(output[1])
#     display_audio(out_diffusion, sample_rate=32000)

### Text-conditional Generation

In [7]:
ARK_API_KEY="9e5d6644-81ee-4360-baf1-db1816b2c344"

In [8]:
import os
from openai import OpenAI

# 请确保 ARK_API_KEY 已经在环境变量或其他地方正确设置
# ARK_API_KEY = os.getenv("ARK_API_KEY")26
if ARK_API_KEY is None:
    print("请设置 ARK_API_KEY 环境变量以继续。")
    exit(1)

# 初始化豆包API客户端
client = OpenAI(
    api_key=ARK_API_KEY,
    base_url="https://ark.cn-beijing.volces.com/api/v3",
)

def create_music_prompt(user_input,api_key=ARK_API_KEY):
    """
    接收用户输入，调用 API 生成仅包含音乐提示词的简短英文 prompt。
    """
    if api_key is None:
        print("请设置 api_key 环境变量以继续。")
    exit(1)

    # 初始化豆包API客户端
    client = OpenAI(
        api_key=api_key,
        base_url="https://ark.cn-beijing.volces.com/api/v3",
    )
    try:
        completion = client.chat.completions.create(
            model="ep-20250220214537-p7622",  # 替换为实际使用的模型ID
            messages=[
                {
                    "role": "system",
                    "content": (
                        "You are a music prompt generator for a music generation model called 'musicgen'. "
                        "Your task is to analyze the user's input, which may express a specific emotional state, "
                        "and then generate a concise, clear, detailed, and creative English music prompt that "
                        "incorporates elements to regulate or transform the stated emotion. For example, if the input "
                        "indicates sadness, include musical elements that transition from melancholy to hope. If the input "
                        "indicates anxiety, include calming, soothing elements to ease tension. Keep the output as short and clear as possible, "
                        "without any extra commentary."
                    )
                },
                {"role": "user", "content": user_input}
            ])
        final_prompt = completion.choices[0].message.content.strip()
        return final_prompt
    except Exception as e:
        print(f"Music prompt generation error: {e}")
        # 出现异常时返回一个默认的 prompt
        return "a gentle ambient piece with soft pads that gradually builds a sense of tranquility"

# def main():
#     # 接受用户输入
#     user_input = input("请输入情绪描述（例如：今天工作压力很大，需要放松）：\n")
    
#     # 生成适用于 musicgen 的英文提示词
#     prompt = create_music_prompt(user_input)
    
#     # 格式化输出提示词（仅输出生成的 prompt）
#     print("\n" + prompt)

# if __name__ == '__main__':
#     main()

In [9]:
# user_input=input("请输入情绪描述（例如：今天工作压力很大，需要放松）：\n")
# prompt=create_music_prompt(user_input)
# print(prompt)

# from audiocraft.utils.notebook import display_audio

# output = model.generate(
#     descriptions=[
#         prompt
#     ],
#     progress=True, return_tokens=True
# )
# display_audio(output[0], sample_rate=32000)
# if USE_DIFFUSION_DECODER:
#     out_diffusion = mbd.tokens_to_wav(output[1])
#     display_audio(out_diffusion, sample_rate=32000)


In [10]:
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output
from audiocraft.utils.notebook import display_audio
import sys
import traceback

# 创建界面组件
title = widgets.HTML(
    value="<h2>情绪调节音乐生成器</h2>"
)

input_text = widgets.Textarea(
    placeholder='请输入情绪描述（例如：今天工作压力很大，需要放松）',
    description='情绪描述:',
    layout=widgets.Layout(width='90%', height='80px')
)

generate_button = widgets.Button(
    description='开始生成',
    button_style='primary',
    icon='music'
)

debug_output = widgets.Output()  # 添加调试输出区域
prompt_display = widgets.HTML(value="")
status_label = widgets.HTML(value="")
audio_output = widgets.Output()

# 生成音乐的函数
def generate_music(b):
    # 先清除之前的输出
    with debug_output:
        clear_output()
        print("按钮被点击，开始处理...")
    
    status_label.value = "正在处理..."
    
    try:
        # 获取用户输入
        user_input = input_text.value
        
        with debug_output:
            print(f"用户输入: {user_input}")
        
        # 验证模型是否已加载
        if 'model' not in globals() or model is None:
            status_label.value = "<span style='color:red;'>错误: 模型未加载，请确保已运行加载模型的代码</span>"
            with debug_output:
                print("错误：模型未加载")
            return
        
        # 调用已定义的函数生成提示词
        prompt = create_music_prompt(user_input)
        prompt_display.value = f"<b>生成的提示词:</b> {prompt}"
        
        with debug_output:
            print(f"生成的提示词: {prompt}")
            print("开始生成音乐...")
        
        # 生成音乐
        with audio_output:
            clear_output()
            output = model.generate(
                descriptions=[prompt],
                progress=True, return_tokens=True
            )
            
            with debug_output:
                print("音乐生成完成，显示音频...")
            
            # 显示生成的音乐
            display_audio(output[0], sample_rate=32000)
            
            # 如果使用扩散解码器
            if 'USE_DIFFUSION_DECODER' in globals() and USE_DIFFUSION_DECODER:
                if 'mbd' in globals() and mbd is not None:
                    out_diffusion = mbd.tokens_to_wav(output[1])
                    display_audio(out_diffusion, sample_rate=32000)
                
        status_label.value = "音乐生成完成！"
    
    except Exception as e:
        # 捕获并显示详细错误信息
        error_details = traceback.format_exc()
        status_label.value = f"<span style='color:red;'>生成过程中出错: {str(e)}</span>"
        
        with debug_output:
            print("发生错误:")
            print(error_details)

# 确保清晰地绑定事件
generate_button.on_click(generate_music)
print("事件已绑定")  # 确认事件绑定

# 布局所有组件
input_box = widgets.VBox([input_text, generate_button])
output_box = widgets.VBox([status_label, prompt_display, audio_output, debug_output])

# 显示整个界面
main_layout = widgets.VBox([
    title,
    widgets.HTML("<hr>"),
    input_box,
    widgets.HTML("<hr>"),
    output_box
])

# 添加一些简单的样式
display(HTML("""
<style>
    .widget-textarea {
        width: 100%;
    }
    .widget-button {
        margin: 10px 0;
    }
</style>
"""))

display(main_layout)

事件已绑定


VBox(children=(HTML(value='<h2>情绪调节音乐生成器</h2>'), HTML(value='<hr>'), VBox(children=(Textarea(value='', descrip…

In [None]:
input()

In [None]:
import time
from pathlib import Path
import soundfile as sf  # 需要安装soundfile库
import numpy as np

In [None]:
# 测试用例集合
test_cases = [
    # 悲伤/抑郁系列
    # {
    #     "case_name": "depression_case_1",
    #     "input": "持续三个月的抑郁状态，感觉生活失去色彩",
    #     "expected_params": {"emotion": "悲伤", "valence": 2, "arousal": 3}
    # # },
    # {
    #     "case_name": "depression_case_2",
    #     "input": "阴沉的天气与内心的孤独，让我陷入深深的抑郁",
    #     "expected_params": {"emotion": "悲伤", "valence": 2, "arousal": 3}
    # },
    # {
    #     "case_name": "depression_case_3",
    #     "input": "连续的失败和失望让我觉得世界一片黑暗",
    #     "expected_params": {"emotion": "悲伤", "valence": 2, "arousal": 3}
    # },

    # # 焦虑/恐惧系列
    # {
    #     "case_name": "anxiety_case_1",
    #     "input": "明天有重要演讲，心跳加速坐立不安",
    #     "expected_params": {"emotion": "恐惧", "valence": 4, "arousal": 8}
    # },
    # {
    #     "case_name": "anxiety_case_2",
    #     "input": "对未来充满未知感，让我夜夜难以入眠",
    #     "expected_params": {"emotion": "恐惧", "valence": 4, "arousal": 8}
    # },
    # {
    #     "case_name": "anxiety_case_3",
    #     "input": "在高压环境中，我不断感受到强烈的焦虑与紧张",
    #     "expected_params": {"emotion": "恐惧", "valence": 4, "arousal": 8}
    # # },

    # # 快乐系列
    # {
    #     "case_name": "joyful_case_1",
    #     "input": "收到梦想公司的offer，兴奋得睡不着觉",
    #     "expected_params": {"emotion": "快乐", "valence": 8, "arousal": 7}
    # },
    # {
    #     "case_name": "joyful_case_2",
    #     "input": "今天得知了一个好消息，整个人都充满无限喜悦",
    #     "expected_params": {"emotion": "快乐", "valence": 8, "arousal": 7}
    # },
    # {
    #     "case_name": "joyful_case_3",
    #     "input": "在阳光灿烂的日子里，我感觉心情如同飞扬的音符",
    #     "expected_params": {"emotion": "快乐", "valence": 8, "arousal": 7}
    # },

    # # 愤怒系列
    # {
    #     "case_name": "anger_case_1",
    #     "input": "遭遇背叛和不公正待遇，我内心充满了愤怒与不满",
    #     "expected_params": {"emotion": "愤怒", "valence": 3, "arousal": 8}
    # },
    # {
    #     "case_name": "anger_case_2",
    #     "input": "在激烈争执中，我几乎无法控制内心的怒火",
    #     "expected_params": {"emotion": "愤怒", "valence": 3, "arousal": 8}
    # },
    # {
    #     "case_name": "anger_case_3",
    #     "input": "面对持续的不正义，我的心中燃起了熊熊怒焰",
    #     "expected_params": {"emotion": "愤怒", "valence": 3, "arousal": 8}
    # },

    # # 平静系列
    # {
    #     "case_name": "calm_case_1",
    #     "input": "夜深人静时，我静静欣赏着窗外的月光，感受到内心的平静",
    #     "expected_params": {"emotion": "平静", "valence": 7, "arousal": 2}
    # },
    # {
    #     "case_name": "calm_case_2",
    #     "input": "在湖边静坐，微风拂过，我感到无比的放松与安宁",
    #     "expected_params": {"emotion": "平静", "valence": 7, "arousal": 2}
    # },
    # {
    #     "case_name": "calm_case_3",
    #     "input": "清晨的鸟鸣与柔和的阳光让我心情彻底平复",
    #     "expected_params": {"emotion": "平静", "valence": 7, "arousal": 2}
    # },

    # # 怀旧系列
    # {
    #     "case_name": "nostalgic_case_1",
    #     "input": "回忆起童年美好的时光，心中涌起淡淡的怀旧与温馨",
    #     "expected_params": {"emotion": "怀旧", "valence": 6, "arousal": 3}
    # },
    # {
    #     "case_name": "nostalgic_case_2",
    #     "input": "翻看旧日照片，让我沉浸在温馨而久远的记忆中",
    #     "expected_params": {"emotion": "怀旧", "valence": 6, "arousal": 3}
    # },
    # {
    #     "case_name": "nostalgic_case_3",
    #     "input": "悠扬的老歌唤起了我深藏心底的往日回忆",
    #     "expected_params": {"emotion": "怀旧", "valence": 6, "arousal": 3}
    # },

    # 希望系列
    {
        "case_name": "hopeful_case_1",
        "input": "经历了一连串挫折后，我依然对未来充满坚定的希望",
        "expected_params": {"emotion": "希望", "valence": 7, "arousal": 4}
    },
    {
        "case_name": "hopeful_case_2",
        "input": "在逆境中，我看到了前方的一线光明",
        "expected_params": {"emotion": "希望", "valence": 7, "arousal": 4}
    },
    {
        "case_name": "hopeful_case_3",
        "input": "尽管前路艰辛，我对明天依然满怀信心",
        "expected_params": {"emotion": "希望", "valence": 7, "arousal": 4}
    },

    # 困惑系列
    {
        "case_name": "confused_case_1",
        "input": "面对前路的未知和不确定性，我感到无比迷茫和困惑",
        "expected_params": {"emotion": "困惑", "valence": 4, "arousal": 5}
    },
    {
        "case_name": "confused_case_2",
        "input": "在每个抉择点上，我都感到前所未有的迷失",
        "expected_params": {"emotion": "困惑", "valence": 4, "arousal": 5}
    },
    {
        "case_name": "confused_case_3",
        "input": "站在人生的十字路口，我不知道该迈向哪个方向",
        "expected_params": {"emotion": "困惑", "valence": 4, "arousal": 5}
    },
]


# # 创建保存目录
# output_dir = Path("./generated_music./250223")
# output_dir.mkdir(exist_ok=True)
# for i in range(1):
#     print(f'{i} set started.')
#     # 修改后的生成逻辑
#     for test_case in test_cases:
#         try:
#             # 生成时间戳
#             timestamp = int(time.time())
            
#             # 生成prompt
#             prompt = create_music_prompt(test_case["input"])
            
#             # 保存prompt文本
#             prompt_path = output_dir / f"{test_case['case_name']}_{timestamp}.txt"
#             with open(prompt_path, "w", encoding="utf-8") as f:
#                 f.write(f"Input: {test_case['input']}\n")
#                 f.write(f"Generated Prompt: {prompt}\n")
#                 f.write(f"Expected Params: {test_case['expected_params']}")
            
#             # 生成音频
#             output = model.generate(
#                 descriptions=[prompt],
#                 progress=True,  # 批量生成时关闭进度条
#                 return_tokens=True
#             )
#             # 修改音频保存部分为：
#             audio_data = output[0].cpu().numpy()
            
#             # 维度处理流程
#             if audio_data.ndim == 3:  # 处理批次维度
#                 audio_data = audio_data.squeeze(0)
                
#             if audio_data.ndim == 2 and audio_data.shape[0] < audio_data.shape[1]:
#                 audio_data = audio_data.T  # 确保是(samples, channels)格式
            
#             # 振幅归一化（防止削波）
#             audio_data = audio_data / np.max(np.abs(audio_data))
#             # 保存音频文件（WAV格式）
#             audio_path = output_dir / f"{test_case['case_name']}_{timestamp}.wav"
#             sf.write(audio_path, 
#                     audio_data,
#                     samplerate=32000)
    
            
        
#             # 可选：保存扩散解码器输出
#             if USE_DIFFUSION_DECODER:
#                 out_diffusion = mbd.tokens_to_wav(output[1])
#                 diffusion_path = output_dir / f"{test_case['case_name']}_diffusion_{timestamp}.wav"
#                 sf.write(diffusion_path, 
#                         out_diffusion.cpu().numpy(),
#                         samplerate=32000)
                
#             print(f"成功生成案例 [{test_case['case_name']}] 文件保存在 {audio_path}")
            
#         except Exception as e:
#             print(f"案例 [{test_case['case_name']}] 生成失败: {str(e)}")
#             continue

# print("所有测试用例处理完成")

###### from audiocraft.utils.notebook import display_audio

output = model.generate(
    descriptions=[
        #'80s pop track with bassy drums and synth',
        #'90s rock song with loud guitars and heavy drums',
        #'Progressive rock drum and bass solo',
        'Punk Rock song with loud drum and power guitar',
        #'Bluesy guitar instrumental with soulful licks and a driving rhythm section',
        #'Jazz Funk song with slap bass and powerful saxophone',
        # 'drum and bass beat with intense percussions'
    ],
    progress=True, return_tokens=True
)
display_audio(output[0], sample_rate=32000)
if USE_DIFFUSION_DECODER:
    out_diffusion = mbd.tokens_to_wav(output[1])
    display_audio(out_diffusion, sample_rate=32000)

In [None]:
import time

# 开始计时
start_time = time.time()

# 生成输出
output = model.generate(
    descriptions=[
        'Progressive rock drum and bass solo',
    ],
    progress=True, return_tokens=True
)

# 显示音频
display_audio(output[0], sample_rate=32000)

# 如果使用扩散解码器
if USE_DIFFUSION_DECODER:
    out_diffusion = mbd.tokens_to_wav(output[1])
    display_audio(out_diffusion, sample_rate=32000)

# 结束计时
end_time = time.time()
elapsed_time = end_time - start_time

# 输出所需时间
print(f"所需时间: {elapsed_time:.2f} 秒")

### Melody-conditional Generation

In [None]:
import torchaudio
from audiocraft.utils.notebook import display_audio

model = MusicGen.get_pretrained('facebook/musicgen-melody')
model.set_generation_params(duration=8)

melody_waveform, sr = torchaudio.load("../assets/bach.mp3")
melody_waveform = melody_waveform.unsqueeze(0).repeat(2, 1, 1)
output = model.generate_with_chroma(
    descriptions=[
        '80s pop track with bassy drums and synth',
        '90s rock song with loud guitars and heavy drums',
    ],
    melody_wavs=melody_waveform,
    melody_sample_rate=sr,
    progress=True, return_tokens=True
)
display_audio(output[0], sample_rate=32000)
if USE_DIFFUSION_DECODER:
    out_diffusion = mbd.tokens_to_wav(output[1])
    display_audio(out_diffusion, sample_rate=32000)