In [1]:
# Testing batch inference of Music Flamingo
from transformers import (
    MusicFlamingoForConditionalGeneration,
    AutoProcessor,
    TextStreamer
)
import torch
import librosa


In [2]:
model = MusicFlamingoForConditionalGeneration.from_pretrained(
    "./music_flamingo_fp8",
    device_map="cuda",
    dtype="auto",
    attn_implementation="sdpa",
)
processor = AutoProcessor.from_pretrained("./music_flamingo_fp8")
streamer = TextStreamer(processor)


Loading weights:   0%|          | 0/1027 [00:00<?, ?it/s]

In [4]:
# Perf flags
torch.set_float32_matmul_precision("high")
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

# Enable KV cache (default config has use_cache=False)
for cfg in (model.config, model.language_model.config, getattr(model.config, "text_config", None)):
    if cfg is not None and hasattr(cfg, "use_cache"):
        cfg.use_cache = True
model.generation_config.use_cache = True
model.language_model.generation_config.use_cache = True
model.generation_config.cache_implementation = "dynamic"
model.language_model.generation_config.cache_implementation = "dynamic"
model.generation_config.max_new_tokens = 2048
model.generation_config.do_sample = False


In [6]:
conversation = [
    {
        "role": "user",
        "content": [
            {"type": "text", "text": "Describe this track in full detail - tell me the genre, tempo, and key, then dive into the instruments, production style, and overall mood it creates."},
            {"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/song_1.mp3"},
        ],
    }
]

# Build prompt without loading audio inside apply_chat_template
prompt = processor.apply_chat_template(
    conversation,
    tokenize=False,
    add_generation_prompt=True,
)

audio, _ = librosa.load(
    "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/song_1.mp3",
    sr=16000,
)
inputs = processor(
    text=prompt,
    audio=audio,
    return_tensors="pt",
).to(model.device)


In [7]:
#warmup
model.generate(**inputs, max_new_tokens=256)

The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


tensor([[151644,   8948,    198,  ...,   8778,    752,  45648]],
       device='cuda:0')

In [8]:
import time

torch.cuda.synchronize()
start = time.perf_counter()
outputs = model.generate(**inputs, max_new_tokens=256)
torch.cuda.synchronize()
elapsed = time.perf_counter() - start
new_tokens = outputs.shape[1] - inputs["input_ids"].shape[1]
toks_per_s = new_tokens / elapsed
toks_per_s


14.1 s ± 28.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
toks_per_s


22.06896551724138