In [1]:
import glob
import torch
from safetensors.torch import safe_open

from transformers import AutoProcessor, MusicFlamingoConfig
from transformers.models.musicflamingo.modeling_musicflamingo import (
    MusicFlamingoEncoder,
    MusicFlamingoMultiModalProjector,
)

from vllm import LLM, SamplingParams


CKPT_DIR = "./music_flamingo_fp8"  # your local folder
AUDIO_COUNT = 1                    # one audio item in the prompt


def load_submodule_weights_from_safetensors(module, ckpt_dir: str, prefix: str):
    """
    Loads weights whose keys start with `prefix` (e.g. "audio_tower.") into `module`.
    Works with sharded safetensors too.
    """
    sd = {}
    st_files = sorted(glob.glob(f"{ckpt_dir}/*.safetensors"))
    if not st_files:
        raise FileNotFoundError(f"No .safetensors files found in: {ckpt_dir}")

    for path in st_files:
        with safe_open(path, framework="pt", device="cpu") as f:
            for k in f.keys():
                if k.startswith(prefix):
                    sd[k[len(prefix):]] = f.get_tensor(k)

    missing, unexpected = module.load_state_dict(sd, strict=False)
    if unexpected:
        raise RuntimeError(f"Unexpected keys when loading {prefix}: {unexpected}")
    # missing can be OK if the module has buffers/extra items; usually small.
    return missing


# --- 1) Build prompt + audio features using the HF processor ---
processor = AutoProcessor.from_pretrained(CKPT_DIR)

conversation = [
    {
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": "Describe this track in full detail - tell me the genre, tempo, and key, "
                        "then dive into the instruments, production style, and overall mood it creates.",
            },
            {"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/song_1.mp3"},
        ],
    }
]

# prompt string for vLLM (should include exactly 1 audio placeholder for 1 audio item)
prompt = processor.apply_chat_template(
    conversation,
    tokenize=False,
    add_generation_prompt=True,
)

# tokenized+processed tensors for computing audio embeddings in Transformers
inputs = processor.apply_chat_template(
    conversation,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
)

# --- 2) Run ONLY the audio tower + projector in Transformers to create audio_embeds ---
cfg = MusicFlamingoConfig.from_pretrained(CKPT_DIR)

audio_tower = MusicFlamingoEncoder(cfg).eval().cuda()
projector = MusicFlamingoMultiModalProjector(cfg).eval().cuda()

load_submodule_weights_from_safetensors(audio_tower, CKPT_DIR, prefix="audio_tower.")
load_submodule_weights_from_safetensors(projector,  CKPT_DIR, prefix="multi_modal_projector.")

input_features = inputs["input_features"].cuda()
input_features_mask = inputs["input_features_mask"].cuda()
audio_times = inputs.get("audio_times", None)
if audio_times is not None:
    audio_times = audio_times.cuda()

with torch.inference_mode():
    # mimic your model.get_audio_features() logic, but keep it 3D for vLLM: (B, T, H)
    input_features = input_features.to(dtype=audio_tower.conv1.weight.dtype)
    enc_out = audio_tower(input_features, input_features_mask=input_features_mask, audio_times=audio_times)
    audio_embeds = projector(enc_out.last_hidden_state)  # (B, Tproj, H)

    # same post-length formula as your code
    post_lengths = (input_features_mask.sum(-1) - 2) // 2 + 1  # (B,)
    # for B=1, slice to valid length
    T = int(post_lengths[0].item())
    audio_embeds = audio_embeds[:, :T, :]                  # (1, audio_feature_size, H)

# vLLM expects CPU tensor is fine; shape must be (num_items, feature_size, hidden_size)
audio_embeds_for_vllm = audio_embeds.to("cpu")             # (1, feature_size, H)


# --- 3) Run generation in vLLM using the *precomputed* audio embeddings ---
# AudioFlamingo3 uses a single placeholder token per audio item (commonly "<sound>"). :contentReference[oaicite:2]{index=2}
# Your `prompt` from the HF processor should already be in the right format.
llm = LLM(
    model=CKPT_DIR,
    trust_remote_code=True,
    limit_mm_per_prompt={"audio": AUDIO_COUNT},
    enable_mm_embeds=True,  # required for embedding inputs :contentReference[oaicite:3]{index=3}
    # max_model_len=...,  # set if you hit context length errors
)

sampling = SamplingParams(
    max_tokens=256,
    temperature=0.7,
    top_p=0.95,
)

outputs = llm.generate(
    {
        "prompt": prompt,
        "multi_modal_data": {
            "audio": audio_embeds_for_vllm,  # (1, feature_size, hidden)
        },
    },
    sampling_params=sampling,
)

print(outputs[0].outputs[0].text)




ImportError: cannot import name 'ALLOWED_LAYER_TYPES' from 'transformers.configuration_utils' (/home/henry/projects/navidrome/musicembed/.venv/lib/python3.13/site-packages/transformers/configuration_utils.py)