In [1]:
from unsloth import FastVisionModel # FastLanguageModel for LLMs
import torch

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit", # Llama 3.2 vision support
    "unsloth/Llama-3.2-11B-Vision-bnb-4bit",
    "unsloth/Llama-3.2-90B-Vision-Instruct-bnb-4bit", # Can fit in a 80GB card!
    "unsloth/Llama-3.2-90B-Vision-bnb-4bit",

    "unsloth/Pixtral-12B-2409-bnb-4bit",              # Pixtral fits in 16GB!
    "unsloth/Pixtral-12B-Base-2409-bnb-4bit",         # Pixtral base model

    "unsloth/Qwen2-VL-2B-Instruct-bnb-4bit",          # Qwen2 VL support
    "unsloth/Qwen2-VL-7B-Instruct-bnb-4bit",
    "unsloth/Qwen2-VL-72B-Instruct-bnb-4bit",

    "unsloth/llava-v1.6-mistral-7b-hf-bnb-4bit",      # Any Llava variant works!
    "unsloth/llava-1.5-7b-hf-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastVisionModel.from_pretrained(
    "unsloth/Qwen3-VL-8B-Instruct-unsloth-bnb-4bit",
    load_in_4bit = True, # Use 4bit to reduce memory use. False for 16bit LoRA.
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
)

ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.11.4: Fast Qwen3_Vl patching. Transformers: 4.57.1.
   \\   /|    NVIDIA GeForce RTX 4090. Num GPUs = 1. Max memory: 23.493 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.5.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post1. FA2 = True]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2/2 [00:01<00:00,  1.54it/s]


In [2]:
example = "This is an instrumental piece with a smooth jazz" \
"and funk fusion style, The tempo is moderate, and the key is major," \
" The primary instruments are a clean electric guitar playing a melodic " \
"lead, a bass guitar providing a walking bass line, and drums with a laid-back" \
" groove featuring prominent hi-hat work, A keyboard provides chordal " \
"accompaniment, often with a Rhodes-like tone, The song structure is " \
"primarily verse-chorus based, with a clear melodic theme introduced " \
"by the guitar, Production elements include a warm, slightly reverbed" \
" sound on the guitar and a well-balanced mix that allows each instrument to be distinctly heard"

In [13]:
FastVisionModel.for_inference(model) # Enable for inference!

image = "./City_of_Tears_Fountain_Square.png"
instruction = f"Generate a paragraph prompt as short as possible that would fit as background music to the image. \
    Mention the genre, instrumentation, tempo, key, and atmosphere using short phrases separated by commas. \
    The prompt should structurally match this example, do not make it any longer than 2 sentences: {example}"

messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": instruction}
    ]}
]
input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
inputs = tokenizer(
    image,
    input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
output = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 100,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

This is a haunting ambient piece with cinematic strings and ethereal pads, The tempo is slow and mournful, key is minor, atmosphere is somber and dripping with melancholy, The instrumentation includes a solo cello with bowed decay, a muted piano with sparse arpeggios, and distant, echoing chimes, A low, sustained drone adds weight beneath subtle tremolo strings, The mood is isolated, reverent, and enveloped in a veil of rain-soaked silence.<|im_end|>


In [4]:
# 1. Calculate where the new tokens begin
input_len = inputs['input_ids'].shape[1]
# 2. Slice and decode just the generated part
generated_tokens = output[:, input_len:]
generated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]

In [None]:
generated_text

'This is a haunting ambient track with a dark cinematic texture, blending elements of neoclassical piano and atmospheric synth pads. The tempo is slow and deliberate, creating a sense of suspended stillness, with a minor key palette that deepens the melancholic mood. A solitary, resonant piano melody drifts like a whisper, accompanied by low, rumbling bass drones that evoke ancient stone and rain-washed streets. Subtle, detuned strings layer beneath, adding a sense of vast, echoing space. The production is sparse and reverbed, with a cavernous reverb tail on the piano notes and faint, ghostly choral pads in the distance. The arrangement features a slow, swelling crescendo in the chorus, building tension before collapsing into quietude. The overall sonic environment feels like a lonely, rainy evening in a forgotten city, perfectly mirroring the imageâ€™s mood of solemn grandeur and quiet melancholy.'

In [None]:
from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor

model_id = "nvidia/music-flamingo-hf"
processor = AutoProcessor.from_pretrained(model_id)
model = AudioFlamingo3ForConditionalGeneration.from_pretrained(model_id, device_map="auto")

conversation = [
    {
        "role": "user",
        "content": [
            {"type": "text", "text": "Describe this track in full detail - tell me the genre, tempo, and key, then dive into the instruments, production style, and overall mood it creates."},
            {"type": "audio", "path": "./data/audio/City_of_Tears.mp3"},
        ],
    }
]

inputs = processor.apply_chat_template(
    conversation,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=1024)

decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
print(decoded_outputs)

  from .autonotebook import tqdm as notebook_tqdm
Fetching 4 files: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 4/4 [00:00<00:00, 97541.95it/s]
Loading weights: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 830/830 [00:03<00:00, 243.05it/s, Materializing param=multi_modal_projector.linear_2.weight]                         
Some parameters are on the meta device because they were offloaded to the cpu.


In [2]:
inputs = processor.apply_chat_template(
    conversation,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=1024)

decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
print(decoded_outputs)

OutOfMemoryError: CUDA out of memory. Tried to allocate 2.13 GiB. GPU 0 has a total capacity of 23.49 GiB of which 2.11 GiB is free. Including non-PyTorch memory, this process has 18.88 GiB memory in use. Of the allocated memory 18.35 GiB is allocated by PyTorch, and 74.56 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

  from .autonotebook import tqdm as notebook_tqdm
Fetching 4 files: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 4/4 [00:00<00:00, 119837.26it/s]
Loading weights: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 830/830 [00:02<00:00, 331.14it/s, Materializing param=multi_modal_projector.linear_2.weight]                         


RuntimeError: Input type (float) and bias type (c10::Half) should be the same

In [15]:
SONG_1 = "./data/audio/City_of_Tears.mp3"
SONG_2 = "./data/audio/Crossroads.mp3"
    
try:
    model, processor = load_model()
    
    # Run inference
    result = compare_songs(model, processor, SONG_1, SONG_2)
    
    print("\n--- Model Response ---")
    print(result)
    print("----------------------")
    
except Exception as e:
    print(f"\nError: {e}")
    print("Ensure you have the latest 'transformers' installed and valid audio paths.")

Loading model: nvidia/music-flamingo-hf...

Error: The checkpoint you are trying to load has model type `audioflamingo3` but Transformers does not recognize this architecture. This could be because of an issue with the checkpoint, or because your version of Transformers is out of date.

You can update Transformers with the command `pip install --upgrade transformers`. If this does not work, and the checkpoint is very new, then there may not be a release version that supports this model yet. In this case, you can get the most up-to-date code by installing Transformers from source with the command `pip install git+https://github.com/huggingface/transformers.git`
Ensure you have the latest 'transformers' installed and valid audio paths.


This is an instrumental piece with a smooth jazz and funk fusion style, The tempo is moderate, and the key is major, The primary instruments are a clean electric guitar playing a melodic lead, a bass guitar providing a walking bass line, and drums with a laid-back groove featuring prominent hi-hat work, A keyboard provides chordal accompaniment, often with a Rhodes-like tone, The song structure is primarily verse-chorus based, with a clear melodic theme introduced by the guitar, Production elements include a warm, slightly reverbed sound on the guitar and a well-balanced mix that allows each instrument to be distinctly heard

A high-energy electronic dance music track with a driving beat and a melancholic yet uplifting atmosphere, The tempo is fast, around 130 BPM, The key is minor, contributing to the melancholic feel, The song features a prominent synth melody that is both catchy and emotive, The instrumentation includes a powerful kick drum, a crisp snare, a driving bassline, and various synth pads and arpeggios, The production is clean and polished, with a wide stereo image and a good balance between all elements, The song structure is typical for EDM, with an intro, build-ups, drops, and breakdowns, The intro features a sustained synth pad and a subtle arpeggio, gradually building in intensity, The build-ups incorporate white noise sweeps and increasing drum intensity, The drops are characterized by a powerful kick drum, a driving bassline, and the main synth melody, The breakdowns feature a more atmospheric sound with pads and arpeggios, before building back up to another drop, There are no vocals in this track


## In the future I'd like to try training a model using audio flamingo annotations, increase the dataset size