In [None]:
!pip install --upgrade pip
!pip install --upgrade git+https://github.com/huggingface/transformers accelerate

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-o0q5z4jk
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-o0q5z4jk
  Resolved https://github.com/huggingface/transformers to commit 40dc11cd3eb4126652aa41ef8272525affd4a636
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor

model_id = "nvidia/music-flamingo-hf"
processor = AutoProcessor.from_pretrained(model_id)
model = AudioFlamingo3ForConditionalGeneration.from_pretrained(model_id, device_map="auto")

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/830 [00:00<?, ?it/s]

In [None]:
import json
with open('dataset.json', 'r') as f:
    dataset = json.load(f)

In [None]:
import glob
import os

# Define the directory path
base_path = "/content/gdrive/MyDrive/CMSC848I_Audio/ai_generated/"
search_pattern = os.path.join(base_path, "*.mp3")

# Find all .mp3 files
ai_mp3_files = glob.glob(search_pattern)

# Print the list of files
print(f"Found {len(ai_mp3_files)} MP3 files:")
for f in ai_mp3_files:
    print(f)

Found 12 MP3 files:
/content/gdrive/MyDrive/CMSC848I_Audio/ai_generated/Fungal_Wastes_AI.mp3
/content/gdrive/MyDrive/CMSC848I_Audio/ai_generated/City_of_Tears_AI.mp3
/content/gdrive/MyDrive/CMSC848I_Audio/ai_generated/Dirtmouth_AI.mp3
/content/gdrive/MyDrive/CMSC848I_Audio/ai_generated/Crossroads_AI.mp3
/content/gdrive/MyDrive/CMSC848I_Audio/ai_generated/Crystal_Peak_AI.mp3
/content/gdrive/MyDrive/CMSC848I_Audio/ai_generated/Queens_Gardens_AI.mp3
/content/gdrive/MyDrive/CMSC848I_Audio/ai_generated/Radiance_AI.mp3
/content/gdrive/MyDrive/CMSC848I_Audio/ai_generated/Soul_Sanctum_AI.mp3
/content/gdrive/MyDrive/CMSC848I_Audio/ai_generated/Resting_Grounds_AI.mp3
/content/gdrive/MyDrive/CMSC848I_Audio/ai_generated/Greenpath_AI.mp3
/content/gdrive/MyDrive/CMSC848I_Audio/ai_generated/White_Palace_AI.mp3
/content/gdrive/MyDrive/CMSC848I_Audio/ai_generated/Kingdoms_Edge_AI.mp3


In [None]:
dataset['Crystal_Peak']

{'audio_filepath': 'audio/Crystal_Peak.mp3',
 'image_filepath': 'images/Crystal_Peak.png',
 'prompt': 'This is an instrumental piece with an ambient and mystical style, The tempo is slow and flowing, and the key is minor, The atmosphere is cavernous, shimmering, and full of wonder. The primary instruments are crystalline bell sounds like a glockenspiel and celeste playing delicate, echoing melodies, ethereal synth pads providing a vast background, and high-pitched string sections creating a sense of tension and beauty, Production elements include a large, hall-like reverb creating a sense of immense space and a clear, resonant mix emphasizing the bell tones.'}

In [None]:
paired_data = []

for filepath in ai_mp3_files:
    filename = os.path.basename(filepath)
    # The user stated format is "{song}_AI", and extension is .mp3.
    # We remove '_AI.mp3' (7 characters) to get the key
    if filename.endswith('_AI.mp3'):
        song_key = filename[:-7]

        if song_key in dataset:
            prompt = dataset[song_key]['prompt']
            paired_data.append((filepath, prompt))
        else:
            print(f"Key '{song_key}' not found in dataset")
    else:
        print(f"File '{filename}' does not match expected pattern")

print(f"Successfully paired {len(paired_data)} items.")
# Display a few examples
for i in range(min(3, len(paired_data))):
    print(f"Pair {i+1}: {paired_data[i]}")

Successfully paired 12 items.
Pair 1: ('/content/gdrive/MyDrive/CMSC848I_Audio/ai_generated/Fungal_Wastes_AI.mp3', 'This is a slow, minor-key instrumental with an ambient, mystical style and a cavernous, shimmering atmosphere. The arrangement features crystalline glockenspiel and celeste notes echoing over ethereal synth pads and high strings, processed with large hall reverb to create a sense of immense space.')
Pair 2: ('/content/gdrive/MyDrive/CMSC848I_Audio/ai_generated/City_of_Tears_AI.mp3', 'This is an instrumental piece with an orchestral and ambient style, The tempo is slow, and the key is minor, The primary instruments are a string section playing soaring melodies, a piano providing gentle accompaniment, and a soft, wordless choir, A light percussion element is present, The atmosphere is melancholic, sorrowful, and beautiful.')
Pair 3: ('/content/gdrive/MyDrive/CMSC848I_Audio/ai_generated/Dirtmouth_AI.mp3', 'wordless female vocal, somber, atmospheric; simple piano melody in a 

In [None]:
import torch

all_responses = []

print(f"Processing {len(paired_data)} pairs...")

for filepath, prompt_text in paired_data:
    prompt = prompt_text

    conversation = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": f"Prompt: {prompt}. Does the following music match this description, answer yes or no."},
                {"type": "audio", "path": filepath},
            ],
        }
    ]

    inputs = processor.apply_chat_template(
        conversation,
        tokenize=True,
        add_generation_prompt=True,
        return_dict=True,
    )
    inputs = inputs.to(model.device)

    # Fix: Cast floating point inputs to match the model's dtype (e.g. bfloat16)
    for k, v in inputs.items():
        if torch.is_floating_point(v):
            inputs[k] = v.to(model.dtype)

    outputs = model.generate(**inputs, max_new_tokens=1024)

    decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
    response = decoded_outputs[0]

    print(f"File: {os.path.basename(filepath)}")
    print(f"Response: {response}\n")

    all_responses.append({
        "filepath": filepath,
        "prompt": prompt,
        "response": response
    })

print("Finished processing all pairs.")

Processing 12 pairs...
File: Fungal_Wastes_AI.mp3
Response: Yes

File: City_of_Tears_AI.mp3
Response: Yes

File: Dirtmouth_AI.mp3
Response: Yes

File: Crossroads_AI.mp3
Response: Yes

File: Crystal_Peak_AI.mp3
Response: Yes

File: Queens_Gardens_AI.mp3
Response: Yes

File: Radiance_AI.mp3
Response: No

File: Soul_Sanctum_AI.mp3
Response: Yes

File: Resting_Grounds_AI.mp3
Response: Yes

File: Greenpath_AI.mp3
Response: Yes

File: White_Palace_AI.mp3
Response: Yes

File: Kingdoms_Edge_AI.mp3
Response: Yes

Finished processing all pairs.
