In [1]:
import torchaudio
import torch
import io
from utils import INPUT_DATA_DIR
import json
from IPython.display import Audio, display
from pydantic import BaseModel, Field
from typing import List
from google import genai
from dotenv import load_dotenv
import os
from google.genai import types

load_dotenv()
base_dir = INPUT_DATA_DIR / "music-bench" / "datashare"

In [2]:
class Concept(BaseModel):
    """Represents a single identified musical concept."""

    name: str = Field(..., description="Concise name for the musical concept.")
    confidence: float = Field(..., ge=0.0, le=1.0, description="Confidence score (0-1).")
    description: str = Field(..., description="Brief description of the concept.")


class ConceptLabels(BaseModel):
    """Represents the overall analysis result for a set of audio clips."""

    concepts: List[Concept] = Field(..., description="List of specific concepts identified.")
    overall_summary: str = Field(..., description="Concise description of the shared concept.")
    overall_name: str = Field(..., description="Concise name for the shared concept.")
    overall_confidence: float = Field(..., ge=0.0, le=1.0, description="Overall confidence score (0-1).")


PROMPT = """
Listen very carefully to this set of audio clips, which consists of song snippets concatenated in random order. You need to discover common musical patterns across the whole set, to identify what musical feature is shared across all clips. You will need to listen carefully. For each potential concept you identify, output a name, a confidence score between 0 and 1 (where 1 is highest confidence), and a concise description of the concept.
At a higher level, describe the overall concept shared across the set, give it a suitable name, and provide an overall confidence score (0 to 1).
Describe the **underlying concepts** not the specific audio snippets (e.g. your description could say "the concept" but not "the audio snippets"). However, try to avoid such verbiage altogether and concisely describe the musical concept’s main attributes.
Include NO FILLER text.
Focus on being specific. Concepts could relate to genre (e.g., hip-hop, salsa, reggaeton, balkan), instruments (e.g., piano, cello, guitar, flute), recording/production techniques (e.g., reverberation, drones, noise, DJ scratching, beatboxing, drum machine, hi-hat patterns, fingerpicking, live recording artifacts, low-pass filtering), or more nuanced musical ideas (e.g., drum solo, chill dance rhythm, serene woodwinds arrangement). These are illustrative examples, NOT a fixed list to choose from.
""".strip()

In [3]:
with open(INPUT_DATA_DIR / "interp" / "features_grouped.json", "r") as fh:
    feat = json.load(fh)

key = "f4606"
for p in set(feat[key]):
    display(Audio(str(base_dir / p)))

paths = set(feat[key])
audios = []
for path in paths:
    audio_tensor, sr = torchaudio.load(str(base_dir / path))
    transform = torchaudio.transforms.Resample(sr, 32000)
    audios.append(transform(audio_tensor)[0])
concatenated = torch.cat(audios, dim=0)  # (channels, total_time)

buffer = io.BytesIO()
torchaudio.save(buffer, concatenated.unsqueeze(dim=0), 32000, format="mp3")

audio_bytes = buffer.getvalue()

In [8]:
client = genai.Client(api_key=os.getenv("GENMINI_API"))

response = client.models.generate_content(
    model="gemini-2.0-flash",
    contents=[
        PROMPT,
        types.Part.from_bytes(
            data=audio_bytes,
            mime_type="audio/mp3",
        ),
    ],
    config={
        "response_mime_type": "application/json",
        "response_schema": ConceptLabels,
    },
)

print(response.text)

2025-06-12 10:47:32,757 INFO AFC is enabled with max remote calls: 10.
2025-06-12 10:47:40,096 INFO HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent "HTTP/1.1 200 OK"
2025-06-12 10:47:40,098 INFO AFC remote call 1 is done.
{
  "concepts": [
    {
      "name": "Middle Eastern Music",
      "confidence": 0.95,
      "description": "Use of instruments like the Oud and Ney flute, and Middle Eastern melodic scales."
    },
    {
      "name": "Percussion",
      "confidence": 0.85,
      "description": "Prominent use of percussion instruments, driving the rhythm."
    },
    {
      "name": "Improvisation",
      "confidence": 0.75,
      "description": "Apparent improvisation within a traditional framework."
    }
  ],
  "overall_summary": "Middle Eastern instrumental music featuring traditional instruments and improvisation.",
  "overall_name": "Arabic Instrumental",
  "overall_confidence": 0.9
}


In [9]:
json.loads(response.text)

{'concepts': [{'name': 'Middle Eastern Music',
   'confidence': 0.95,
   'description': 'Use of instruments like the Oud and Ney flute, and Middle Eastern melodic scales.'},
  {'name': 'Percussion',
   'confidence': 0.85,
   'description': 'Prominent use of percussion instruments, driving the rhythm.'},
  {'name': 'Improvisation',
   'confidence': 0.75,
   'description': 'Apparent improvisation within a traditional framework.'}],
 'overall_summary': 'Middle Eastern instrumental music featuring traditional instruments and improvisation.',
 'overall_name': 'Arabic Instrumental',
 'overall_confidence': 0.9}