In [None]:
from huggingface_hub import login

HF_TOKEN = "hf_******yRkbHvj"
login(token=HF_TOKEN)

In [None]:
# Install a transformers version that supports Gemma 3n (>= 4.53)
!pip install 'transformers>=4.53.0' 'timm>=1.0.16' -q

from transformers import AutoModelForImageTextToText, AutoProcessor
import torch
import os
from glob import glob
from IPython.display import display, Markdown


In [None]:
GEMMA_PATH = 'google/gemma-3n-E4B-it'

processor = AutoProcessor.from_pretrained(GEMMA_PATH)
model = AutoModelForImageTextToText.from_pretrained(
    GEMMA_PATH,
    torch_dtype='auto',
    device_map='auto',
)

print(f'Device: {model.device}')
print(f'DType: {model.dtype}')


In [None]:
# def describe_audio(path: str, max_tokens: int = 64) -> str:
#     """Run Gemma-3n on a single audio file and return a short description."""
#     system_msg = {
#         'role': 'system',
#         'content': [
#             {
#                 'type': 'text',
#                 'text': (
#                     'You are an assistant that listens to a given background noise audio clip '
#                     'and writes a short, high-level description of what is '
#                     'happening in 1-2 sentences. Mainly describe the sound/noise - how is it and also do mention how loud/unpleasant it is. Based on these descriptions we will be using that audio as background noise when simulating a vapi voice assistant when calling an agent - so that it looks like a real background noise based on the assistants situation. So need the descriptions to select this particular background noise. Start "Heading: "Give a short heading for this background noise"; Description: This noise.."'
#                 ),
#             }
#         ],
#     }

#     user_msg = {
#         'role': 'user',
#         'content': [
#             {
#                 'type': 'text',
#                 'text': (
#                     f"Please describe the following audio file named "
#                     f"'{os.path.basename(path)}' in 1-2 sentences."
#                 ),
#             },
#             {
#                 'type': 'audio',
#                 'audio': path,
#             },
#         ],
#     }

#     messages = [system_msg, user_msg]

#     input_ids = processor.apply_chat_template(
#         messages,
#         add_generation_prompt=True,
#         tokenize=True,
#         return_dict=True,
#         return_tensors='pt',
#     )
#     input_len = input_ids['input_ids'].shape[-1]

#     input_ids = input_ids.to(model.device, dtype=model.dtype)
#     outputs = model.generate(
#         **input_ids,
#         max_new_tokens=max_tokens,
#         disable_compile=True,
#     )

#     decoded = processor.batch_decode(
#         outputs[:, input_len:],
#         skip_special_tokens=True,
#         clean_up_tokenization_spaces=True,
#     )
#     return decoded[0].strip()


def describe_audio(path: str, max_tokens: int = 256) -> str:
    """Run Gemma-3n on a single audio file and return a JSON description object as a string."""
    system_msg = {
        "role": "system",
        "content": [
            {
                "type": "text",
                "text": (
                    "You are an assistant that listens to a background noise audio clip and "
                    "outputs a single JSON object describing it for use as call background.\n\n"
                    "Your ONLY output must be a valid JSON object, with no extra text, in this exact shape: {...}\n\n"
                    "- Do NOT wrap the JSON in ``` or any code fences.\n"
                    "- Do NOT prefix it with 'json' or any language label.\n"
                    "- Do NOT write anything before or after the JSON object.\n"
                    "{\n"
                    '  "id": "short_snake_case_identifier",\n'
                    '  "path": "<<<COPY THE EXACT path STRING PROVIDED>>>",\n'
                    '  "name": "Short human-readable name",\n'
                    '  "environment": "one of: office, home, outdoors, transit, retail, vehicle, neutral",\n'
                    '  "noise_level": "one of: low, medium, high",\n'
                    '  "energy": "one of: calm, neutral, busy, chaotic",\n'
                    '  "pleasantness": "one of: pleasant, neutral, unpleasant",\n'
                    '  "voices_intelligible": true or false,\n'
                    '  "context_examples": [\n'
                    '    "one example of a caller situation where this background fits",\n'
                    '    "optionally a second short example"\n'
                    "  ],\n"
                    '  "notes": "1–2 sentences summarizing what the noise sounds like and how loud/unpleasant it is"\n'
                    "}\n\n"
                    "- Use the audio to set noise_level, energy and pleasantness.\n"
                    "- Use generic, reusable context examples (e.g., “caller in a busy office” or “caller walking through a mall”).\n"
                    "- Always copy the path string exactly from the user message into the `path` field.\n"
                    "- Do NOT include any commentary or text outside the JSON object. Also don't include any code blocks like: '```json' in your response.. just the json object that I can parse it directly using python's parser."
                ),
            }
        ],
    }

    user_msg = {
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": (
                    "Given this audio file, produce the JSON object as described in the system message. "
                    f"The file path is: {path}"
                ),
            },
            {
                "type": "audio",
                "audio": path,
            },
        ],
    }

    messages = [system_msg, user_msg]

    input_ids = processor.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt",
    )
    input_len = input_ids["input_ids"].shape[-1]

    input_ids = input_ids.to(model.device, dtype=model.dtype)
    outputs = model.generate(
        **input_ids,
        max_new_tokens=max_tokens,
        disable_compile=True,
    )

    decoded = processor.batch_decode(
        outputs[:, input_len:],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True,
    )
    return decoded[0].strip()


In [None]:
# import os
# import json
# from glob import glob

# def describe_audio_directory(directory: str, patterns=None, json_filename="audio_descriptions.json"):
#     """Find all audio files in a directory, describe them, and save to JSON."""
#     if patterns is None:
#         patterns = ('*.wav', '*.mp3', '*.flac', '*.ogg', '*.m4a')

#     paths = []
#     for pattern in patterns:
#         paths.extend(glob(os.path.join(directory, pattern)))

#     paths = sorted(set(paths))
#     if not paths:
#         print(f'No audio files found in {directory}')
#         return []

#     results = []
#     for path in paths:
#         print(f"\n=== {os.path.basename(path)} ===")
#         try:
#             description = describe_audio(path)
#         except Exception as e:
#             print(f'Error describing {path}: {e}')
#             continue
#         print(description)
#         results.append({
#             "path": path,
#             "description": description,
#         })

#     # Save results as JSON
#     json_path = os.path.join(directory, json_filename)
#     with open(json_path, "w", encoding="utf-8") as f:
#         json.dump(results, f, ensure_ascii=False, indent=2)

#     print(f"\nSaved {len(results)} descriptions to {json_path}")
#     return results

import json

def describe_audio_directory(directory: str, patterns=None, json_filename="audio_descriptions.json"):
    """Find all audio files in a directory, describe them, and save normalized JSON objects."""
    if patterns is None:
        patterns = ("*.wav", "*.mp3", "*.flac", "*.ogg", "*.m4a")

    paths = []
    for pattern in patterns:
        paths.extend(glob(os.path.join(directory, pattern)))

    paths = sorted(set(paths))
    if not paths:
        print(f"No audio files found in {directory}")
        return []

    results = []
    for path in paths:
        print(f"\n=== {os.path.basename(path)} ===")
        try:
            raw = describe_audio(path)
        except Exception as e:
            print(f"Error describing {path}: {e}")
            continue

        print("Model raw output:")
        print(raw)
        try:
            obj = json.loads(raw)
        except json.JSONDecodeError as e:
            print(f"Failed to parse JSON for {path}: {e}")
            continue

        # Ensure path is correct, even if the model messed it up
        obj["path"] = path

        results.append(obj)

    # Save results as JSON
    json_path = os.path.join(directory, json_filename)
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    print(f"\nSaved {len(results)} descriptions to {json_path}")
    return results


In [None]:
# Change this to another path if needed
audio_dir = '/content/bg_noises'

results = describe_audio_directory(audio_dir)
results


In [None]:
print(json.dumps(results, indent=2))

In [None]:
!git clone https://github.com/KarthikAvinashFI/bg_noises.git

In [None]:
cd bg_noises/