In [None]:
import torch
import librosa
import transformers
from transformers import WhisperForAudioCaptioning

# Load pre-trained model and tokenizer
checkpoint = "MU-NLPC/whisper-tiny-audio-captioning"
model = WhisperForAudioCaptioning.from_pretrained(checkpoint).to("cuda")  # Move to GPU if available
tokenizer = transformers.WhisperTokenizer.from_pretrained(checkpoint, language="en", task="transcribe")
feature_extractor = transformers.WhisperFeatureExtractor.from_pretrained(checkpoint)

# Load and preprocess audio
input_file = "your_audio_file.wav"  # Change this to your actual file path
audio, sampling_rate = librosa.load(input_file, sr=feature_extractor.sampling_rate)
features = feature_extractor(audio, sampling_rate=sampling_rate, return_tensors="pt").input_features.to("cuda")

# Prepare caption style (optional but helps with formatting)
style_prefix = "clotho > caption: "
style_prefix_tokens = tokenizer(
    "", text_target=style_prefix, return_tensors="pt", add_special_tokens=False
).labels.to("cuda")

# Generate caption
model.eval()
with torch.no_grad():
    outputs = model.generate(
        inputs=features,
        forced_ac_decoder_ids=style_prefix_tokens,
        max_length=100,  # Control verbosity
    )

# Decode and print the caption
caption = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
print("Generated Audio Caption:", caption)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


ValueError: Unrecognized configuration class <class 'transformers.models.whisper.configuration_whisper.WhisperConfig'> for this kind of AutoModel: AutoModelForSeq2SeqLM.
Model type should be one of BartConfig, BigBirdPegasusConfig, BlenderbotConfig, BlenderbotSmallConfig, EncoderDecoderConfig, FSMTConfig, GPTSanJapaneseConfig, LEDConfig, LongT5Config, M2M100Config, MarianConfig, MBartConfig, MT5Config, MvpConfig, NllbMoeConfig, PegasusConfig, PegasusXConfig, PLBartConfig, ProphetNetConfig, Qwen2AudioConfig, SeamlessM4TConfig, SeamlessM4Tv2Config, SwitchTransformersConfig, T5Config, UMT5Config, XLMProphetNetConfig.

In [2]:
%pip install librosa

Collecting librosa
  Downloading librosa-0.11.0-py3-none-any.whl.metadata (8.7 kB)
Collecting audioread>=2.1.9 (from librosa)
  Using cached audioread-3.0.1-py3-none-any.whl.metadata (8.4 kB)
Collecting soundfile>=0.12.1 (from librosa)
  Using cached soundfile-0.13.1-py2.py3-none-win_amd64.whl.metadata (16 kB)
Collecting pooch>=1.1 (from librosa)
  Downloading pooch-1.8.2-py3-none-any.whl.metadata (10 kB)
Collecting soxr>=0.3.2 (from librosa)
  Downloading soxr-0.5.0.post1-cp312-abi3-win_amd64.whl.metadata (5.6 kB)
Collecting msgpack>=1.0 (from librosa)
  Downloading msgpack-1.1.0-cp313-cp313-win_amd64.whl.metadata (8.6 kB)
Collecting standard-aifc (from librosa)
  Downloading standard_aifc-3.13.0-py3-none-any.whl.metadata (969 bytes)
Collecting standard-sunau (from librosa)
  Downloading standard_sunau-3.13.0-py3-none-any.whl.metadata (914 bytes)
Collecting cffi>=1.0 (from soundfile>=0.12.1->librosa)
  Downloading cffi-1.17.1-cp313-cp313-win_amd64.whl.metadata (1.6 kB)
Collecting stan

In [10]:
import torch
from transformers import AutoModel, PreTrainedTokenizerFast
import torchaudio

audio_file_path = "audio/rain.mp3"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# use the model trained on AudioCaps
model = AutoModel.from_pretrained(
    "wsntxxn/effb2-trm-audio-captioning",
    trust_remote_code=True
).to(device)
tokenizer = PreTrainedTokenizerFast.from_pretrained(
    "wsntxxn/audiocaps-simple-tokenizer"
)

# inference on a single audio clip
wav, sr = torchaudio.load(audio_file_path)
wav = torchaudio.functional.resample(wav, sr, model.config.sample_rate)
if wav.size(0) > 1:
    wav = wav.mean(0).unsqueeze(0)

with torch.no_grad():
    word_idxs = model(
        audio=wav,
        audio_length=[wav.size(1)],
    )

caption = tokenizer.decode(word_idxs[0], skip_special_tokens=True)
print(caption)

# inference on a batch
wav1, sr1 = torchaudio.load(audio_file_path)
wav1 = torchaudio.functional.resample(wav1, sr1, model.config.sample_rate)
wav1 = wav1.mean(0) if wav1.size(0) > 1 else wav1[0]

wav2, sr2 = torchaudio.load(audio_file_path)
wav2 = torchaudio.functional.resample(wav2, sr2, model.config.sample_rate)
wav2 = wav2.mean(0) if wav2.size(0) > 1 else wav2[0]

wav_batch = torch.nn.utils.rnn.pad_sequence([wav1, wav2], batch_first=True)

with torch.no_grad():
    word_idxs = model(
        audio=wav_batch,
        audio_length=[wav1.size(0), wav2.size(0)],
    )

captions = tokenizer.batch_decode(word_idxs, skip_special_tokens=True)
print(captions)


['', '']


In [6]:
%pip install efficientnet_pytorch

Collecting efficientnet_pytorch
  Downloading efficientnet_pytorch-0.7.1.tar.gz (21 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: efficientnet_pytorch
  Building wheel for efficientnet_pytorch (pyproject.toml): started
  Building wheel for efficientnet_pytorch (pyproject.toml): finished with status 'done'
  Created wheel for efficientnet_pytorch: filename=efficientnet_pytorch-0.7.1-py3-none-any.whl size=16520 sha256=56530814876d06b182560439afbe0248310b9bc664cef6385d67de4e1ffa138e
  Stored in directory: c:\users\krishnaraj\appdata\local\pip\cache\wheels\5b\2f\2c\f72934c756bb8333dc80c448b1c97e40665b27b7fd15d6be9f
Successfully built efficientnet_pytorch
Installing co