<font size=64px>Qwen Audio</font>

Notebook by [Trelis Research](trelis.com/about).

Find:
- [Trelis on YouTube](https://youtube.com/@trelisresearch).
- [The Trelis Newsletter here](https://blog.trelis.com).
- [Qwen Audio Fine-tuning and Inference Scripts (paid)](trelis.com/advanced-transcription).

In [None]:
!pip install transformers hf_transfer bitsandbytes accelerate -qU

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m35.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m37.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# ensure that hf_transfer is being used for fast weight uploads/downloads
import os
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

## Load the model (quantized to fit in colab T4 GPU)

In [None]:
from io import BytesIO
from urllib.request import urlopen
import librosa
from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
import torch

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16, #bfloat16 if on ampere, lovelace, ada or hopper
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4")

processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
model = Qwen2AudioForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-Audio-7B-Instruct",
    quantization_config=quant_config,
    device_map="auto",
    )

PackageNotFoundError: No package metadata was found for bitsandbytes

### Use audio from a url (provided by qwen)

In [None]:
from IPython.display import Audio, display
from urllib.request import urlopen
from io import BytesIO
import librosa

# First audio file URL
audio_url_1 = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/guess_age_gender.wav"

# Load the audio with librosa to get the waveform and sampling rate
audio_data, sampling_rate = librosa.load(BytesIO(urlopen(audio_url_1).read()), sr=None)

# Play the audio
display(Audio(audio_data, rate=sampling_rate))

conversation = [
    {"role": "user", "content": [
        {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/guess_age_gender.wav"},
        {"type": "text", "text": "Describe the speaker, including their age."},
    ]}
]

### OR Upload and select a sound file

In [None]:
from IPython.display import Audio, display
import librosa
import torch

# File path of the local audio file (replace with your file's path)
local_audio_file = "/content/Tell me a fun short story.m4a"  # Example for Colab local file

# Expected sampling rate from the processor
expected_sr = processor.feature_extractor.sampling_rate

# Load the audio file and resample to the expected rate - IMPORTANT!
audio_data, sr = librosa.load(local_audio_file, sr=expected_sr)  # Align sampling rate during load

# Create a dictionary mimicking the previous structure
audio_info = {
    "type": "audio",
    "audio_data": audio_data,
    "sampling_rate": expected_sr,
    "file_name": local_audio_file
}

# Print information and play the audio
print(f"Audio file '{audio_info['file_name']}' successfully loaded.")
display(Audio(audio_info["audio_data"], rate=audio_info["sampling_rate"]))

if False:
  # Ask - via voice - for a short story in text
  conversation = [
      {"role": "user", "content": [
          {"type": "audio", "audio": audio_info["audio_data"]},  # Replaces the URL with local audio data
          # {"type": "text", "text": "Describe the speaker, including their age."},
      ]},
  ]

if True:
  # Ask for a transcription that includes a description of the speaker's emotions
  conversation = [
      {"role": "user", "content": [
          {"type": "audio", "audio": audio_info["audio_data"]},  # Replaces the URL with local audio data
          {"type": "text", "text": "Transcribe this audio AND provide a description of the speaker's emotions. Respond in the following format:\n\nTranscription: <transcript>\nEmotion: <emotion>"},
      ]},
  ]

Audio file '/content/Tell me a fun short story.m4a' successfully loaded.


  audio_data, sr = librosa.load(local_audio_file, sr=expected_sr)  # Align sampling rate during load
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


## Get a transcription that includes emotions.

In [None]:
text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
# Prepare audio list for processing
audios = []
for message in conversation:
    if isinstance(message["content"], list):
        for ele in message["content"]:
            if ele["type"] == "audio":
                if "audio_url" in ele:  # Remote audio
                    audios.append(librosa.load(
                        BytesIO(urlopen(ele["audio_url"]).read()),
                        sr=processor.feature_extractor.sampling_rate)[0]
                    )
                elif "audio" in ele:  # Local audio
                    audios.append(ele["audio"])  # Directly add the local audio data

inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True).to("cuda")
# inputs.input_ids = inputs.input_ids.to("cuda")

generated_ids = model.generate(**inputs, max_length=256)
generated_ids = generated_ids[:, inputs.input_ids.size(1):]

response = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

print(response)

It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


{'Transcription': 'Go ahead and tell me a fun short story.', 'Emotion': 'Amused'}
