### Multi Round Omni Chatting with Qwen2.5-Omni

This notebook demonstrates how to use Qwen2.5-Omni for multiple rounds of audio and video dialogues.

In [1]:
# !pipip install git+https://github.com/huggingface/transformers@v4.51.3-Qwen2.5-Omni-preview
# !pip install qwen-omni-utils
# !pip install openai
# !pip install flash-attn --no-build-isolation

In [2]:
from qwen_omni_utils import process_mm_info

# @title inference function
def inference(conversations):
    text = processor.apply_chat_template(conversations, tokenize=False, add_generation_prompt=True)
    audios, images, videos = process_mm_info(conversations, use_audio_in_video=True)
    inputs = processor(text=text, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True, use_audio_in_video=True)
    inputs.to(model.device).to(model.dtype)

    output = model.generate(**inputs, use_audio_in_video=True, return_audio=True)

    text = processor.batch_decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
    audio = output[1]
    return text, audio

Load model and processors.

In [3]:
import torch
from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor

model_path = "Qwen/Qwen2.5-Omni-3B"
model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    attn_implementation="flash_attention_2",
)
processor = Qwen2_5OmniProcessor.from_pretrained(model_path)

Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}
You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
Qwen2_5OmniToken2WavModel must inference with fp32, but flash_attention_2 only supports fp16 and bf16, attention implementation of Qwen2_5OmniToken2WavModel will fallback to sdpa.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.
You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.


In [6]:
import librosa
import audioread

from IPython.display import Video
from IPython.display import Audio

#### Omni Chatting Round 1

In [None]:
conversations = [
    {"role": "system", "content": [{"type": "text", "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."}]},
]

video_path_round_1 = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/draw1.mp4"

display(Video(video_path_round_1, width=640, height=360))
display(Audio(librosa.load(audioread.ffdec.FFmpegAudioFile(video_path_round_1), sr=16000)[0], rate=16000))

## Use a local HuggingFace model to inference.
conversations.append( {"role": "user", "content":  [{"type": "video", "video": video_path_round_1}]} )

response, audio  = inference(conversations)
print(response[0])

display(Audio(audio, rate=24000))

	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
qwen-vl-utils using torchvision to read video.


#### Omni Chatting Round 2

In [None]:
conversations.append( {"role": "assistant", "content": response[0].split("\n")[-1]} )

video_path_round_2 = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/draw2.mp4"

display(Video(video_path_round_2, width=640, height=360))
display(Audio(librosa.load(audioread.ffdec.FFmpegAudioFile(video_path_round_2), sr=16000)[0], rate=16000))

## Use a local HuggingFace model to inference.
conversations.append( {"role": "user", "content":  [{"type": "video", "video": video_path_round_2}]} )

response, audio  = inference(conversations)
print(response[0])

display(Audio(audio, rate=24000))

#### Omni Chatting Round 3

In [None]:
conversations.append( {"role": "assistant", "content": response[0].split("\n")[-1]} )

video_path_round_3 = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/draw3.mp4"

display(Video(video_path_round_3, width=640, height=360))
display(Audio(librosa.load(audioread.ffdec.FFmpegAudioFile(video_path_round_3), sr=16000)[0], rate=16000))

## Use a local HuggingFace model to inference.
conversations.append( {"role": "user", "content":  [{"type": "video", "video": video_path_round_3}]} )

response, audio  = inference(conversations)
print(response[0])

display(Audio(audio, rate=24000))