#Install package

In [None]:
!pip uninstall transformers

In [None]:
!pip -q install -U git+https://github.com/huggingface/transformers@3a1ead0aabed473eafe527915eea8c197d424356
!pip -q install accelerate
!pip -q install qwen-omni-utils[decord]
!pip -q install -U flash-attn --no-build-isolation

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m110.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m84.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m57.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━

#Import library

In [None]:
import soundfile as sf
import librosa
from IPython.display import Audio, display

from transformers import Qwen2_5OmniModel, Qwen2_5OmniProcessor
from qwen_omni_utils import process_mm_info

#Load model

In [None]:
#Load model
model = Qwen2_5OmniModel.from_pretrained(
    "Qwen/Qwen2.5-Omni-7B",
    torch_dtype="auto",
    device_map="auto",
    attn_implementation="flash_attention_2",
)

You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
Qwen2_5OmniToken2WavModel must inference with fp32, but flash_attention_2 only supports fp16 and bf16, attention implementation of Qwen2_5OmniToken2WavModel will fallback to sdpa.


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

#Thông tin mô hình

In [None]:
import os
import torch

def get_model_size(model):
  total_params = sum(p.numel() for p in model.parameters())
  print(f"Total parameters: {total_params}")
  total_size = 0

get_model_size(model)

Total parameters: 10732225408


In [None]:
get_model_size(model.thinker)

Total parameters: 8931813888


In [None]:
get_model_size(model.talker)

Total parameters: 1351360256


In [None]:
get_model_size(model.thinker.visual)

Total parameters: 676550144


In [None]:
get_model_size(model.thinker.base_model)

Total parameters: 7070619136


In [None]:
processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B")

#Text -> Audio

In [None]:
conversation = [
    {
        "role": "system",
        "content": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.",
    },
    {
        "role": "user",
        "content": "who are you?"
    }
]

# Preparation for inference
text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)

audios, images, videos = process_mm_info(conversation, use_audio_in_video=True)

inputs = processor(text=text, audios=audios, images=images, videos=videos, return_tensors="pt", padding=True)
inputs = inputs.to(model.device).to(model.dtype)


In [None]:

# Inference: Generation of the output text and audio
text_ids, audio = model.generate(**inputs, use_audio_in_video=True)

text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
print(text)

sf.write(
    "output.wav",
    audio.reshape(-1).detach().cpu().numpy(),
    samplerate=24000,
)


Setting `pad_token_id` to `eos_token_id`:8292 for open-end generation.


['system\nYou are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.\nuser\nwho are you?\nassistant\nI am Qwen, a large-scale language model developed by Alibaba Cloud. How can I assist you?']


In [None]:


audio, sr = librosa.load("output.wav", sr=24000)
display(Audio(audio, rate=sr))

In [None]:
def prompt_model(prompt, speaker="Chelsie"):
    conversation = [
        {
            "role": "system",
            "content": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.",
        },
        {
            "role": "user",
            "content": prompt
        }
    ]

    # Preparation for inference
    text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)

    audios, images, videos = process_mm_info(conversation, use_audio_in_video=True)

    inputs = processor(text=text, audios=audios, images=images, videos=videos, return_tensors="pt", padding=True)
    inputs = inputs.to(model.device).to(model.dtype)

    # Inference: Generation of the output text and audio
    # text_ids, audio = model.generate(**inputs, use_audio_in_video=True)
    text_ids, audio = model.generate(**inputs, spk=speaker, use_audio_in_video=True)

    text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    print(text)

    sf.write(
        "output.wav",
        audio.reshape(-1).detach().cpu().numpy(),
        samplerate=24000,
    )
    audio, sr = librosa.load("output.wav", sr=24000)
    display(Audio(audio, rate=sr))



In [None]:
prompt_model("What do you think the meaning of life is? Is it 42?")

Setting `pad_token_id` to `eos_token_id`:8292 for open-end generation.


['system\nYou are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.\nuser\nWhat do you think the meaning of life is? Is it 42?\nassistant\nWell, that\'s a really interesting question. I don\'t think 42 is the meaning of life. It\'s just a number in Douglas Adams\' "The Hitchhiker\'s Guide to the Galaxy." For me, the meaning of life could be different things for different people. Some might say it\'s about finding happiness, others might think it\'s about making a positive impact on the world. What do you think?']


In [None]:
prompt_model("Compare Paris and Beijing?", speaker="Ethan" )

Setting `pad_token_id` to `eos_token_id`:8292 for open-end generation.


['system\nYou are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.\nuser\nCompare Paris and Beijing?\nassistant\nWell, Paris is a really romantic and artistic city. It has the Eiffel Tower, which is super famous. There are lots of beautiful parks and cafes. The food is amazing too, like croissants and cheese.Beijing, on the other hand, has a rich history. The Forbidden City is a must - see. It has a different kind of beauty, more traditional and cultural. The food there is also great, with things like Peking duck.Both cities are great in their own ways. If you like art and history, Beijing might be more for you. But if you want a more romantic and modern experience, Paris could be better. So, which one do you think you might be more interested in?']


In [None]:
prompt_model("What is the difference between Llamas, Vicunas and Alpacas", speaker="Chelsie" )

Setting `pad_token_id` to `eos_token_id`:8292 for open-end generation.


["system\nYou are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.\nuser\nWhat is the difference between Llamas, Vicunas and Alpacas\nassistant\nWell, llamas are the largest of the three. They're often used for transportation and as pack animals. Vicunas are the smallest and are considered the most valuable in terms of their wool. Alpacas are in between in size. Llamas have a more shaggy coat, while vicunas have a really fine, soft wool. Alpacas have a more refined look. Oh, and llamas are more common in South America, vicunas are more endangered and found in the Andes, and alpacas are also found in South America but are more domesticated.If you want to know more about these animals, like their behavior or how they're raised, just let me know."]


## Batch Inference

In [None]:
# Sample messages for batch inference

# Conversation with video only
conversation1 = [
    {
        "role": "system",
        "content": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.",
    },
    {
        "role": "user",
        "content": [
            {"type": "video", "video": "/path/to/video.mp4"},
        ]
    }
]

# Conversation with audio only
conversation2 = [
    {
        "role": "system",
        "content": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.",
    },
    {
        "role": "user",
        "content": [
            {"type": "audio", "audio": "/path/to/audio.wav"},
        ]
    }
]

# Conversation with pure text
conversation3 = [
    {
        "role": "system",
        "content": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.",
    },
    {
        "role": "user",
        "content": "who are you?"
    }
]


# Conversation with mixed media
conversation4 = [
    {
        "role": "system",
        "content": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.",
    },
    {
        "role": "user",
        "content": [
            {"type": "image", "image": "/path/to/image.jpg"},
            {"type": "video", "video": "/path/to/video.mp4"},
            {"type": "audio", "audio": "/path/to/audio.wav"},
            {"type": "text", "text": "What are the elements can you see and hear in these medias?"},
        ],
    }
]

# Combine messages for batch processing
conversations = [conversation1, conversation2, conversation3, conversation4]

# Preparation for batch inference
text = processor.apply_chat_template(conversations, add_generation_prompt=True, tokenize=False)
audios, images, videos = process_mm_info(conversations, use_audio_in_video=True)

inputs = processor(text=text, audios=audios, images=images, videos=videos, return_tensors="pt", padding=True)
inputs = inputs.to(model.device).to(model.dtype)

# Batch Inference
text_ids = model.generate(**inputs, use_audio_in_video=True, return_audio=False)
text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
print(text)


In [None]:
model.thinker

Qwen2_5OmniThinkerForConditionalGeneration(
  (audio_tower): Qwen2_5OmniAudioEncoder(
    (conv1): Conv1d(128, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
    (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
    (positional_embedding): SinusoidsPositionEmbedding()
    (audio_bos_eos_token): Embedding(2, 3584)
    (layers): ModuleList(
      (0-31): 32 x Qwen2_5OmniAudioEncoderLayer(
        (self_attn): Qwen2_5OmniAudioFlashAttention2(
          (k_proj): Linear(in_features=1280, out_features=1280, bias=False)
          (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
          (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
          (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (activation_fn): GELUActivation()
        (fc1): Linear(in_features=1280, out_features=5120, bias=True)
        (fc2): Li