In [2]:
!pip install transformers hf_transfer bitsandbytes accelerate -qU

In [3]:
import os
import torch
from transformers import AutoProcessor, AutoModelForCausalLM, LEDForConditionalGeneration, LEDTokenizer
import soundfile as sf

In [4]:
from io import BytesIO
from urllib.request import urlopen
import librosa
from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
import torch

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16, #bfloat16 if on ampere, lovelace, ada or hopper
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4")

processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
models = Qwen2AudioForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-Audio-7B-Instruct",
    quantization_config=quant_config,
    device_map="auto",
    )

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [5]:
led_tokenizer = LEDTokenizer.from_pretrained("allenai/led-base-16384")
led_model = LEDForConditionalGeneration.from_pretrained("allenai/led-base-16384").to("cuda")

In [6]:
# อ่าน sampling rate จาก environment variable ที่กำหนดใน shell script
sr = int(os.getenv("SR", 16000))  # ค่า default คือ 16000 หากไม่มีการตั้งค่า

In [7]:
# กำหนด path ของไฟล์เสียง
movie_audio_path = 'A Minecraft Movie.wav'

In [8]:
# กำหนด sampling rate ตามที่โมเดล Qwen-Audio ต้องการ
expected_sr = processor.feature_extractor.sampling_rate  # ปกติคือ 16000 Hz

In [9]:
# โหลดไฟล์เสียงด้วย librosa
audio_data, sr = librosa.load(movie_audio_path, sr=expected_sr)

In [10]:
# กำหนดความยาวของแต่ละ segment (เช่น 30 วินาที)
segment_length = 30 * expected_sr  # 30 วินาทีในหน่วย samples

In [11]:
# คำนวณจำนวน segments
num_segments = len(audio_data) // segment_length

In [12]:
# แบ่งไฟล์เสียงเป็น segments
segments = [audio_data[i*segment_length:(i+1)*segment_length] for i in range(num_segments)]

In [13]:
# เตรียมตัวแปรสำหรับเก็บผลลัพธ์
final_response_parts = []
conversation_history = []

In [14]:
# Create directory for saving audio segments if it doesn't exist
segment_dir = 'segments'
if not os.path.exists(segment_dir):
    os.makedirs(segment_dir)

# Allow configurable segment length (in seconds)
segment_duration_seconds = os.getenv("SEGMENT_DURATION", "600")  # Default 30 seconds
segment_length = int(float(segment_duration_seconds) * expected_sr)

# Recalculate segments with the configurable length
num_segments = len(audio_data) // segment_length
segments = [audio_data[i*segment_length:(i+1)*segment_length] for i in range(num_segments)]

# Save each segment as a separate WAV file
for i, segment in enumerate(segments):
    segment_path = os.path.join(segment_dir, f'segment_{i}.wav')
    sf.write(segment_path, segment, sr)
    
print(f"Saved {len(segments)} audio segments to {segment_dir}/")
print(f"Segment duration: {segment_duration_seconds} seconds")

Saved 9 audio segments to segments/
Segment duration: 600 seconds


In [15]:
audio_info = {
    "type": "audio",
    "audio_data": segment,
    "sampling_rate": sr,
    "file_name": f"segment_{i}.wav"
}

In [16]:
for i, segment in enumerate(segments):
    audio_info = {"type": "audio", "audio_data": segment, "sampling_rate": sr, "file_name": f"segment_{i}.wav"}
    conversation = []
    if conversation_history:
        conversation.extend(conversation_history)
    instruction_text = "Describe the sound effects and tell me the mood in this segment." if i == 0 else "Continuing from the previous, describe the sound effects and mood in this next part."
    conversation.append({"role": "user", "content": [{"type": "audio", "audio": audio_info["audio_data"]}, {"type": "text", "text": instruction_text}]})
    text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
    inputs = processor(text=text_prompt, audios=[audio_info["audio_data"]], return_tensors="pt", padding=True).to("cuda")
    generated_ids = models.generate(**inputs, max_new_tokens=200, do_sample=True, temperature=0.7, top_p=0.9)
    generated_ids = generated_ids[:, inputs.input_ids.size(1):]
    response_part = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    final_response_parts.append(response_part)
    conversation_history.append({"role": "assistant", "content": [{"type": "text", "text": response_part}]})

  inputs = processor(text=text_prompt, audios=[audio_info["audio_data"]], return_tensors="pt", padding=True).to("cuda")
It is strongly recommended to pass the `sampling_rate` argument to `WhisperFeatureExtractor()`. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the `sampling_rate` argument to `WhisperFeatureExtractor()`. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the `sampling_rate` argument to `WhisperFeatureExtractor()`. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the `sampling_rate` argument to `WhisperFeatureExtractor()`. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the `sampling_rate` argument to `WhisperFeatureExtractor()`. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to 

In [28]:
# รวมคำอธิบายของทุก segment เป็นข้อความเดียว
full_description = "\n".join(final_response_parts)


In [None]:
summary_prompt = f"Based on the following audio description of a movie, which includes various sound segments, please infer and summarize what the movie is likely about. Consider the key sounds, their sequence, and the moods they evoke to hypothesize the plot, themes, and overall atmosphere of the film:\n\n{full_description}"

inputs = led_tokenizer(summary_prompt, return_tensors="pt", max_length=16384, truncation=True).to("cuda")

summary_ids = led_model.generate(
    inputs["input_ids"], 
    max_length=1000,  # ปรับให้สั้นลง
    min_length=100,
    num_beams=4,
    length_penalty=1.0,  # ปรับให้กระชับ
    early_stopping=True,
    no_repeat_ngram_size=3
)

summary = led_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

Input ids are automatically padded from 461 to 1024 to be a multiple of `config.attention_window`: 1024


In [37]:
# แสดงผลลัพธ์
print("สรุปผลลัพธ์:")
print(summary)

สรุปผลลัพธ์:
Imagine you are a film critic reviewing the audio aspects of a movie. Summarize the following audio description, highlighting the key sounds, the moods they evoke, and how they contribute to the overall atmosphere of the scene. Be concise but descriptive, as if writing a brief review for a film magazine:The sound effects in this section are described as generic impact sounds. The mood conveyed by the music is one of excitement, anticipation, and anticipation.The audio contains sound effects with no specific source, described asgeneric impact sounds, and also includes music playing in the background. The music is happy, inspiring, motivational, optimistic, positive, and uplifting.The sound effect in this segment is described as:The car engine revving followed by it idling loudly. The atmosphere is tense and tense.Sound effects such as clock ticking or other mechanisms occur intermittently throughout the track, contributing to an overall feeling of tension and unease.The fol