# モデル選択

In [None]:
model_path = "Qwen/Qwen2-VL-2B-Instruct"
# model_path = "Qwen/Qwen2-VL-7B-Instruct"

# パッケージインストール

In [None]:
!pip install -q av
!pip install -q ffmpeg
!pip install -q qwen_vl_utils
!pip install -q git+https://github.com/huggingface/transformers

# モデルロード

In [None]:
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor

qwen2_vl_model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_path,
    torch_dtype="auto",
    device_map="auto"
)
qwen2_vl_processor = AutoProcessor.from_pretrained(
    model_path
)

In [None]:
import torch
from qwen_vl_utils import process_vision_info

# 推論用関数
def run_inference(processor, model, messages):
    # 入力プロンプトの準備
    text_prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text_prompt],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt"
    )
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    inputs = inputs.to(device)

    # 推論
    output_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids = [
        output_ids[len(input_ids) :]
        for input_ids, output_ids in zip(inputs.input_ids, output_ids)
    ]

    # デコード
    output_text = processor.batch_decode(
        generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
    )

    return output_text

# 画像推論サンプル

### サンプル画像ダウンロード

In [None]:
!wget https://raw.githubusercontent.com/Kazuhito00/Qwen2-VL-Colaboratory-Sample/main/sample.jpg -q -O test.jpg

In [None]:
import cv2
from PIL import Image

cv_image = cv2.imread('test.jpg')
rgb_image = cv2.cvtColor(cv_image, cv2.COLOR_BGR2RGB)
pil_image = Image.fromarray(rgb_image)

print(pil_image.size)
pil_image

In [None]:
%%time

# ローカルファイル指定での推論
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": "test.jpg"},
            {"type": "text", "text": "画像を説明してください"},
        ],
    }
]

output_text = run_inference(qwen2_vl_processor, qwen2_vl_model, messages)
print(output_text)

In [None]:
%%time

# PILイメージ指定での推論
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": pil_image},
            {"type": "text", "text": "画像を説明してください"},
        ],
    }
]

output_text = run_inference(qwen2_vl_processor, qwen2_vl_model, messages)
print(output_text)

In [None]:
%%time

# PILイメージ指定での推論(サイズ変更)
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": pil_image, "resized_width": 300, "resized_height": 200},
            {"type": "text", "text": "画像を説明してください"},
        ],
    }
]

output_text = run_inference(qwen2_vl_processor, qwen2_vl_model, messages)
print(output_text)

In [None]:
%%time

# URL指定での推論
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": "https://raw.githubusercontent.com/Kazuhito00/Qwen2-VL-Colaboratory-Sample/main/sample.jpg"},
            {"type": "text", "text": "画像を説明してください"},
        ],
    }
]

output_text = run_inference(qwen2_vl_processor, qwen2_vl_model, messages)
print(output_text)

In [None]:
%%time

import base64

_, imencode_image = cv2.imencode('.jpg', cv_image)
base64_image = base64.b64encode(imencode_image)

# メッセージの準備
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": "data:image/jpg;base64," + base64_image.decode("ascii")},
            {"type": "text", "text": "画像を説明してください"},
        ],
    }
]

output_text = run_inference(qwen2_vl_processor, qwen2_vl_model, messages)
print(output_text)

# 動画推論

### サンプル動画ダウンロード

In [None]:
!wget https://raw.githubusercontent.com/Kazuhito00/Qwen2-VL-Colaboratory-Sample/main/sample.mp4 -q -O test.mp4

In [None]:
# 冒頭4秒のみの動画を生成
!ffmpeg -loglevel quiet -i test.mp4 -t 4 -c copy test_4s.mp4

In [None]:
%%time

# ローカルファイル指定での推論
messages = [
    {
        "role": "user",
        "content": [
            {"type": "video", "video": "test_4s.mp4", "fps": 1.0},  # メモリが不足するためFPSを制限、必要に応じて画像と同様に幅、高さも指定可能
            {"type": "text", "text": "動画を説明してください"},
        ],
    }
]

output_text = run_inference(qwen2_vl_processor, qwen2_vl_model, messages)
print(output_text)