# モデル選択

In [1]:
model_path = 'openbmb/MiniCPM-V-2_6-int4'  # T4 GPU以上のGPUが必要
# model_path = 'openbmb/MiniCPM-V-2_6'  # L4 GPU以上のGPUが必要

# パッケージインストール

In [None]:
!pip install -q transformers
!pip install -q -U flash_attn
!pip install -q sentencepiece==0.1.99
!pip install -q accelerate==0.30.1
!pip install -q bitsandbytes==0.43.1
!pip install -q -U timm

# モデル読み込み

In [None]:
import torch
from transformers import AutoModel, AutoTokenizer

torch.manual_seed(0)

model_name = model_path.split('/')[-1]
if model_name == 'MiniCPM-V-2_6':
    attn_implementation = 'sdpa'  # sdpa or flash_attention_2, no eager
    torch_dtype = torch.bfloat16
elif model_name == 'MiniCPM-V-2_6-int4':
    attn_implementation = None
    torch_dtype = None

model = AutoModel.from_pretrained(
    model_path,
    trust_remote_code=True,
    attn_implementation=attn_implementation,
    torch_dtype=torch_dtype,
)
if model_name == 'MiniCPM-V-2_6':
    model = model.eval().cuda()
elif model_name == 'MiniCPM-V-2_6-int4':
    model = model.eval()

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

# 画像認識でのマルチターンチャット サンプル

## サンプル画像ダウンロード

In [None]:
!wget https://raw.githubusercontent.com/Kazuhito00/MiniCPM-V2.6-Colaboratory-Sample/main/assets/sample01.jpg -O sample01.jpg

In [6]:
import cv2
from PIL import Image

cv_image = cv2.imread('sample01.jpg')
rgb_image = cv2.cvtColor(cv_image, cv2.COLOR_BGR2RGB)
pil_image = Image.fromarray(rgb_image)

In [None]:
from google.colab.patches import cv2_imshow
cv2_imshow(cv_image)

## 初回チャット

In [None]:
%%time

question = "explain this image."
msgs = [{'role': 'user', 'content': [pil_image, question]}]

answer = model.chat(
    image=None,
    msgs=msgs,
    tokenizer=tokenizer,
)
print(answer)

## 2回目チャット（1回目の回答を渡し、追加の質問を行う）

In [None]:
%%time

msgs.append({"role": "assistant", "content": [answer]})
msgs.append({"role": "user", "content": ["tell me more about the cartridge."]})

answer = model.chat(
    image=None,
    msgs=msgs,
    tokenizer=tokenizer
)
print(answer)

# 複数画像認識 サンプル

## サンプル画像ダウンロード

In [None]:
!wget https://raw.githubusercontent.com/Kazuhito00/MiniCPM-V2.6-Colaboratory-Sample/main/assets/sample02-01.jpg -O sample02-01.jpg
!wget https://raw.githubusercontent.com/Kazuhito00/MiniCPM-V2.6-Colaboratory-Sample/main/assets/sample02-02.jpg -O sample02-02.jpg

In [11]:
cv_image01 = cv2.imread('sample02-01.jpg')
pil_image01 = Image.fromarray(cv2.cvtColor(cv_image01, cv2.COLOR_BGR2RGB))

cv_image02 = cv2.imread('sample02-02.jpg')
pil_image02 = Image.fromarray(cv2.cvtColor(cv_image02, cv2.COLOR_BGR2RGB))

In [None]:
from google.colab.patches import cv2_imshow
debug_image = cv2.hconcat([cv_image01, cv_image02])
cv2_imshow(debug_image)

## 2枚の画像を渡し、画像の違いを説明させる

In [None]:
question = 'Compare image 1 and image 2, tell me about the differences between image 1 and image 2.'
msgs = [{'role': 'user', 'content': [pil_image01, pil_image02, question]}]

answer = model.chat(
    image=None,
    msgs=msgs,
    tokenizer=tokenizer
)
print(answer)

# 動画認識 サンプル

## サンプル動画ダウンロード

In [None]:
!wget https://raw.githubusercontent.com/Kazuhito00/MiniCPM-V2.6-Colaboratory-Sample/main/assets/sample.mp4 -O sample.mp4

## 動画前処理用関数

In [15]:
def preprocess_video(video_path, max_num_frames=64):
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = cap.get(cv2.CAP_PROP_FPS)

    # サンプリング間隔を計算
    sample_fps = round(fps)
    frame_idx = [i for i in range(0, total_frames, sample_fps)]

    # 必要に応じてフレーム数を制限
    def uniform_sample(l, n):
        gap = len(l) / n
        idxs = [int(i * gap + gap / 2) for i in range(n)]
        return [l[i] for i in idxs]
    if len(frame_idx) > max_num_frames:
        print('uniform_sample() 実行')
        frame_idx = uniform_sample(frame_idx, max_num_frames)

    print('frame_idx:', frame_idx)
    print('フレーム間隔（秒）:', int((frame_idx[-1] - frame_idx[-2]) / sample_fps))

    # PIL Image形式でリストに格納
    frames = []
    for idx in frame_idx:
        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frames.append(Image.fromarray(frame))

    cap.release()

    print('フレーム数:', len(frames))
    return frames

## 動画解析

In [None]:
video_path="sample.mp4"
frames = preprocess_video(video_path)

question = "describe the video"
msgs = [
    {'role': 'user', 'content': frames + [question]},
]
params = {}
params["use_image_id"] = False
params["max_slice_nums"] = 1  # 通常は2を指定し、CUDAメモリオーバーやビデオ解像度が448x448を越える場合1を指定

answer = model.chat(
    image=None,
    msgs=msgs,
    tokenizer=tokenizer,
    **params
)
print(answer)

# コンテキスト内でのフューショット学習 サンプル

## サンプル画像ダウンロード

In [None]:
!wget https://raw.githubusercontent.com/Kazuhito00/MiniCPM-V2.6-Colaboratory-Sample/main/assets/sample03-01.jpg -O sample03-01.jpg
!wget https://raw.githubusercontent.com/Kazuhito00/MiniCPM-V2.6-Colaboratory-Sample/main/assets/sample03-02.jpg -O sample03-02.jpg
!wget https://raw.githubusercontent.com/Kazuhito00/MiniCPM-V2.6-Colaboratory-Sample/main/assets/sample03-03.jpg -O sample03-03.jpg

In [18]:
cv_image03_01 = cv2.imread('sample03-01.jpg')
pil_image03_01 = Image.fromarray(cv2.cvtColor(cv_image03_01, cv2.COLOR_BGR2RGB))

cv_image03_02 = cv2.imread('sample03-02.jpg')
pil_image03_02 = Image.fromarray(cv2.cvtColor(cv_image03_02, cv2.COLOR_BGR2RGB))

cv_image03_03 = cv2.imread('sample03-03.jpg')
pil_image03_03 = Image.fromarray(cv2.cvtColor(cv_image03_03, cv2.COLOR_BGR2RGB))

In [None]:
from google.colab.patches import cv2_imshow
debug_image = cv2.hconcat([cv_image03_01, cv_image03_02])
debug_image = cv2.hconcat([debug_image, cv_image03_03])
cv2_imshow(debug_image)

## 回答例の学習なしでの質問

In [None]:
question = "What does this picture represent?"
msgs = [{'role': 'user', 'content': [pil_image03_03, question]}]

answer = model.chat(
    image=None,
    msgs=msgs,
    tokenizer=tokenizer,
)
print(answer)

## 回答例の学習ありでの質問

In [None]:
ありquestion = "What does this picture represent?"
answer1 = "S"
answer2 = "G"
msgs = [
    {'role': 'user', 'content': [pil_image03_01, question]}, {'role': 'assistant', 'content': [answer1]},
    {'role': 'user', 'content': [pil_image03_02, question]}, {'role': 'assistant', 'content': [answer2]},
    {'role': 'user', 'content': [pil_image03_03, question]}
]

answer = model.chat(
    image=None,
    msgs=msgs,
    tokenizer=tokenizer
)
print(answer)