In [None]:
from transformers import AutoProcessor, AutoModel
import torch
from PIL import Image
import requests

# 如果您之前已经下载到本地项目文件夹，请使用本地路径
# model_path = "lmms-lab/LLaVA-OneVision-1.5-4B-Instruct" 
model_path = "/home/ZJH/llava/onevision/llava_onevision_1" # 假设这是您的本地路径

# 1. 加载 Processor 和 Model
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)

model = AutoModel.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16, 
    device_map="cuda:0", 
    local_files_only=True,
    trust_remote_code=True
)

# 2. 准备图像
image_path = "/home/ZJH/llava/build_llava/R.jpeg"
image = Image.open(image_path).convert("RGB")

# 3. 关键修改：使用标准的 Messages 格式，而不是手动拼接字符串
conversation = [
    {
        "role": "user",
        "content": [
            {"type": "image"},  # 这里只是占位，告诉模板这里有一张图
            {"type": "text", "text": "Describe this image."}
        ]
    }
]

# 4. 使用 apply_chat_template 自动生成正确的 Prompt
# 这会自动添加正确的 <|im_start|>, <|image_pad|> 等特殊 Token
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

# 5. 处理输入
inputs = processor(
    images=image, 
    text=prompt, 
    return_tensors="pt"
).to(model.device)

# 6. 推理
output_ids = model.generate(
    **inputs, 
    max_new_tokens=1024,
    do_sample=False
)

# 7. 解码输出
# 只需要解码新生成的 token
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)
]
output_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

print(f"Model Response: {output_text}")

  from .autonotebook import tqdm as notebook_tqdm
The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.
The tokenizer you are loading from '/home/ZJH/llava/onevision/llava_onevision' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.18s/it]
The following generation flags are not valid and may be ignored: ['temperatur

Model Response: The image depicts a breathtaking mountainous landscape, likely situated in a high-altitude region. The central focus is a towering snow-capped peak that dominates the scene, its rugged surface covered with pristine white snow and ice. This peak appears to be part of a larger mountain range, as several other peaks can be seen in the background, each varying in height and shape.

In the foreground, there is a serene lake reflecting the vibrant colors of the surrounding environment. The water is calm, creating a mirror-like surface that perfectly captures the reflection of the mountains, trees, and sky above. The lake's edge is lined with dense forests, showcasing a mix of evergreen and deciduous trees. The foliage displays a rich palette of autumnal hues, including shades of green, yellow, orange, and red, indicating that the season is fall.

The terrain around the lake slopes gently downward from the forested area towards the base of the mountains. The lower slopes are c

In [3]:
inputs.keys()

KeysView({'input_ids': tensor([[151644,   8948,    198,  ..., 151644,  77091,    198]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]], device='cuda:0'), 'pixel_values': tensor([[-1.7777, -1.7777, -1.7777,  ...,  2.0464,  2.0464,  2.0464],
        [-1.7777, -1.7777, -1.7777,  ...,  2.0464,  2.0464,  2.0464],
        [-1.7485, -1.7485, -1.7485,  ...,  2.0748,  2.0748,  2.0748],
        ...,
        [-0.0405, -0.0113,  0.4413,  ..., -1.3665, -1.4091, -1.4376],
        [-0.0550,  0.0179,  0.1493,  ..., -1.2811, -1.1674, -1.1105],
        [ 0.0033,  0.0325, -0.0842,  ..., -1.1389, -1.0821, -1.1532]],
       device='cuda:0'), 'image_grid_thw': tensor([[  1,  66, 136]], device='cuda:0')})

In [4]:
inputs.input_ids.shape

torch.Size([1, 2269])

In [6]:
inputs.attention_mask.shape

torch.Size([1, 2269])

In [5]:
inputs.pixel_values.shape

torch.Size([8976, 588])

In [7]:
inputs.image_grid_thw.shape

torch.Size([1, 3])

In [8]:
model.config

Llavaonevision1_5Config {
  "architectures": [
    "LLaVAOneVision1_5_ForConditionalGeneration"
  ],
  "auto_map": {
    "AutoConfig": "configuration_llavaonevision1_5.Llavaonevision1_5Config",
    "AutoModel": "modeling_llavaonevision1_5.LLaVAOneVision1_5_ForConditionalGeneration",
    "AutoModelForCausalLM": "modeling_llavaonevision1_5.LLaVAOneVision1_5_ForConditionalGeneration"
  },
  "dtype": "bfloat16",
  "image_token_id": 151655,
  "model_type": "llavaonevision1_5",
  "text_config": {
    "attention_bias": false,
    "attention_dropout": 0.0,
    "dtype": "bfloat16",
    "head_dim": 128,
    "hidden_act": "silu",
    "hidden_size": 2560,
    "image_token_id": null,
    "initializer_range": 0.02,
    "intermediate_size": 9728,
    "layer_types": [
      "full_attention",
      "full_attention",
      "full_attention",
      "full_attention",
      "full_attention",
      "full_attention",
      "full_attention",
      "full_attention",
      "full_attention",
      "full_attention

In [3]:
from dataclasses import dataclass
import pandas as pd
import torch
from PIL import Image
from torch.utils.data import Dataset
from pathlib import Path
from typing import Dict,List,Tuple
import pandas as pd

In [2]:
data_dir="drive-action"
data_dir

'drive-action'

In [4]:
df = pd.read_parquet("/home/ZJH/llava/onevision/drive-action/data/train-00000-of-00013.parquet")


In [9]:
print(df.head())          # 打印前几行


                      question_slice_id   qa_l0                qa_l1  \
0  d752a4f2-1f4b-11f0-b3a2-644ed76caecb  Vision  Navigation Position   
1  db72cfb2-1f4b-11f0-b3a2-644ed76caecb  Vision  Navigation Position   
2  d9789e8a-1f4b-11f0-b3a2-644ed76caecb  Vision  Navigation Position   
3  d9789e8a-1f4b-11f0-b3a2-644ed76caecb  Vision  Navigation Position   
4  db273d40-1f4b-11f0-b3a2-644ed76caecb  Vision  Navigation Position   

      question_category                                         content_cn  \
0  true_false_questions  {'answer': '正确', 'question': '前方47米，导航指示向右前方行驶...   
1  true_false_questions  {'answer': '正确', 'question': '前方17米，导航指示左转掉头，前...   
2      choice_questions  {'answer': 'C', 'question': '前方14米，导航指示向右前方行驶，...   
3  true_false_questions  {'answer': '正确', 'question': '前方14米，导航指示向右前方行驶...   
4      choice_questions  {'answer': 'C', 'question': '前方102米，导航指示右转，前方2...   

                                          content_en  \
0  {'answer': 'True', 'question': '47 mete

In [7]:
print(df.columns)         # 查看字段
print(len(df)) 

Index(['question_slice_id', 'qa_l0', 'qa_l1', 'question_category',
       'content_cn', 'content_en', 'image_0', 'image_1', 'image_2'],
      dtype='object')
1245


In [10]:
type(df['content_en'])

pandas.core.series.Series

In [None]:
print(processor.chat_template)


In [1]:
from transformers import AutoProcessor, AutoModel
import torch
from PIL import Image
import requests

# 如果您之前已经下载到本地项目文件夹，请使用本地路径
# model_path = "lmms-lab/LLaVA-OneVision-1.5-4B-Instruct" 
model_path = "/home/ZJH/llava/onevision/llava_onevision_1" # 假设这是您的本地路径

# 1. 加载 Processor 和 Model
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)

model = AutoModel.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16, 
    device_map="cuda:0", 
    local_files_only=True,
    trust_remote_code=True
)

  from .autonotebook import tqdm as notebook_tqdm
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00,  1.13it/s]


In [2]:
print(model)


LlavaOnevisionModel(
  (vision_tower): SiglipVisionModel(
    (vision_model): SiglipVisionTransformer(
      (embeddings): SiglipVisionEmbeddings(
        (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
        (position_embedding): Embedding(729, 1152)
      )
      (encoder): SiglipEncoder(
        (layers): ModuleList(
          (0-25): 26 x SiglipEncoderLayer(
            (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
            (self_attn): SiglipAttention(
              (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
              (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
              (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
              (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
            )
            (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
            (mlp): SiglipMLP(
              (activatio

In [3]:
# 1) processor 类型
from transformers import AutoProcessor
proc = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
print("processor type:", type(proc))




processor type: <class 'transformers.models.llava_onevision.processing_llava_onevision.LlavaOnevisionProcessor'>


In [5]:
from PIL import Image
import numpy as np
img = Image.fromarray((np.random.rand(384,384,3)*255).astype("uint8"))
out = proc(images=img, text="hello", return_tensors="pt")
out.keys()

KeysView({'input_ids': tensor([[14990]]), 'attention_mask': tensor([[1]]), 'pixel_values': tensor([[[[[-0.6078,  0.7333, -0.0431,  ...,  0.2549, -0.1922, -0.9216],
           [ 0.0824, -0.2549, -0.9843,  ...,  0.8745,  0.1137,  0.3569],
           [-0.9216,  0.6314,  0.8745,  ..., -0.6941,  0.8196,  0.3804],
           ...,
           [ 0.7098, -0.2549, -0.5529,  ..., -0.9216,  0.1765,  0.0745],
           [-0.0039, -0.2157,  0.9922,  ...,  0.7490,  0.5686,  0.3333],
           [-0.7569, -0.2784,  0.9765,  ...,  0.4196,  0.1373, -0.9765]],

          [[-0.5529, -0.5529, -0.0118,  ...,  0.5843, -0.1137,  0.8039],
           [-0.7255,  0.0588,  0.2549,  ..., -0.4980, -0.7255,  0.8824],
           [-0.9686,  0.5373, -0.4980,  ..., -0.0196, -0.9922,  0.6078],
           ...,
           [ 0.0118, -0.2784,  0.3647,  ...,  0.6941, -0.6706, -0.4824],
           [-0.0667,  0.3882, -0.4902,  ..., -0.8667,  0.0667, -0.9608],
           [-0.3412, -0.1059, -0.7804,  ...,  0.3569,  0.2235, -0.9765]]

In [7]:
from IPython.display import display
from dataclasses import dataclass
import pandas as pd
import torch
from PIL import Image
from torch.utils.data import Dataset
from pathlib import Path
from typing import Dict,List,Tuple,Union

In [8]:
data_dir="drive_action_output"
chat_file=Path(data_dir).joinpath("drive_action_train.json")
chat_data=pd.read_json(path_or_buf=chat_file)
chat_data.shape

(5000, 3)

In [9]:
from show_onevision.data_onevision import LlavaDataset, TrainLLavaOneVisionCollator

In [10]:
test_llavadataset=LlavaDataset(dataset_dir=data_dir)

In [11]:
test_llavadataset[1234]

('View 1: <image>\nView 2: <image>\nView 3: <image>\nThe navigation indicates a left-turn U-turn. As shown in the picture, please determine the types and number of traffic signals ahead in your lane.\nA. One left-turn arrow signal and one straight circular signal\nB. Two straight circular signals\nC. One left-turn arrow signal and two straight circular signals\nD. Only one straight circular signal.\nPlease answer with the option letter only (A/B/C/D).',
 'C',
 [PosixPath('drive_action_output/images/1234_image_0.jpg'),
  PosixPath('drive_action_output/images/1234_image_1.jpg'),
  PosixPath('drive_action_output/images/1234_image_2.jpg')])

In [14]:
image_paths = test_llavadataset[1234][2]
image_paths

[PosixPath('drive_action_output/images/1234_image_0.jpg'),
 PosixPath('drive_action_output/images/1234_image_1.jpg'),
 PosixPath('drive_action_output/images/1234_image_2.jpg')]

In [13]:
out = proc(images=image_paths, text="hello", return_tensors="pt")
out.keys()

TypeError: only a single or a list of entries is supported but got type=<class 'pathlib.PosixPath'>