In [1]:
import os

os.environ['HF_HOME'] = '/DATA2/HuggingFace'

import torch
from transformers import AutoModelForCausalLM

from deepseek_vl2.models import DeepseekVLV2Processor, DeepseekVLV2ForCausalLM
from deepseek_vl2.utils.io import load_pil_images

# specify the path to the model
model_path = "deepseek-ai/deepseek-vl2"

  from .autonotebook import tqdm as notebook_tqdm


Python version is above 3.10, patching the collections module.


In [2]:
vl_chat_processor: DeepseekVLV2Processor = DeepseekVLV2Processor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: DeepseekVLV2ForCausalLM = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

## single image conversation example
conversation = [
    {
        "role": "<|User|>",
        "content": "This is the image: <image>\n Can you tell me what are in the images?",
        "images": ["./images/visual_grounding_1.jpeg"],
    },
    {"role": "<|Assistant|>", "content": ""},
]

# load images and prepare for inputs
pil_images = load_pil_images(conversation)
prepare_inputs = vl_chat_processor(
    conversations=conversation,
    images=pil_images,
    force_batchify=True,
    system_prompt=""
).to(vl_gpt.device)

# run image encoder to get the image embeddings
inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

# run the model to get the response
outputs = vl_gpt.language.generate(
    inputs_embeds=inputs_embeds,
    attention_mask=prepare_inputs.attention_mask,
    pad_token_id=tokenizer.eos_token_id,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    max_new_tokens=512,
    do_sample=False,
    use_cache=True
)

answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
print(f"{prepare_inputs['sft_format'][0]}", answer)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Add pad token = ['<｜▁pad▁｜>'] to the tokenizer
<｜▁pad▁｜>:2
Add image token = ['<image>'] to the tokenizer
<image>:128815
Add grounding-related tokens = ['<|ref|>', '<|/ref|>', '<|det|>', '<|/det|>', '<|grounding|>'] to the tokenizer with input_ids
<|ref|>:128816
<|/ref|>:128817
<|det|>:128818
<|/det|>:128819
<|grounding|>:128820
Add chat tokens = ['<|User|>', '<|Assistant|>'] to the tokenizer with input_ids
<|User|>:128821
<|Assistant|>:128822



Loading checkpoint shards: 100%|██████████| 8/8 [00:13<00:00,  1.66s/it]
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


<|User|>: This is the image: <image>
 Can you tell me what are in the images?

<|Assistant|>: In the image shows two giraffes standing in a field. The giraffes are standing on a grassy field with trees in the background. The sky is clear blue.


In [3]:
answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
print(answer)

In the image shows two giraffes standing in a field. The giraffes are standing on a grassy field with trees in the background. The sky is clear blue.


In [4]:
print(f"Output shape: {outputs.shape}")
print(f"Output: {outputs[0].cpu().tolist()}")
print(f"Output decoded: {tokenizer.decode(outputs[0].cpu().tolist())}")

Output shape: torch.Size([1, 36])
Output: [1124, 270, 4609, 4849, 1234, 94656, 617, 273, 12286, 295, 260, 2994, 16, 455, 94656, 617, 273, 477, 12286, 377, 260, 124752, 2994, 418, 9693, 295, 270, 6951, 16, 455, 12709, 344, 4521, 8295, 16, 1]
Output decoded: In the image shows two giraffes standing in a field. The giraffes are standing on a grassy field with trees in the background. The sky is clear blue.<｜end▁of▁sentence｜>


In [None]:
torch.randn(1, 3, 224, 224).cuda()
from torch import nn
class RoPE(nn.Module):
    """
    RoPE (Rotary Positional Embedding) for Transformer models.
    This class computes the rotary positional embeddings for the input sequences.
    It is used to enhance the positional information in the input embeddings.
    """
    
    def __init__(self, max_seq_len=2048, d_model=1024):
        self.max_seq_len = max_seq_len
        self.d_model = d_model
        self.inv_freq = 1.0 / (10000 ** (torch.arange(0, d_model, 2).float() / d_model))
        self.register_buffer("inv_freq", self.inv_freq)
        self.register_buffer("seq_len", torch.arange(0, max_seq_len).unsqueeze(1))
        self.register_buffer("seq_len", self.seq_len)
        self.register_buffer("inv_freq", self.inv_freq)
        self.register_buffer("cos", None) 