In [1]:
import argparse
from omegaconf import OmegaConf
from GOT.utils.arguments import *


parser = argparse.ArgumentParser()
parser.add_argument(
    "--configs",
    nargs="*",
    default=["/apps/GOT-OCR2.0/configs/got_test.yaml"],
    help="Path to the config file",
)
parser.add_argument('--local_rank', type=int, default=-1,
                    help='Used for distributed training')  # ✅ 添加这一行
args = parser.parse_args([])

config_list = [OmegaConf.load(c) for c in args.configs]
config = OmegaConf.merge(*config_list)
# model_args, data_args, training_args = parser.parse_yaml_file(
#     "configs/got.yaml")
# config = OmegaConf.load("configs/got.yaml")
# 分别提取字段构造 dataclass
model_args = ModelArguments(
    **{k: v for k, v in config.items() if k in ModelArguments.__dataclass_fields__}
)
data_args = DataArguments(
    **{k: v for k, v in config.items() if k in DataArguments.__dataclass_fields__}
)
training_args = TrainingArguments(
    **{k: v for k, v in config.items() if k in TrainingArguments.__dataclass_fields__}
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
from GOT.model import *
# 假设模型已实例化并在cuda
model = GOTQwenForCausalLM.from_pretrained(
    "/data_8t_1/qby/GOT-OCR2_0", use_safetensors=True)
model.to("cuda")
# model.eval()

GOTQwenForCausalLM(
  (model): GOTQwenModel(
    (embed_tokens): Embedding(151860, 1024)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (o_proj): Linear(in_features=1024, out_features=1024, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
          (up_proj): Linear(in_features=1024, out_features=2816, bias=False)
          (down_proj): Linear(in_features=2816, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((1024,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((1024,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((1024,), eps=1e-06)
    (rotary_em

In [3]:
vision_tower_dict = model.get_model().initialize_vision_modules(
    vision_tower=model_args.vision_tower,
    pretrained_stage1_model=model_args.pretrained_stage1_model,
    freeze_vision_tower=model_args.freeze_vision_tower,
    use_im_start_end=model_args.use_im_start_end,
    vision_select_layer=model_args.vision_select_layer,
    device=training_args.device
)

In [4]:

from GOT.data.conversation_dataset_qwen import ConversationDataset
from transformers import AutoTokenizer
dataset_cls = ConversationDataset
tokenizer=AutoTokenizer.from_pretrained(
    "/data_8t_1/qby/GOT-OCR2_0",
    trust_remote_code=True,
    # use_fast=False,
    # revision="v1.0.0"
)
data_args.image_token_len = 256
data_args.image_processor = vision_tower_dict['image_processor']
data_args.image_processor_high = vision_tower_dict['image_processor_high']
data_args.use_im_start_end = model_args.use_im_start_end
train_dataset = dataset_cls(
    tokenizer=tokenizer,
    datasets=data_args.train_datasets,
    multimodal_cfg=dict(
        sep_image_conv_front=data_args.sep_image_conv_front,
        image_token_len=data_args.image_token_len,
        image_aspect_ratio=data_args.image_aspect_ratio,
        use_im_start_end=data_args.use_im_start_end,
        image_processor=data_args.image_processor,
        image_processor_high=data_args.image_processor_high,
        box_limit=data_args.box_limit,
    )
)



In [5]:
print("train_dataset:", train_dataset)

train_dataset: <GOT.data.conversation_dataset_qwen.ConversationDataset object at 0x7f95195f2920>


In [6]:
# print("train_dataset[0]:",train_dataset[0])
for k, v in train_dataset[0].items():
    print(f"{k}: {v.shape if isinstance(v, torch.Tensor) else len(v)}")

input_ids: torch.Size([406])
labels: torch.Size([406])
image: 1
image_high: 1


In [7]:
train_dataset

<GOT.data.conversation_dataset_qwen.ConversationDataset at 0x7f95195f2920>

In [8]:
# from torch.utils.data import Dataset, DataLoader
# from GOT.data import DataCollatorForSupervisedDataset
# data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
# dataloader = DataLoader(train_dataset, batch_size=16, collate_fn=data_collator)

In [9]:
# dict(train_dataset)

In [10]:
# dataloader

In [11]:
# for batch in dataloader:
#     print(batch)
#     break

In [12]:
# # 假设你已经定义好了 dataloader
# batch = next(iter(dataloader))

# # 打印内容
# print(batch.keys())  # 查看有哪些字段
# print(batch['input_ids'].shape)
# print(batch['labels'].shape)
# print(batch['attention_mask'].shape)
# print(len(batch['images']))  # 查看图片数量
# print(len(batch['images'][0]))  # 查看第一张图片的数量
# print(batch['images'][0][0].shape)  # 查看第一张图片的形状
# print(batch['images'][0][1].shape)  # 查看第一张图片的
# # print((batch['images'][0][0]))

In [13]:
# from collections import defaultdict
# import torch
# from tqdm import tqdm

# # 初始化记录字典
# field_shapes = defaultdict(list)

# # 遍历整个数据集
# for example in tqdm(train_dataset):
#     for k, v in example.items():
#         if isinstance(v, torch.Tensor):
#             field_shapes[k].append(v.shape[0])
#         elif isinstance(v, list) or isinstance(v, str):
#             field_shapes[k].append(len(v))
#         else:
#             field_shapes[k].append(type(v))

# # 打印每个字段的统计结果
# for k, v_list in field_shapes.items():
#     unique_vals = set(v_list)
#     print(f"{k}:")
#     print(f"  Unique shapes/lengths: {unique_vals}")
#     print(f"  Max: {max(v_list)}, Min: {min(v_list)}, Avg: {sum(v_list)/len(v_list):.2f}" if all(
#         isinstance(x, int) for x in v_list) else "")

In [14]:
# from collections import defaultdict
# import torch
# from tqdm import tqdm

# # 初始化记录字典，保存长度和对应index
# field_max = defaultdict(lambda: {'max_len': -1, 'index': -1})

# for idx, example in enumerate(tqdm(train_dataset)):
#     for k, v in example.items():
#         length = None
#         if isinstance(v, torch.Tensor):
#             length = v.shape[0]
#         elif isinstance(v, list) or isinstance(v, str):
#             length = len(v)
#         else:
#             continue

#         if length > field_max[k]['max_len']:
#             field_max[k]['max_len'] = length
#             field_max[k]['index'] = idx

# print("字段最大长度及对应样本index:")
# for k, info in field_max.items():
#     print(f"{k}: max_len={info['max_len']}, index={info['index']}")

# # 以某个字段为例，保存最大长度样本
# key_of_interest = 'input_ids'  # 比如你想要最长input_ids的样本
# max_index = field_max[key_of_interest]['index']
# longest_sample = train_dataset[max_index]

# # longest_sample 就是你想保存的最大样本
# # 你可以保存为json，或torch保存，根据需求处理

In [15]:
# import torch

# torch.save(longest_sample, "longest_sample.pt")
longest_sample = torch.load("longest_sample.pt")

In [16]:
# for k, v in longest_sample.items():
#     if isinstance(v, torch.Tensor):
#         # 确保v至少有一维且长度大于600才截断
#         if v.size(0) > 600:
#             longest_sample[k] = v[:600]
#         print(f"{k}: {longest_sample[k].shape}")
#     elif isinstance(v, list) or isinstance(v, str):
#         print(f"{k}: {len(v)}")
#     else:
#         print(f"{k}: {type(v)}")

In [17]:
longest_sample

{'input_ids': tensor([151644,   8948,    198,   2610,   1265,   1795,    279,  11221,  15516,
            323,  10339,    697,  11253,    304,   7716,     13, 151645, 151644,
            872,    198, 151857, 151859, 151859, 151859, 151859, 151859, 151859,
         151859, 151859, 151859, 151859, 151859, 151859, 151859, 151859, 151859,
         151859, 151859, 151859, 151859, 151859, 151859, 151859, 151859, 151859,
         151859, 151859, 151859, 151859, 151859, 151859, 151859, 151859, 151859,
         151859, 151859, 151859, 151859, 151859, 151859, 151859, 151859, 151859,
         151859, 151859, 151859, 151859, 151859, 151859, 151859, 151859, 151859,
         151859, 151859, 151859, 151859, 151859, 151859, 151859, 151859, 151859,
         151859, 151859, 151859, 151859, 151859, 151859, 151859, 151859, 151859,
         151859, 151859, 151859, 151859, 151859, 151859, 151859, 151859, 151859,
         151859, 151859, 151859, 151859, 151859, 151859, 151859, 151859, 151859,
         151859

In [18]:
for k,v in longest_sample.items():
    if isinstance(v, torch.Tensor):
        print(f"{k}: {v.shape}")
    elif isinstance(v, list) or isinstance(v, str):
        print(f"{k}: {len(v)}")
    else:
        print(f"{k}: {type(v)}")  # 打印其他类型的字段

input_ids: torch.Size([919])
labels: torch.Size([919])
image: 1
image_high: 1


In [19]:
# import matplotlib.pyplot as plt
# from collections import Counter

# for k, v_list in field_shapes.items():
#     print(f"Plotting field: {k}")

#     # 判断v_list中元素类型
#     if all(isinstance(x, int) for x in v_list):
#         # 连续数值，用直方图
#         plt.figure(figsize=(8, 4))
#         plt.hist(v_list, bins=30, color='skyblue', edgecolor='black')
#         plt.title(f"{k} length distribution")
#         plt.xlabel("Length")
#         plt.ylabel("Frequency")
#         plt.grid(True, linestyle='--', alpha=0.5)
#         plt.show()

#     elif all(isinstance(x, tuple) for x in v_list):
#         # 形状分布，统计频次并画条形图
#         shape_counts = Counter(v_list)
#         shapes = list(shape_counts.keys())
#         counts = list(shape_counts.values())

#         # 把tuple转成字符串方便显示
#         shapes_str = [str(s) for s in shapes]

#         plt.figure(figsize=(10, 5))
#         plt.bar(range(len(counts)), counts,
#                 color='lightcoral', edgecolor='black')
#         plt.xticks(range(len(counts)), shapes_str, rotation=45, ha='right')
#         plt.title(f"{k} shape distribution")
#         plt.xlabel("Shape")
#         plt.ylabel("Frequency")
#         plt.tight_layout()
#         plt.show()

#     else:
#         print(f"Skipped field {k} with unsupported type for plotting.")

In [20]:
import torch
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
# 假设 longest_sample 是字典，包含键 'input_ids', 'attention_mask', 'labels', 'images'
# input_ids, attention_mask, labels是tensor，images是list或tensor
# batch_size = 16


def make_batch(sample, batch_size):
    batch = {}
    for k, v in sample.items():
        if isinstance(v, torch.Tensor):
            batch[k] = v.unsqueeze(0).repeat(batch_size, *([1] * (v.dim())))
        elif isinstance(v, list):
            # 假设 images 是 list of tensors，简单重复list内容
            batch[k] = v * batch_size  # 复制列表，batch_size倍
        else:
            # 其他情况，简单复制
            batch[k] = [v] * batch_size
    return batch

In [21]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [22]:
import torch
from transformers import default_data_collator
from GOT.data import DataCollatorForSupervisedDataset
# 模拟 batch_size = 16
batch_size = 6
# 构造 batch 的输入样本列表
features = [longest_sample] * batch_size  # 复制样本

# 调用默认 collator（会处理 input_ids, attention_mask, labels）
collator= DataCollatorForSupervisedDataset(tokenizer=tokenizer)
batch = collator(features)


In [23]:
# 取一个 batch
# batch = next(iter(dataloader))  # 或者: for batch in dataloader:

# 把数据移动到 GPU（确保模型也在 CUDA 上）
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
input_ids = batch["input_ids"].to(device)
attention_mask = batch["attention_mask"].to(device)
labels = batch["labels"].to(device)

# 图像可能需要特殊处理（比如 list of tensors 或 Tensor）
images = batch["images"]
images = [(item[0].to(device), item[1].to(device))
          for item in images]
# if isinstance(images, list):
#     images = torch.stack(images)
# images = images.to(device)

Using device: cuda


In [24]:
# model.float()

In [25]:
print(input_ids.device)
print(attention_mask.device)
print(labels.device)
print(images[0][1].device)
print(next(model.parameters()).device)

cuda:0
cuda:0
cuda:0
cuda:0
cuda:0


In [26]:
# 清空优化器梯度
optimizer.zero_grad()

# 显存前
print("显存占用（训练前）:", torch.cuda.memory_reserved() / 1024**2, "MB")
print(torch.cuda.memory_summary())

显存占用（训练前）: 2268.0 MB
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |   2183 MiB |   2214 MiB |   2551 MiB | 377423 KiB |
|       from large pool |   2181 MiB |   2212 MiB |   2548 MiB | 375808 KiB |
|       from small pool |      1 MiB |      2 MiB |      2 MiB |   1615 KiB |
|---------------------------------------------------------------------------|
| Active memory         |   2183 MiB |   2214 MiB |   2551 MiB | 377423 KiB |
|       from large pool |   2181 MiB |   2212 MiB |   2548 MiB | 375808 KiB |
|       from small pool |      1 MiB |      2 MiB |      2 MiB |   1615 KiB |
|------------------------------------------

In [27]:




with torch.cuda.amp.autocast():
    outputs = model(input_ids=input_ids,
                    attention_mask=attention_mask, labels=labels, images=images)
    print("outputs:", outputs)
    loss = outputs.loss
# loss = outputs.loss
print("Loss:", loss.item())

# 反向传播 + 更新参数
loss.backward()
optimizer.step()

# 显存后
print("显存占用（训练后）:", torch.cuda.memory_allocated() / 1024**2, "MB")


  with torch.cuda.amp.autocast():


outputs: CausalLMOutputWithPast(loss=tensor(0.4377, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[[ 5.2305,  4.3984,  5.8164,  ..., -0.9219, -1.6182, -1.7695],
         [ 5.4609,  4.6758,  6.1484,  ..., -0.7266, -1.4258, -1.7080],
         [ 4.6328,  3.7129,  5.3047,  ..., -1.3828, -2.0488, -1.7646],
         ...,
         [20.1719, 16.2500, 17.0000,  ...,  7.8086, 14.9688, 11.0938],
         [17.0625, 18.3438, 21.6875,  ..., 11.2422, 18.5312, 12.0625],
         [16.7812, 16.2656, 19.4688,  ...,  6.0039, 14.3750,  7.7695]],

        [[ 5.2305,  4.3984,  5.8164,  ..., -0.9219, -1.6182, -1.7695],
         [ 5.4609,  4.6758,  6.1484,  ..., -0.7266, -1.4258, -1.7080],
         [ 4.6328,  3.7129,  5.3047,  ..., -1.3828, -2.0488, -1.7646],
         ...,
         [20.1719, 16.2500, 17.0000,  ...,  7.8086, 14.9688, 11.0938],
         [17.0625, 18.3438, 21.6875,  ..., 11.2422, 18.5312, 12.0625],
         [16.7812, 16.2656, 19.4688,  ...,  6.0039, 14.3750,  7.7695]],

        [[ 

OutOfMemoryError: CUDA out of memory. Tried to allocate 3.12 GiB. GPU 0 has a total capacity of 23.57 GiB of which 2.37 GiB is free. Including non-PyTorch memory, this process has 21.17 GiB memory in use. Of the allocated memory 20.58 GiB is allocated by PyTorch, and 293.29 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
print(torch.cuda.memory_summary())

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 1         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  11122 MiB |  21514 MiB | 182150 MiB | 171027 MiB |
|       from large pool |  11120 MiB |  21501 MiB | 181596 MiB | 170476 MiB |
|       from small pool |      2 MiB |     19 MiB |    554 MiB |    551 MiB |
|---------------------------------------------------------------------------|
| Active memory         |  11122 MiB |  21514 MiB | 182150 MiB | 171027 MiB |
|       from large pool |  11120 MiB |  21501 MiB | 181596 MiB | 170476 MiB |
|       from small pool |      2 MiB |     19 MiB |    554 MiB |    551 MiB |
|---------------------------------------------------------------

In [None]:
# batch_size = 16
# seq_len = 200

# # 模拟input_ids：随机生成在词表大小范围内
# vocab_size = model.config.vocab_size
# input_ids = torch.randint(low=0, high=vocab_size, size=(
#     batch_size, seq_len), device=device)

# # 模拟attention_mask：全1
# attention_mask = torch.ones_like(input_ids, device=device)
# optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

# # 模拟labels（可以和input_ids相同，也可部分设-100忽略loss）
# labels = input_ids.clone()
# labels[:, :10] = -100

# # 模拟 images 输入（batch内每个样本含1张图像，1个patch，3通道1024x1024）
# # GOT代码里图像输入是一个list，列表长度为batch_size，每个元素形如(image_count, 3, H, W)
# # 这里image_count=1, 高度宽度为1024
# images = []

In [None]:




# for _ in range(batch_size):
#     img_tensor = torch.randn(1, 3, 1024, 1024, device=device, dtype=torch.float32)  # 模拟单张图像
#     images.append((None, img_tensor))  # 你的代码中传入的images元素形如 tuple，第二项是Tensor


In [None]:
# len(images)

In [None]:
# len(images[0])
    

In [None]:
# # 记录显存占用
# print("显存占用（训练前）:", torch.cuda.memory_allocated() / 1024**2, "MB")

# optimizer.zero_grad()

# outputs = model(
#     input_ids=input_ids,
#     attention_mask=attention_mask,
#     labels=labels,
#     images=images,
#     return_dict=True,
# )

# loss = outputs.loss
# print("Loss:", loss.item())

# loss.backward()

# optimizer.step()

# print("显存占用（训练后）:", torch.cuda.memory_allocated() / 1024**2, "MB")