In [14]:
from pdata import PersonalizedMMUDataset, PersonalizedT2IDataset, get_personalized_mmu_dataloader, get_personalized_t2i_dataloader
from lightning.pytorch.utilities import CombinedLoader

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.nn as nn
import numpy as np
from tqdm import tqdm
from PIL import Image

from models import Showo, MAGVITv2, get_mask_chedule
from training.prompting_utils import UniversalPrompting, create_attention_mask_predict_next, create_attention_mask_for_mmu
from training.utils import get_config, flatten_omega_conf, mask_or_random_replace_tokens, AverageMeter
from transformers import AutoTokenizer
from models.clip_encoder import CLIPVisionTower
from transformers import CLIPImageProcessor
from llava.llava import conversation as conversation_lib

conversation_lib.default_conversation = conversation_lib.conv_templates["phi1.5"]

import os
from omegaconf import DictConfig, ListConfig, OmegaConf
config = OmegaConf.load('configs/showo_demo.yaml')
# device setup
device = torch.device("cuda:7")

In [15]:
# show o tokenizer setup and adding special tokens to universal prompting
# llm model : 'microsoft/phi-1_5'
tokenizer = AutoTokenizer.from_pretrained(config.model.showo.llm_model_path, padding_side ="left")
uni_prompting = UniversalPrompting(tokenizer, max_text_len=config.dataset.preprocessing.max_seq_length,
                                       special_tokens=("<|soi|>", "<|eoi|>", "<|sov|>", "<|eov|>", "<|t2i|>", "<|mmu|>", "<|t2v|>", "<|v2v|>", "<|lvg|>"),
                                       ignore_id=-100, cond_dropout_prob=config.training.cond_dropout_prob)

# setting up the magvit-v2, for t2i
vq_model = MAGVITv2.from_pretrained(config.model.vq_model.vq_model_name).to(device)
# vq_model.requires_grad_(False)
# vq_model.eval()

# setting up vision tower: clip-vit only for mmu
# vision_tower_name =config.clip_path
# vision_tower = CLIPVisionTower(vision_tower_name).to(device)
# clip_image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)

# setting up the showo model 
model = Showo.from_pretrained(config.model.showo.pretrained_model_path).to(device)
# model.eval()

# setting up the parameters
temperature = 1  # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
top_k = 1  # retain only the top_k most likely tokens, clamp others to have 0 probability
# LLAVA_SYSTEM_PROMPT = "A chat between a curious user and an artificial intelligence assistant. " \
#                 "The assistant gives helpful, detailed, and polite answers to the user's questions."
# LLAVA_SYSTEM_PROMPT_LEN = 28

The config attributes {'mask_token_id': 58497} were passed to Showo, but are not expected and will be ignored. Please verify your config.json configuration file.


Working with z of shape (1, 13, 16, 16) = 3328 dimensions.
Look-up free quantizer with codebook size: 8192
attention implementation:  sdpa


In [16]:
# print(model.showo.get_input_embeddings())
model.showo.get_input_embeddings().num_embeddings
model.showo.get_input_embeddings().num_embeddings - len(tokenizer)
model.showo.get_input_embeddings().weight.data.shape
model.showo.lm_head.weight.shape
model.showo.lm_head.bias.shape

torch.Size([58498])

In [17]:
data_root = "/home/arc/full_mcdata"
concept = "dunpai"

In [18]:
nums_new_token_i = 16

#################################
new_tokens = [f"<{concept}>"] + [f"<token_{i}>" for i in range(nums_new_token_i)]
num_new_tokens = len(new_tokens)  # 17

# 已知的原始参数
# 文本 token 数量（ID 0-50304）
original_text_vocab_size = len(tokenizer)  
# Image token 数量（原 ID 50305-58497）
original_image_vocab_size = model.showo.get_input_embeddings().num_embeddings - len(tokenizer)

original_total_vocab = original_text_vocab_size + original_image_vocab_size  # 58498

# 新的参数
new_text_vocab_size = original_text_vocab_size + num_new_tokens  # 50305 + 17 = 50322
new_total_vocab = original_total_vocab + num_new_tokens          # 58498 + 17 = 58515

# ------------------------------
# Step 1: 修改 Tokenizer 的词汇表
# ------------------------------

# 添加新 token 到 50305-50321 的位置
num_new_tokens = tokenizer.add_tokens(new_tokens)
new_token_ids = tokenizer.convert_tokens_to_ids(new_tokens)
print("新 token ID:", new_token_ids)  # 应输出 50305-50321

# ------------------------------
# Step 2: 调整模型的权重
# ------------------------------
with torch.no_grad():
    # 获取嵌入层权重
    embeddings = model.showo.get_input_embeddings().weight.data
    
    # 扩展嵌入层（58498 -> 58515）
    model.showo.resize_token_embeddings(new_total_vocab)
    # new_embeddings = model.showo.get_input_embeddings().weight.data

    # 将原 Image Token 权重后移 17 位
    original_image_weights = embeddings[original_text_vocab_size:original_total_vocab].clone()
    model.showo.get_input_embeddings().weight.data[new_text_vocab_size:new_total_vocab] = original_image_weights
    
    # 初始化新 token 的权重（用原文本最后 17 个 token）
    # new_text_weights = embeddings[original_text_vocab_size - num_new_tokens : original_text_vocab_size].clone()
    # model.showo.get_input_embeddings().weight.data[original_text_vocab_size : new_text_vocab_size] = new_text_weights
    # print(model.showo.lm_head.weight.data.shape[1])
    # 处理 lm_head（假设与嵌入层共享权重）
    if model.showo.lm_head.weight.data.shape[0] == new_total_vocab:
        # 扩展 lm_head 权重
        lm_head = model.showo.lm_head
        new_lm_head = torch.nn.Linear(
            lm_head.in_features, 
            new_total_vocab, 
            bias=hasattr(lm_head, 'bias')
        )
        new_lm_head.weight.data = lm_head.weight.data.clone()
        new_lm_head.weight.data[new_text_vocab_size:new_total_vocab] = lm_head.weight.data[original_text_vocab_size:original_total_vocab]
        # new_lm_head.weight.data[original_text_vocab_size:new_text_vocab_size] = lm_head.weight.data[original_text_vocab_size - num_new_tokens : original_text_vocab_size]
        if hasattr(lm_head, 'bias'):
            new_lm_head.bias.data = lm_head.bias.data.clone()
            new_lm_head.bias.data[new_text_vocab_size:new_total_vocab] = lm_head.bias.data[original_text_vocab_size:original_total_vocab]
            # new_lm_head.bias.data[original_text_vocab_size:new_text_vocab_size] = lm_head.bias.data[original_text_vocab_size - num_new_tokens : original_text_vocab_size]
        
        model.showo.lm_head = new_lm_head
    else:
        raise ValueError("lm_head weights do not match the input embeddings!")

index_no_updates = torch.ones((new_total_vocab,), dtype=torch.bool)
index_no_updates[new_token_ids] = False
# ------------------------------
# 验证
# ------------------------------
# 检查新 token 的 ID
print("新增文本 token ID:", [tokenizer.convert_tokens_to_ids(t) for t in new_tokens])  # 应输出 50305-50321

# 检查一个原 Image Token 的新 ID
sample_image_token = tokenizer.convert_ids_to_tokens(original_text_vocab_size)  # 原 ID 50305
print(f"Concept Token '{sample_image_token}' 的新 ID:", tokenizer.convert_tokens_to_ids(sample_image_token))  # 应输出 50322

# 检查嵌入层形状
print("嵌入层大小:", model.showo.get_input_embeddings().weight.shape)  # 应显示 torch.Size([58515, 2048])

# 检查 index_no_updates 中 True 的位置和数量，True 应该是 new token ids
print("index_no_updates 中 False 的位置:", torch.nonzero(~index_no_updates).squeeze())  # 应输出 50305-50321
print("index_no_updates 中 True 的数量:", torch.sum(index_no_updates))  # 应输出 58498

with torch.no_grad():
    orig_embeds = model.showo.get_input_embeddings().weight.data.clone()
    orig_lm_head_weight = model.showo.lm_head.weight.data.clone()
    orig_lm_head_bias = model.showo.lm_head.bias.data.clone()

新 token ID: [50305, 50306, 50307, 50308, 50309, 50310, 50311, 50312, 50313, 50314, 50315, 50316, 50317, 50318, 50319, 50320, 50321]
新增文本 token ID: [50305, 50306, 50307, 50308, 50309, 50310, 50311, 50312, 50313, 50314, 50315, 50316, 50317, 50318, 50319, 50320, 50321]
Concept Token '<dunpai>' 的新 ID: 50305
嵌入层大小: torch.Size([58515, 2048])
index_no_updates 中 False 的位置: tensor([50305, 50306, 50307, 50308, 50309, 50310, 50311, 50312, 50313, 50314,
        50315, 50316, 50317, 50318, 50319, 50320, 50321])
index_no_updates 中 True 的数量: tensor(58498)


In [19]:
concept_embeds = model.showo.get_input_embeddings().weight.data[new_token_ids]
concept_lm_nead_wight = model.showo.lm_head.weight.data[new_token_ids]
concept_lm_nead_bias = model.showo.lm_head.bias.data[new_token_ids]

In [20]:
concept_embeds.shape, concept_lm_nead_wight.shape, concept_lm_nead_bias.shape

(torch.Size([17, 2048]), torch.Size([17, 2048]), torch.Size([17]))

In [21]:
uni_prompting.sptids_dict

{'<|soi|>': tensor([50296]),
 '<|eoi|>': tensor([50297]),
 '<|sov|>': tensor([50298]),
 '<|eov|>': tensor([50299]),
 '<|t2i|>': tensor([50300]),
 '<|mmu|>': tensor([50301]),
 '<|t2v|>': tensor([50302]),
 '<|v2v|>': tensor([50303]),
 '<|lvg|>': tensor([50304]),
 '<|sot|>': tensor([50256]),
 '<|eot|>': tensor([50256]),
 '<|pad|>': tensor([50295])}

In [22]:
vq_model.requires_grad_ = False
vq_model.eval()
model.train()
for names, p in model.named_parameters():
    if "embed_tokens" not in names and "lm_head" not in names:
        p.requires_grad = False
    else:
        p.requires_grad = True

trainable_params = [model.showo.get_input_embeddings().weight, model.showo.lm_head.weight, model.showo.lm_head.bias]
optimizer = torch.optim.AdamW(
            trainable_params, # for optimize the embeddings and the head
            lr=1e-2,
            betas=(0.9, 0.999),
            weight_decay=1e-2,
            eps=1e-08,
        )
for names, p in model.named_parameters():
    if p.requires_grad:
        print(f"{names} requires_grad") # embed_token, lm_head会更新

showo.model.embed_tokens.weight requires_grad
showo.lm_head.weight requires_grad
showo.lm_head.bias requires_grad


In [23]:
model.config.mask_token_id = model.showo.get_input_embeddings().num_embeddings - 1
model.mask_token_id = model.showo.get_input_embeddings().num_embeddings - 1

In [24]:
mask_schedule = get_mask_chedule(config.training.get("mask_schedule", "cosine"))
mask_id = model.mask_token_id
mask_dtype = model.showo.model.embed_tokens.weight.dtype

In [26]:

# t2i_dataset = PersonalizedT2IDataset(data_root, concept)
# t2i_dataloader = DataLoader(t2i_dataset, batch_size=5, shuffle=True, num_workers=10, pin_memory=True)

mmu_dataloader = get_personalized_mmu_dataloader(data_root, concept, tokenizer, batch_size=5, num_workers=0, max_length=128)
t2i_dataloader = get_personalized_t2i_dataloader(data_root, concept, tokenizer, batch_size=2, num_workers=0, max_length=128)


iterables = {
    'mmu_flow': mmu_dataloader,
    't2i_flow': t2i_dataloader
}


combined_dataloader = CombinedLoader(iterables, mode="max_size_cycle")

# Before adding the new tokens, the vocab size is 58498
# vocab size = 58498 = 50295  llm vocabsize
#                    + 10     <|soi|> <|eoi|> <|sov|> <|eov|> <|t2i|> <|mmu|> <|t2v|> <|v2v|> <|lvg|> <|pad|>
#                    + 8192   vq model codebook size
#                    + 1      mask token (token id == 58497)
from typing import Union


uni_prompting.sptids_dict
# {'<|soi|>': tensor([50296]),
#  '<|eoi|>': tensor([50297]),
#  '<|sov|>': tensor([50298]),
#  '<|eov|>': tensor([50299]),
#  '<|t2i|>': tensor([50300]),
#  '<|mmu|>': tensor([50301]),
#  '<|t2v|>': tensor([50302]),
#  '<|v2v|>': tensor([50303]),
#  '<|lvg|>': tensor([50304]),
#  '<|sot|>': tensor([50256]),
#  '<|eot|>': tensor([50256]),
#  '<|pad|>': tensor([50295])}

# uni_prompting.text_tokenizer == tokenizer
def prepare_inputs_and_labels(
        pixel_values_or_image_ids: Union[torch.FloatTensor, torch.LongTensor],
        texts: Union[str, str],
        min_masking_rate: float = 0.0,
        is_train: bool = True,
):

    image_tokens = vq_model.get_code(pixel_values_or_image_ids)
    image_tokens = image_tokens + len(uni_prompting.text_tokenizer)

    # create MLM mask and labels
    input_ids, labels, loss_weight, mask_prob = mask_or_random_replace_tokens(
        image_tokens,
        mask_id,
        config,
        mask_schedule=mask_schedule,
        is_train=is_train,
    )
    input_ids, masks, labels = uni_prompting((texts, input_ids, labels), 't2i')

    return input_ids, labels, mask_prob, image_tokens

Formatting llava instruction data


In [27]:
list_combined_dataloader = list(combined_dataloader)
# one_batch_mmu = list_combined_dataloader[0][0]['mmu_flow']
one_batch_t2i = list_combined_dataloader[0][0]['t2i_flow']

# one_batch_mmu = next(iter(mmu_dataloader))

In [30]:
one_batch_t2i

{'conditions': ['<dunpai> is <token_0><token_1><token_2><token_3><token_4><token_5><token_6><token_7><token_8><token_9><token_10><token_11><token_12><token_13><token_14><token_15>.\nA photo of <dunpai>.',
  '<dunpai> is <token_0><token_1><token_2><token_3><token_4><token_5><token_6><token_7><token_8><token_9><token_10><token_11><token_12><token_13><token_14><token_15>.\nA photo of <dunpai>.'],
 'images': tensor([[[[-0.9922, -0.9922, -0.9922,  ..., -0.4039, -0.3255, -0.3725],
           [-0.9922, -0.9922, -0.9922,  ..., -0.3961, -0.3333, -0.3647],
           [-0.9922, -0.9922, -0.9922,  ..., -0.3725, -0.3725, -0.3647],
           ...,
           [ 0.0118,  0.0039,  0.0196,  ...,  0.0510,  0.0510,  0.0431],
           [-0.0353, -0.0118,  0.0196,  ...,  0.0510,  0.0510,  0.0431],
           [-0.1686, -0.1059, -0.0510,  ...,  0.0510,  0.0510,  0.0431]],
 
          [[-0.9922, -0.9922, -0.9922,  ..., -0.5608, -0.4510, -0.5373],
           [-0.9922, -0.9922, -0.9922,  ..., -0.5529, -0.4510, 

In [28]:
pixel_values = one_batch_t2i['images'].to(device)
texts = one_batch_t2i['conditions']
input_ids, labels, mask_prob, image_tokens_ori = prepare_inputs_and_labels(pixel_values, texts, is_train=True)

In [29]:
input_ids

tensor([[50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295,
         50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295,
         50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295,
         50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295,
         50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295,
         50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295,
         50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295,
         50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295,
         50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295,
         50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50300,
         50256, 50305,   318,   220, 50306, 50307, 50308, 50309, 50310, 50311,
         50312, 50313, 50314, 50315, 50316, 50317, 50318, 50319, 50320, 50321,
            13,   198,    32,  4590,   286,   220, 5

In [43]:
model.output_size = new_total_vocab
save_path = os.path.join("saves", concept, "notebook_v2")
os.makedirs(save_path, exist_ok=True)
for epoch in range(0, 100):
    print(f"Epoch {epoch+1}")
    loss_list = []
    loss_t2i_list = []
    loss_mmu_list = []
    for batch, batch_idx, dataloader_idx in tqdm(list_combined_dataloader):
        batch_size_mmu = batch["mmu_flow"]["images"].shape[0]
        batch_size_t2i = batch["t2i_flow"]["images"].shape[0]
        
        # t2i format
        pixel_values, texts = batch["t2i_flow"]["images"], batch["t2i_flow"]["conditions"]
        pixel_values = pixel_values.to(device)
        input_ids, labels, mask_prob, image_tokens_ori = prepare_inputs_and_labels(pixel_values, texts, is_train=True)
        attention_mask = create_attention_mask_predict_next(input_ids,
                                                                pad_id=int(uni_prompting.sptids_dict['<|pad|>']),
                                                                soi_id=int(uni_prompting.sptids_dict['<|soi|>']),
                                                                eoi_id=int(uni_prompting.sptids_dict['<|eoi|>']),
                                                                rm_pad_in_image=True,
                                                                return_inverse_mask=True)
        attention_mask = attention_mask.to(mask_dtype)
        # 美国队长的盾牌
        
        # mmu format
        pixel_values_mmu, input_ids_mmu, labels_mmu = (batch["mmu_flow"]["images"],
                                                      batch["mmu_flow"]["input_ids"],
                                                      batch["mmu_flow"]["labels"])
        pixel_values_mmu = pixel_values_mmu.to(device, non_blocking=True)
        input_ids_mmu = input_ids_mmu.to(device, non_blocking=True)
        image_tokens_mmu = vq_model.get_code(pixel_values_mmu)
        image_tokens_mmu = image_tokens_mmu + len(uni_prompting.text_tokenizer)
        
        input_ids_mmu = torch.cat([
                    (torch.ones(input_ids_mmu.shape[0], 1) * uni_prompting.sptids_dict['<|mmu|>']).to(
                        device),
                    (torch.ones(input_ids_mmu.shape[0], 1) * uni_prompting.sptids_dict['<|soi|>']).to(
                        device),
                    image_tokens_mmu,
                    (torch.ones(input_ids_mmu.shape[0], 1) * uni_prompting.sptids_dict['<|eoi|>']).to(
                        device),
                    input_ids_mmu,
                ], dim=1).long()

        labels_mmu = torch.cat([
                    (torch.ones(input_ids_mmu.shape[0], 1) * uni_prompting.ignore_id).to(device),
                    (torch.ones(input_ids_mmu.shape[0], 1) * uni_prompting.ignore_id).to(device),
                    torch.ones_like(image_tokens_mmu) * uni_prompting.ignore_id,
                    (torch.ones(input_ids_mmu.shape[0], 1) * uni_prompting.ignore_id).to(device),
                    labels_mmu.to(device)
                ], dim=1).long()
        
        
        attention_mask_mmu = create_attention_mask_for_mmu(input_ids_mmu.to(input_ids.device),
                                                               eoi_id=int(uni_prompting.sptids_dict['<|eoi|>']))
        attention_mask_mmu = attention_mask_mmu.to(mask_dtype)
        attention_mask = torch.cat([attention_mask, attention_mask_mmu], dim=0)
        input_ids = torch.cat((input_ids, input_ids_mmu.to(input_ids.device)), dim=0)
        labels = torch.cat((labels, labels_mmu.to(input_ids.device)), dim=0)
        
        optimizer.zero_grad()
        
        logits, loss_t2i, loss_lm, loss_mmu = model(
                    input_ids=input_ids,
                    input_embeddings=None,
                    attention_mask=attention_mask,
                    labels=labels,
                    label_smoothing=0.0,
                    batch_size_t2i=batch_size_t2i,
                    batch_size_lm=0,
                    batch_size_mmu=batch_size_mmu,
                    max_seq_length=128,
                )
        loss = 0.8 * loss_t2i + 0.2 * loss_mmu
        # loss = loss_t2i
        loss.backward()
        optimizer.step()
        loss_list.append(loss.item())
        loss_t2i_list.append(loss_t2i.item())
        loss_mmu_list.append(loss_mmu.item())
        # tqdm.set_postfix(loss=loss.item(), loss_t2i=loss_t2i.item(), loss_mmu=loss_mmu.item())
        # tqdm.write(f"loss: {loss.item()}, loss_t2i: {loss_t2i.item()}, loss_mmu: {loss_mmu.item()}")
        # 恢复原始权重
        with torch.no_grad():
            model.showo.get_input_embeddings().weight.data[index_no_updates] = orig_embeds[index_no_updates]
            model.showo.lm_head.weight.data[index_no_updates] = orig_lm_head_weight[index_no_updates]
            model.showo.lm_head.bias.data[index_no_updates] = orig_lm_head_bias[index_no_updates]
    print(f"Epoch {epoch+1} loss: {np.mean(loss_list)}, loss_t2i: {np.mean(loss_t2i_list)}, loss_mmu: {np.mean(loss_mmu_list)}")
    print(f"  Token-Norm: {model.showo.get_input_embeddings().weight[new_token_ids].norm().item()}")
    print(f"  index_no_updates-Token-Norm: {model.showo.get_input_embeddings().weight[index_no_updates].norm().item()}")
    print(f"  LM-Head-Weight-Norm: {model.showo.lm_head.weight[new_token_ids].norm().item()}")
    print(f"  index_no_updates-LM-Head-Weight-Norm: {model.showo.lm_head.weight[index_no_updates].norm().item()}")
    print(f"  LM-Head-Bias-Norm: {model.showo.lm_head.bias[new_token_ids].norm().item()}")
    print(f"  index_no_updates-LM-Head-Bias-Norm: {model.showo.lm_head.bias[index_no_updates].norm().item()}")
    
    
    # print(f"Epoch {epoch} loss: {np.mean(loss_list)}, loss_t2i: {np.mean(loss_t2i_list)}")
    if (epoch+1) % 10 == 0:
        save_path_embed = os.path.join(save_path, f"epoch_{epoch+1}_embed.pt")
        save_path_lm_head_weight = os.path.join(save_path, f"epoch_{epoch+1}_lm_head_weight.pt")
        save_path_lm_head_bias = os.path.join(save_path, f"epoch_{epoch+1}_lm_head_bias.pt")
        
        torch.save(model.showo.get_input_embeddings().weight.data[new_token_ids], save_path_embed)
        torch.save(model.showo.lm_head.weight.data[new_token_ids], save_path_lm_head_weight)
        torch.save(model.showo.lm_head.bias.data[new_token_ids], save_path_lm_head_bias)
        
        

Epoch 1


100%|██████████| 49/49 [00:18<00:00,  2.58it/s]


Epoch 1 loss: 5.189498925695614, loss_t2i: 6.1075884468701425, loss_mmu: 1.517140389705191
  Token-Norm: 16.996217727661133
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.835942268371582
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1776151657104492
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 2


100%|██████████| 49/49 [00:19<00:00,  2.58it/s]


Epoch 2 loss: 5.295677515925194, loss_t2i: 6.332796359548763, loss_mmu: 1.1472018342845294
  Token-Norm: 21.274869918823242
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.899560928344727
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1911085844039917
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 3


100%|██████████| 49/49 [00:18<00:00,  2.58it/s]


Epoch 3 loss: 5.165294744530502, loss_t2i: 6.198772659107131, loss_mmu: 1.0313825874912494
  Token-Norm: 23.274295806884766
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.896822929382324
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1979981660842896
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 4


100%|██████████| 49/49 [00:18<00:00,  2.58it/s]


Epoch 4 loss: 5.064726595975915, loss_t2i: 6.091950496848749, loss_mmu: 0.9558303161543242
  Token-Norm: 25.646060943603516
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.889705657958984
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.2019548416137695
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 5


100%|██████████| 49/49 [00:18<00:00,  2.58it/s]


Epoch 5 loss: 5.013992090614474, loss_t2i: 6.042409862790789, loss_mmu: 0.9003206351581885
  Token-Norm: 26.977981567382812
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.885478019714355
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.205296277999878
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 6


100%|██████████| 49/49 [00:18<00:00,  2.58it/s]


Epoch 6 loss: 5.0297971015073815, loss_t2i: 6.072145184692071, loss_mmu: 0.8604043168681008
  Token-Norm: 28.584562301635742
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.880767822265625
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.2078670263290405
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 7


100%|██████████| 49/49 [00:19<00:00,  2.58it/s]


Epoch 7 loss: 5.202958934161128, loss_t2i: 6.293621014575569, loss_mmu: 0.8403100663301896
  Token-Norm: 30.82159423828125
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.890098571777344
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.2112854719161987
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 8


100%|██████████| 49/49 [00:19<00:00,  2.58it/s]


Epoch 8 loss: 5.020822062784312, loss_t2i: 6.07710870431394, loss_mmu: 0.7956751090835552
  Token-Norm: 31.94410514831543
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.882380485534668
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.212541937828064
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 9


100%|██████████| 49/49 [00:19<00:00,  2.58it/s]


Epoch 9 loss: 5.044492040361677, loss_t2i: 6.116896755841314, loss_mmu: 0.7548727479820349
  Token-Norm: 33.53121566772461
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.878503799438477
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.2139770984649658
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 10


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 10 loss: 4.975795891820168, loss_t2i: 6.0345093133498215, loss_mmu: 0.7409418045866246
  Token-Norm: 35.438446044921875
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.866326332092285
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.2141520977020264
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 11


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 11 loss: 4.9183997378057365, loss_t2i: 5.958429545772319, loss_mmu: 0.7582805339170962
  Token-Norm: 36.7804069519043
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.856635093688965
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.214709758758545
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 12


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 12 loss: 4.667988524145009, loss_t2i: 5.660765302424529, loss_mmu: 0.696881002765529
  Token-Norm: 37.442665100097656
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.835716247558594
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.2140326499938965
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 13


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 13 loss: 4.8703071389879495, loss_t2i: 5.924354828133875, loss_mmu: 0.654116051309571
  Token-Norm: 38.152374267578125
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.824703216552734
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.213559865951538
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 14


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 14 loss: 5.265481374701675, loss_t2i: 6.427951209399165, loss_mmu: 0.6156015380532766
  Token-Norm: 39.1376953125
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.82219409942627
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.2142239809036255
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 15


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 15 loss: 5.003269404781108, loss_t2i: 6.1031877118714, loss_mmu: 0.6035960464909369
  Token-Norm: 40.6532096862793
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.80922794342041
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.213906168937683
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 16


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 16 loss: 4.86877711938352, loss_t2i: 5.931649222665904, loss_mmu: 0.6172883867913362
  Token-Norm: 42.53426742553711
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.801727294921875
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.2140268087387085
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 17


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 17 loss: 4.817201429483842, loss_t2i: 5.871483564376831, loss_mmu: 0.6000725870609892
  Token-Norm: 43.78966522216797
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.785484313964844
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.2131733894348145
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 18


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 18 loss: 5.133930549329641, loss_t2i: 6.2786456224869704, loss_mmu: 0.55506963942352
  Token-Norm: 44.71426010131836
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.773560523986816
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.2125784158706665
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 19


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 19 loss: 4.696778007916042, loss_t2i: 5.742798226220267, loss_mmu: 0.5126967684232763
  Token-Norm: 45.42453384399414
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.759502410888672
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.2116600275039673
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 20


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 20 loss: 4.75642035931957, loss_t2i: 5.817049284370578, loss_mmu: 0.5139044109845952
  Token-Norm: 46.323463439941406
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.733633041381836
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.2094109058380127
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 21


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 21 loss: 4.896249853834814, loss_t2i: 5.999451452372026, loss_mmu: 0.48344308373575307
  Token-Norm: 47.04182815551758
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.713618278503418
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.2076553106307983
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 22


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 22 loss: 4.974328070270772, loss_t2i: 6.103254833999945, loss_mmu: 0.45862059660103854
  Token-Norm: 48.0160026550293
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.69416618347168
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.205739974975586
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 23


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 23 loss: 5.0723150603625236, loss_t2i: 6.224542121497953, loss_mmu: 0.46340640971665176
  Token-Norm: 48.98190689086914
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.67795467376709
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.2039215564727783
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 24


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 24 loss: 4.986224836232711, loss_t2i: 6.122357407394721, loss_mmu: 0.44169392178253253
  Token-Norm: 49.811790466308594
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.663269996643066
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.20263671875
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 25


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 25 loss: 4.918222563607352, loss_t2i: 6.042555142422112, loss_mmu: 0.4208917227402634
  Token-Norm: 50.5679817199707
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.642878532409668
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.2006381750106812
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 26


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 26 loss: 4.819636578462561, loss_t2i: 5.9200418676648825, loss_mmu: 0.4180146835896434
  Token-Norm: 51.622047424316406
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.619129180908203
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.198496699333191
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 27


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 27 loss: 4.655815095317607, loss_t2i: 5.71860392239629, loss_mmu: 0.4046594150630491
  Token-Norm: 52.27448654174805
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.591724395751953
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.196151852607727
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 28


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 28 loss: 4.886978908460968, loss_t2i: 6.006993853316015, loss_mmu: 0.406918595126849
  Token-Norm: 53.46338653564453
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.5697660446167
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1938990354537964
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 29


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 29 loss: 4.900942301263615, loss_t2i: 6.024939220778796, loss_mmu: 0.4049541430015649
  Token-Norm: 54.58707046508789
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.543538093566895
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1911532878875732
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 30


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 30 loss: 4.952775604870855, loss_t2i: 6.094614243020817, loss_mmu: 0.3854208017642401
  Token-Norm: 55.48569869995117
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.524956703186035
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.189111351966858
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 31


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 31 loss: 4.724914054481351, loss_t2i: 5.813105485877212, loss_mmu: 0.3721478220303448
  Token-Norm: 56.08204650878906
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.507229804992676
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1875278949737549
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 32


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 32 loss: 4.823928375633395, loss_t2i: 5.939631700515747, loss_mmu: 0.36111477815679144
  Token-Norm: 57.03557205200195
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.490757942199707
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.186226725578308
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 33


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 33 loss: 4.837252526867147, loss_t2i: 5.955988611493792, loss_mmu: 0.36230804824403356
  Token-Norm: 58.13853073120117
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.464902877807617
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1836320161819458
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 34


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 34 loss: 4.946010073836969, loss_t2i: 6.095531706907312, loss_mmu: 0.34792304685225295
  Token-Norm: 59.127662658691406
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.456421852111816
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1830030679702759
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 35


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 35 loss: 4.676813393223043, loss_t2i: 5.761096365597783, loss_mmu: 0.3396810413889435
  Token-Norm: 59.80320358276367
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.429275512695312
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1804980039596558
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 36


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 36 loss: 4.81628308490831, loss_t2i: 5.9349779498820405, loss_mmu: 0.341503136601223
  Token-Norm: 60.898040771484375
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.403074264526367
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1778982877731323
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 37


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 37 loss: 4.8213330093695195, loss_t2i: 5.946488380432129, loss_mmu: 0.3207110745680271
  Token-Norm: 61.62082290649414
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.378660202026367
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1754966974258423
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 38


100%|██████████| 49/49 [00:19<00:00,  2.58it/s]


Epoch 38 loss: 4.864654472896031, loss_t2i: 5.999318263968643, loss_mmu: 0.32599864645423937
  Token-Norm: 62.3652458190918
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.35384750366211
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1729170083999634
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 39


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 39 loss: 4.726211769240243, loss_t2i: 5.83201366541337, loss_mmu: 0.30300383292594735
  Token-Norm: 62.997093200683594
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.32106876373291
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1697155237197876
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 40


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 40 loss: 4.805381760305288, loss_t2i: 5.9305563070336165, loss_mmu: 0.30468313422586235
  Token-Norm: 63.983421325683594
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.292482376098633
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1667113304138184
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 41


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 41 loss: 5.034850903919765, loss_t2i: 6.21753158861277, loss_mmu: 0.3041279795674645
  Token-Norm: 64.73188018798828
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.270048141479492
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.164336919784546
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 42


100%|██████████| 49/49 [00:19<00:00,  2.58it/s]


Epoch 42 loss: 4.746253261760789, loss_t2i: 5.85784405104968, loss_mmu: 0.2998897679036065
  Token-Norm: 65.65949249267578
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.247820854187012
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1623096466064453
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 43


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 43 loss: 4.902717196211523, loss_t2i: 6.054760178741144, loss_mmu: 0.2945450082497329
  Token-Norm: 66.44782257080078
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.224882125854492
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1599465608596802
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 44


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 44 loss: 4.6737165597020365, loss_t2i: 5.766732580807744, loss_mmu: 0.301652020717762
  Token-Norm: 67.7010498046875
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.198692321777344
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1573801040649414
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 45


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 45 loss: 4.641802413122995, loss_t2i: 5.7280228673195355, loss_mmu: 0.2969203947530109
  Token-Norm: 68.78501892089844
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.168322563171387
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1545039415359497
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 46


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 46 loss: 4.981347210553228, loss_t2i: 6.152627482706187, loss_mmu: 0.29622574747368996
  Token-Norm: 70.30530548095703
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.145332336425781
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1520687341690063
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 47


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 47 loss: 4.916989793582839, loss_t2i: 6.075122142324642, loss_mmu: 0.28446016377028155
  Token-Norm: 71.47810363769531
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.124200820922852
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1497411727905273
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 48


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 48 loss: 4.298855367971926, loss_t2i: 5.301739578344384, loss_mmu: 0.2873181277619941
  Token-Norm: 72.27383422851562
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.089882850646973
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1465892791748047
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 49


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 49 loss: 4.774450905468999, loss_t2i: 5.899993327199196, loss_mmu: 0.27228083577463214
  Token-Norm: 73.11774444580078
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.067911148071289
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.144546627998352
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 50


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 50 loss: 4.733472035855663, loss_t2i: 5.849007416744621, loss_mmu: 0.2713301164972387
  Token-Norm: 74.18463134765625
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.050017356872559
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1427398920059204
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 51


100%|██████████| 49/49 [00:19<00:00,  2.55it/s]


Epoch 51 loss: 4.52496958995352, loss_t2i: 5.583332300186157, loss_mmu: 0.29151832863536414
  Token-Norm: 75.4850082397461
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 12.023186683654785
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1400620937347412
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 52


100%|██████████| 49/49 [00:19<00:00,  2.56it/s]


Epoch 52 loss: 4.76845577784947, loss_t2i: 5.890713818219243, loss_mmu: 0.27942332709018064
  Token-Norm: 76.79292297363281
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 11.993514060974121
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1369779109954834
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 53


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 53 loss: 4.626686091325721, loss_t2i: 5.715763714848732, loss_mmu: 0.27037509504173485
  Token-Norm: 77.89508056640625
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 11.968205451965332
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1345468759536743
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 54


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 54 loss: 4.71954235252069, loss_t2i: 5.833400687392877, loss_mmu: 0.2641085470570441
  Token-Norm: 78.5687026977539
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 11.957320213317871
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1339139938354492
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 55


100%|██████████| 49/49 [00:19<00:00,  2.58it/s]


Epoch 55 loss: 4.762157036333668, loss_t2i: 5.8783594004961905, loss_mmu: 0.2973473225054996
  Token-Norm: 80.10250854492188
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 11.943018913269043
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.133036494255066
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 56


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 56 loss: 4.855512619018555, loss_t2i: 5.998793139749644, loss_mmu: 0.2823900664035155
  Token-Norm: 81.21875
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 11.9271240234375
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1317847967147827
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 57


100%|██████████| 49/49 [00:19<00:00,  2.58it/s]


Epoch 57 loss: 4.73468478601806, loss_t2i: 5.851159460690557, loss_mmu: 0.26878582296550885
  Token-Norm: 82.22695922851562
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 11.90692138671875
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1298468112945557
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 58


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 58 loss: 4.51431861945561, loss_t2i: 5.577749692663854, loss_mmu: 0.26059371724306624
  Token-Norm: 82.99173736572266
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 11.883216857910156
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1277236938476562
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 59


100%|██████████| 49/49 [00:19<00:00,  2.58it/s]


Epoch 59 loss: 4.458051097636321, loss_t2i: 5.5085514516246565, loss_mmu: 0.2560492498914198
  Token-Norm: 83.76292419433594
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 11.857090950012207
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1252646446228027
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 60


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 60 loss: 4.696412086486816, loss_t2i: 5.804881056960748, loss_mmu: 0.2625358218935375
  Token-Norm: 84.71622467041016
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 11.830610275268555
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1227010488510132
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 61


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 61 loss: 4.634181523809628, loss_t2i: 5.726162657445791, loss_mmu: 0.26625680052960404
  Token-Norm: 85.6576156616211
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 11.806462287902832
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1203635931015015
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 62


100%|██████████| 49/49 [00:19<00:00,  2.58it/s]


Epoch 62 loss: 4.626322157528936, loss_t2i: 5.713282035321606, loss_mmu: 0.2784822178928524
  Token-Norm: 86.69084930419922
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 11.788455963134766
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1188842058181763
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 63


100%|██████████| 49/49 [00:19<00:00,  2.58it/s]


Epoch 63 loss: 4.495390468714189, loss_t2i: 5.553118275136364, loss_mmu: 0.2644787764328779
  Token-Norm: 87.81450653076172
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 11.763651847839355
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.116567850112915
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 64


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 64 loss: 4.397152251126815, loss_t2i: 5.43378717072156, loss_mmu: 0.2506121025439732
  Token-Norm: 88.4156494140625
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 11.742775917053223
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1149344444274902
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 65


100%|██████████| 49/49 [00:19<00:00,  2.58it/s]


Epoch 65 loss: 4.419666382731224, loss_t2i: 5.464218937620824, loss_mmu: 0.24145583530925974
  Token-Norm: 88.85820007324219
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 11.721578598022461
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.113385558128357
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 66


100%|██████████| 49/49 [00:19<00:00,  2.58it/s]


Epoch 66 loss: 4.711656750464926, loss_t2i: 5.8276486640073815, loss_mmu: 0.24768873593028712
  Token-Norm: 89.95762634277344
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 11.701179504394531
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.111526370048523
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 67


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 67 loss: 4.5652457864917055, loss_t2i: 5.640650053413546, loss_mmu: 0.2636284950968562
  Token-Norm: 91.06458282470703
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 11.688762664794922
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1105891466140747
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 68


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 68 loss: 4.405778128273633, loss_t2i: 5.4469601864717445, loss_mmu: 0.24104951706011685
  Token-Norm: 91.63909912109375
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 11.67497444152832
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1097865104675293
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 69


100%|██████████| 49/49 [00:19<00:00,  2.55it/s]


Epoch 69 loss: 4.476972633478593, loss_t2i: 5.534047516024843, loss_mmu: 0.248672696666754
  Token-Norm: 92.73479461669922
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 11.659244537353516
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1087665557861328
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 70


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 70 loss: 4.513823572470217, loss_t2i: 5.580911130321269, loss_mmu: 0.24547281764371662
  Token-Norm: 93.39141845703125
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 11.637185096740723
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1070187091827393
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 71


100%|██████████| 49/49 [00:19<00:00,  2.56it/s]


Epoch 71 loss: 4.635739134282482, loss_t2i: 5.735668209134316, loss_mmu: 0.23602254074827142
  Token-Norm: 94.19572448730469
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 11.623221397399902
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1064621210098267
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 72


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 72 loss: 4.590314164453623, loss_t2i: 5.670053503951248, loss_mmu: 0.2713564358157467
  Token-Norm: 95.60423278808594
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 11.594978332519531
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1039098501205444
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 73


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 73 loss: 4.534712565188506, loss_t2i: 5.603719974050716, loss_mmu: 0.2586825720927849
  Token-Norm: 96.51533508300781
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 11.578919410705566
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1030545234680176
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 74


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 74 loss: 4.308073223853598, loss_t2i: 5.322956294429545, loss_mmu: 0.2485406683491809
  Token-Norm: 97.31774139404297
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 11.550769805908203
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1003071069717407
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 75


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 75 loss: 4.479966119844086, loss_t2i: 5.541365414249654, loss_mmu: 0.23436842843586084
  Token-Norm: 97.69742584228516
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 11.521967887878418
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0977599620819092
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 76


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 76 loss: 4.339845457855536, loss_t2i: 5.368432636163672, loss_mmu: 0.22549620747794302
  Token-Norm: 97.89163970947266
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 11.501585006713867
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0961933135986328
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 77


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 77 loss: 4.494328048764443, loss_t2i: 5.553012400257344, loss_mmu: 0.25959030398148664
  Token-Norm: 99.16558074951172
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 11.476913452148438
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0939109325408936
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 78


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 78 loss: 4.889912678270924, loss_t2i: 6.049317126371423, loss_mmu: 0.25229462255172586
  Token-Norm: 99.69847869873047
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 11.457950592041016
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0920782089233398
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 79


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 79 loss: 4.141848612804802, loss_t2i: 5.1168360150590235, loss_mmu: 0.2418986206532133
  Token-Norm: 100.2342758178711
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 11.445889472961426
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0918203592300415
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 80


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 80 loss: 4.533057653174108, loss_t2i: 5.609157056224589, loss_mmu: 0.2286597069124786
  Token-Norm: 100.64663696289062
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 11.431002616882324
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0909552574157715
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 81


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 81 loss: 4.28366714107747, loss_t2i: 5.299872707347481, loss_mmu: 0.2188446182301458
  Token-Norm: 100.92536926269531
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 11.409988403320312
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0894914865493774
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 82


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 82 loss: 4.6171678912882905, loss_t2i: 5.71110571647177, loss_mmu: 0.24141625923161603
  Token-Norm: 102.08844757080078
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 11.39212703704834
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0881602764129639
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 83


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 83 loss: 4.402704316742566, loss_t2i: 5.447232531041515, loss_mmu: 0.2245911546431634
  Token-Norm: 102.50737762451172
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 11.370017051696777
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0867185592651367
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 84


100%|██████████| 49/49 [00:19<00:00,  2.56it/s]


Epoch 84 loss: 4.396136041806669, loss_t2i: 5.440334614442319, loss_mmu: 0.21934136242738791
  Token-Norm: 102.81697082519531
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 11.3525972366333
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0857369899749756
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 85


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 85 loss: 4.569373284067426, loss_t2i: 5.657959668003783, loss_mmu: 0.21502744817003913
  Token-Norm: 103.1275634765625
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 11.331624031066895
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0844526290893555
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 86


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 86 loss: 4.439378218991416, loss_t2i: 5.49574215193184, loss_mmu: 0.21392211781776682
  Token-Norm: 103.58335876464844
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 11.304508209228516
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0820649862289429
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 87


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 87 loss: 4.713911110041093, loss_t2i: 5.832911812529272, loss_mmu: 0.23790804528612264
  Token-Norm: 104.81611633300781
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 11.275556564331055
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0794199705123901
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 88


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 88 loss: 4.28752678511094, loss_t2i: 5.3022798020012525, loss_mmu: 0.22851426738827507
  Token-Norm: 105.21306610107422
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 11.258687019348145
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0787286758422852
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 89


100%|██████████| 49/49 [00:19<00:00,  2.55it/s]


Epoch 89 loss: 4.17160465279404, loss_t2i: 5.161347739550532, loss_mmu: 0.212631795289261
  Token-Norm: 105.3882827758789
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 11.224209785461426
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0755670070648193
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 90


100%|██████████| 49/49 [00:19<00:00,  2.55it/s]


Epoch 90 loss: 4.286380039185894, loss_t2i: 5.304309626014865, loss_mmu: 0.21466132655397666
  Token-Norm: 105.9671859741211
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 11.202486038208008
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0745114088058472
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 91


100%|██████████| 49/49 [00:19<00:00,  2.56it/s]


Epoch 91 loss: 4.333718358253946, loss_t2i: 5.365644621605775, loss_mmu: 0.20601282915936744
  Token-Norm: 106.00858306884766
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 11.18237018585205
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0734630823135376
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 92


100%|██████████| 49/49 [00:19<00:00,  2.56it/s]


Epoch 92 loss: 4.287458961107293, loss_t2i: 5.309067616657335, loss_mmu: 0.20102418622724255
  Token-Norm: 106.30217742919922
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 11.157427787780762
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0715179443359375
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 93


100%|██████████| 49/49 [00:19<00:00,  2.56it/s]


Epoch 93 loss: 4.333679133531999, loss_t2i: 5.365951861654009, loss_mmu: 0.204587723793728
  Token-Norm: 106.69204711914062
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 11.130715370178223
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0692965984344482
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 94


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 94 loss: 4.466922716826809, loss_t2i: 5.532744459351715, loss_mmu: 0.20363533198453335
  Token-Norm: 106.93561553955078
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 11.114020347595215
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0687267780303955
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 95


100%|██████████| 49/49 [00:19<00:00,  2.55it/s]


Epoch 95 loss: 4.379576439760169, loss_t2i: 5.420200068123487, loss_mmu: 0.21708169132851216
  Token-Norm: 107.562744140625
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 11.091907501220703
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0675996541976929
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 96


100%|██████████| 49/49 [00:19<00:00,  2.55it/s]


Epoch 96 loss: 4.135096670413504, loss_t2i: 5.1150897352062925, loss_mmu: 0.21512399632863852
  Token-Norm: 108.05756378173828
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 11.060322761535645
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0648179054260254
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 97


100%|██████████| 49/49 [00:19<00:00,  2.55it/s]


Epoch 97 loss: 4.242300481212382, loss_t2i: 5.2511633391283, loss_mmu: 0.20684889190811284
  Token-Norm: 108.57537078857422
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 11.026708602905273
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.061898946762085
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 98


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 98 loss: 4.240787192266815, loss_t2i: 5.249189677287121, loss_mmu: 0.2071767123796198
  Token-Norm: 108.91477966308594
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 11.000885009765625
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.060003399848938
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 99


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


Epoch 99 loss: 4.180792133418882, loss_t2i: 5.177845445214485, loss_mmu: 0.19257837499738956
  Token-Norm: 108.94597625732422
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 10.974903106689453
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.058174967765808
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 100


100%|██████████| 49/49 [00:19<00:00,  2.57it/s]

Epoch 100 loss: 4.056426503220383, loss_t2i: 5.0219403772938005, loss_mmu: 0.1943706366404587
  Token-Norm: 109.27006530761719
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 10.950366020202637
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.056618571281433
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438





tensor([50301, 50296, 50424, 50872, 51385, 51136, 50904, 50617, 52184, 51160,
        52201, 55256, 51161, 55191, 54775, 54521, 55265, 54745, 50940, 51326,
        50361, 50425, 54282, 50616, 50693, 54285, 50696, 50644, 50793, 50793,
        55400, 51160, 54680, 55257, 53000, 54266, 53434, 52915, 54007, 53498,
        50425, 51214, 51208, 51716, 50440, 50661, 50648, 50661, 50665, 54745,
        52914, 57539, 53191, 53451, 52930, 53962, 52942, 52918, 52986, 54278,
        51144, 50438, 51188, 50376, 52130, 50646, 50868, 52674, 50831, 56715,
        53765, 52738, 50498, 50371, 53938, 50563, 50946, 50930, 51938, 51187,
        50898, 50851, 51380, 51915, 53434, 52523, 52411, 53939, 51401, 53683,
        50329, 52382, 52915, 51363, 50846, 51350, 52435, 50342, 58049, 55858,
        51610, 50498, 52890, 55492, 55162, 57550, 57645, 55678, 57633, 57612,
        57732, 55793, 57868, 57788, 54390, 55508, 51860, 54089, 51353, 56126,
        57689, 56021, 51416, 54329, 55499, 52473, 57702, 56985, 