In [1]:
import torch
print(torch.cuda.is_available())

True


In [1]:
from pdata import PersonalizedMMUDataset, PersonalizedT2IDataset, get_personalized_mmu_dataloader, get_personalized_t2i_dataloader
from lightning.pytorch.utilities import CombinedLoader

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.nn as nn
import numpy as np
from tqdm import tqdm
from PIL import Image

from models import Showo, MAGVITv2, get_mask_chedule
from training.prompting_utils import UniversalPrompting, create_attention_mask_predict_next, create_attention_mask_for_mmu
from training.utils import get_config, flatten_omega_conf, mask_or_random_replace_tokens, AverageMeter
from transformers import AutoTokenizer
from models.clip_encoder import CLIPVisionTower
from transformers import CLIPImageProcessor
from llava.llava import conversation as conversation_lib

conversation_lib.default_conversation = conversation_lib.conv_templates["phi1.5"]

import os
from omegaconf import DictConfig, ListConfig, OmegaConf
config = OmegaConf.load('configs/showo_demo.yaml')
# device setup
device = torch.device("cuda:7")

  from .autonotebook import tqdm as notebook_tqdm


[2025-02-23 16:30:35,825] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/hpyky/miniconda3/envs/showo/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status




In [3]:
# show o tokenizer setup and adding special tokens to universal prompting
# llm model : 'microsoft/phi-1_5'
tokenizer = AutoTokenizer.from_pretrained(config.model.showo.llm_model_path, padding_side ="left")
uni_prompting = UniversalPrompting(tokenizer, max_text_len=config.dataset.preprocessing.max_seq_length,
                                       special_tokens=("<|soi|>", "<|eoi|>", "<|sov|>", "<|eov|>", "<|t2i|>", "<|mmu|>", "<|t2v|>", "<|v2v|>", "<|lvg|>"),
                                       ignore_id=-100, cond_dropout_prob=config.training.cond_dropout_prob)

# setting up the magvit-v2, for t2i
vq_model = MAGVITv2.from_pretrained(config.model.vq_model.vq_model_name).to(device)
# vq_model.requires_grad_(False)
# vq_model.eval()

# setting up vision tower: clip-vit only for mmu
# vision_tower_name =config.clip_path
# vision_tower = CLIPVisionTower(vision_tower_name).to(device)
# clip_image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)

# setting up the showo model 
model = Showo.from_pretrained(config.model.showo.pretrained_model_path).to(device)
# model.eval()

# setting up the parameters
temperature = 1  # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
top_k = 1  # retain only the top_k most likely tokens, clamp others to have 0 probability
# LLAVA_SYSTEM_PROMPT = "A chat between a curious user and an artificial intelligence assistant. " \
#                 "The assistant gives helpful, detailed, and polite answers to the user's questions."
# LLAVA_SYSTEM_PROMPT_LEN = 28

Working with z of shape (1, 13, 16, 16) = 3328 dimensions.
Look-up free quantizer with codebook size: 8192


The config attributes {'mask_token_id': 58497} were passed to Showo, but are not expected and will be ignored. Please verify your config.json configuration file.


attention implementation:  sdpa


  if self.w_clip_vit:


In [4]:
# print(model.showo.get_input_embeddings())
model.showo.get_input_embeddings().num_embeddings
model.showo.get_input_embeddings().num_embeddings - len(tokenizer)
model.showo.get_input_embeddings().weight.data.shape
model.showo.lm_head.weight.shape
model.showo.lm_head.bias.shape

torch.Size([58498])

In [5]:
data_root = "/home/hpyky/full_mcdata"
concept = "dunpai"

In [6]:
nums_new_token_i = 16

#################################
new_tokens = [f"<{concept}>"] + [f"<token_{i}>" for i in range(nums_new_token_i)]
num_new_tokens = len(new_tokens)  # 17

# 已知的原始参数
# 文本 token 数量（ID 0-50304）
original_text_vocab_size = len(tokenizer)  
# Image token 数量（原 ID 50305-58497）
original_image_vocab_size = model.showo.get_input_embeddings().num_embeddings - len(tokenizer)

original_total_vocab = original_text_vocab_size + original_image_vocab_size  # 58498

# 新的参数
new_text_vocab_size = original_text_vocab_size + num_new_tokens  # 50305 + 17 = 50322
new_total_vocab = original_total_vocab + num_new_tokens          # 58498 + 17 = 58515

# ------------------------------
# Step 1: 修改 Tokenizer 的词汇表
# ------------------------------

# 添加新 token 到 50305-50321 的位置
num_new_tokens = tokenizer.add_tokens(new_tokens)
new_token_ids = tokenizer.convert_tokens_to_ids(new_tokens)
print("新 token ID:", new_token_ids)  # 应输出 50305-50321

# ------------------------------
# Step 2: 调整模型的权重
# ------------------------------
with torch.no_grad():
    # 获取嵌入层权重
    embeddings = model.showo.get_input_embeddings().weight.data
    
    # 扩展嵌入层（58498 -> 58515）
    model.showo.resize_token_embeddings(new_total_vocab)
    # new_embeddings = model.showo.get_input_embeddings().weight.data

    # 将原 Image Token 权重后移 17 位
    original_image_weights = embeddings[original_text_vocab_size:original_total_vocab].clone()
    model.showo.get_input_embeddings().weight.data[new_text_vocab_size:new_total_vocab] = original_image_weights
    
    # 初始化新 token 的权重（用原文本最后 17 个 token）
    # new_text_weights = embeddings[original_text_vocab_size - num_new_tokens : original_text_vocab_size].clone()
    # model.showo.get_input_embeddings().weight.data[original_text_vocab_size : new_text_vocab_size] = new_text_weights
    # print(model.showo.lm_head.weight.data.shape[1])
    # 处理 lm_head（假设与嵌入层共享权重）
    if model.showo.lm_head.weight.data.shape[0] == new_total_vocab:
        # 扩展 lm_head 权重
        lm_head = model.showo.lm_head
        new_lm_head = torch.nn.Linear(
            lm_head.in_features, 
            new_total_vocab, 
            bias=hasattr(lm_head, 'bias')
        )
        new_lm_head.weight.data = lm_head.weight.data.clone()
        new_lm_head.weight.data[new_text_vocab_size:new_total_vocab] = lm_head.weight.data[original_text_vocab_size:original_total_vocab]
        # new_lm_head.weight.data[original_text_vocab_size:new_text_vocab_size] = lm_head.weight.data[original_text_vocab_size - num_new_tokens : original_text_vocab_size]
        if hasattr(lm_head, 'bias'):
            new_lm_head.bias.data = lm_head.bias.data.clone()
            new_lm_head.bias.data[new_text_vocab_size:new_total_vocab] = lm_head.bias.data[original_text_vocab_size:original_total_vocab]
            # new_lm_head.bias.data[original_text_vocab_size:new_text_vocab_size] = lm_head.bias.data[original_text_vocab_size - num_new_tokens : original_text_vocab_size]
        
        model.showo.lm_head = new_lm_head
    else:
        raise ValueError("lm_head weights do not match the input embeddings!")

index_no_updates = torch.ones((new_total_vocab,), dtype=torch.bool)
index_no_updates[new_token_ids] = False
# ------------------------------
# 验证
# ------------------------------
# 检查新 token 的 ID
print("新增文本 token ID:", [tokenizer.convert_tokens_to_ids(t) for t in new_tokens])  # 应输出 50305-50321

# 检查一个原 Image Token 的新 ID
sample_image_token = tokenizer.convert_ids_to_tokens(original_text_vocab_size)  # 原 ID 50305
print(f"Concept Token '{sample_image_token}' 的新 ID:", tokenizer.convert_tokens_to_ids(sample_image_token))  # 应输出 50322

# 检查嵌入层形状
print("嵌入层大小:", model.showo.get_input_embeddings().weight.shape)  # 应显示 torch.Size([58515, 2048])

# 检查 index_no_updates 中 True 的位置和数量，True 应该是 new token ids
print("index_no_updates 中 False 的位置:", torch.nonzero(~index_no_updates).squeeze())  # 应输出 50305-50321
print("index_no_updates 中 True 的数量:", torch.sum(index_no_updates))  # 应输出 58498

with torch.no_grad():
    orig_embeds = model.showo.get_input_embeddings().weight.data.clone()
    orig_lm_head_weight = model.showo.lm_head.weight.data.clone()
    orig_lm_head_bias = model.showo.lm_head.bias.data.clone()

新 token ID: [50305, 50306, 50307, 50308, 50309, 50310, 50311, 50312, 50313, 50314, 50315, 50316, 50317, 50318, 50319, 50320, 50321]
新增文本 token ID: [50305, 50306, 50307, 50308, 50309, 50310, 50311, 50312, 50313, 50314, 50315, 50316, 50317, 50318, 50319, 50320, 50321]
Concept Token '<dunpai>' 的新 ID: 50305
嵌入层大小: torch.Size([58515, 2048])
index_no_updates 中 False 的位置: tensor([50305, 50306, 50307, 50308, 50309, 50310, 50311, 50312, 50313, 50314,
        50315, 50316, 50317, 50318, 50319, 50320, 50321])
index_no_updates 中 True 的数量: tensor(58498)


In [7]:
concept_embeds = model.showo.get_input_embeddings().weight.data[new_token_ids]
concept_lm_nead_wight = model.showo.lm_head.weight.data[new_token_ids]
concept_lm_nead_bias = model.showo.lm_head.bias.data[new_token_ids]

In [8]:
concept_embeds.shape, concept_lm_nead_wight.shape, concept_lm_nead_bias.shape

(torch.Size([17, 2048]), torch.Size([17, 2048]), torch.Size([17]))

In [9]:
uni_prompting.sptids_dict

{'<|soi|>': tensor([50296]),
 '<|eoi|>': tensor([50297]),
 '<|sov|>': tensor([50298]),
 '<|eov|>': tensor([50299]),
 '<|t2i|>': tensor([50300]),
 '<|mmu|>': tensor([50301]),
 '<|t2v|>': tensor([50302]),
 '<|v2v|>': tensor([50303]),
 '<|lvg|>': tensor([50304]),
 '<|sot|>': tensor([50256]),
 '<|eot|>': tensor([50256]),
 '<|pad|>': tensor([50295])}

In [10]:
vq_model.requires_grad_ = False
vq_model.eval()
model.train()
for names, p in model.named_parameters():
    if "embed_tokens" not in names and "lm_head" not in names:
        p.requires_grad = False
    else:
        p.requires_grad = True

trainable_params = [model.showo.get_input_embeddings().weight, model.showo.lm_head.weight, model.showo.lm_head.bias]
optimizer = torch.optim.AdamW(
            trainable_params, # for optimize the embeddings and the head
            lr=1e-2,
            betas=(0.9, 0.999),
            weight_decay=1e-2,
            eps=1e-08,
        )
for names, p in model.named_parameters():
    if p.requires_grad:
        print(f"{names} requires_grad") # embed_token, lm_head会更新

showo.model.embed_tokens.weight requires_grad
showo.lm_head.weight requires_grad
showo.lm_head.bias requires_grad


In [11]:
model.config.mask_token_id = model.showo.get_input_embeddings().num_embeddings - 1
model.mask_token_id = model.showo.get_input_embeddings().num_embeddings - 1

In [12]:
mask_schedule = get_mask_chedule(config.training.get("mask_schedule", "cosine"))
mask_id = model.mask_token_id
mask_dtype = model.showo.model.embed_tokens.weight.dtype

In [13]:

# t2i_dataset = PersonalizedT2IDataset(data_root, concept)
# t2i_dataloader = DataLoader(t2i_dataset, batch_size=5, shuffle=True, num_workers=10, pin_memory=True)

mmu_dataloader = get_personalized_mmu_dataloader(data_root, concept, tokenizer, batch_size=5, num_workers=0, max_length=128)
t2i_dataloader = get_personalized_t2i_dataloader(data_root, concept, tokenizer, batch_size=2, num_workers=0, max_length=128)


iterables = {
    'mmu_flow': mmu_dataloader,
    't2i_flow': t2i_dataloader
}


combined_dataloader = CombinedLoader(iterables, mode="max_size_cycle")

# Before adding the new tokens, the vocab size is 58498
# vocab size = 58498 = 50295  llm vocabsize
#                    + 10     <|soi|> <|eoi|> <|sov|> <|eov|> <|t2i|> <|mmu|> <|t2v|> <|v2v|> <|lvg|> <|pad|>
#                    + 8192   vq model codebook size
#                    + 1      mask token (token id == 58497)
from typing import Union


uni_prompting.sptids_dict
# {'<|soi|>': tensor([50296]),
#  '<|eoi|>': tensor([50297]),
#  '<|sov|>': tensor([50298]),
#  '<|eov|>': tensor([50299]),
#  '<|t2i|>': tensor([50300]),
#  '<|mmu|>': tensor([50301]),
#  '<|t2v|>': tensor([50302]),
#  '<|v2v|>': tensor([50303]),
#  '<|lvg|>': tensor([50304]),
#  '<|sot|>': tensor([50256]),
#  '<|eot|>': tensor([50256]),
#  '<|pad|>': tensor([50295])}

# uni_prompting.text_tokenizer == tokenizer
def prepare_inputs_and_labels(
        pixel_values_or_image_ids: Union[torch.FloatTensor, torch.LongTensor],
        texts: Union[str, str],
        min_masking_rate: float = 0.0,
        is_train: bool = True,
):

    image_tokens = vq_model.get_code(pixel_values_or_image_ids)
    image_tokens = image_tokens + len(uni_prompting.text_tokenizer)

    # create MLM mask and labels
    input_ids, labels, loss_weight, mask_prob = mask_or_random_replace_tokens(
        image_tokens,
        mask_id,
        config,
        mask_schedule=mask_schedule,
        is_train=is_train,
    )
    input_ids, masks, labels = uni_prompting((texts, input_ids, labels), 't2i')

    return input_ids, labels, mask_prob, image_tokens

Formatting llava instruction data


In [27]:
list_combined_dataloader = list(combined_dataloader)
# one_batch_mmu = list_combined_dataloader[0][0]['mmu_flow']
one_batch_t2i = list_combined_dataloader[0][0]['t2i_flow']

# one_batch_mmu = next(iter(mmu_dataloader))

In [30]:
one_batch_t2i

{'conditions': ['<dunpai> is <token_0><token_1><token_2><token_3><token_4><token_5><token_6><token_7><token_8><token_9><token_10><token_11><token_12><token_13><token_14><token_15>.\nA photo of <dunpai>.',
  '<dunpai> is <token_0><token_1><token_2><token_3><token_4><token_5><token_6><token_7><token_8><token_9><token_10><token_11><token_12><token_13><token_14><token_15>.\nA photo of <dunpai>.'],
 'images': tensor([[[[-0.9922, -0.9922, -0.9922,  ..., -0.4039, -0.3255, -0.3725],
           [-0.9922, -0.9922, -0.9922,  ..., -0.3961, -0.3333, -0.3647],
           [-0.9922, -0.9922, -0.9922,  ..., -0.3725, -0.3725, -0.3647],
           ...,
           [ 0.0118,  0.0039,  0.0196,  ...,  0.0510,  0.0510,  0.0431],
           [-0.0353, -0.0118,  0.0196,  ...,  0.0510,  0.0510,  0.0431],
           [-0.1686, -0.1059, -0.0510,  ...,  0.0510,  0.0510,  0.0431]],
 
          [[-0.9922, -0.9922, -0.9922,  ..., -0.5608, -0.4510, -0.5373],
           [-0.9922, -0.9922, -0.9922,  ..., -0.5529, -0.4510, 

In [28]:
pixel_values = one_batch_t2i['images'].to(device)
texts = one_batch_t2i['conditions']
input_ids, labels, mask_prob, image_tokens_ori = prepare_inputs_and_labels(pixel_values, texts, is_train=True)

In [29]:
input_ids

tensor([[50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295,
         50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295,
         50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295,
         50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295,
         50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295,
         50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295,
         50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295,
         50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295,
         50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295,
         50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50300,
         50256, 50305,   318,   220, 50306, 50307, 50308, 50309, 50310, 50311,
         50312, 50313, 50314, 50315, 50316, 50317, 50318, 50319, 50320, 50321,
            13,   198,    32,  4590,   286,   220, 5

In [31]:
model.output_size = new_total_vocab
save_path = os.path.join("saves", concept, "t2i_wo_sys")
os.makedirs(save_path, exist_ok=True)
for epoch in range(0, 100):
    print(f"Epoch {epoch+1}")
    loss_list = []
    loss_t2i_list = []
    loss_mmu_list = []
    for batch, batch_idx, dataloader_idx in tqdm(list_combined_dataloader):
        batch_size_mmu = batch["mmu_flow"]["images"].shape[0]
        batch_size_t2i = batch["t2i_flow"]["images"].shape[0]
        
        # t2i format
        pixel_values, texts = batch["t2i_flow"]["images"], batch["t2i_flow"]["conditions"]
        pixel_values = pixel_values.to(device)
        input_ids, labels, mask_prob, image_tokens_ori = prepare_inputs_and_labels(pixel_values, texts, is_train=True)
        attention_mask = create_attention_mask_predict_next(input_ids,
                                                                pad_id=int(uni_prompting.sptids_dict['<|pad|>']),
                                                                soi_id=int(uni_prompting.sptids_dict['<|soi|>']),
                                                                eoi_id=int(uni_prompting.sptids_dict['<|eoi|>']),
                                                                rm_pad_in_image=True,
                                                                return_inverse_mask=True)
        attention_mask = attention_mask.to(mask_dtype)
        # 美国队长的盾牌
        
        # mmu format
        pixel_values_mmu, input_ids_mmu, labels_mmu = (batch["mmu_flow"]["images"],
                                                      batch["mmu_flow"]["input_ids"],
                                                      batch["mmu_flow"]["labels"])
        pixel_values_mmu = pixel_values_mmu.to(device, non_blocking=True)
        input_ids_mmu = input_ids_mmu.to(device, non_blocking=True)
        image_tokens_mmu = vq_model.get_code(pixel_values_mmu)
        image_tokens_mmu = image_tokens_mmu + len(uni_prompting.text_tokenizer)
        
        input_ids_mmu = torch.cat([
                    (torch.ones(input_ids_mmu.shape[0], 1) * uni_prompting.sptids_dict['<|mmu|>']).to(
                        device),
                    (torch.ones(input_ids_mmu.shape[0], 1) * uni_prompting.sptids_dict['<|soi|>']).to(
                        device),
                    image_tokens_mmu,
                    (torch.ones(input_ids_mmu.shape[0], 1) * uni_prompting.sptids_dict['<|eoi|>']).to(
                        device),
                    input_ids_mmu,
                ], dim=1).long()

        labels_mmu = torch.cat([
                    (torch.ones(input_ids_mmu.shape[0], 1) * uni_prompting.ignore_id).to(device),
                    (torch.ones(input_ids_mmu.shape[0], 1) * uni_prompting.ignore_id).to(device),
                    torch.ones_like(image_tokens_mmu) * uni_prompting.ignore_id,
                    (torch.ones(input_ids_mmu.shape[0], 1) * uni_prompting.ignore_id).to(device),
                    labels_mmu.to(device)
                ], dim=1).long()
        
        
        attention_mask_mmu = create_attention_mask_for_mmu(input_ids_mmu.to(input_ids.device),
                                                               eoi_id=int(uni_prompting.sptids_dict['<|eoi|>']))
        attention_mask_mmu = attention_mask_mmu.to(mask_dtype)
        attention_mask = torch.cat([attention_mask, attention_mask_mmu], dim=0)
        input_ids = torch.cat((input_ids, input_ids_mmu.to(input_ids.device)), dim=0)
        labels = torch.cat((labels, labels_mmu.to(input_ids.device)), dim=0)
        
        optimizer.zero_grad()
        
        logits, loss_t2i, loss_lm, loss_mmu = model(
                    input_ids=input_ids,
                    input_embeddings=None,
                    attention_mask=attention_mask,
                    labels=labels,
                    label_smoothing=0.0,
                    batch_size_t2i=batch_size_t2i,
                    batch_size_lm=0,
                    batch_size_mmu=batch_size_mmu,
                    max_seq_length=128,
                )
        loss = 0.8 * loss_t2i + 0.2 * loss_mmu
        # loss = loss_t2i
        loss.backward()
        optimizer.step()
        loss_list.append(loss.item())
        loss_t2i_list.append(loss_t2i.item())
        loss_mmu_list.append(loss_mmu.item())
        # tqdm.set_postfix(loss=loss.item(), loss_t2i=loss_t2i.item(), loss_mmu=loss_mmu.item())
        # tqdm.write(f"loss: {loss.item()}, loss_t2i: {loss_t2i.item()}, loss_mmu: {loss_mmu.item()}")
        # 恢复原始权重
        with torch.no_grad():
            model.showo.get_input_embeddings().weight.data[index_no_updates] = orig_embeds[index_no_updates]
            model.showo.lm_head.weight.data[index_no_updates] = orig_lm_head_weight[index_no_updates]
            model.showo.lm_head.bias.data[index_no_updates] = orig_lm_head_bias[index_no_updates]
    print(f"Epoch {epoch+1} loss: {np.mean(loss_list)}, loss_t2i: {np.mean(loss_t2i_list)}, loss_mmu: {np.mean(loss_mmu_list)}")
    print(f"  Token-Norm: {model.showo.get_input_embeddings().weight[new_token_ids].norm().item()}")
    print(f"  index_no_updates-Token-Norm: {model.showo.get_input_embeddings().weight[index_no_updates].norm().item()}")
    print(f"  LM-Head-Weight-Norm: {model.showo.lm_head.weight[new_token_ids].norm().item()}")
    print(f"  index_no_updates-LM-Head-Weight-Norm: {model.showo.lm_head.weight[index_no_updates].norm().item()}")
    print(f"  LM-Head-Bias-Norm: {model.showo.lm_head.bias[new_token_ids].norm().item()}")
    print(f"  index_no_updates-LM-Head-Bias-Norm: {model.showo.lm_head.bias[index_no_updates].norm().item()}")
    
    
    # print(f"Epoch {epoch} loss: {np.mean(loss_list)}, loss_t2i: {np.mean(loss_t2i_list)}")
    if (epoch+1) % 10 == 0:
        save_path_embed = os.path.join(save_path, f"epoch_{epoch+1}_embed.pt")
        save_path_lm_head_weight = os.path.join(save_path, f"epoch_{epoch+1}_lm_head_weight.pt")
        save_path_lm_head_bias = os.path.join(save_path, f"epoch_{epoch+1}_lm_head_bias.pt")
        
        torch.save(model.showo.get_input_embeddings().weight.data[new_token_ids], save_path_embed)
        torch.save(model.showo.lm_head.weight.data[new_token_ids], save_path_lm_head_weight)
        torch.save(model.showo.lm_head.bias.data[new_token_ids], save_path_lm_head_bias)
        
        

Epoch 1


100%|██████████| 48/48 [00:18<00:00,  2.53it/s]


Epoch 1 loss: 5.60643916328748, loss_t2i: 6.633129944403966, loss_mmu: 1.499675625314315
  Token-Norm: 16.50321388244629
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.198519706726074
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1127179861068726
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 2


100%|██████████| 48/48 [00:18<00:00,  2.57it/s]


Epoch 2 loss: 5.280611013372739, loss_t2i: 6.307993019620578, loss_mmu: 1.1710822482903798
  Token-Norm: 19.651737213134766
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.258724212646484
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1222505569458008
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 3


100%|██████████| 48/48 [00:18<00:00,  2.57it/s]


Epoch 3 loss: 5.330645129084587, loss_t2i: 6.386648600300153, loss_mmu: 1.1066311101118724
  Token-Norm: 22.207183837890625
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.26913070678711
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.125584363937378
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 4


100%|██████████| 48/48 [00:18<00:00,  2.55it/s]


Epoch 4 loss: 5.167608181635539, loss_t2i: 6.206409643093745, loss_mmu: 1.0124017621080081
  Token-Norm: 23.946935653686523
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.276002883911133
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1269809007644653
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 5


100%|██████████| 48/48 [00:18<00:00,  2.56it/s]


Epoch 5 loss: 5.1951029648383455, loss_t2i: 6.257631311813991, loss_mmu: 0.9449890106916428
  Token-Norm: 25.022785186767578
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.29468822479248
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1283031702041626
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 6


100%|██████████| 48/48 [00:18<00:00,  2.53it/s]


Epoch 6 loss: 5.378634100159009, loss_t2i: 6.489446098605792, loss_mmu: 0.9353858170409998
  Token-Norm: 26.816238403320312
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.311322212219238
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1282968521118164
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 7


100%|██████████| 48/48 [00:19<00:00,  2.51it/s]


Epoch 7 loss: 5.127976482113202, loss_t2i: 6.193236713608106, loss_mmu: 0.8669351624945799
  Token-Norm: 27.57149314880371
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.325873374938965
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1284843683242798
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 8


100%|██████████| 48/48 [00:19<00:00,  2.52it/s]


Epoch 8 loss: 5.024218554298083, loss_t2i: 6.0707163115342455, loss_mmu: 0.8382270478953918
  Token-Norm: 28.239473342895508
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.345934867858887
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1286001205444336
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 9


100%|██████████| 48/48 [00:19<00:00,  2.52it/s]


Epoch 9 loss: 5.114073286453883, loss_t2i: 6.19205467402935, loss_mmu: 0.8021475467830896
  Token-Norm: 28.826065063476562
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.35556411743164
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.12822425365448
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 10


100%|██████████| 48/48 [00:19<00:00,  2.52it/s]


Epoch 10 loss: 4.84836062292258, loss_t2i: 5.870374555389087, loss_mmu: 0.7603046658138434
  Token-Norm: 29.450265884399414
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.371969223022461
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1281378269195557
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 11


100%|██████████| 48/48 [00:18<00:00,  2.56it/s]


Epoch 11 loss: 5.11290368437767, loss_t2i: 6.204727679491043, loss_mmu: 0.7456071594109138
  Token-Norm: 30.35487174987793
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.39567756652832
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1277689933776855
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 12


100%|██████████| 48/48 [00:18<00:00,  2.56it/s]


Epoch 12 loss: 4.798916384577751, loss_t2i: 5.8184264451265335, loss_mmu: 0.7208757903426886
  Token-Norm: 31.485708236694336
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.404095649719238
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1266428232192993
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 13


100%|██████████| 48/48 [00:18<00:00,  2.57it/s]


Epoch 13 loss: 5.104570041100184, loss_t2i: 6.206378931800525, loss_mmu: 0.6973338533813754
  Token-Norm: 32.45843505859375
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.425854682922363
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1263798475265503
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 14


100%|██████████| 48/48 [00:18<00:00,  2.56it/s]


Epoch 14 loss: 4.974209914604823, loss_t2i: 6.0340076585610705, loss_mmu: 0.7350185010582209
  Token-Norm: 34.095149993896484
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.437376976013184
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1253812313079834
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 15


100%|██████████| 48/48 [00:18<00:00,  2.56it/s]


Epoch 15 loss: 4.971584764619668, loss_t2i: 6.0351056307554245, loss_mmu: 0.7175007412830988
  Token-Norm: 36.403724670410156
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.444942474365234
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1242337226867676
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 16


100%|██████████| 48/48 [00:18<00:00,  2.56it/s]


Epoch 16 loss: 5.017747059464455, loss_t2i: 6.096885032951832, loss_mmu: 0.7011946802958846
  Token-Norm: 37.36294174194336
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.448554992675781
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1228054761886597
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 17


100%|██████████| 48/48 [00:18<00:00,  2.56it/s]


Epoch 17 loss: 5.217306425174077, loss_t2i: 6.352408314744632, loss_mmu: 0.6768985347201427
  Token-Norm: 38.61777114868164
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.455872535705566
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1211857795715332
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 18


100%|██████████| 48/48 [00:18<00:00,  2.56it/s]


Epoch 18 loss: 5.074927628040314, loss_t2i: 6.177715351184209, loss_mmu: 0.6637764656916261
  Token-Norm: 39.63446044921875
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.45915699005127
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1196657419204712
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 19


100%|██████████| 48/48 [00:18<00:00,  2.55it/s]


Epoch 19 loss: 5.156219224135081, loss_t2i: 6.289958203832309, loss_mmu: 0.621262826025486
  Token-Norm: 40.327030181884766
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.460576057434082
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1178330183029175
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 20


100%|██████████| 48/48 [00:18<00:00,  2.55it/s]


Epoch 20 loss: 5.338389525810878, loss_t2i: 6.510338008403778, loss_mmu: 0.6505949643130103
  Token-Norm: 43.01359176635742
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.46848201751709
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1163867712020874
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 21


100%|██████████| 48/48 [00:18<00:00,  2.55it/s]


Epoch 21 loss: 4.947128479679425, loss_t2i: 6.014040117462476, loss_mmu: 0.6794815625374516
  Token-Norm: 44.37313461303711
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.471556663513184
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1147611141204834
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 22


100%|██████████| 48/48 [00:18<00:00,  2.55it/s]


Epoch 22 loss: 5.155593857169151, loss_t2i: 6.287154803673427, loss_mmu: 0.6293496632327636
  Token-Norm: 45.081764221191406
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.478492736816406
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.113276481628418
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 23


100%|██████████| 48/48 [00:18<00:00,  2.55it/s]


Epoch 23 loss: 4.891323626041412, loss_t2i: 5.961634799838066, loss_mmu: 0.6100784946853915
  Token-Norm: 45.790435791015625
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.472529411315918
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1110687255859375
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 24


100%|██████████| 48/48 [00:18<00:00,  2.54it/s]


Epoch 24 loss: 4.801683311661084, loss_t2i: 5.854350144664447, loss_mmu: 0.591015518642962
  Token-Norm: 46.349422454833984
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.464130401611328
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.108757495880127
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 25


100%|██████████| 48/48 [00:18<00:00,  2.55it/s]


Epoch 25 loss: 5.338821028669675, loss_t2i: 6.52657983203729, loss_mmu: 0.5877853001778325
  Token-Norm: 47.24104309082031
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.467007637023926
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1069004535675049
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 26


100%|██████████| 48/48 [00:18<00:00,  2.54it/s]


Epoch 26 loss: 4.988081723451614, loss_t2i: 6.0919165561596555, loss_mmu: 0.5727419964969158
  Token-Norm: 48.000274658203125
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.446073532104492
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.1036972999572754
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 27


100%|██████████| 48/48 [00:18<00:00,  2.56it/s]


Epoch 27 loss: 5.113727276523908, loss_t2i: 6.251602858304977, loss_mmu: 0.5622243092705806
  Token-Norm: 48.73143768310547
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.436604499816895
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.101298451423645
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 28


100%|██████████| 48/48 [00:18<00:00,  2.54it/s]


Epoch 28 loss: 4.865791608889897, loss_t2i: 5.947033266226451, loss_mmu: 0.5408246647566557
  Token-Norm: 49.2645378112793
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.419601440429688
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0983093976974487
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 29


100%|██████████| 48/48 [00:19<00:00,  2.52it/s]


Epoch 29 loss: 5.162978440523148, loss_t2i: 6.319735964139302, loss_mmu: 0.5359480769063035
  Token-Norm: 50.16530227661133
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.407896995544434
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0956178903579712
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 30


100%|██████████| 48/48 [00:18<00:00,  2.55it/s]


Epoch 30 loss: 4.885270078976949, loss_t2i: 5.96879818290472, loss_mmu: 0.5511571578681469
  Token-Norm: 51.23274230957031
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.390824317932129
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0928205251693726
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 31


100%|██████████| 48/48 [00:18<00:00,  2.56it/s]


Epoch 31 loss: 4.927416091163953, loss_t2i: 6.031465371449788, loss_mmu: 0.5112184289221963
  Token-Norm: 51.83692932128906
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.37846565246582
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0902451276779175
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 32


100%|██████████| 48/48 [00:19<00:00,  2.47it/s]


Epoch 32 loss: 5.1576909472544985, loss_t2i: 6.321021909515063, loss_mmu: 0.5043664959569772
  Token-Norm: 52.57219696044922
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.372321128845215
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0879299640655518
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 33


100%|██████████| 48/48 [00:19<00:00,  2.48it/s]


Epoch 33 loss: 5.088080217440923, loss_t2i: 6.236344123880069, loss_mmu: 0.4950240481023987
  Token-Norm: 53.440940856933594
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.350380897521973
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0849158763885498
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 34


100%|██████████| 48/48 [00:19<00:00,  2.52it/s]


Epoch 34 loss: 5.1670421461264295, loss_t2i: 6.335723419984181, loss_mmu: 0.4923166576772928
  Token-Norm: 54.56230926513672
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.336828231811523
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0822933912277222
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 35


100%|██████████| 48/48 [00:19<00:00,  2.52it/s]


Epoch 35 loss: 4.825842812657356, loss_t2i: 5.915202697118123, loss_mmu: 0.468402689943711
  Token-Norm: 55.28520584106445
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.316173553466797
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.079227089881897
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 36


100%|██████████| 48/48 [00:18<00:00,  2.54it/s]


Epoch 36 loss: 4.8309565782547, loss_t2i: 5.923739358782768, loss_mmu: 0.4598249443806708
  Token-Norm: 55.982357025146484
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.306953430175781
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0770199298858643
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 37


100%|██████████| 48/48 [00:18<00:00,  2.54it/s]


Epoch 37 loss: 4.706924043595791, loss_t2i: 5.773643443981807, loss_mmu: 0.4400461548939347
  Token-Norm: 56.497013092041016
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.290350914001465
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0745315551757812
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 38


100%|██████████| 48/48 [00:18<00:00,  2.54it/s]


Epoch 38 loss: 4.814702734351158, loss_t2i: 5.91025714079539, loss_mmu: 0.43248450150713325
  Token-Norm: 57.26054382324219
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.28498363494873
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0728697776794434
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 39


100%|██████████| 48/48 [00:18<00:00,  2.54it/s]


Epoch 39 loss: 4.874694640437762, loss_t2i: 5.984710618853569, loss_mmu: 0.4346303263058265
  Token-Norm: 58.10774612426758
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.26872730255127
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0702449083328247
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 40


100%|██████████| 48/48 [00:18<00:00,  2.55it/s]


Epoch 40 loss: 4.7524591982364655, loss_t2i: 5.8356652309497195, loss_mmu: 0.41963461724420387
  Token-Norm: 58.91019058227539
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.248220443725586
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0675140619277954
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 41


100%|██████████| 48/48 [00:18<00:00,  2.55it/s]


Epoch 41 loss: 5.112925156950951, loss_t2i: 6.286912401517232, loss_mmu: 0.41697567235678434
  Token-Norm: 59.7945442199707
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.23697280883789
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.065278172492981
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 42


100%|██████████| 48/48 [00:18<00:00,  2.55it/s]


Epoch 42 loss: 4.748321359356244, loss_t2i: 5.831887602806091, loss_mmu: 0.41405582210669917
  Token-Norm: 60.84090805053711
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.207799911499023
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.062090277671814
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 43


100%|██████████| 48/48 [00:18<00:00,  2.55it/s]


Epoch 43 loss: 4.720501939455668, loss_t2i: 5.800266397496064, loss_mmu: 0.4014436121409138
  Token-Norm: 61.879249572753906
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.177001953125
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.058803677558899
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 44


100%|██████████| 48/48 [00:19<00:00,  2.48it/s]


Epoch 44 loss: 4.99936377008756, loss_t2i: 6.1472574671109514, loss_mmu: 0.4077887331756453
  Token-Norm: 63.29739761352539
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.15695571899414
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0560611486434937
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 45


100%|██████████| 48/48 [00:19<00:00,  2.52it/s]


Epoch 45 loss: 4.962284028530121, loss_t2i: 6.100229526559512, loss_mmu: 0.4105015861180921
  Token-Norm: 64.37730407714844
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.133788108825684
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0530738830566406
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 46


100%|██████████| 48/48 [00:18<00:00,  2.54it/s]


Epoch 46 loss: 4.990025530258815, loss_t2i: 6.139670963088672, loss_mmu: 0.3914432874880731
  Token-Norm: 65.22154998779297
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.117213249206543
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0505095720291138
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 47


100%|██████████| 48/48 [00:19<00:00,  2.51it/s]


Epoch 47 loss: 4.728444864352544, loss_t2i: 5.811965644359589, loss_mmu: 0.39436114983012277
  Token-Norm: 66.57929229736328
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.10224437713623
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0481339693069458
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 48


100%|██████████| 48/48 [00:19<00:00,  2.52it/s]


Epoch 48 loss: 4.977364433308442, loss_t2i: 6.126232519745827, loss_mmu: 0.3818916372644405
  Token-Norm: 67.74710083007812
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.07744312286377
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0450552701950073
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 49


100%|██████████| 48/48 [00:19<00:00,  2.49it/s]


Epoch 49 loss: 4.879481126864751, loss_t2i: 6.007941037416458, loss_mmu: 0.36564103009489674
  Token-Norm: 68.348876953125
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.055148124694824
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0423305034637451
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 50


100%|██████████| 48/48 [00:19<00:00,  2.51it/s]


Epoch 50 loss: 4.749847277998924, loss_t2i: 5.847471823294957, loss_mmu: 0.3593486485381921
  Token-Norm: 69.10747528076172
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.022100448608398
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.038961410522461
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 51


100%|██████████| 48/48 [00:19<00:00,  2.51it/s]


Epoch 51 loss: 4.77489997446537, loss_t2i: 5.8806935747464495, loss_mmu: 0.35172537015751004
  Token-Norm: 69.86431884765625
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 14.01135540008545
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0369819402694702
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 52


100%|██████████| 48/48 [00:19<00:00,  2.50it/s]


Epoch 52 loss: 4.926793540517489, loss_t2i: 6.070079065859318, loss_mmu: 0.3536510909131418
  Token-Norm: 70.86109924316406
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.974313735961914
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.033300757408142
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 53


100%|██████████| 48/48 [00:19<00:00,  2.51it/s]


Epoch 53 loss: 4.8332086106141405, loss_t2i: 5.948513930042584, loss_mmu: 0.3719867958376805
  Token-Norm: 71.99189758300781
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.960841178894043
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0312248468399048
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 54


100%|██████████| 48/48 [00:19<00:00,  2.51it/s]


Epoch 54 loss: 4.765469479064147, loss_t2i: 5.870177199443181, loss_mmu: 0.34663844512154657
  Token-Norm: 72.5674057006836
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.950042724609375
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0294253826141357
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 55


100%|██████████| 48/48 [00:19<00:00,  2.46it/s]


Epoch 55 loss: 4.749857907493909, loss_t2i: 5.851576179265976, loss_mmu: 0.3429842982441187
  Token-Norm: 73.30559539794922
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.922405242919922
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0266164541244507
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 56


100%|██████████| 48/48 [00:19<00:00,  2.45it/s]


Epoch 56 loss: 4.866997371117274, loss_t2i: 5.99760144452254, loss_mmu: 0.3445806757857402
  Token-Norm: 74.17671203613281
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.918268203735352
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0251338481903076
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 57


100%|██████████| 48/48 [00:19<00:00,  2.41it/s]


Epoch 57 loss: 4.621662278970082, loss_t2i: 5.693930387496948, loss_mmu: 0.3325893253398438
  Token-Norm: 74.65109252929688
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.887138366699219
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0221635103225708
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 58


100%|██████████| 48/48 [00:19<00:00,  2.52it/s]


Epoch 58 loss: 4.941544324159622, loss_t2i: 6.095337371031444, loss_mmu: 0.3263718195570012
  Token-Norm: 75.37403869628906
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.87208366394043
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0201146602630615
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 59


100%|██████████| 48/48 [00:19<00:00,  2.49it/s]


Epoch 59 loss: 4.914949541290601, loss_t2i: 6.062211707234383, loss_mmu: 0.3259006786781053
  Token-Norm: 76.09632110595703
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.876079559326172
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.019339919090271
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 60


100%|██████████| 48/48 [00:19<00:00,  2.50it/s]


Epoch 60 loss: 4.686458778878053, loss_t2i: 5.778512631853421, loss_mmu: 0.3182426748486857
  Token-Norm: 76.69302368164062
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.854310035705566
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0169963836669922
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 61


100%|██████████| 48/48 [00:19<00:00,  2.46it/s]


Epoch 61 loss: 4.846143367389838, loss_t2i: 5.97802413503329, loss_mmu: 0.3186199396538238
  Token-Norm: 77.46121978759766
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.83076286315918
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0144814252853394
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 62


100%|██████████| 48/48 [00:19<00:00,  2.49it/s]


Epoch 62 loss: 4.966697583595912, loss_t2i: 6.128142977754275, loss_mmu: 0.3209155910493185
  Token-Norm: 78.73760223388672
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.796619415283203
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0112816095352173
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 63


100%|██████████| 48/48 [00:19<00:00,  2.51it/s]


Epoch 63 loss: 4.680572683612506, loss_t2i: 5.769075204928716, loss_mmu: 0.32656231833000976
  Token-Norm: 79.6781005859375
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.770702362060547
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0087629556655884
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 64


100%|██████████| 48/48 [00:19<00:00,  2.50it/s]


Epoch 64 loss: 4.926241149504979, loss_t2i: 6.078775107860565, loss_mmu: 0.3161048063387473
  Token-Norm: 80.34098815917969
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.74937629699707
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0066206455230713
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 65


100%|██████████| 48/48 [00:19<00:00,  2.50it/s]


Epoch 65 loss: 4.6512059817711515, loss_t2i: 5.7350709189971285, loss_mmu: 0.3157457730267197
  Token-Norm: 81.21074676513672
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.72185230255127
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0040472745895386
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 66


100%|██████████| 48/48 [00:19<00:00,  2.50it/s]


Epoch 66 loss: 4.892579009135564, loss_t2i: 6.036086435119311, loss_mmu: 0.31854880151028436
  Token-Norm: 82.02020263671875
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.705190658569336
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.001867651939392
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 67


100%|██████████| 48/48 [00:19<00:00,  2.45it/s]


Epoch 67 loss: 4.78677174448967, loss_t2i: 5.9071711748838425, loss_mmu: 0.30517385449881357
  Token-Norm: 82.75885009765625
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.71005630493164
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 1.0014833211898804
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 68


100%|██████████| 48/48 [00:19<00:00,  2.52it/s]


Epoch 68 loss: 5.005562017361323, loss_t2i: 6.180033137400945, loss_mmu: 0.3076768486450116
  Token-Norm: 83.57341766357422
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.687980651855469
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 0.9991980195045471
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 69


100%|██████████| 48/48 [00:19<00:00,  2.48it/s]


Epoch 69 loss: 4.832304825385411, loss_t2i: 5.9645018974939985, loss_mmu: 0.3035163127351552
  Token-Norm: 84.48741912841797
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.659146308898926
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 0.9964211583137512
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 70


100%|██████████| 48/48 [00:19<00:00,  2.52it/s]


Epoch 70 loss: 4.769666949907939, loss_t2i: 5.886111577351888, loss_mmu: 0.30388798797503114
  Token-Norm: 85.22830963134766
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.620843887329102
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 0.9932073354721069
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 71


100%|██████████| 48/48 [00:19<00:00,  2.47it/s]


Epoch 71 loss: 4.717006539305051, loss_t2i: 5.820044194658597, loss_mmu: 0.3048553200593839
  Token-Norm: 86.17371368408203
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.589157104492188
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 0.9903821349143982
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 72


100%|██████████| 48/48 [00:19<00:00,  2.44it/s]


Epoch 72 loss: 5.034942706425984, loss_t2i: 6.219238231579463, loss_mmu: 0.29776042498027283
  Token-Norm: 86.93061065673828
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.573551177978516
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 0.9883728623390198
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 73


100%|██████████| 48/48 [00:20<00:00,  2.40it/s]


Epoch 73 loss: 4.912772764762242, loss_t2i: 6.067466673751672, loss_mmu: 0.2939970172786464
  Token-Norm: 87.72671508789062
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.557528495788574
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 0.9865900874137878
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 74


100%|██████████| 48/48 [00:19<00:00,  2.50it/s]


Epoch 74 loss: 4.593718305230141, loss_t2i: 5.668252065777779, loss_mmu: 0.2955828147629897
  Token-Norm: 88.39545440673828
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.548011779785156
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 0.985378086566925
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 75


100%|██████████| 48/48 [00:20<00:00,  2.39it/s]


Epoch 75 loss: 4.283024340867996, loss_t2i: 5.282157063484192, loss_mmu: 0.28649295028299093
  Token-Norm: 88.84901428222656
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.526243209838867
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 0.9833845496177673
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 76


100%|██████████| 48/48 [00:20<00:00,  2.37it/s]


Epoch 76 loss: 4.644215539097786, loss_t2i: 5.733505740761757, loss_mmu: 0.28705442789942026
  Token-Norm: 89.6181869506836
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.494319915771484
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 0.9806681275367737
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 77


100%|██████████| 48/48 [00:19<00:00,  2.43it/s]


Epoch 77 loss: 4.838824550310771, loss_t2i: 5.974965776006381, loss_mmu: 0.2942592901332925
  Token-Norm: 90.42679595947266
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.4639253616333
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 0.9779826998710632
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 78


100%|██████████| 48/48 [00:20<00:00,  2.38it/s]


Epoch 78 loss: 4.795732413729032, loss_t2i: 5.922023360927899, loss_mmu: 0.29056813575637835
  Token-Norm: 90.97496795654297
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.459291458129883
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 0.9773116111755371
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 79


100%|██████████| 48/48 [00:19<00:00,  2.46it/s]


Epoch 79 loss: 4.748799040913582, loss_t2i: 5.866625641783078, loss_mmu: 0.27749237208627164
  Token-Norm: 91.55365753173828
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.4421968460083
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 0.9756230711936951
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 80


100%|██████████| 48/48 [00:19<00:00,  2.44it/s]


Epoch 80 loss: 4.695927267273267, loss_t2i: 5.8004749193787575, loss_mmu: 0.2777362580721577
  Token-Norm: 92.2234115600586
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.419573783874512
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 0.9735788702964783
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 81


100%|██████████| 48/48 [00:19<00:00,  2.47it/s]


Epoch 81 loss: 4.6511649290720625, loss_t2i: 5.744304940104485, loss_mmu: 0.2786045287890981
  Token-Norm: 92.92076110839844
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.392826080322266
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 0.9714959859848022
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 82


100%|██████████| 48/48 [00:19<00:00,  2.46it/s]


Epoch 82 loss: 4.740452840924263, loss_t2i: 5.855845014254252, loss_mmu: 0.2788836448453367
  Token-Norm: 93.59732818603516
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.367218971252441
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 0.9694010615348816
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 83


100%|██████████| 48/48 [00:20<00:00,  2.34it/s]


Epoch 83 loss: 4.885017280777295, loss_t2i: 6.036806404590607, loss_mmu: 0.27786024124361575
  Token-Norm: 94.42491912841797
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.341917991638184
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 0.9672917127609253
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 84


100%|██████████| 48/48 [00:20<00:00,  2.32it/s]


Epoch 84 loss: 4.589840526382129, loss_t2i: 5.668114100893338, loss_mmu: 0.2767459271320452
  Token-Norm: 95.09696197509766
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.323046684265137
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 0.9655101895332336
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 85


100%|██████████| 48/48 [00:19<00:00,  2.42it/s]


Epoch 85 loss: 4.69173438847065, loss_t2i: 5.795849457383156, loss_mmu: 0.2752737127399693
  Token-Norm: 95.65254211425781
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.30129623413086
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 0.9636515378952026
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 86


100%|██████████| 48/48 [00:20<00:00,  2.40it/s]


Epoch 86 loss: 4.483996624747912, loss_t2i: 5.536447594563167, loss_mmu: 0.2741922549903393
  Token-Norm: 96.45869445800781
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.291725158691406
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 0.9631807208061218
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 87


100%|██████████| 48/48 [00:19<00:00,  2.40it/s]


Epoch 87 loss: 4.655292173226674, loss_t2i: 5.750990768273671, loss_mmu: 0.27249741010988754
  Token-Norm: 97.14942169189453
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.25753402709961
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 0.9605458378791809
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 88


100%|██████████| 48/48 [00:19<00:00,  2.42it/s]


Epoch 88 loss: 5.053495069344838, loss_t2i: 6.24857963124911, loss_mmu: 0.2731564828039457
  Token-Norm: 97.77544403076172
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.270474433898926
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 0.9612802863121033
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 89


100%|██████████| 48/48 [00:20<00:00,  2.38it/s]


Epoch 89 loss: 4.856747211577992, loss_t2i: 6.00328305332611, loss_mmu: 0.2706034573105474
  Token-Norm: 98.46116638183594
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.2608060836792
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 0.9607225656509399
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 90


100%|██████████| 48/48 [00:19<00:00,  2.41it/s]


Epoch 90 loss: 4.559513757626216, loss_t2i: 5.632428268591563, loss_mmu: 0.26785547053441405
  Token-Norm: 99.09678649902344
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.23636531829834
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 0.9588407874107361
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 91


100%|██████████| 48/48 [00:20<00:00,  2.38it/s]


Epoch 91 loss: 4.549325630068779, loss_t2i: 5.619448482990265, loss_mmu: 0.26883394131436944
  Token-Norm: 99.61195373535156
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.232706069946289
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 0.9584458470344543
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 92


100%|██████████| 48/48 [00:19<00:00,  2.41it/s]


Epoch 92 loss: 4.606464579701424, loss_t2i: 5.692927867174149, loss_mmu: 0.26061084827718634
  Token-Norm: 100.25029754638672
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.2153902053833
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 0.9573018550872803
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 93


100%|██████████| 48/48 [00:19<00:00,  2.41it/s]


Epoch 93 loss: 4.66227591286103, loss_t2i: 5.761431102951367, loss_mmu: 0.2656546803967406
  Token-Norm: 100.92002868652344
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.17808723449707
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 0.9544312357902527
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 94


100%|██████████| 48/48 [00:20<00:00,  2.36it/s]


Epoch 94 loss: 4.515634514391422, loss_t2i: 5.579460064570109, loss_mmu: 0.2603318675731619
  Token-Norm: 101.2801742553711
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.143588066101074
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 0.9520848393440247
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 95


100%|██████████| 48/48 [00:19<00:00,  2.42it/s]


Epoch 95 loss: 4.434241759280364, loss_t2i: 5.476129213968913, loss_mmu: 0.26669158949516714
  Token-Norm: 101.95359802246094
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.09901237487793
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 0.9488313794136047
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 96


100%|██████████| 48/48 [00:20<00:00,  2.36it/s]


Epoch 96 loss: 4.72545596708854, loss_t2i: 5.8410436908404035, loss_mmu: 0.2631043044384569
  Token-Norm: 102.47635650634766
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.087751388549805
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 0.9481560587882996
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 97


100%|██████████| 48/48 [00:20<00:00,  2.38it/s]


Epoch 97 loss: 4.646288494269053, loss_t2i: 5.7415865908066435, loss_mmu: 0.26509572216309607
  Token-Norm: 103.11553192138672
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.085599899291992
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 0.9482777118682861
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 98


100%|██████████| 48/48 [00:19<00:00,  2.41it/s]


Epoch 98 loss: 4.526162003477414, loss_t2i: 5.591716552774112, loss_mmu: 0.2639434208783011
  Token-Norm: 103.64498138427734
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.072162628173828
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 0.9476011991500854
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 99


100%|██████████| 48/48 [00:20<00:00,  2.35it/s]


Epoch 99 loss: 4.649280746777852, loss_t2i: 5.744374513626099, loss_mmu: 0.26890539308078587
  Token-Norm: 104.42333221435547
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.076780319213867
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 0.9484653472900391
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438
Epoch 100


100%|██████████| 48/48 [00:20<00:00,  2.34it/s]

Epoch 100 loss: 4.427635793884595, loss_t2i: 5.467959562937419, loss_mmu: 0.26634043486167985
  Token-Norm: 104.93689727783203
  index_no_updates-Token-Norm: 475.90191650390625
  LM-Head-Weight-Norm: 13.048620223999023
  index_no_updates-LM-Head-Weight-Norm: 636.0464477539062
  LM-Head-Bias-Norm: 0.9466903805732727
  index_no_updates-LM-Head-Bias-Norm: 200.17538452148438



