### Train a ALM in Google Colab!

### Clone the repository if you don't have it already

In [1]:
import os

if not os.path.isdir('nanoALM'):
    !git clone https://github.com/LWL220184016/nanoVLM_From_Huggingface.git
%cd nanoVLM_From_Huggingface/
!ls

fatal: destination path 'nanoVLM_From_Huggingface' already exists and is not an empty directory.
/content/nanoVLM_From_Huggingface
assets			debug_tokenizer_dataset_compatibility.py
benchmark-inference.py	generate.py
benchmark_suite.py	measure_vram.py
checkpoints		models
compare1.py		nanoALM.ipynb
compare2.py		README.md
data			train.py
debug_func.py


### Imports and Setup

In [2]:
# If you get an "Error" from pip's dependency resolver but the cell complets fine, this is not an issue, you can continue :)
!pip -q install torch
!pip -q install gcsfs
!pip -q install tqdm
!pip -q install huggingface_hub
!pip -q install librosa
!pip install --upgrade datasets


Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Using cached fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Using cached fsspec-2025.3.0-py3-none-any.whl (193 kB)
Installing collected packages: fsspec
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2025.3.2 requires fsspec==2025.3.2, but you have fsspec 2025.3.0 which is incompatible.[0m[31m
[0mSuccessfully installed fsspec-2025.3.0


In [3]:
# Let's authentificate with the Hugging Face Hub so you can push your model
# from huggingface_hub import notebook_login
# notebook_login()

In [4]:
# Decide on the name of your model here!
# You will need your HF user name and the name you want to give to it
# For me, this would be "lusxvr/nanoALM"
# hf_model_name = "YOUR_HF_USER_NAME/nanoALM"

In [5]:
# nanoALM Imports (please check out the implementations in detail, that's where all the interessting stuff is!)
from data.collators import AudioQACollator, SAVEECollator
from data.datasets import SAVEEDataset, AudioQADataset
from data.processors import get_audio_processor
from data.processors import get_tokenizer
from models.audio_language_model import AudioLanguageModel
import models.utils as utils

# Libraries
import math
import time
import torch
from tqdm import tqdm
import torch.optim as optim
import matplotlib.pyplot as plt
from dataclasses import dataclass
from torch.utils.data import DataLoader
from datasets import load_dataset, concatenate_datasets

#Otherwise, the tokenizer will through a warning
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

if torch.cuda.is_available():
    device = "cuda"
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"
print(f"Using device: {device}")

torch.manual_seed(0)
torch.cuda.manual_seed_all(0)

# To reload the modules if you change something in the code
%reload_ext autoreload
%autoreload 2

Using device: cuda


### Get the dataloaders

In [6]:
def get_dataloaders(train_cfg, alm_cfg):
    # Create datasets
    audio_processor = get_audio_processor(alm_cfg.audio_sample_rate)
    tokenizer = get_tokenizer(alm_cfg.lm_tokenizer)

    # text = "splitting datasets, disable in get_dataloaders function"
    # print(f"\n\033[38;5;05m{text}05m\033[0m")
    # Load and combine all training datasets
    combined_train_data = []
    for dataset_name in train_cfg.train_dataset_name:
        train_ds = load_dataset(
        path = train_cfg.train_dataset_path,
        name = dataset_name,
        # split='train[:1000]'
    )
        combined_train_data.append(train_ds['train'])
    train_ds = concatenate_datasets(combined_train_data)

    test_ds = load_dataset(train_cfg.test_dataset_path)
    train_ds = train_ds.shuffle(seed=0) # Shuffle the training dataset, so train and val get equal contributions from all concatinated datasets

    # Apply cutoff if specified
    if train_cfg.data_cutoff_idx is None:
        total_samples = len(train_ds)  # Use the entire dataset
    else:
        total_samples = min(len(train_ds), train_cfg.data_cutoff_idx)

    val_size = int(total_samples * train_cfg.val_ratio)
    train_size = total_samples - val_size

    train_dataset = AudioQADataset(train_ds.select(range(train_size)), tokenizer, audio_processor)
    val_dataset = AudioQADataset(train_ds.select(range(train_size, total_samples)), tokenizer, audio_processor)
    test_dataset = SAVEEDataset(test_ds, tokenizer, audio_processor)

    # Create collators
    aqa_collator = AudioQACollator(tokenizer, alm_cfg.lm_max_length)
    savee_collator = SAVEECollator(tokenizer)

    # Create dataloaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=train_cfg.batch_size,
        shuffle=True,
        collate_fn=aqa_collator,
        num_workers=2,
        pin_memory=True,
        drop_last=True,
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=train_cfg.batch_size,
        shuffle=False,
        collate_fn=aqa_collator,
        num_workers=2,
        pin_memory=True,
        drop_last=True,
    )

    test_loader = DataLoader(
        test_dataset,
        batch_size=train_cfg.savee_batch_size,
        shuffle=False,
        collate_fn=savee_collator,
        pin_memory=True,
        )

    return train_loader, val_loader, test_loader

### Prepare the testing function

In [7]:
def test_savee(model, tokenizer, test_loader, device):
    total_examples = 0
    correct_predictions = 0
    with torch.no_grad():
        for batch in test_loader:
            audio = batch['audios'].to(device)
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            correct_answer = tokenizer.batch_decode(labels, skip_special_tokens=True)

            gen = model.generate(input_ids, audio, attention_mask)
            model_output = tokenizer.batch_decode(gen, skip_special_tokens=True)

            is_correct = utils.check_multiple_choice_with_regex(model_output, correct_answer)

            total_examples += len(is_correct)
            if is_correct:
                correct_predictions += sum(is_correct)
    accuracy = correct_predictions / total_examples if total_examples > 0 else 0
    return accuracy

### Add debug

In [8]:
# 在训练开始前添加这个检查函数
def debug_model_dimensions(model, input_ids, audio):
    """调试模型各层的维度"""
    print("=== Model Dimension Debug ===")

    # 检查音频编码器
    audio_features = model.audio_encoder(audio)
    print(f"Audio features shape: {audio_features.shape}")

    # 检查模态投影器
    audio_embeds = model.MP(audio_features)
    print(f"Audio embeds shape: {audio_embeds.shape}")

    # 检查文本嵌入
    text_embeds = model.decoder.token_embedding(input_ids)
    print(f"Text embeds shape: {text_embeds.shape}")

    # 检查拼接后的嵌入
    inputs_embeds = torch.cat([audio_embeds, text_embeds], dim=1)
    print(f"Combined embeds shape: {inputs_embeds.shape}")

    # 检查语言模型输出
    logits = model.decoder(inputs_embeds)
    print(f"Logits shape: {logits.shape}")
    print(f"Vocab size (last dim): {logits.shape[-1]}")

    # 检查语言模型配置
    print(f"LM vocab size config: {model.cfg.lm_vocab_size}")
    print(f"Decoder vocab size: {getattr(model.decoder, 'vocab_size', 'Not found')}")

    return logits.shape[-1]

# 在训练循环开始前调用
# vocab_size = debug_model_dimensions(model, input_ids, audios)

In [9]:
def debug_training_step(model, input_ids, audios, attention_mask, labels):
    """调试训练步骤"""
    # 添加这些调试行：
    print(f"Batch debug - input_ids shape: {input_ids.shape}, max: {input_ids.max().item()}")
    print(f"Batch debug - labels shape: {labels.shape}, max: {labels.max().item()}")
    print(f"Batch debug - Model vocab config: {model.cfg.lm_vocab_size}")

    # 检查decoder的实际vocab_size
    if hasattr(model.decoder, 'head') and hasattr(model.decoder.head, 'out_features'):
        print(f"Decoder head in_features: {model.decoder.head.in_features}")
        print(f"Decoder head out_features: {model.decoder.head.out_features}")

### Prepare the training loop

In [None]:
def get_lr(it, max_lr, max_steps):
    min_lr = max_lr * 0.1
    warmup_steps = max_steps * 0.03
    # 1) linear warmup for warmup_iters steps
    if it < warmup_steps:
        return max_lr * (it+1) / warmup_steps
    # 2) if it > lr_decay_iters, return min learning rate
    if it > max_steps:
        return min_lr
    # 3) in between, use cosine decay down to min learning rate
    decay_ratio = (it - warmup_steps) / (max_steps - warmup_steps)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff starts at 1 and goes to 0
    return min_lr + coeff * (max_lr - min_lr)

def train(train_cfg, alm_cfg):
    train_loader, val_loader, test_loader = get_dataloaders(train_cfg, alm_cfg)
    tokenizer = get_tokenizer(alm_cfg.lm_tokenizer)

    # Initialize model
    if train_cfg.resume_from_alm_checkpoint:
        model = AudioLanguageModel.from_pretrained(alm_cfg.alm_checkpoint_path)
    else:
        model = AudioLanguageModel(alm_cfg)

    print(f"nanoALM initialized with {sum(p.numel() for p in model.parameters()):,} parameters")
    print(f"Training summary: {len(train_loader.dataset)} samples, {len(train_loader)} batches/epoch, batch size {train_cfg.batch_size}")

    # Define optimizer groups
    param_groups = [{'params': model.MP.parameters(), 'lr': train_cfg.lr_mp},
                    {'params': list(model.decoder.parameters()) + list(model.audio_encoder.parameters()), 'lr': train_cfg.lr_backbones}]
    optimizer = optim.AdamW(param_groups)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    if train_cfg.compile:
        model = torch.compile(model)

    epoch_times = []
    batch_losses = []
    val_losses = []
    val_plot_steps = []
    best_accuracy = 0
    global_step = 0
    for epoch in range(train_cfg.epochs):
        epoch_start_time = time.time()
        model.train()
        total_train_loss = 0
        total_tokens_processed = 0

        for batch in tqdm(train_loader):
            batch_start_time = time.time()
            audios = batch["audio"].to(device)
            input_ids = batch["input_ids"].to(device)
            labels = batch["labels"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            # debug_model_dimensions(model, input_ids, audios)  # Debug model dimensions with dummy data
            optimizer.zero_grad()

            with torch.autocast(device_type='cuda', dtype=torch.float16): # Mixed precision training
                # debug_training_step(model, input_ids, audios, attention_mask, labels)  # Debug training step
                _, loss = model(input_ids, audios, attention_mask=attention_mask, targets=labels)

            loss.backward()

            adj_lr_mp = get_lr(global_step, train_cfg.lr_mp, len(train_loader) * train_cfg.epochs)
            adj_lr_backbones = get_lr(global_step, train_cfg.lr_backbones, len(train_loader) * train_cfg.epochs)
            optimizer.param_groups[0]['lr'] = adj_lr_mp
            optimizer.param_groups[1]['lr'] = adj_lr_backbones

            optimizer.step()

            batch_loss = loss.item()
            total_train_loss += batch_loss
            batch_losses.append(batch_loss)

            num_tokens = torch.sum(attention_mask).item()
            # 修改音頻token計算：根據實際的音頻處理方式
            audio_tokens = audios.shape[0] * alm_cfg.mp_target_length  # 使用配置的目標長度
            num_tokens += audio_tokens
            total_tokens_processed += num_tokens

            batch_end_time = time.time()
            batch_duration = batch_end_time - batch_start_time
            tokens_per_second = num_tokens / batch_duration

            if global_step % 5 == 0:
                model.eval()
                torch.cuda.empty_cache()  # Clear GPU memory
                with torch.no_grad():
                    total_val_loss = 0
                    for batch in val_loader:
                        audios = batch["audio"].to(device)
                        input_ids = batch["input_ids"].to(device)
                        labels = batch["labels"].to(device)
                        attention_mask = batch["attention_mask"].to(device)

                        with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
                            # debug_training_step(model, input_ids, audios, attention_mask, labels)  # Debug training step
                            _, loss = model(input_ids, audios, attention_mask=attention_mask, targets=labels)

                        total_val_loss += loss.item()
                    avg_val_loss = total_val_loss / len(val_loader)
                    val_losses.append(avg_val_loss)
                    val_plot_steps.append(global_step)
                epoch_accuracy = 0
                if train_cfg.eval_in_epochs:
                    epoch_accuracy = test_savee(model, tokenizer, test_loader, device)
                    if epoch_accuracy > best_accuracy:
                      best_accuracy = epoch_accuracy
                      model.save_pretrained(save_directory=alm_cfg.alm_checkpoint_path)
                    print(f"\nStep: {global_step}, Loss: {batch_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Tokens/s: {tokens_per_second:.2f}, Accuracy: {epoch_accuracy:.4f}")
                model.train()

            global_step += 1

        avg_train_loss = total_train_loss / len(train_loader)

        epoch_end_time = time.time()
        epoch_duration = epoch_end_time - epoch_start_time
        epoch_times.append(epoch_duration)

        epoch_tokens_per_second = total_tokens_processed / epoch_duration

        print(f"Epoch {epoch+1}/{train_cfg.epochs} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | Time: {epoch_duration:.2f}s | T/s: {epoch_tokens_per_second:.2f}")

    # Summary Statistics
    if not train_cfg.eval_in_epochs:
      model.save_pretrained(save_directory=alm_cfg.alm_checkpoint_path)
    try:
        model.push_to_hub(hf_model_name)
    except Exception as e:
        print(f"Error pushing model to hub: {e}")

    avg_epoch_time = sum(epoch_times) / len(epoch_times)
    total_training_time = sum(epoch_times)
    total_samples_processed = len(train_loader.dataset) * train_cfg.epochs
    avg_time_per_sample = total_training_time / total_samples_processed
    print(f"Average time per epoch: {avg_epoch_time:.2f}s")
    print(f"Average time per sample: {avg_time_per_sample:.4f}s")

    plt.plot(batch_losses, label='Train Loss')
    plt.plot(val_plot_steps, val_losses, label='Val Loss')
    plt.xlabel('Batch')
    plt.ylabel('Loss')
    plt.title('Loss Curve')
    plt.grid(True)
    plt.legend()
    plt.show()

    # With this code you can test the accuracy of the model on the SAVEE dataset
    # But if you only train with few samples, the accuracy will be very low
    # print("Testing SAVEE Accuracy:")
    # accuracy = test_savee(model, tokenizer, test_loader, device)
    # print(f"SAVEE Accuracy: {accuracy:.4f}")

### Prepare the Configs
Instead of using the config.py file in the repo (which was created to run on one H100), we will create our config here to play around with the parameters easier and adapt them to colabs capabilities

In [11]:
@dataclass
class ALMConfig:
    audio_hidden_dim: int = 768
    audio_inter_dim: int = 4 * audio_hidden_dim
    audio_patch_size: int = 16  # 音频patch大小（时间步数）
    audio_n_heads: int = 12
    audio_dropout: float = 0.0
    audio_n_blocks: int = 12
    audio_ln_eps: float = 1e-6
    audio_model_type: str = 'custom_audio_transformer'

    # 音频处理相关参数
    audio_sample_rate: int = 16000  # 采样率
    audio_n_fft: int = 400  # FFT窗口大小
    audio_hop_length: int = 160  # 跳跃长度
    audio_n_mels: int = 80  # 梅尔滤波器数量
    audio_max_length: int = 1000  # 最大时间步数

    lm_hidden_dim: int = 576
    lm_inter_dim: int = 1536
    lm_rms_eps: float = 1e-5
    lm_re_base: int = 100000
    lm_max_position_embeddings: int = 8192
    lm_vocab_size: int = 49152
    lm_n_heads: int = 9
    lm_n_kv_heads: int = 3
    lm_dropout: float = 0.0
    lm_n_blocks: int = 30
    lm_attn_scaling: float = 1.0
    lm_eos_token_id: int = 0
    lm_use_tokens: bool = False
    lm_tie_weights: bool = True
    lm_model_type: str = 'HuggingFaceTB/SmolLM2-135M'
    lm_tokenizer: str = 'HuggingFaceTB/cosmo2-tokenizer'

    # 模態投影器配置
    mp_projection_type: str = 'adaptive_pool'
    mp_target_length: int = 50
    mp_use_position_aware: bool = True

    # 計算語言模型最大長度
    lm_max_length: int = 128 - 50  # 總長度 - 音頻token長度

    # ALM特定配置
    alm_load_backbone_weights: bool = True
    alm_checkpoint_path: str = 'checkpoints'
    alm_name: str = 'nanoALM-222M'


@dataclass
class TrainConfig:
    lr_mp: float = 1e-3
    lr_backbones: float = 5e-5
    val_ratio: float = 0.2
    compile: bool = False
    data_cutoff_idx: int = 1024 # Let's only use a small subset of the data at first, otherwise it takes very long to see anything :D
    batch_size: int = 12
    savee_batch_size: int = 12
    epochs: int = 20
    eval_in_epochs: bool = False # Deactivating this in colab, because it would evaluate 1500 samples of SAVEE every time otherwise
    resume_from_alm_checkpoint: bool = False # Indicate if the training should be resumed from a checkpoint of the whole ALM or you want to start from scratch
    # train_dataset_path: str = 'AbstractTTS/IEMOCAP'
    # train_dataset_name: tuple[str, ...] = ('default', ) #All options; ("ai2d", "aokvqa", "chart2text", "chartqa", "clevr", "cocoqa", "datikz", "diagram_image_to_text", "docvqa", "dvqa", "figureqa", "finqa", "geomverse", "hateful_memes", "hitab", "iam", "iconqa", "infographic_vqa", "intergps", "localized_narratives", "mapqa", "multihiertt", "ocrvqa", "plotqa", "raven", "rendered_text", "robut_sqa", "robut_wikisql", "robut_wtq", "scienceqa", "screen2words", "st_vqa", "tabmwp", "tallyqa", "tat_qa", "textcaps", "textvqa", "tqa", "vistext", "visual7w", "visualmrc", "vqarad", "vqav2", "vsr", "websight") # "clevr_math", "okvqa", "spot_the_diff", "nlvr2", "mimic_cgd",
    train_dataset_path: str = 'speechbrain/LoquaciousSet'
    train_dataset_name: tuple[str, ...] = ('small', )
    test_dataset_path: str = "AbstractTTS/SAVEE"

### Lets run the training!

In [12]:
import os
# 要創建的目錄路徑
dir_name = ALMConfig.alm_checkpoint_path

try:
    os.mkdir(dir_name)
    print(f"Directory '{dir_name}' created successfully.")
except FileExistsError:
    print(f"Directory '{dir_name}' already exists.")
except FileNotFoundError:
    print(f"Parent directory does not exist for '{dir_name}'.")
except Exception as e:
    print(f"An error occurred: {e}")

Directory 'checkpoints' already exists.


In [13]:
alm_cfg = ALMConfig()
train_cfg = TrainConfig()
train(train_cfg, alm_cfg)

Resolving data files:   0%|          | 0/6323 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/72 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/46 [00:00<?, ?it/s]

Loading from backbone weights
Initializing AudioTransformer from scratch with custom_audio_transformer
AudioTransformer initialized with 86,087,424 parameters.
Successfully loaded HuggingFaceTB/SmolLM2-135M weights from safetensors. Model has 134,515,008 parameters.
nanoALM initialized with 221,044,800 parameters
Training summary: 820 samples, 68 batches/epoch, batch size 12


  0%|          | 0/68 [00:00<?, ?it/s]

=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48816
Batch debug - labels shape: torch.Size([12, 78]), max: 48816
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder head out_features: 49152
Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48816, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48816, min: -100
Debug - vocab_si

  1%|▏         | 1/68 [00:08<09:25,  8.44s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48910, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48910, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48987
Batch debug - labels shape: torch.Size([12, 78]), max: 48987
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

  3%|▎         | 2/68 [00:09<04:13,  3.84s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48987, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48987, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 49041
Batch debug - labels shape: torch.Size([12, 78]), max: 49041
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

  4%|▍         | 3/68 [00:10<02:44,  2.53s/it]

=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 49128
Batch debug - labels shape: torch.Size([12, 78]), max: 49128
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder head out_features: 49152
Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 49128, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 49128, min: -100
Debug - vocab_si

  6%|▌         | 4/68 [00:11<02:09,  2.02s/it]

=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48932
Batch debug - labels shape: torch.Size([12, 78]), max: 48932
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder head out_features: 49152
Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48932, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48932, min: -100
Debug - vocab_si

  7%|▋         | 5/68 [00:12<01:45,  1.67s/it]

=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48985
Batch debug - labels shape: torch.Size([12, 78]), max: 48985
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder head out_features: 49152
Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48985, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48985, min: -100
Debug - vocab_si

  9%|▉         | 6/68 [00:16<02:46,  2.69s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48910, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48910, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48985
Batch debug - labels shape: torch.Size([12, 78]), max: 48985
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 10%|█         | 7/68 [00:17<02:03,  2.02s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48985, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48985, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48985
Batch debug - labels shape: torch.Size([12, 78]), max: 48700
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 12%|█▏        | 8/68 [00:18<01:35,  1.59s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48700, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48700, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 49128
Batch debug - labels shape: torch.Size([12, 78]), max: 49128
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 13%|█▎        | 9/68 [00:18<01:15,  1.28s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 49128, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 49128, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48682
Batch debug - labels shape: torch.Size([12, 78]), max: 48682
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 15%|█▍        | 10/68 [00:19<01:02,  1.09s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48682, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48682, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48985
Batch debug - labels shape: torch.Size([12, 78]), max: 48985
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 16%|█▌        | 11/68 [00:25<02:26,  2.57s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48910, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48910, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 49128
Batch debug - labels shape: torch.Size([12, 78]), max: 48702
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 18%|█▊        | 12/68 [00:26<01:53,  2.02s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48702, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48702, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48985
Batch debug - labels shape: torch.Size([12, 78]), max: 48985
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 19%|█▉        | 13/68 [00:26<01:28,  1.61s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48985, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48985, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48816
Batch debug - labels shape: torch.Size([12, 78]), max: 48816
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 21%|██        | 14/68 [00:27<01:10,  1.31s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48816, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48816, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 49041
Batch debug - labels shape: torch.Size([12, 78]), max: 49041
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 22%|██▏       | 15/68 [00:28<00:58,  1.10s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 49041, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 49041, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48985
Batch debug - labels shape: torch.Size([12, 78]), max: 48985
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 24%|██▎       | 16/68 [00:32<01:52,  2.16s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48910, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48910, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48985
Batch debug - labels shape: torch.Size([12, 78]), max: 48234
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 25%|██▌       | 17/68 [00:33<01:26,  1.70s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48234, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48234, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48476
Batch debug - labels shape: torch.Size([12, 78]), max: 48476
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 26%|██▋       | 18/68 [00:34<01:09,  1.38s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48476, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48476, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 49126
Batch debug - labels shape: torch.Size([12, 78]), max: 48558
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 28%|██▊       | 19/68 [00:34<00:56,  1.16s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48558, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48558, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 49066
Batch debug - labels shape: torch.Size([12, 78]), max: 49066
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 29%|██▉       | 20/68 [00:35<00:51,  1.08s/it]

=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48910
Batch debug - labels shape: torch.Size([12, 78]), max: 48910
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder head out_features: 49152
Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48910, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48910, min: -100
Debug - vocab_si

 31%|███       | 21/68 [00:42<02:08,  2.73s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48910, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48910, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48932
Batch debug - labels shape: torch.Size([12, 78]), max: 48910
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 32%|███▏      | 22/68 [00:42<01:36,  2.10s/it]

=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48682
Batch debug - labels shape: torch.Size([12, 78]), max: 47657
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder head out_features: 49152
Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 47657, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 47657, min: -100
Debug - vocab_si

 34%|███▍      | 23/68 [00:43<01:14,  1.66s/it]

=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 49126
Batch debug - labels shape: torch.Size([12, 78]), max: 49126
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder head out_features: 49152
Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 49126, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 49126, min: -100
Debug - vocab_si

 35%|███▌      | 24/68 [00:44<00:59,  1.34s/it]

=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48987
Batch debug - labels shape: torch.Size([12, 78]), max: 48987
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder head out_features: 49152
Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48987, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48987, min: -100
Debug - vocab_si

 37%|███▋      | 25/68 [00:44<00:48,  1.13s/it]

=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48476
Batch debug - labels shape: torch.Size([12, 78]), max: 48476
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder head out_features: 49152
Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48476, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48476, min: -100
Debug - vocab_si

 38%|███▊      | 26/68 [00:49<01:29,  2.14s/it]

=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 49128
Batch debug - labels shape: torch.Size([12, 78]), max: 49126
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder head out_features: 49152
Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 49126, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 49126, min: -100
Debug - vocab_si

 40%|███▉      | 27/68 [00:49<01:09,  1.68s/it]

=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48985
Batch debug - labels shape: torch.Size([12, 78]), max: 48985
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder head out_features: 49152
Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48985, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48985, min: -100
Debug - vocab_si

 41%|████      | 28/68 [00:50<00:58,  1.45s/it]

=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48534
Batch debug - labels shape: torch.Size([12, 78]), max: 48534
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder head out_features: 49152
Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48534, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48534, min: -100
Debug - vocab_si

 43%|████▎     | 29/68 [00:51<00:52,  1.35s/it]

=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48476
Batch debug - labels shape: torch.Size([12, 78]), max: 48476
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder head out_features: 49152
Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48476, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48476, min: -100
Debug - vocab_si

 44%|████▍     | 30/68 [00:53<00:49,  1.31s/it]

=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48962
Batch debug - labels shape: torch.Size([12, 78]), max: 48962
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder head out_features: 49152
Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48962, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48962, min: -100
Debug - vocab_si

 46%|████▌     | 31/68 [00:57<01:29,  2.41s/it]

=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48985
Batch debug - labels shape: torch.Size([12, 78]), max: 48985
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder head out_features: 49152
Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48985, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48985, min: -100
Debug - vocab_si

 47%|████▋     | 32/68 [00:58<01:07,  1.87s/it]

=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48985
Batch debug - labels shape: torch.Size([12, 78]), max: 48985
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder head out_features: 49152


 49%|████▊     | 33/68 [00:59<00:52,  1.50s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48985, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48985, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48985
Batch debug - labels shape: torch.Size([12, 78]), max: 48985
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 50%|█████     | 34/68 [00:59<00:42,  1.24s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48985, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48985, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 49041
Batch debug - labels shape: torch.Size([12, 78]), max: 49041
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 51%|█████▏    | 35/68 [01:00<00:35,  1.06s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 49041, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 49041, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48910
Batch debug - labels shape: torch.Size([12, 78]), max: 48910
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 53%|█████▎    | 36/68 [01:06<01:24,  2.63s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48910, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48910, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48910
Batch debug - labels shape: torch.Size([12, 78]), max: 47699
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 54%|█████▍    | 37/68 [01:07<01:03,  2.05s/it]

=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 49146
Batch debug - labels shape: torch.Size([12, 78]), max: 49146
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder head out_features: 49152
Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 49146, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 49146, min: -100
Debug - vocab_si

 56%|█████▌    | 38/68 [01:08<00:48,  1.63s/it]

=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48932
Batch debug - labels shape: torch.Size([12, 78]), max: 48932
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder head out_features: 49152


 57%|█████▋    | 39/68 [01:08<00:38,  1.34s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48932, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48932, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 49126
Batch debug - labels shape: torch.Size([12, 78]), max: 47687
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 59%|█████▉    | 40/68 [01:09<00:31,  1.14s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 47687, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 47687, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48932
Batch debug - labels shape: torch.Size([12, 78]), max: 48932
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 60%|██████    | 41/68 [01:14<00:58,  2.18s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48910, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48910, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48910
Batch debug - labels shape: torch.Size([12, 78]), max: 48910
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 62%|██████▏   | 42/68 [01:14<00:44,  1.72s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48910, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48910, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 49128
Batch debug - labels shape: torch.Size([12, 78]), max: 49128
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 63%|██████▎   | 43/68 [01:15<00:34,  1.39s/it]

=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48476
Batch debug - labels shape: torch.Size([12, 78]), max: 48476
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder head out_features: 49152
Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48476, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48476, min: -100
Debug - vocab_si

 65%|██████▍   | 44/68 [01:15<00:27,  1.16s/it]

=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 49128
Batch debug - labels shape: torch.Size([12, 78]), max: 49128
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder head out_features: 49152
Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 49128, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 49128, min: -100
Debug - vocab_si

 66%|██████▌   | 45/68 [01:16<00:25,  1.10s/it]

=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48987
Batch debug - labels shape: torch.Size([12, 78]), max: 48987
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder head out_features: 49152
Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48987, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48987, min: -100
Debug - vocab_si

 68%|██████▊   | 46/68 [01:22<00:53,  2.43s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48910, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48910, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48932
Batch debug - labels shape: torch.Size([12, 78]), max: 48044
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 69%|██████▉   | 47/68 [01:23<00:39,  1.90s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48044, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48044, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 49041
Batch debug - labels shape: torch.Size([12, 78]), max: 48902
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 71%|███████   | 48/68 [01:23<00:30,  1.52s/it]

=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48910
Batch debug - labels shape: torch.Size([12, 78]), max: 48480
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder head out_features: 49152
Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48480, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48480, min: -100
Debug - vocab_si

 72%|███████▏  | 49/68 [01:24<00:23,  1.25s/it]

=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48910
Batch debug - labels shape: torch.Size([12, 78]), max: 47657
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder head out_features: 49152


 74%|███████▎  | 50/68 [01:25<00:19,  1.07s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 47657, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 47657, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48985
Batch debug - labels shape: torch.Size([12, 78]), max: 48985
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 75%|███████▌  | 51/68 [01:30<00:38,  2.26s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48910, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48910, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 47926
Batch debug - labels shape: torch.Size([12, 78]), max: 47926
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 76%|███████▋  | 52/68 [01:31<00:31,  1.96s/it]

=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48987
Batch debug - labels shape: torch.Size([12, 78]), max: 48987
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder head out_features: 49152


 78%|███████▊  | 53/68 [01:32<00:24,  1.66s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48987, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48987, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 49126
Batch debug - labels shape: torch.Size([12, 78]), max: 48815
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 79%|███████▉  | 54/68 [01:32<00:18,  1.35s/it]

=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48932
Batch debug - labels shape: torch.Size([12, 78]), max: 48910
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder head out_features: 49152


 81%|████████  | 55/68 [01:33<00:14,  1.15s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48910, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48910, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48866
Batch debug - labels shape: torch.Size([12, 78]), max: 48866
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 82%|████████▏ | 56/68 [01:38<00:26,  2.23s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48910, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48910, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48910
Batch debug - labels shape: torch.Size([12, 78]), max: 48910
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 84%|████████▍ | 57/68 [01:39<00:19,  1.76s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48910, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48910, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48987
Batch debug - labels shape: torch.Size([12, 78]), max: 48987
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 85%|████████▌ | 58/68 [01:39<00:14,  1.44s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48987, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48987, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 49041
Batch debug - labels shape: torch.Size([12, 78]), max: 48910
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 87%|████████▋ | 59/68 [01:40<00:10,  1.20s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48910, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48910, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 49041
Batch debug - labels shape: torch.Size([12, 78]), max: 48558
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 88%|████████▊ | 60/68 [01:41<00:08,  1.05s/it]

=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 49128
Batch debug - labels shape: torch.Size([12, 78]), max: 49128
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder head out_features: 49152
Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 49128, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 49128, min: -100
Debug - vocab_si

 90%|████████▉ | 61/68 [01:47<00:18,  2.64s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48910, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48910, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48987
Batch debug - labels shape: torch.Size([12, 78]), max: 48985
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 91%|█████████ | 62/68 [01:48<00:12,  2.06s/it]

=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 49126
Batch debug - labels shape: torch.Size([12, 78]), max: 49126
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder head out_features: 49152


 93%|█████████▎| 63/68 [01:48<00:08,  1.64s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 49126, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 49126, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48234
Batch debug - labels shape: torch.Size([12, 78]), max: 47657
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 94%|█████████▍| 64/68 [01:49<00:05,  1.35s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 47657, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 47657, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48910
Batch debug - labels shape: torch.Size([12, 78]), max: 48910
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 96%|█████████▌| 65/68 [01:50<00:03,  1.13s/it]

=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48910
Batch debug - labels shape: torch.Size([12, 78]), max: 48910
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder head out_features: 49152
Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48910, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48910, min: -100
Debug - vocab_si

 97%|█████████▋| 66/68 [01:54<00:04,  2.22s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48910, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48910, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 49126
Batch debug - labels shape: torch.Size([12, 78]), max: 48476
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 99%|█████████▊| 67/68 [01:55<00:01,  1.76s/it]

=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48816
Batch debug - labels shape: torch.Size([12, 78]), max: 48051
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder head out_features: 49152


100%|██████████| 68/68 [01:56<00:00,  1.54s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48051, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48051, min: -100
Debug - vocab_size (logits dim -1): 49152


100%|██████████| 68/68 [01:56<00:00,  1.72s/it]


Epoch 1/20 | Train Loss: 4.0630 | Val Loss: 3.3344 | Time: 116.67s | T/s: 631.80


  0%|          | 0/68 [00:00<?, ?it/s]

=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 49128
Batch debug - labels shape: torch.Size([12, 78]), max: 48815
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder head out_features: 49152


  1%|▏         | 1/68 [00:01<01:37,  1.46s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48815, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48815, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 49128
Batch debug - labels shape: torch.Size([12, 78]), max: 49128
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

  3%|▎         | 2/68 [00:02<01:05,  1.00it/s]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 49128, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 49128, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 49041
Batch debug - labels shape: torch.Size([12, 78]), max: 48987
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

  4%|▍         | 3/68 [00:06<02:49,  2.61s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48910, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48910, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 49126
Batch debug - labels shape: torch.Size([12, 78]), max: 48687
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

  6%|▌         | 4/68 [00:07<01:57,  1.84s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48687, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48687, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48910
Batch debug - labels shape: torch.Size([12, 78]), max: 48208
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

  7%|▋         | 5/68 [00:07<01:29,  1.41s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48208, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48208, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 47687
Batch debug - labels shape: torch.Size([12, 78]), max: 47687
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

  9%|▉         | 6/68 [00:08<01:11,  1.16s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 47687, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 47687, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 49041
Batch debug - labels shape: torch.Size([12, 78]), max: 49041
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 10%|█         | 7/68 [00:09<01:01,  1.00s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 49041, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 49041, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48866
Batch debug - labels shape: torch.Size([12, 78]), max: 48866
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 12%|█▏        | 8/68 [00:15<02:41,  2.69s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48910, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48910, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 49041
Batch debug - labels shape: torch.Size([12, 78]), max: 49041
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 13%|█▎        | 9/68 [00:16<02:00,  2.05s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 49041, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 49041, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48932
Batch debug - labels shape: torch.Size([12, 78]), max: 48910
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 15%|█▍        | 10/68 [00:16<01:33,  1.62s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48910, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48910, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48816
Batch debug - labels shape: torch.Size([12, 78]), max: 48816
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 16%|█▌        | 11/68 [00:17<01:15,  1.32s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48816, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48816, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48682
Batch debug - labels shape: torch.Size([12, 78]), max: 48682
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 18%|█▊        | 12/68 [00:18<01:02,  1.11s/it]

=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48932
Batch debug - labels shape: torch.Size([12, 78]), max: 48932
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder head out_features: 49152
Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48932, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48932, min: -100
Debug - vocab_si

 19%|█▉        | 13/68 [00:22<02:01,  2.21s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48910, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48910, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48558
Batch debug - labels shape: torch.Size([12, 78]), max: 48558
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 21%|██        | 14/68 [00:23<01:34,  1.74s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48558, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48558, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 49126
Batch debug - labels shape: torch.Size([12, 78]), max: 48987
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 22%|██▏       | 15/68 [00:24<01:16,  1.44s/it]

=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48910
Batch debug - labels shape: torch.Size([12, 78]), max: 48702
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder head out_features: 49152


 24%|██▎       | 16/68 [00:25<01:08,  1.32s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48702, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48702, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48985
Batch debug - labels shape: torch.Size([12, 78]), max: 48985
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 25%|██▌       | 17/68 [00:26<01:04,  1.27s/it]

=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48910
Batch debug - labels shape: torch.Size([12, 78]), max: 47197
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder head out_features: 49152
Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 47197, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 47197, min: -100
Debug - vocab_si

 26%|██▋       | 18/68 [00:31<01:55,  2.32s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48910, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48910, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48985
Batch debug - labels shape: torch.Size([12, 78]), max: 48985
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 28%|██▊       | 19/68 [00:31<01:29,  1.82s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48985, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48985, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 49128
Batch debug - labels shape: torch.Size([12, 78]), max: 49128
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 29%|██▉       | 20/68 [00:32<01:10,  1.48s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 49128, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 49128, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48234
Batch debug - labels shape: torch.Size([12, 78]), max: 48234
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 31%|███       | 21/68 [00:33<00:57,  1.23s/it]

=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48985
Batch debug - labels shape: torch.Size([12, 78]), max: 48985
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder head out_features: 49152


 32%|███▏      | 22/68 [00:33<00:48,  1.05s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48985, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48985, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48932
Batch debug - labels shape: torch.Size([12, 78]), max: 48687
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 34%|███▍      | 23/68 [00:40<01:56,  2.58s/it]

=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 49126
Batch debug - labels shape: torch.Size([12, 78]), max: 49126
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder head out_features: 49152


 35%|███▌      | 24/68 [00:40<01:28,  2.01s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 49126, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 49126, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48476
Batch debug - labels shape: torch.Size([12, 78]), max: 48051
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 37%|███▋      | 25/68 [00:41<01:09,  1.61s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48051, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48051, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 47792
Batch debug - labels shape: torch.Size([12, 78]), max: 47792
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 38%|███▊      | 26/68 [00:42<00:55,  1.33s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 47792, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 47792, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 49126
Batch debug - labels shape: torch.Size([12, 78]), max: 47483
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 40%|███▉      | 27/68 [00:42<00:46,  1.13s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 47483, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 47483, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48007
Batch debug - labels shape: torch.Size([12, 78]), max: 48007
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 41%|████      | 28/68 [00:47<01:30,  2.26s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48910, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48910, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48985
Batch debug - labels shape: torch.Size([12, 78]), max: 48558
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 43%|████▎     | 29/68 [00:48<01:09,  1.79s/it]

=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48932
Batch debug - labels shape: torch.Size([12, 78]), max: 48446
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder head out_features: 49152


 44%|████▍     | 30/68 [00:49<00:55,  1.45s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48446, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48446, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 49126
Batch debug - labels shape: torch.Size([12, 78]), max: 49126
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 46%|████▌     | 31/68 [00:49<00:45,  1.23s/it]

=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48985
Batch debug - labels shape: torch.Size([12, 78]), max: 48985
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder head out_features: 49152


 47%|████▋     | 32/68 [00:50<00:43,  1.22s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48985, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48985, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 49041
Batch debug - labels shape: torch.Size([12, 78]), max: 48816
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 49%|████▊     | 33/68 [00:56<01:24,  2.41s/it]

=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 49126
Batch debug - labels shape: torch.Size([12, 78]), max: 49126
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder head out_features: 49152


 50%|█████     | 34/68 [00:56<01:04,  1.89s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 49126, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 49126, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 47657
Batch debug - labels shape: torch.Size([12, 78]), max: 47657
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 51%|█████▏    | 35/68 [00:57<00:50,  1.52s/it]

=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 49128
Batch debug - labels shape: torch.Size([12, 78]), max: 49128
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder head out_features: 49152


 53%|█████▎    | 36/68 [00:58<00:40,  1.27s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 49128, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 49128, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48910
Batch debug - labels shape: torch.Size([12, 78]), max: 48910
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 54%|█████▍    | 37/68 [00:58<00:33,  1.09s/it]

=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48985
Batch debug - labels shape: torch.Size([12, 78]), max: 48985
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder head out_features: 49152
Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48985, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48985, min: -100
Debug - vocab_si

 56%|█████▌    | 38/68 [01:05<01:19,  2.66s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48910, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48910, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 47699
Batch debug - labels shape: torch.Size([12, 78]), max: 47699
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 57%|█████▋    | 39/68 [01:05<01:00,  2.08s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 47699, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 47699, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48932
Batch debug - labels shape: torch.Size([12, 78]), max: 48816
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 59%|█████▉    | 40/68 [01:06<00:46,  1.65s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48816, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48816, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48910
Batch debug - labels shape: torch.Size([12, 78]), max: 48910
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 60%|██████    | 41/68 [01:07<00:36,  1.36s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48910, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48910, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48559
Batch debug - labels shape: torch.Size([12, 78]), max: 48235
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 62%|██████▏   | 42/68 [01:07<00:29,  1.15s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48235, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48235, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48910
Batch debug - labels shape: torch.Size([12, 78]), max: 48910
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 63%|██████▎   | 43/68 [01:12<00:55,  2.21s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48910, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48910, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 49126
Batch debug - labels shape: torch.Size([12, 78]), max: 49126
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 65%|██████▍   | 44/68 [01:13<00:42,  1.75s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 49126, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 49126, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48987
Batch debug - labels shape: torch.Size([12, 78]), max: 48987
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 66%|██████▌   | 45/68 [01:13<00:32,  1.41s/it]

=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48932
Batch debug - labels shape: torch.Size([12, 78]), max: 48932
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder head out_features: 49152
Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48932, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48932, min: -100
Debug - vocab_si

 68%|██████▊   | 46/68 [01:14<00:25,  1.18s/it]

=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 49128
Batch debug - labels shape: torch.Size([12, 78]), max: 49128
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder head out_features: 49152
Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 49128, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 49128, min: -100
Debug - vocab_si

 69%|██████▉   | 47/68 [01:15<00:21,  1.01s/it]

=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 49128
Batch debug - labels shape: torch.Size([12, 78]), max: 49128
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder head out_features: 49152
Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 49128, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 49128, min: -100
Debug - vocab_si

 71%|███████   | 48/68 [01:20<00:48,  2.45s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48910, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48910, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48932
Batch debug - labels shape: torch.Size([12, 78]), max: 48932
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 72%|███████▏  | 49/68 [01:21<00:36,  1.91s/it]

Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48932, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48932, min: -100
Debug - vocab_size (logits dim -1): 49152
=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48910
Batch debug - labels shape: torch.Size([12, 78]), max: 48910
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder

 74%|███████▎  | 50/68 [01:22<00:28,  1.56s/it]

=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 48987
Batch debug - labels shape: torch.Size([12, 78]), max: 48987
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder head out_features: 49152
Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 48987, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 48987, min: -100
Debug - vocab_si

 75%|███████▌  | 51/68 [01:23<00:23,  1.40s/it]

=== Model Dimension Debug ===
Audio features shape: torch.Size([12, 62, 768])
Audio embeds shape: torch.Size([12, 50, 576])
Text embeds shape: torch.Size([12, 78, 576])
Combined embeds shape: torch.Size([12, 128, 576])
Logits shape: torch.Size([12, 128, 576])
Vocab size (last dim): 576
LM vocab size config: 49152
Decoder vocab size: Not found
Batch debug - input_ids shape: torch.Size([12, 78]), max: 49041
Batch debug - labels shape: torch.Size([12, 78]), max: 49041
Batch debug - Model vocab config: 49152
Decoder head in_features: 576
Decoder head out_features: 49152
Final logits shape: torch.Size([12, 128, 49152])
Expected vocab_size: 49152
Debug - logits shape: torch.Size([12, 128, 49152])
Debug - audio_embeds shape: torch.Size([12, 50, 576])
Debug - targets shape: torch.Size([12, 78])
Debug - targets max: 49041, min: -100
Debug - shift_logits shape: torch.Size([12, 77, 49152])
Debug - shift_labels shape: torch.Size([12, 77])
Debug - shift_labels max: 49041, min: -100
Debug - vocab_si

 75%|███████▌  | 51/68 [01:24<00:28,  1.66s/it]


KeyboardInterrupt: 

As you can see the model trains, so feel free to play around with the architecture or data! Let us know what you build with it!

PS: If you want to test the model, check out generate.py to see how to do inference with it