<a href="https://colab.research.google.com/github/Hiromi06/machine-translation/blob/main/train_on_chunk.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install accelerate bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3

In [None]:
import os
import sys
import pprint
import torch
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler
from transformers import BitsAndBytesConfig
from tqdm import tqdm
import gc
import bitsandbytes as bnb
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [None]:
nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
from transformers import BertTokenizer, BertConfig, BertForMaskedLM, AdamW, get_linear_schedule_with_warmup

model_name = 'bert-base-multilingual-cased'
device = torch.device(f'cuda:{rank}')
config = BertConfig.from_pretrained(model_name)
bert_masked_model = BertForMaskedLM.from_pretrained(model_name, torch_dtype=torch.float16, low_cpu_mem_usage=True, quantization_config=nf4_config)
bert_masked_model = DDP(bert_masked_model, device_ids=[rank])
optimizer = torch.optim.AdamW(bert_masked_model.parameters(), lr=2e-5)




Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.distributed as dist

# Initialize the process group
dist.init_process_group(backend='nccl')

# Wrap the model with DDP
model = DDP(bert_masked_model)

In [None]:
def setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'  # non-used port number
    dist.init_process_group("nccl", rank=rank, world_size=world_size)

def cleanup():
    dist.destroy_process_group()

In [None]:
class TranslationDataset(Dataset):
    def __init__(self, input_data, label_data):
        self.input_data = input_data
        self.label_data = label_data

    def __len__(self):
        return len(self.input_data)

    def __getitem__(self, idx):
        #input_item = {key: val[idx] for key, val in self.input_data.items()}
        #label_item = {key: val[idx] for key, val in self.label_data.items()}
        input_item = self.input_data[idx]
        label_item = self.label_data[idx]
        return {
            'input_ids': input_item['input_ids'],
            'attention_mask': input_item['attention_mask'],
            'labels': label_item['input_ids']  # For translation, the label is the input_ids of the target language
        }


def get_file_paths(directory, prefix):
    return [os.path.join(directory, f) for f in sorted(os.listdir(directory)) if f.startswith(prefix) and f.endswith('.pt')]

def load_chunk(file_path):
    return torch.load(file_path)

def clear_memory():
    gc.collect()
    torch.cuda.ipc_collect()
    torch.cuda.empty_cache()

def train_on_chunk(bert_masked_model, optimizer, scheduler, device, input_chunk_path, label_chunk_path, batch_size=8):
    input_data = load_chunk(input_chunk_path)
    label_data = load_chunk(label_chunk_path)

    dataset = TranslationDataset(input_data, label_data)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    total_loss = 0
    bert_masked_model.train()
    scaler = GradScaler()

    loop = tqdm(dataloader, leave=True)
    for batch in loop:
        input_ids = batch['input_ids'].squeeze(1).to(device)
        attention_masks = batch['attention_mask'].squeeze(1).to(device)
        labels = batch['labels'].squeeze(1).to(device)

        print(f"input_ids shape: {input_ids.shape}")
        print(f"attention_masks shape: {attention_masks.shape}")
        print(f"labels shape: {labels.shape}")
        # without squeeze, the data looks like this
        # input_ids shape: torch.Size([16, 1, 512])
        # attention_masks shape: torch.Size([16, 1, 512])
        # labels shape: torch.Size([16, 1, 512])

        optimizer.zero_grad()

        """
        # apply AMP (Mixed-Precision Training)
        with torch.cuda.amp.autocast():
          outputs = bert_masked_model(input_ids=input_ids, attention_mask=attention_masks, labels=labels)
          loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        """

        # Regular (Non-AMP) Training
        outputs = bert_masked_model(input_ids=input_ids, attention_mask=attention_masks, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()


        total_loss += loss.item()

        loop.set_postfix(loss=total_loss/len(dataloader))

        scheduler.step()

    avg_train_loss = total_loss / len(dataloader)
    print(f"Train Loss: {avg_train_loss}")


    # Clear memory
    del input_data, label_data, dataset, dataloader
    clear_memory()

def validate_on_chunks(bert_masked_model, device, val_input_files, val_label_files, batch_size=8):
    bert_masked_model.eval()
    total_eval_loss = 0

    for input_chunk_path, label_chunk_path in zip(val_input_files, val_label_files):
        input_data = load_chunk(input_chunk_path)
        label_data = load_chunk(label_chunk_path)

        dataset = TranslationDataset(input_data, label_data)
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

        with torch.no_grad():
            for batch in dataloader:
                input_ids = batch['input_ids'].squeeze(1).to(device)
                attention_masks = batch['attention_mask'].squeeze(1).to(device)
                labels = batch['labels'].squeeze(1).to(device)

                outputs = bert_masked_model(input_ids=input_ids, attention_mask=attention_masks, labels=labels)
                loss = outputs.loss
                total_eval_loss += loss.item()

        # Clear memory
        del input_data, label_data, dataset, dataloader
        clear_memory()

    avg_val_loss = total_eval_loss / len(val_input_files)
    print(f"Validation Loss: {avg_val_loss}")




# Prepare file paths for training and validation
encoded_data_dir = '/content/drive/MyDrive/machine_learning'
ja_train_files = get_file_paths(encoded_data_dir, 'ja_train_encoded_chunk')
en_train_files = get_file_paths(encoded_data_dir, 'en_train_encoded_chunk')
ja_test_files = get_file_paths(encoded_data_dir, 'ja_test_encoded_chunk')
en_test_files = get_file_paths(encoded_data_dir, 'en_test_encoded_chunk')


num_training_steps = 3 * len(ja_train_files)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Train the model one chunk at a time
for epoch in range(3):  # Number of epochs
    for i in range(len(ja_train_files)):
        print(f"Epoch {epoch + 1}, Chunk {i + 1}")
        train_on_chunk(bert_masked_model, optimizer, scheduler, device, ja_train_files[i], en_train_files[i])

    # Perform validation after each epoch
    print(f"Epoch {epoch + 1}, Validation")
    validate_on_chunks(bert_masked_model, device, ja_test_files, en_test_files)

# Save the final model
bert_masked_model.save_pretrained('/content/drive/MyDrive/machine_learning')

Epoch 1, Chunk 1


  0%|          | 0/49298 [00:00<?, ?it/s]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 1/49298 [00:00<5:54:44,  2.32it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 2/49298 [00:00<4:52:58,  2.80it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 3/49298 [00:01<4:33:08,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 4/49298 [00:01<4:24:13,  3.11it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 5/49298 [00:01<4:19:11,  3.17it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 6/49298 [00:01<4:16:27,  3.20it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 7/49298 [00:02<4:13:59,  3.23it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 8/49298 [00:02<4:13:11,  3.24it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 9/49298 [00:02<4:12:10,  3.26it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 10/49298 [00:03<4:11:56,  3.26it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 11/49298 [00:03<4:11:38,  3.26it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 12/49298 [00:03<4:11:36,  3.26it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 13/49298 [00:04<4:12:12,  3.26it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 14/49298 [00:04<4:11:51,  3.26it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 15/49298 [00:04<4:11:39,  3.26it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 16/49298 [00:05<4:11:33,  3.27it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 17/49298 [00:05<4:11:23,  3.27it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 18/49298 [00:05<4:10:29,  3.28it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 19/49298 [00:05<4:10:49,  3.27it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 20/49298 [00:06<4:10:50,  3.27it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 21/49298 [00:06<4:10:54,  3.27it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 22/49298 [00:06<4:11:54,  3.26it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 23/49298 [00:07<4:12:01,  3.26it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 24/49298 [00:07<4:12:00,  3.26it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 25/49298 [00:07<4:12:02,  3.26it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 26/49298 [00:08<4:12:08,  3.26it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 27/49298 [00:08<4:12:48,  3.25it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 28/49298 [00:08<4:12:59,  3.25it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 29/49298 [00:09<4:13:10,  3.24it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 30/49298 [00:09<4:13:18,  3.24it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 31/49298 [00:09<4:12:39,  3.25it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 32/49298 [00:09<4:13:01,  3.25it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 33/49298 [00:10<4:12:59,  3.25it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 34/49298 [00:10<4:13:20,  3.24it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 35/49298 [00:10<4:13:27,  3.24it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 36/49298 [00:11<4:13:22,  3.24it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 37/49298 [00:11<4:13:26,  3.24it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 38/49298 [00:11<4:13:51,  3.23it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 39/49298 [00:12<4:14:08,  3.23it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 40/49298 [00:12<4:13:44,  3.24it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 41/49298 [00:12<4:13:32,  3.24it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 42/49298 [00:13<4:13:40,  3.24it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 43/49298 [00:13<4:13:53,  3.23it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 44/49298 [00:13<4:13:36,  3.24it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 45/49298 [00:13<4:13:43,  3.24it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 46/49298 [00:14<4:14:05,  3.23it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 47/49298 [00:14<4:13:53,  3.23it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 48/49298 [00:14<4:13:43,  3.24it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 49/49298 [00:15<4:13:55,  3.23it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 50/49298 [00:15<4:14:25,  3.23it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 51/49298 [00:15<4:14:34,  3.22it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 52/49298 [00:16<4:14:38,  3.22it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 53/49298 [00:16<4:14:22,  3.23it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 54/49298 [00:16<4:14:21,  3.23it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 55/49298 [00:17<4:14:38,  3.22it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 56/49298 [00:17<4:14:21,  3.23it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 57/49298 [00:17<4:14:58,  3.22it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 58/49298 [00:17<4:14:47,  3.22it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 59/49298 [00:18<4:15:07,  3.22it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 60/49298 [00:18<4:15:09,  3.22it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 61/49298 [00:18<4:14:48,  3.22it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 62/49298 [00:19<4:14:40,  3.22it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 63/49298 [00:19<4:15:06,  3.22it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 64/49298 [00:19<4:14:51,  3.22it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 65/49298 [00:20<4:15:09,  3.22it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 66/49298 [00:20<4:15:09,  3.22it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 67/49298 [00:20<4:15:18,  3.21it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 68/49298 [00:21<4:15:53,  3.21it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 69/49298 [00:21<4:15:24,  3.21it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 70/49298 [00:21<4:15:17,  3.21it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 71/49298 [00:22<4:15:18,  3.21it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 72/49298 [00:22<4:16:04,  3.20it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 73/49298 [00:22<4:15:54,  3.21it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 74/49298 [00:22<4:15:56,  3.21it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 75/49298 [00:23<4:16:27,  3.20it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 76/49298 [00:23<4:16:21,  3.20it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 77/49298 [00:23<4:16:11,  3.20it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 78/49298 [00:24<4:16:13,  3.20it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 79/49298 [00:24<4:16:26,  3.20it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 80/49298 [00:24<4:16:10,  3.20it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 81/49298 [00:25<4:16:09,  3.20it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 82/49298 [00:25<4:16:22,  3.20it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 83/49298 [00:25<4:16:30,  3.20it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 84/49298 [00:26<4:16:15,  3.20it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 85/49298 [00:26<4:15:55,  3.20it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 86/49298 [00:26<4:16:22,  3.20it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 87/49298 [00:27<4:16:23,  3.20it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 88/49298 [00:27<4:16:26,  3.20it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 89/49298 [00:27<4:16:17,  3.20it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 90/49298 [00:27<4:16:57,  3.19it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 91/49298 [00:28<4:16:59,  3.19it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 92/49298 [00:28<4:17:27,  3.19it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 93/49298 [00:28<4:17:23,  3.19it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 94/49298 [00:29<4:17:12,  3.19it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 95/49298 [00:29<4:17:25,  3.19it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 96/49298 [00:29<4:17:29,  3.18it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 97/49298 [00:30<4:17:01,  3.19it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 98/49298 [00:30<4:17:23,  3.19it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 99/49298 [00:30<4:17:49,  3.18it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 100/49298 [00:31<4:17:52,  3.18it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 101/49298 [00:31<4:17:31,  3.18it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 102/49298 [00:31<4:17:57,  3.18it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 103/49298 [00:32<4:18:01,  3.18it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 104/49298 [00:32<4:17:59,  3.18it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 105/49298 [00:32<4:18:03,  3.18it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 106/49298 [00:32<4:17:38,  3.18it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 107/49298 [00:33<4:17:38,  3.18it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 108/49298 [00:33<4:17:29,  3.18it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 109/49298 [00:33<4:17:56,  3.18it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 110/49298 [00:34<4:17:49,  3.18it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 111/49298 [00:34<4:17:54,  3.18it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 112/49298 [00:34<4:18:16,  3.17it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 113/49298 [00:35<4:18:21,  3.17it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 114/49298 [00:35<4:18:57,  3.17it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 115/49298 [00:35<4:18:51,  3.17it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 116/49298 [00:36<4:19:14,  3.16it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 117/49298 [00:36<4:19:04,  3.16it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 118/49298 [00:36<4:18:28,  3.17it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 119/49298 [00:37<4:18:36,  3.17it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 120/49298 [00:37<4:18:50,  3.17it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 121/49298 [00:37<4:18:39,  3.17it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 122/49298 [00:38<4:18:54,  3.17it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 123/49298 [00:38<4:18:51,  3.17it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 124/49298 [00:38<4:18:51,  3.17it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 125/49298 [00:38<4:18:56,  3.17it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 126/49298 [00:39<4:18:55,  3.17it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 127/49298 [00:39<4:18:57,  3.16it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 128/49298 [00:39<4:19:05,  3.16it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 129/49298 [00:40<4:19:11,  3.16it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 130/49298 [00:40<4:19:01,  3.16it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 131/49298 [00:40<4:18:45,  3.17it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 132/49298 [00:41<4:18:48,  3.17it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 133/49298 [00:41<4:19:02,  3.16it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 134/49298 [00:41<4:18:29,  3.17it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 135/49298 [00:42<4:18:37,  3.17it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 136/49298 [00:42<4:19:21,  3.16it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 137/49298 [00:42<4:19:12,  3.16it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 138/49298 [00:43<4:19:06,  3.16it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 139/49298 [00:43<4:18:55,  3.16it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 140/49298 [00:43<4:19:11,  3.16it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 141/49298 [00:44<4:19:27,  3.16it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 142/49298 [00:44<4:19:35,  3.16it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 143/49298 [00:44<4:19:27,  3.16it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 144/49298 [00:44<4:19:06,  3.16it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 145/49298 [00:45<4:19:11,  3.16it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 146/49298 [00:45<4:19:17,  3.16it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 147/49298 [00:45<4:19:30,  3.16it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 148/49298 [00:46<4:19:39,  3.15it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 149/49298 [00:46<4:19:49,  3.15it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 150/49298 [00:46<4:20:07,  3.15it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 151/49298 [00:47<4:19:58,  3.15it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 152/49298 [00:47<4:20:04,  3.15it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 153/49298 [00:47<4:19:37,  3.15it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 154/49298 [00:48<4:19:48,  3.15it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 155/49298 [00:48<4:19:53,  3.15it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 156/49298 [00:48<4:20:05,  3.15it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 157/49298 [00:49<4:19:51,  3.15it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 158/49298 [00:49<4:19:42,  3.15it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 159/49298 [00:49<4:20:04,  3.15it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 160/49298 [00:50<4:20:14,  3.15it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 161/49298 [00:50<4:20:04,  3.15it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 162/49298 [00:50<4:20:11,  3.15it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 163/49298 [00:51<4:20:07,  3.15it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 164/49298 [00:51<4:19:56,  3.15it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 165/49298 [00:51<4:20:21,  3.15it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 166/49298 [00:51<4:20:05,  3.15it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 167/49298 [00:52<4:20:05,  3.15it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 168/49298 [00:52<4:20:02,  3.15it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 169/49298 [00:52<4:20:19,  3.15it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 170/49298 [00:53<4:20:14,  3.15it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 171/49298 [00:53<4:20:26,  3.14it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 172/49298 [00:53<4:20:46,  3.14it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 173/49298 [00:54<4:20:15,  3.15it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 174/49298 [00:54<4:20:28,  3.14it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 175/49298 [00:54<4:20:43,  3.14it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 176/49298 [00:55<4:20:53,  3.14it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 177/49298 [00:55<4:20:37,  3.14it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 178/49298 [00:55<4:20:29,  3.14it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 179/49298 [00:56<4:20:05,  3.15it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 180/49298 [00:56<4:20:20,  3.14it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 181/49298 [00:56<4:20:30,  3.14it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 182/49298 [00:57<4:20:41,  3.14it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 183/49298 [00:57<4:20:56,  3.14it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 184/49298 [00:57<4:20:43,  3.14it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 185/49298 [00:58<4:21:16,  3.13it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 186/49298 [00:58<4:21:09,  3.13it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 187/49298 [00:58<4:21:21,  3.13it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 188/49298 [00:58<4:21:24,  3.13it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 189/49298 [00:59<4:21:33,  3.13it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 190/49298 [00:59<4:21:25,  3.13it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 191/49298 [00:59<4:21:38,  3.13it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 192/49298 [01:00<4:21:44,  3.13it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 193/49298 [01:00<4:21:41,  3.13it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 194/49298 [01:00<4:21:29,  3.13it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 195/49298 [01:01<4:21:27,  3.13it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 196/49298 [01:01<4:21:40,  3.13it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 197/49298 [01:01<4:21:25,  3.13it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 198/49298 [01:02<4:20:35,  3.14it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 199/49298 [01:02<4:21:27,  3.13it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 200/49298 [01:02<4:22:11,  3.12it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 201/49298 [01:03<4:22:35,  3.12it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 202/49298 [01:03<4:22:36,  3.12it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 203/49298 [01:03<4:22:39,  3.12it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 204/49298 [01:04<4:22:28,  3.12it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 205/49298 [01:04<4:21:58,  3.12it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 206/49298 [01:04<4:21:58,  3.12it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 207/49298 [01:05<4:22:05,  3.12it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 208/49298 [01:05<4:23:05,  3.11it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 209/49298 [01:05<4:22:48,  3.11it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 210/49298 [01:06<4:22:47,  3.11it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 211/49298 [01:06<4:22:46,  3.11it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 212/49298 [01:06<4:22:50,  3.11it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 213/49298 [01:06<4:22:46,  3.11it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 214/49298 [01:07<4:22:40,  3.11it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 215/49298 [01:07<4:22:45,  3.11it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 216/49298 [01:07<4:22:53,  3.11it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 217/49298 [01:08<4:23:07,  3.11it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 218/49298 [01:08<4:22:53,  3.11it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 219/49298 [01:08<4:23:26,  3.11it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 220/49298 [01:09<4:23:12,  3.11it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 221/49298 [01:09<4:23:06,  3.11it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 222/49298 [01:09<4:22:48,  3.11it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 223/49298 [01:10<4:22:37,  3.11it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 224/49298 [01:10<4:22:52,  3.11it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 225/49298 [01:10<4:23:23,  3.11it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 226/49298 [01:11<4:23:30,  3.10it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 227/49298 [01:11<4:23:26,  3.10it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 228/49298 [01:11<4:23:19,  3.11it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 229/49298 [01:12<4:23:28,  3.10it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 230/49298 [01:12<4:23:40,  3.10it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 231/49298 [01:12<4:24:20,  3.09it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 232/49298 [01:13<4:24:17,  3.09it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 233/49298 [01:13<4:24:04,  3.10it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 234/49298 [01:13<4:23:46,  3.10it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 235/49298 [01:14<4:23:36,  3.10it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 236/49298 [01:14<4:23:20,  3.11it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 237/49298 [01:14<4:23:12,  3.11it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 238/49298 [01:15<4:23:15,  3.11it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 239/49298 [01:15<4:23:16,  3.11it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 240/49298 [01:15<4:23:29,  3.10it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 241/49298 [01:16<4:23:38,  3.10it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 242/49298 [01:16<4:23:17,  3.11it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 243/49298 [01:16<4:23:33,  3.10it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 244/49298 [01:16<4:23:42,  3.10it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 245/49298 [01:17<4:24:10,  3.09it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  0%|          | 246/49298 [01:17<4:24:11,  3.09it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 247/49298 [01:17<4:23:50,  3.10it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 248/49298 [01:18<4:23:48,  3.10it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 249/49298 [01:18<4:23:51,  3.10it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 250/49298 [01:18<4:23:52,  3.10it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 251/49298 [01:19<4:24:09,  3.09it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 252/49298 [01:19<4:24:02,  3.10it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 253/49298 [01:19<4:24:12,  3.09it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 254/49298 [01:20<4:24:15,  3.09it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 255/49298 [01:20<4:24:44,  3.09it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 256/49298 [01:20<4:24:42,  3.09it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 257/49298 [01:21<4:24:33,  3.09it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 258/49298 [01:21<4:24:25,  3.09it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 259/49298 [01:21<4:24:34,  3.09it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 260/49298 [01:22<4:24:33,  3.09it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 261/49298 [01:22<4:24:12,  3.09it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 262/49298 [01:22<4:24:22,  3.09it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 263/49298 [01:23<4:24:21,  3.09it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 264/49298 [01:23<4:24:48,  3.09it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 265/49298 [01:23<4:24:29,  3.09it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 266/49298 [01:24<4:24:42,  3.09it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 267/49298 [01:24<4:24:58,  3.08it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 268/49298 [01:24<4:24:49,  3.09it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 269/49298 [01:25<4:24:37,  3.09it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 270/49298 [01:25<4:24:33,  3.09it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 271/49298 [01:25<4:24:26,  3.09it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 272/49298 [01:26<4:24:17,  3.09it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 273/49298 [01:26<4:24:12,  3.09it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 274/49298 [01:26<4:24:35,  3.09it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 275/49298 [01:27<4:24:22,  3.09it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 276/49298 [01:27<4:24:47,  3.09it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 277/49298 [01:27<4:24:25,  3.09it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 278/49298 [01:27<4:24:24,  3.09it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 279/49298 [01:28<4:24:27,  3.09it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 280/49298 [01:28<4:24:13,  3.09it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 281/49298 [01:28<4:24:11,  3.09it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 282/49298 [01:29<4:24:38,  3.09it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 283/49298 [01:29<4:24:27,  3.09it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 284/49298 [01:29<4:24:37,  3.09it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 285/49298 [01:30<4:24:53,  3.08it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 286/49298 [01:30<4:24:57,  3.08it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 287/49298 [01:30<4:24:32,  3.09it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 288/49298 [01:31<4:24:41,  3.09it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 289/49298 [01:31<4:24:51,  3.08it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 290/49298 [01:31<4:25:05,  3.08it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 291/49298 [01:32<4:25:21,  3.08it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 292/49298 [01:32<4:25:50,  3.07it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 293/49298 [01:32<4:25:35,  3.08it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 294/49298 [01:33<4:25:29,  3.08it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 295/49298 [01:33<4:25:24,  3.08it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 296/49298 [01:33<4:25:16,  3.08it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 297/49298 [01:34<4:25:26,  3.08it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 298/49298 [01:34<4:25:16,  3.08it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 299/49298 [01:34<4:25:11,  3.08it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 300/49298 [01:35<4:25:17,  3.08it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 301/49298 [01:35<4:25:34,  3.07it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 302/49298 [01:35<4:25:52,  3.07it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 303/49298 [01:36<4:25:31,  3.08it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 304/49298 [01:36<4:25:46,  3.07it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 305/49298 [01:36<4:25:28,  3.08it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 306/49298 [01:37<4:25:21,  3.08it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 307/49298 [01:37<4:25:26,  3.08it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 308/49298 [01:37<4:25:28,  3.08it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 309/49298 [01:38<4:25:24,  3.08it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 310/49298 [01:38<4:25:10,  3.08it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 311/49298 [01:38<4:25:34,  3.07it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 312/49298 [01:39<4:25:27,  3.08it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 313/49298 [01:39<4:25:33,  3.07it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 314/49298 [01:39<4:26:04,  3.07it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 315/49298 [01:40<4:26:14,  3.07it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 316/49298 [01:40<4:26:19,  3.07it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 317/49298 [01:40<4:25:58,  3.07it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 318/49298 [01:40<4:25:43,  3.07it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 319/49298 [01:41<4:25:36,  3.07it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 320/49298 [01:41<4:25:55,  3.07it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 321/49298 [01:41<4:25:58,  3.07it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 322/49298 [01:42<4:26:18,  3.07it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 323/49298 [01:42<4:26:21,  3.06it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 324/49298 [01:42<4:25:59,  3.07it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 325/49298 [01:43<4:25:58,  3.07it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 326/49298 [01:43<4:25:55,  3.07it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 327/49298 [01:43<4:25:49,  3.07it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 328/49298 [01:44<4:25:59,  3.07it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 329/49298 [01:44<4:26:14,  3.07it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 330/49298 [01:44<4:26:25,  3.06it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 331/49298 [01:45<4:26:40,  3.06it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 332/49298 [01:45<4:26:29,  3.06it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 333/49298 [01:45<4:25:53,  3.07it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 334/49298 [01:46<4:26:30,  3.06it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 335/49298 [01:46<4:26:41,  3.06it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 336/49298 [01:46<4:26:37,  3.06it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 337/49298 [01:47<4:26:40,  3.06it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 338/49298 [01:47<4:26:43,  3.06it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 339/49298 [01:47<4:26:33,  3.06it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 340/49298 [01:48<4:26:38,  3.06it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 341/49298 [01:48<4:26:36,  3.06it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 342/49298 [01:48<4:27:12,  3.05it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 343/49298 [01:49<4:27:25,  3.05it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 344/49298 [01:49<4:27:19,  3.05it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 345/49298 [01:49<4:27:15,  3.05it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 346/49298 [01:50<4:26:53,  3.06it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 347/49298 [01:50<4:26:42,  3.06it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 348/49298 [01:50<4:26:14,  3.06it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 349/49298 [01:51<4:24:54,  3.08it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 350/49298 [01:51<4:25:47,  3.07it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 351/49298 [01:51<4:26:15,  3.06it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 352/49298 [01:52<4:25:58,  3.07it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 353/49298 [01:52<4:26:43,  3.06it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 354/49298 [01:52<4:27:01,  3.05it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 355/49298 [01:53<4:26:26,  3.06it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 356/49298 [01:53<4:26:32,  3.06it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 357/49298 [01:53<4:26:27,  3.06it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 358/49298 [01:54<4:26:49,  3.06it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 359/49298 [01:54<4:26:41,  3.06it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 360/49298 [01:54<4:26:46,  3.06it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 361/49298 [01:55<4:26:28,  3.06it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 362/49298 [01:55<4:26:17,  3.06it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 363/49298 [01:55<4:26:20,  3.06it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 364/49298 [01:56<4:26:31,  3.06it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 365/49298 [01:56<4:26:54,  3.06it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 366/49298 [01:56<4:27:03,  3.05it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 367/49298 [01:56<4:27:05,  3.05it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 368/49298 [01:57<4:27:23,  3.05it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 369/49298 [01:57<4:27:17,  3.05it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 370/49298 [01:57<4:27:39,  3.05it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 371/49298 [01:58<4:27:50,  3.04it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 372/49298 [01:58<4:27:23,  3.05it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 373/49298 [01:58<4:27:06,  3.05it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 374/49298 [01:59<4:27:32,  3.05it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 375/49298 [01:59<4:27:11,  3.05it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 376/49298 [01:59<4:27:36,  3.05it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 377/49298 [02:00<4:27:17,  3.05it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 378/49298 [02:00<4:26:56,  3.05it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 379/49298 [02:00<4:27:08,  3.05it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 380/49298 [02:01<4:27:24,  3.05it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 381/49298 [02:01<4:26:59,  3.05it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 382/49298 [02:01<4:27:10,  3.05it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 383/49298 [02:02<4:27:18,  3.05it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 384/49298 [02:02<4:26:40,  3.06it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 385/49298 [02:02<4:26:58,  3.05it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 386/49298 [02:03<4:27:18,  3.05it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 387/49298 [02:03<4:27:15,  3.05it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 388/49298 [02:03<4:27:45,  3.04it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 389/49298 [02:04<4:27:33,  3.05it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 390/49298 [02:04<4:27:41,  3.05it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 391/49298 [02:04<4:26:18,  3.06it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 392/49298 [02:05<4:27:27,  3.05it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 393/49298 [02:05<4:27:45,  3.04it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 394/49298 [02:05<4:27:32,  3.05it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 395/49298 [02:06<4:27:16,  3.05it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 396/49298 [02:06<4:27:45,  3.04it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 397/49298 [02:06<4:27:58,  3.04it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 398/49298 [02:07<4:27:37,  3.05it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 399/49298 [02:07<4:27:59,  3.04it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 400/49298 [02:07<4:27:50,  3.04it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 401/49298 [02:08<4:28:07,  3.04it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 402/49298 [02:08<4:27:46,  3.04it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 403/49298 [02:08<4:27:17,  3.05it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 404/49298 [02:09<4:26:59,  3.05it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 405/49298 [02:09<4:27:42,  3.04it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 406/49298 [02:09<4:27:47,  3.04it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 407/49298 [02:10<4:26:58,  3.05it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 408/49298 [02:10<4:27:25,  3.05it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 409/49298 [02:10<4:27:41,  3.04it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 410/49298 [02:11<4:28:07,  3.04it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 411/49298 [02:11<4:28:18,  3.04it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 412/49298 [02:11<4:28:19,  3.04it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 413/49298 [02:12<4:28:23,  3.04it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 414/49298 [02:12<4:29:01,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 415/49298 [02:12<4:28:15,  3.04it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 416/49298 [02:13<4:28:30,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 417/49298 [02:13<4:28:20,  3.04it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 418/49298 [02:13<4:28:26,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 419/49298 [02:14<4:28:07,  3.04it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 420/49298 [02:14<4:28:06,  3.04it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 421/49298 [02:14<4:28:41,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 422/49298 [02:15<4:28:15,  3.04it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 423/49298 [02:15<4:28:09,  3.04it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 424/49298 [02:15<4:28:27,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 425/49298 [02:16<4:28:02,  3.04it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 426/49298 [02:16<4:27:41,  3.04it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 427/49298 [02:16<4:27:35,  3.04it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 428/49298 [02:17<4:28:27,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 429/49298 [02:17<4:28:30,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 430/49298 [02:17<4:28:18,  3.04it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 431/49298 [02:18<4:28:17,  3.04it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 432/49298 [02:18<4:28:43,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 433/49298 [02:18<4:28:33,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 434/49298 [02:19<4:28:36,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 435/49298 [02:19<4:28:53,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 436/49298 [02:19<4:28:20,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 437/49298 [02:19<4:28:08,  3.04it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 438/49298 [02:20<4:28:38,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 439/49298 [02:20<4:28:19,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 440/49298 [02:20<4:28:33,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 441/49298 [02:21<4:28:27,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 442/49298 [02:21<4:28:39,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 443/49298 [02:21<4:28:31,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 444/49298 [02:22<4:28:28,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 445/49298 [02:22<4:28:28,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 446/49298 [02:22<4:28:04,  3.04it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 447/49298 [02:23<4:29:04,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 448/49298 [02:23<4:29:03,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 449/49298 [02:23<4:28:53,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 450/49298 [02:24<4:28:51,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 451/49298 [02:24<4:28:44,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 452/49298 [02:24<4:29:00,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 453/49298 [02:25<4:28:53,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 454/49298 [02:25<4:28:39,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 455/49298 [02:25<4:28:29,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 456/49298 [02:26<4:28:41,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 457/49298 [02:26<4:28:37,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 458/49298 [02:26<4:29:02,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 459/49298 [02:27<4:29:16,  3.02it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 460/49298 [02:27<4:29:02,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 461/49298 [02:27<4:28:59,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 462/49298 [02:28<4:28:44,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 463/49298 [02:28<4:28:51,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 464/49298 [02:28<4:28:20,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 465/49298 [02:29<4:28:29,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 466/49298 [02:29<4:28:54,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 467/49298 [02:29<4:28:41,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 468/49298 [02:30<4:29:15,  3.02it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 469/49298 [02:30<4:28:50,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 470/49298 [02:30<4:29:13,  3.02it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 471/49298 [02:31<4:29:32,  3.02it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 472/49298 [02:31<4:29:14,  3.02it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 473/49298 [02:31<4:29:04,  3.02it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 474/49298 [02:32<4:29:17,  3.02it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 475/49298 [02:32<4:29:01,  3.02it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 476/49298 [02:32<4:29:18,  3.02it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 477/49298 [02:33<4:29:07,  3.02it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 478/49298 [02:33<4:28:50,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 479/49298 [02:33<4:28:56,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 480/49298 [02:34<4:28:53,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 481/49298 [02:34<4:28:50,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 482/49298 [02:34<4:28:33,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 483/49298 [02:35<4:28:48,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 484/49298 [02:35<4:28:57,  3.02it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 485/49298 [02:35<4:29:17,  3.02it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 486/49298 [02:36<4:28:50,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 487/49298 [02:36<4:29:22,  3.02it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 488/49298 [02:36<4:29:58,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 489/49298 [02:37<4:29:39,  3.02it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 490/49298 [02:37<4:29:52,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 491/49298 [02:37<4:29:36,  3.02it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 492/49298 [02:38<4:29:52,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 493/49298 [02:38<4:29:54,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 494/49298 [02:38<4:29:35,  3.02it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 495/49298 [02:39<4:29:23,  3.02it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 496/49298 [02:39<4:29:17,  3.02it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 497/49298 [02:39<4:29:40,  3.02it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 498/49298 [02:40<4:29:50,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 499/49298 [02:40<4:29:50,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 500/49298 [02:40<4:29:20,  3.02it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 501/49298 [02:41<4:28:47,  3.03it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 502/49298 [02:41<4:29:35,  3.02it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 503/49298 [02:41<4:29:21,  3.02it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 504/49298 [02:42<4:29:05,  3.02it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 505/49298 [02:42<4:29:33,  3.02it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 506/49298 [02:42<4:29:17,  3.02it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 507/49298 [02:43<4:29:00,  3.02it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 508/49298 [02:43<4:29:18,  3.02it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 509/49298 [02:43<4:29:31,  3.02it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 510/49298 [02:44<4:29:30,  3.02it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 511/49298 [02:44<4:29:45,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 512/49298 [02:44<4:29:12,  3.02it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 513/49298 [02:45<4:29:34,  3.02it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 514/49298 [02:45<4:30:07,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 515/49298 [02:45<4:30:00,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 516/49298 [02:46<4:29:49,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 517/49298 [02:46<4:29:33,  3.02it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 518/49298 [02:46<4:29:56,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 519/49298 [02:47<4:29:43,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 520/49298 [02:47<4:30:05,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 521/49298 [02:47<4:30:11,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 522/49298 [02:48<4:29:55,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 523/49298 [02:48<4:30:17,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 524/49298 [02:48<4:30:02,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 525/49298 [02:49<4:29:50,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 526/49298 [02:49<4:29:50,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 527/49298 [02:49<4:30:01,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 528/49298 [02:50<4:30:05,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 529/49298 [02:50<4:29:32,  3.02it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 530/49298 [02:50<4:30:02,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 531/49298 [02:51<4:29:43,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 532/49298 [02:51<4:30:06,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 533/49298 [02:51<4:29:59,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 534/49298 [02:52<4:29:40,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 535/49298 [02:52<4:30:01,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 536/49298 [02:52<4:29:32,  3.02it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 537/49298 [02:53<4:29:32,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 538/49298 [02:53<4:29:54,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 539/49298 [02:53<4:30:06,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 540/49298 [02:54<4:30:23,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 541/49298 [02:54<4:30:00,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 542/49298 [02:54<4:30:17,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 543/49298 [02:55<4:29:45,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 544/49298 [02:55<4:29:35,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 545/49298 [02:55<4:29:31,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 546/49298 [02:56<4:29:32,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 547/49298 [02:56<4:29:39,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 548/49298 [02:56<4:29:21,  3.02it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 549/49298 [02:57<4:29:54,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 550/49298 [02:57<4:30:11,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 551/49298 [02:57<4:30:12,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 552/49298 [02:58<4:30:05,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 553/49298 [02:58<4:29:49,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 554/49298 [02:58<4:29:39,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 555/49298 [02:59<4:29:29,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 556/49298 [02:59<4:29:34,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 557/49298 [02:59<4:29:50,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 558/49298 [03:00<4:30:23,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 559/49298 [03:00<4:30:28,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 560/49298 [03:00<4:29:39,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 561/49298 [03:01<4:30:44,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 562/49298 [03:01<4:30:39,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 563/49298 [03:01<4:30:15,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 564/49298 [03:02<4:30:13,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 565/49298 [03:02<4:30:33,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 566/49298 [03:02<4:29:55,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 567/49298 [03:03<4:29:42,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 568/49298 [03:03<4:29:48,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 569/49298 [03:03<4:29:47,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 570/49298 [03:04<4:30:07,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 571/49298 [03:04<4:30:24,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 572/49298 [03:04<4:30:26,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 573/49298 [03:05<4:29:54,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 574/49298 [03:05<4:30:05,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 575/49298 [03:05<4:29:46,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 576/49298 [03:06<4:29:49,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 577/49298 [03:06<4:30:12,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 578/49298 [03:06<4:30:05,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 579/49298 [03:07<4:30:19,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 580/49298 [03:07<4:30:21,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 581/49298 [03:07<4:30:49,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 582/49298 [03:08<4:30:46,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 583/49298 [03:08<4:30:28,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 584/49298 [03:08<4:30:04,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 585/49298 [03:09<4:30:00,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 586/49298 [03:09<4:29:52,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 587/49298 [03:09<4:30:13,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 588/49298 [03:10<4:30:24,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 589/49298 [03:10<4:30:55,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 590/49298 [03:10<4:30:03,  3.01it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 591/49298 [03:11<4:30:30,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 592/49298 [03:11<4:31:08,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 593/49298 [03:11<4:30:44,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 594/49298 [03:12<4:30:29,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 595/49298 [03:12<4:30:55,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 596/49298 [03:12<4:30:53,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 597/49298 [03:13<4:30:46,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 598/49298 [03:13<4:31:02,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 599/49298 [03:13<4:30:45,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 600/49298 [03:14<4:30:31,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 601/49298 [03:14<4:30:37,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 602/49298 [03:14<4:30:43,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 603/49298 [03:15<4:30:24,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 604/49298 [03:15<4:30:51,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 605/49298 [03:15<4:30:48,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 606/49298 [03:16<4:30:54,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 607/49298 [03:16<4:31:00,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 608/49298 [03:16<4:30:42,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 609/49298 [03:17<4:30:32,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 610/49298 [03:17<4:30:27,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 611/49298 [03:17<4:30:42,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 612/49298 [03:18<4:30:54,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 613/49298 [03:18<4:30:42,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 614/49298 [03:18<4:30:28,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 615/49298 [03:19<4:31:14,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|          | 616/49298 [03:19<4:30:42,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 617/49298 [03:19<4:30:26,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 618/49298 [03:20<4:30:48,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 619/49298 [03:20<4:30:41,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 620/49298 [03:20<4:30:44,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 621/49298 [03:21<4:30:41,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 622/49298 [03:21<4:30:52,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 623/49298 [03:21<4:30:43,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 624/49298 [03:22<4:30:22,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 625/49298 [03:22<4:31:18,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 626/49298 [03:22<4:31:16,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 627/49298 [03:23<4:30:40,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 628/49298 [03:23<4:30:20,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 629/49298 [03:23<4:31:03,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 630/49298 [03:24<4:30:30,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 631/49298 [03:24<4:30:49,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 632/49298 [03:24<4:31:07,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 633/49298 [03:25<4:31:20,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 634/49298 [03:25<4:31:11,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 635/49298 [03:25<4:31:13,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 636/49298 [03:26<4:31:39,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 637/49298 [03:26<4:30:50,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 638/49298 [03:26<4:30:34,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 639/49298 [03:27<4:30:47,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 640/49298 [03:27<4:30:56,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 641/49298 [03:27<4:30:35,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 642/49298 [03:28<4:30:26,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 643/49298 [03:28<4:30:29,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 644/49298 [03:28<4:30:04,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 645/49298 [03:29<4:30:17,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 646/49298 [03:29<4:30:20,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 647/49298 [03:29<4:30:26,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 648/49298 [03:30<4:30:15,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 649/49298 [03:30<4:30:55,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 650/49298 [03:30<4:30:45,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 651/49298 [03:31<4:30:58,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 652/49298 [03:31<4:30:53,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 653/49298 [03:31<4:31:15,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 654/49298 [03:32<4:31:19,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 655/49298 [03:32<4:31:18,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 656/49298 [03:32<4:31:21,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 657/49298 [03:33<4:31:29,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 658/49298 [03:33<4:30:50,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 659/49298 [03:33<4:31:43,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 660/49298 [03:34<4:30:52,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 661/49298 [03:34<4:31:09,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 662/49298 [03:34<4:30:54,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 663/49298 [03:35<4:30:53,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 664/49298 [03:35<4:30:33,  3.00it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 665/49298 [03:35<4:31:13,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 666/49298 [03:36<4:30:41,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 667/49298 [03:36<4:31:33,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 668/49298 [03:36<4:31:17,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 669/49298 [03:37<4:31:32,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 670/49298 [03:37<4:31:21,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 671/49298 [03:37<4:31:44,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 672/49298 [03:38<4:31:34,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 673/49298 [03:38<4:30:42,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 674/49298 [03:38<4:31:02,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 675/49298 [03:39<4:30:48,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 676/49298 [03:39<4:31:12,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 677/49298 [03:39<4:31:15,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 678/49298 [03:40<4:31:18,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 679/49298 [03:40<4:31:20,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 680/49298 [03:40<4:31:38,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 681/49298 [03:41<4:31:21,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 682/49298 [03:41<4:31:13,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 683/49298 [03:41<4:31:32,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 684/49298 [03:42<4:30:54,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 685/49298 [03:42<4:31:21,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 686/49298 [03:42<4:31:21,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 687/49298 [03:43<4:31:20,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 688/49298 [03:43<4:31:28,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 689/49298 [03:43<4:31:32,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 690/49298 [03:44<4:31:43,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 691/49298 [03:44<4:31:43,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 692/49298 [03:44<4:31:19,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 693/49298 [03:45<4:31:17,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 694/49298 [03:45<4:31:07,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 695/49298 [03:45<4:31:13,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 696/49298 [03:46<4:30:50,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 697/49298 [03:46<4:31:52,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 698/49298 [03:46<4:31:35,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 699/49298 [03:47<4:31:46,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 700/49298 [03:47<4:32:06,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 701/49298 [03:47<4:31:25,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 702/49298 [03:48<4:32:04,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 703/49298 [03:48<4:31:33,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 704/49298 [03:48<4:31:34,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 705/49298 [03:49<4:31:11,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 706/49298 [03:49<4:31:31,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 707/49298 [03:49<4:31:24,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 708/49298 [03:50<4:31:12,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 709/49298 [03:50<4:31:28,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 710/49298 [03:50<4:31:33,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 711/49298 [03:51<4:30:59,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 712/49298 [03:51<4:30:26,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 713/49298 [03:51<4:31:20,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 714/49298 [03:52<4:31:47,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 715/49298 [03:52<4:30:37,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 716/49298 [03:52<4:31:44,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 717/49298 [03:53<4:31:54,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 718/49298 [03:53<4:31:06,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 719/49298 [03:53<4:31:34,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 720/49298 [03:54<4:31:25,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 721/49298 [03:54<4:31:46,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 722/49298 [03:54<4:31:45,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 723/49298 [03:55<4:31:40,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 724/49298 [03:55<4:32:06,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 725/49298 [03:55<4:31:26,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 726/49298 [03:56<4:31:48,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 727/49298 [03:56<4:31:37,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 728/49298 [03:56<4:31:11,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 729/49298 [03:57<4:31:39,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 730/49298 [03:57<4:31:00,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 731/49298 [03:57<4:31:13,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 732/49298 [03:58<4:30:58,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 733/49298 [03:58<4:31:21,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 734/49298 [03:58<4:30:43,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 735/49298 [03:59<4:30:48,  2.99it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 736/49298 [03:59<4:31:58,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 737/49298 [03:59<4:32:20,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 738/49298 [04:00<4:32:02,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  1%|▏         | 739/49298 [04:00<4:32:27,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 740/49298 [04:00<4:32:11,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 741/49298 [04:01<4:32:02,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 742/49298 [04:01<4:31:58,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 743/49298 [04:01<4:31:39,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 744/49298 [04:02<4:32:00,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 745/49298 [04:02<4:31:36,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 746/49298 [04:02<4:31:41,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 747/49298 [04:03<4:32:01,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 748/49298 [04:03<4:31:20,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 749/49298 [04:03<4:31:29,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 750/49298 [04:04<4:31:14,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 751/49298 [04:04<4:31:14,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 752/49298 [04:04<4:31:15,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 753/49298 [04:05<4:31:20,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 754/49298 [04:05<4:31:25,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 755/49298 [04:05<4:31:40,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 756/49298 [04:06<4:31:13,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 757/49298 [04:06<4:31:37,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 758/49298 [04:06<4:31:43,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 759/49298 [04:07<4:31:38,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 760/49298 [04:07<4:32:03,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 761/49298 [04:07<4:31:30,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 762/49298 [04:08<4:31:33,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 763/49298 [04:08<4:31:12,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 764/49298 [04:08<4:31:28,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 765/49298 [04:09<4:31:43,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 766/49298 [04:09<4:31:39,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 767/49298 [04:09<4:32:03,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 768/49298 [04:10<4:31:56,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 769/49298 [04:10<4:31:40,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 770/49298 [04:10<4:32:01,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 771/49298 [04:11<4:32:37,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 772/49298 [04:11<4:31:24,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 773/49298 [04:11<4:31:50,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 774/49298 [04:12<4:31:52,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 775/49298 [04:12<4:31:54,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 776/49298 [04:13<4:31:56,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 777/49298 [04:13<4:31:58,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 778/49298 [04:13<4:31:55,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 779/49298 [04:14<4:31:23,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 780/49298 [04:14<4:31:24,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 781/49298 [04:14<4:31:28,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 782/49298 [04:15<4:31:35,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 783/49298 [04:15<4:31:28,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 784/49298 [04:15<4:31:00,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 785/49298 [04:16<4:31:34,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 786/49298 [04:16<4:31:57,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 787/49298 [04:16<4:31:12,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 788/49298 [04:17<4:31:29,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 789/49298 [04:17<4:31:47,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 790/49298 [04:17<4:31:46,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 791/49298 [04:18<4:31:44,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 792/49298 [04:18<4:31:44,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 793/49298 [04:18<4:32:12,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 794/49298 [04:19<4:32:11,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 795/49298 [04:19<4:31:31,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 796/49298 [04:19<4:31:17,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 797/49298 [04:20<4:31:52,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 798/49298 [04:20<4:32:20,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 799/49298 [04:20<4:32:00,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 800/49298 [04:21<4:31:31,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 801/49298 [04:21<4:31:17,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 802/49298 [04:21<4:31:37,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 803/49298 [04:22<4:31:47,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 804/49298 [04:22<4:31:14,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 805/49298 [04:22<4:31:15,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 806/49298 [04:23<4:31:57,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 807/49298 [04:23<4:31:37,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 808/49298 [04:23<4:31:28,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 809/49298 [04:24<4:31:13,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 810/49298 [04:24<4:32:05,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 811/49298 [04:24<4:32:22,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 812/49298 [04:25<4:32:04,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 813/49298 [04:25<4:32:10,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 814/49298 [04:25<4:31:58,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 815/49298 [04:26<4:31:57,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 816/49298 [04:26<4:32:20,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 817/49298 [04:26<4:32:13,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 818/49298 [04:27<4:31:46,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 819/49298 [04:27<4:32:09,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 820/49298 [04:27<4:31:27,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 821/49298 [04:28<4:31:52,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 822/49298 [04:28<4:31:28,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 823/49298 [04:28<4:31:40,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 824/49298 [04:29<4:31:33,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 825/49298 [04:29<4:31:09,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 826/49298 [04:29<4:31:20,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 827/49298 [04:30<4:31:47,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 828/49298 [04:30<4:31:25,  2.98it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 829/49298 [04:30<4:31:34,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 830/49298 [04:31<4:31:43,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 831/49298 [04:31<4:31:55,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 832/49298 [04:31<4:31:38,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 833/49298 [04:32<4:31:43,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 834/49298 [04:32<4:31:32,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 835/49298 [04:32<4:32:05,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 836/49298 [04:33<4:32:23,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 837/49298 [04:33<4:31:29,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 838/49298 [04:33<4:31:35,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 839/49298 [04:34<4:31:33,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 840/49298 [04:34<4:31:30,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 841/49298 [04:34<4:31:43,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 842/49298 [04:35<4:31:55,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 843/49298 [04:35<4:31:50,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 844/49298 [04:35<4:32:13,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 845/49298 [04:36<4:32:09,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 846/49298 [04:36<4:31:55,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 847/49298 [04:36<4:32:42,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 848/49298 [04:37<4:32:01,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 849/49298 [04:37<4:32:02,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 850/49298 [04:37<4:32:24,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 851/49298 [04:38<4:32:02,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 852/49298 [04:38<4:31:39,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 853/49298 [04:38<4:31:44,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 854/49298 [04:39<4:31:41,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 855/49298 [04:39<4:31:44,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 856/49298 [04:39<4:31:46,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 857/49298 [04:40<4:31:25,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 858/49298 [04:40<4:31:42,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 859/49298 [04:40<4:32:16,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 860/49298 [04:41<4:31:43,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 861/49298 [04:41<4:31:35,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 862/49298 [04:41<4:31:49,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 863/49298 [04:42<4:31:41,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 864/49298 [04:42<4:31:45,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 865/49298 [04:42<4:32:09,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 866/49298 [04:43<4:32:10,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 867/49298 [04:43<4:31:44,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 868/49298 [04:43<4:32:12,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 869/49298 [04:44<4:31:50,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 870/49298 [04:44<4:31:59,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 871/49298 [04:44<4:32:08,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 872/49298 [04:45<4:31:54,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 873/49298 [04:45<4:31:36,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 874/49298 [04:45<4:31:50,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 875/49298 [04:46<4:31:40,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 876/49298 [04:46<4:31:32,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 877/49298 [04:46<4:31:53,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 878/49298 [04:47<4:31:22,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 879/49298 [04:47<4:31:57,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 880/49298 [04:48<4:32:28,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 881/49298 [04:48<4:32:06,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 882/49298 [04:48<4:32:09,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 883/49298 [04:49<4:32:21,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 884/49298 [04:49<4:31:35,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 885/49298 [04:49<4:31:38,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 886/49298 [04:50<4:32:20,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 887/49298 [04:50<4:31:44,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 888/49298 [04:50<4:31:38,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 889/49298 [04:51<4:32:03,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 890/49298 [04:51<4:31:37,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 891/49298 [04:51<4:31:28,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 892/49298 [04:52<4:32:10,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 893/49298 [04:52<4:31:27,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 894/49298 [04:52<4:31:38,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 895/49298 [04:53<4:31:42,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 896/49298 [04:53<4:31:48,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 897/49298 [04:53<4:31:56,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 898/49298 [04:54<4:31:31,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 899/49298 [04:54<4:31:31,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 900/49298 [04:54<4:31:52,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 901/49298 [04:55<4:31:35,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 902/49298 [04:55<4:31:35,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 903/49298 [04:55<4:31:58,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 904/49298 [04:56<4:31:46,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 905/49298 [04:56<4:31:44,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 906/49298 [04:56<4:31:52,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 907/49298 [04:57<4:31:56,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 908/49298 [04:57<4:31:34,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 909/49298 [04:57<4:31:47,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 910/49298 [04:58<4:31:50,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 911/49298 [04:58<4:31:35,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 912/49298 [04:58<4:32:23,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 913/49298 [04:59<4:32:40,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 914/49298 [04:59<4:32:25,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 915/49298 [04:59<4:31:45,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 916/49298 [05:00<4:32:16,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 917/49298 [05:00<4:31:38,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 918/49298 [05:00<4:31:40,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 919/49298 [05:01<4:31:49,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 920/49298 [05:01<4:32:00,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 921/49298 [05:01<4:31:44,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 922/49298 [05:02<4:32:25,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 923/49298 [05:02<4:32:51,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 924/49298 [05:02<4:31:44,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 925/49298 [05:03<4:31:30,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 926/49298 [05:03<4:32:42,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 927/49298 [05:03<4:31:58,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 928/49298 [05:04<4:31:50,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 929/49298 [05:04<4:32:18,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 930/49298 [05:04<4:31:25,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 931/49298 [05:05<4:31:00,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 932/49298 [05:05<4:31:47,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 933/49298 [05:05<4:31:33,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 934/49298 [05:06<4:31:35,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 935/49298 [05:06<4:32:14,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 936/49298 [05:06<4:31:16,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 937/49298 [05:07<4:31:04,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 938/49298 [05:07<4:31:38,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 939/49298 [05:07<4:31:28,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 940/49298 [05:08<4:31:16,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 941/49298 [05:08<4:31:36,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 942/49298 [05:08<4:31:18,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 943/49298 [05:09<4:31:42,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 944/49298 [05:09<4:32:02,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 945/49298 [05:09<4:31:52,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 946/49298 [05:10<4:31:21,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 947/49298 [05:10<4:32:08,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 948/49298 [05:10<4:32:05,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 949/49298 [05:11<4:31:37,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 950/49298 [05:11<4:32:04,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 951/49298 [05:11<4:32:20,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 952/49298 [05:12<4:31:36,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 953/49298 [05:12<4:31:33,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 954/49298 [05:12<4:32:23,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 955/49298 [05:13<4:31:45,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 956/49298 [05:13<4:31:23,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 957/49298 [05:13<4:32:14,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 958/49298 [05:14<4:32:01,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 959/49298 [05:14<4:31:56,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 960/49298 [05:14<4:32:02,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 961/49298 [05:15<4:32:22,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 962/49298 [05:15<4:31:55,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 963/49298 [05:15<4:31:39,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 964/49298 [05:16<4:32:13,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 965/49298 [05:16<4:31:35,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 966/49298 [05:16<4:31:28,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 967/49298 [05:17<4:32:07,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 968/49298 [05:17<4:32:16,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 969/49298 [05:18<4:31:21,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 970/49298 [05:18<4:32:08,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 971/49298 [05:18<4:32:19,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 972/49298 [05:19<4:31:52,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 973/49298 [05:19<4:32:05,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 974/49298 [05:19<4:32:31,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 975/49298 [05:20<4:32:06,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 976/49298 [05:20<4:31:49,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 977/49298 [05:20<4:32:37,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 978/49298 [05:21<4:32:32,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 979/49298 [05:21<4:31:52,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 980/49298 [05:21<4:31:29,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 981/49298 [05:22<4:32:29,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 982/49298 [05:22<4:31:37,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 983/49298 [05:22<4:31:29,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 984/49298 [05:23<4:31:45,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 985/49298 [05:23<4:31:17,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 986/49298 [05:23<4:31:03,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 987/49298 [05:24<4:31:21,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 988/49298 [05:24<4:31:10,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 989/49298 [05:24<4:31:16,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 990/49298 [05:25<4:31:57,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 991/49298 [05:25<4:31:47,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 992/49298 [05:25<4:31:28,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 993/49298 [05:26<4:31:44,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 994/49298 [05:26<4:32:11,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 995/49298 [05:26<4:31:53,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 996/49298 [05:27<4:32:11,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 997/49298 [05:27<4:31:55,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 998/49298 [05:27<4:31:51,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 999/49298 [05:28<4:32:00,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1000/49298 [05:28<4:32:44,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1001/49298 [05:28<4:31:32,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1002/49298 [05:29<4:31:28,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1003/49298 [05:29<4:32:30,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1004/49298 [05:29<4:31:52,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1005/49298 [05:30<4:31:56,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1006/49298 [05:30<4:31:57,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1007/49298 [05:30<4:32:00,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1008/49298 [05:31<4:31:37,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1009/49298 [05:31<4:31:31,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1010/49298 [05:31<4:32:03,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1011/49298 [05:32<4:31:26,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1012/49298 [05:32<4:31:08,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1013/49298 [05:32<4:31:34,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1014/49298 [05:33<4:31:36,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1015/49298 [05:33<4:31:21,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1016/49298 [05:33<4:31:23,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1017/49298 [05:34<4:31:31,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1018/49298 [05:34<4:31:36,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1019/49298 [05:34<4:32:02,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1020/49298 [05:35<4:32:22,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1021/49298 [05:35<4:31:30,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1022/49298 [05:35<4:31:43,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1023/49298 [05:36<4:32:18,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1024/49298 [05:36<4:31:31,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1025/49298 [05:36<4:31:36,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1026/49298 [05:37<4:32:01,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1027/49298 [05:37<4:31:36,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1028/49298 [05:37<4:31:26,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1029/49298 [05:38<4:32:30,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1030/49298 [05:38<4:32:41,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1031/49298 [05:38<4:31:48,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1032/49298 [05:39<4:31:47,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1033/49298 [05:39<4:32:09,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1034/49298 [05:39<4:32:20,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1035/49298 [05:40<4:31:20,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1036/49298 [05:40<4:31:32,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1037/49298 [05:40<4:31:47,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1038/49298 [05:41<4:31:17,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1039/49298 [05:41<4:31:34,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1040/49298 [05:41<4:32:08,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1041/49298 [05:42<4:32:10,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1042/49298 [05:42<4:31:47,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1043/49298 [05:43<4:32:00,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1044/49298 [05:43<4:32:22,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1045/49298 [05:43<4:32:28,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1046/49298 [05:44<4:31:05,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1047/49298 [05:44<4:32:02,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1048/49298 [05:44<4:32:19,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1049/49298 [05:45<4:32:16,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1050/49298 [05:45<4:31:40,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1051/49298 [05:45<4:32:24,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1052/49298 [05:46<4:32:40,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1053/49298 [05:46<4:32:11,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1054/49298 [05:46<4:31:43,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1055/49298 [05:47<4:31:35,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1056/49298 [05:47<4:31:45,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1057/49298 [05:47<4:31:39,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1058/49298 [05:48<4:31:32,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1059/49298 [05:48<4:31:42,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1060/49298 [05:48<4:31:27,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1061/49298 [05:49<4:31:46,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1062/49298 [05:49<4:31:36,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1063/49298 [05:49<4:32:06,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1064/49298 [05:50<4:32:08,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1065/49298 [05:50<4:31:01,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1066/49298 [05:50<4:31:50,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1067/49298 [05:51<4:31:41,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1068/49298 [05:51<4:31:17,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1069/49298 [05:51<4:31:37,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1070/49298 [05:52<4:31:51,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1071/49298 [05:52<4:30:57,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1072/49298 [05:52<4:31:23,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1073/49298 [05:53<4:32:09,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1074/49298 [05:53<4:31:59,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1075/49298 [05:53<4:31:53,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1076/49298 [05:54<4:31:26,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1077/49298 [05:54<4:31:37,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1078/49298 [05:54<4:31:26,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1079/49298 [05:55<4:31:22,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1080/49298 [05:55<4:31:43,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1081/49298 [05:55<4:32:15,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1082/49298 [05:56<4:31:38,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1083/49298 [05:56<4:31:22,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1084/49298 [05:56<4:31:32,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1085/49298 [05:57<4:31:40,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1086/49298 [05:57<4:31:07,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1087/49298 [05:57<4:31:42,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1088/49298 [05:58<4:32:16,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1089/49298 [05:58<4:31:21,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1090/49298 [05:58<4:31:07,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1091/49298 [05:59<4:31:55,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1092/49298 [05:59<4:31:44,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1093/49298 [05:59<4:31:11,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1094/49298 [06:00<4:31:22,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1095/49298 [06:00<4:31:36,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1096/49298 [06:00<4:31:32,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1097/49298 [06:01<4:31:32,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1098/49298 [06:01<4:31:48,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1099/49298 [06:01<4:32:10,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1100/49298 [06:02<4:32:05,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1101/49298 [06:02<4:31:24,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1102/49298 [06:02<4:31:30,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1103/49298 [06:03<4:32:05,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1104/49298 [06:03<4:32:00,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1105/49298 [06:03<4:31:46,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1106/49298 [06:04<4:31:45,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1107/49298 [06:04<4:32:07,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1108/49298 [06:04<4:31:48,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1109/49298 [06:05<4:31:46,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1110/49298 [06:05<4:32:41,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1111/49298 [06:06<4:32:08,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1112/49298 [06:06<4:31:30,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1113/49298 [06:06<4:31:31,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1114/49298 [06:07<4:31:56,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1115/49298 [06:07<4:31:31,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1116/49298 [06:07<4:31:08,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1117/49298 [06:08<4:31:39,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1118/49298 [06:08<4:32:06,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1119/49298 [06:08<4:31:58,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1120/49298 [06:09<4:31:16,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1121/49298 [06:09<4:31:17,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1122/49298 [06:09<4:32:00,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1123/49298 [06:10<4:31:49,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1124/49298 [06:10<4:31:12,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1125/49298 [06:10<4:31:33,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1126/49298 [06:11<4:32:08,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1127/49298 [06:11<4:32:05,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1128/49298 [06:11<4:31:27,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1129/49298 [06:12<4:31:32,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1130/49298 [06:12<4:31:47,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1131/49298 [06:12<4:31:56,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1132/49298 [06:13<4:31:01,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1133/49298 [06:13<4:31:58,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1134/49298 [06:13<4:32:39,  2.94it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1135/49298 [06:14<4:32:34,  2.94it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1136/49298 [06:14<4:32:37,  2.94it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1137/49298 [06:14<4:31:25,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1138/49298 [06:15<4:31:05,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1139/49298 [06:15<4:31:27,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1140/49298 [06:15<4:31:22,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1141/49298 [06:16<4:30:54,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1142/49298 [06:16<4:31:14,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1143/49298 [06:16<4:31:56,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1144/49298 [06:17<4:31:10,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1145/49298 [06:17<4:30:43,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1146/49298 [06:17<4:31:48,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1147/49298 [06:18<4:31:42,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1148/49298 [06:18<4:31:37,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1149/49298 [06:18<4:31:04,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1150/49298 [06:19<4:31:36,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1151/49298 [06:19<4:32:00,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1152/49298 [06:19<4:31:09,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1153/49298 [06:20<4:31:19,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1154/49298 [06:20<4:32:03,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1155/49298 [06:20<4:31:47,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1156/49298 [06:21<4:31:33,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1157/49298 [06:21<4:31:37,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1158/49298 [06:21<4:31:58,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1159/49298 [06:22<4:31:57,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1160/49298 [06:22<4:31:31,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1161/49298 [06:22<4:31:11,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1162/49298 [06:23<4:31:00,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1163/49298 [06:23<4:31:31,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1164/49298 [06:23<4:30:31,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1165/49298 [06:24<4:30:52,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1166/49298 [06:24<4:31:33,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1167/49298 [06:24<4:31:10,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1168/49298 [06:25<4:31:14,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1169/49298 [06:25<4:31:42,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1170/49298 [06:25<4:31:44,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1171/49298 [06:26<4:31:28,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1172/49298 [06:26<4:30:49,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1173/49298 [06:26<4:31:23,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1174/49298 [06:27<4:31:51,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1175/49298 [06:27<4:31:22,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1176/49298 [06:27<4:30:49,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1177/49298 [06:28<4:31:35,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1178/49298 [06:28<4:31:06,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1179/49298 [06:29<4:30:58,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1180/49298 [06:29<4:31:20,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1181/49298 [06:29<4:31:51,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1182/49298 [06:30<4:31:28,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1183/49298 [06:30<4:31:08,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1184/49298 [06:30<4:31:43,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1185/49298 [06:31<4:31:56,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1186/49298 [06:31<4:31:44,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1187/49298 [06:31<4:31:25,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1188/49298 [06:32<4:31:49,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1189/49298 [06:32<4:31:52,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1190/49298 [06:32<4:31:03,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1191/49298 [06:33<4:30:42,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1192/49298 [06:33<4:31:55,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1193/49298 [06:33<4:31:57,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1194/49298 [06:34<4:31:16,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1195/49298 [06:34<4:31:24,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1196/49298 [06:34<4:31:30,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1197/49298 [06:35<4:31:11,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1198/49298 [06:35<4:30:55,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1199/49298 [06:35<4:31:25,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1200/49298 [06:36<4:31:34,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1201/49298 [06:36<4:30:46,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1202/49298 [06:36<4:31:11,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1203/49298 [06:37<4:31:56,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1204/49298 [06:37<4:31:58,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1205/49298 [06:37<4:31:25,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1206/49298 [06:38<4:31:37,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1207/49298 [06:38<4:31:56,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1208/49298 [06:38<4:31:51,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1209/49298 [06:39<4:31:14,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1210/49298 [06:39<4:31:12,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1211/49298 [06:39<4:31:29,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1212/49298 [06:40<4:31:03,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1213/49298 [06:40<4:30:43,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1214/49298 [06:40<4:30:50,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1215/49298 [06:41<4:30:53,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1216/49298 [06:41<4:30:18,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1217/49298 [06:41<4:31:00,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1218/49298 [06:42<4:31:33,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1219/49298 [06:42<4:30:56,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1220/49298 [06:42<4:30:42,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1221/49298 [06:43<4:31:42,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1222/49298 [06:43<4:31:32,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1223/49298 [06:43<4:30:46,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1224/49298 [06:44<4:31:16,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1225/49298 [06:44<4:31:27,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1226/49298 [06:44<4:30:44,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1227/49298 [06:45<4:30:48,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1228/49298 [06:45<4:31:05,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1229/49298 [06:45<4:31:45,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1230/49298 [06:46<4:31:08,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1231/49298 [06:46<4:30:28,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  2%|▏         | 1232/49298 [06:46<4:31:03,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1233/49298 [06:47<4:31:17,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1234/49298 [06:47<4:30:23,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1235/49298 [06:47<4:30:50,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1236/49298 [06:48<4:31:42,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1237/49298 [06:48<4:31:23,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1238/49298 [06:48<4:30:38,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1239/49298 [06:49<4:31:06,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1240/49298 [06:49<4:31:20,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1241/49298 [06:49<4:31:16,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1242/49298 [06:50<4:30:45,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1243/49298 [06:50<4:30:43,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1244/49298 [06:51<4:31:40,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1245/49298 [06:51<4:31:26,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1246/49298 [06:51<4:30:53,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1247/49298 [06:52<4:30:44,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1248/49298 [06:52<4:31:24,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1249/49298 [06:52<4:31:24,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1250/49298 [06:53<4:31:34,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1251/49298 [06:53<4:30:58,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1252/49298 [06:53<4:31:16,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1253/49298 [06:54<4:31:17,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1254/49298 [06:54<4:30:53,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1255/49298 [06:54<4:30:31,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1256/49298 [06:55<4:31:05,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1257/49298 [06:55<4:31:18,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1258/49298 [06:55<4:30:16,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1259/49298 [06:56<4:30:19,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1260/49298 [06:56<4:30:42,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1261/49298 [06:56<4:30:57,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1262/49298 [06:57<4:30:30,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1263/49298 [06:57<4:31:00,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1264/49298 [06:57<4:31:09,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1265/49298 [06:58<4:31:00,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1266/49298 [06:58<4:30:24,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1267/49298 [06:58<4:31:09,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1268/49298 [06:59<4:31:14,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1269/49298 [06:59<4:31:03,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1270/49298 [06:59<4:30:34,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1271/49298 [07:00<4:30:45,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1272/49298 [07:00<4:30:59,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1273/49298 [07:00<4:30:28,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1274/49298 [07:01<4:30:19,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1275/49298 [07:01<4:30:49,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1276/49298 [07:01<4:31:19,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1277/49298 [07:02<4:30:26,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1278/49298 [07:02<4:30:36,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1279/49298 [07:02<4:31:29,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1280/49298 [07:03<4:31:08,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1281/49298 [07:03<4:30:21,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1282/49298 [07:03<4:30:27,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1283/49298 [07:04<4:31:11,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1284/49298 [07:04<4:30:57,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1285/49298 [07:04<4:30:31,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1286/49298 [07:05<4:30:36,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1287/49298 [07:05<4:31:19,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1288/49298 [07:05<4:31:36,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1289/49298 [07:06<4:30:50,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1290/49298 [07:06<4:30:33,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1291/49298 [07:06<4:30:52,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1292/49298 [07:07<4:31:01,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1293/49298 [07:07<4:30:53,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1294/49298 [07:07<4:30:36,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1295/49298 [07:08<4:30:51,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1296/49298 [07:08<4:30:51,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1297/49298 [07:08<4:30:31,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1298/49298 [07:09<4:30:03,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1299/49298 [07:09<4:30:41,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1300/49298 [07:09<4:30:35,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1301/49298 [07:10<4:29:56,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1302/49298 [07:10<4:30:02,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1303/49298 [07:10<4:30:30,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1304/49298 [07:11<4:30:13,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1305/49298 [07:11<4:29:51,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1306/49298 [07:11<4:30:37,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1307/49298 [07:12<4:30:49,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1308/49298 [07:12<4:29:36,  2.97it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1309/49298 [07:13<4:30:25,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1310/49298 [07:13<4:31:20,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1311/49298 [07:13<4:30:59,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1312/49298 [07:14<4:31:11,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1313/49298 [07:14<4:30:14,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1314/49298 [07:14<4:30:53,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1315/49298 [07:15<4:31:10,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1316/49298 [07:15<4:30:49,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1317/49298 [07:15<4:30:48,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1318/49298 [07:16<4:30:20,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1319/49298 [07:16<4:30:44,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1320/49298 [07:16<4:31:06,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1321/49298 [07:17<4:30:36,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1322/49298 [07:17<4:30:04,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1323/49298 [07:17<4:30:56,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1324/49298 [07:18<4:30:55,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1325/49298 [07:18<4:30:24,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1326/49298 [07:18<4:30:52,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1327/49298 [07:19<4:31:12,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1328/49298 [07:19<4:31:07,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1329/49298 [07:19<4:30:32,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1330/49298 [07:20<4:30:48,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1331/49298 [07:20<4:30:54,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1332/49298 [07:20<4:31:07,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1333/49298 [07:21<4:30:53,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1334/49298 [07:21<4:30:37,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1335/49298 [07:21<4:30:37,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1336/49298 [07:22<4:31:04,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1337/49298 [07:22<4:30:32,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1338/49298 [07:22<4:29:55,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1339/49298 [07:23<4:30:45,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1340/49298 [07:23<4:30:37,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1341/49298 [07:23<4:30:30,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1342/49298 [07:24<4:29:49,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1343/49298 [07:24<4:30:33,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1344/49298 [07:24<4:30:55,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1345/49298 [07:25<4:30:19,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1346/49298 [07:25<4:30:24,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1347/49298 [07:25<4:31:12,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1348/49298 [07:26<4:31:30,  2.94it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1349/49298 [07:26<4:31:28,  2.94it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1350/49298 [07:26<4:31:17,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1351/49298 [07:27<4:30:45,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1352/49298 [07:27<4:31:19,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1353/49298 [07:27<4:31:07,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1354/49298 [07:28<4:31:14,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1355/49298 [07:28<4:30:49,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1356/49298 [07:28<4:30:25,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1357/49298 [07:29<4:31:12,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1358/49298 [07:29<4:31:04,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1359/49298 [07:29<4:30:53,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1360/49298 [07:30<4:30:18,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1361/49298 [07:30<4:30:49,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1362/49298 [07:30<4:31:00,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1363/49298 [07:31<4:31:04,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1364/49298 [07:31<4:30:32,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1365/49298 [07:31<4:30:28,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1366/49298 [07:32<4:31:06,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1367/49298 [07:32<4:31:00,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1368/49298 [07:32<4:30:16,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1369/49298 [07:33<4:30:03,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1370/49298 [07:33<4:30:38,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1371/49298 [07:34<4:30:17,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1372/49298 [07:34<4:30:10,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1373/49298 [07:34<4:30:14,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1374/49298 [07:35<4:30:21,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1375/49298 [07:35<4:30:08,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1376/49298 [07:35<4:30:00,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1377/49298 [07:36<4:30:34,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1378/49298 [07:36<4:30:52,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1379/49298 [07:36<4:29:58,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1380/49298 [07:37<4:30:03,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1381/49298 [07:37<4:30:56,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1382/49298 [07:37<4:30:44,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1383/49298 [07:38<4:30:00,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1384/49298 [07:38<4:30:23,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1385/49298 [07:38<4:30:50,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1386/49298 [07:39<4:30:47,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1387/49298 [07:39<4:31:00,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1388/49298 [07:39<4:30:35,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1389/49298 [07:40<4:30:49,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1390/49298 [07:40<4:31:15,  2.94it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1391/49298 [07:40<4:30:53,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1392/49298 [07:41<4:30:02,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1393/49298 [07:41<4:30:21,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1394/49298 [07:41<4:30:57,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1395/49298 [07:42<4:30:34,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1396/49298 [07:42<4:29:42,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1397/49298 [07:42<4:30:50,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1398/49298 [07:43<4:31:24,  2.94it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1399/49298 [07:43<4:30:57,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1400/49298 [07:43<4:30:11,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1401/49298 [07:44<4:30:36,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1402/49298 [07:44<4:30:47,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1403/49298 [07:44<4:30:15,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1404/49298 [07:45<4:30:04,  2.96it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1405/49298 [07:45<4:30:30,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1406/49298 [07:45<4:30:53,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1407/49298 [07:46<4:30:27,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1408/49298 [07:46<4:30:09,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1409/49298 [07:46<4:30:36,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1410/49298 [07:47<4:30:27,  2.95it/s, loss=nan]

input_ids shape: torch.Size([4, 512])
attention_masks shape: torch.Size([4, 512])
labels shape: torch.Size([4, 512])


  3%|▎         | 1410/49298 [07:47<4:24:39,  3.02it/s, loss=nan]


KeyboardInterrupt: 