In [2]:
%pip install diffusers transformers accelerate peft
%pip install --quiet kagglehub
%pip install -q bitsandbytes




In [None]:
import torch
from torch.utils.data import Dataset
from PIL import Image
import os
import json
import random
import zipfile
import numpy as np
from diffusers import StableDiffusionXLPipeline, UNet2DConditionModel, DDPMScheduler, AutoencoderKL
from peft import LoraConfig, get_peft_model
from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
from accelerate import Accelerator
from tqdm import tqdm
import kagglehub
from huggingface_hub import login

# Set environment variable to handle memory fragmentation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Étape 0 : Authentification Hugging Face
hf_token = os.getenv("HF_TOKEN")
if not hf_token:
    from getpass import getpass
    hf_token = getpass("Entrez votre token Hugging Face : ")
login(hf_token)
print("Authentification HF réussie.")

# Étape 1 : Télécharger le Dataset Kaggle
dataset_name = "fatihkgg/ecommerce-product-images-18k"
dataset_path = kagglehub.dataset_download(dataset_name, force_download=False)
print(f"Dataset downloaded to: {dataset_path}")

# Extraction ZIP
top_level_contents = os.listdir(dataset_path)
print(f"Top-level contents: {top_level_contents}")
zip_files = [f for f in top_level_contents if f.lower().endswith('.zip')]
if zip_files:
    for zip_file in zip_files:
        zip_path = os.path.join(dataset_path, zip_file)
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(dataset_path)
        print(f"Extracted {zip_path}")
        os.remove(zip_path)
    top_level_contents = os.listdir(dataset_path)

root_folder = None
if len(top_level_contents) == 1 and os.path.isdir(os.path.join(dataset_path, top_level_contents[0])):
    root_folder = top_level_contents[0]
train_dir = os.path.join(dataset_path, root_folder if root_folder else "", "train")
if not os.path.exists(train_dir):
    raise FileNotFoundError(f"Train dir not found at {train_dir}.")
categories = [d for d in os.listdir(train_dir) if os.path.isdir(os.path.join(train_dir, d))]
print(f"Categories: {categories}")

# Étape 2 : Hyperparamètres SDXL (PLUS SIMPLE)
model_id = "stabilityai/stable-diffusion-xl-base-1.0"
output_dir = "fine_tuned_sdxl_ecommerce"
resolution = 512  # Résolution modeste
train_batch_size = 1
num_epochs = 3  # Réduit pour tester
learning_rate = 1e-4
lora_rank = 4  # Très petit pour économiser VRAM
gradient_accumulation_steps = 4

# Étape 3 : Dataset
class KaggleEcommerceDataset(Dataset):
    def __init__(self, root_dir, tokenizer, tokenizer_2, resolution, categories, num_images=1000):
        self.root_dir = root_dir
        self.tokenizer = tokenizer
        self.tokenizer_2 = tokenizer_2
        self.resolution = resolution
        self.image_files = []
        self.captions = {}

        colors = ['rouge', 'bleu', 'vert', 'noir', 'blanc', 'jaune']
        styles = ['moderne', 'classique', 'sportif', 'élégant', 'casual']

        for category in categories:
            cat_dir = os.path.join(self.root_dir, category)
            for img_file in os.listdir(cat_dir):
                if img_file.lower().endswith(('.jpg', '.png', '.jpeg')):
                    img_path = os.path.join(cat_dir, img_file)
                    self.image_files.append(img_path)
                    color = random.choice(colors)
                    style = random.choice(styles)
                    caption = f"produit e-commerce {category}, {color}, {style}, haute qualité"
                    self.captions[img_path] = caption

        # Save generated_captions.json to a writable directory, e.g., /content/
        # The `root_dir` (/kaggle/input/...) is read-only, causing the OSError.
        output_caption_path = os.path.join("/content/", "generated_captions.json")
        with open(output_caption_path, "w") as f:
            json.dump(self.captions, f)

        print(f"Dataset créé : {len(self.image_files)} images. Captions saved to {output_caption_path}")

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_path = self.image_files[idx]
        image = Image.open(img_path).convert("RGB")
        image = image.resize((self.resolution, self.resolution), Image.LANCZOS)

        # Conversion correcte PIL -> tensor
        image = np.array(image)
        image = torch.from_numpy(image).permute(2, 0, 1).float() / 127.5 - 1.0

        caption = self.captions[img_path]

        # Tokenization pour les 2 encodeurs CLIP de SDXL
        text_input_1 = self.tokenizer(
            caption, padding="max_length", max_length=77, truncation=True, return_tensors="pt"
        )
        text_input_2 = self.tokenizer_2(
            caption, padding="max_length", max_length=77, truncation=True, return_tensors="pt"
        )

        return {
            "pixel_values": image,
            "input_ids_1": text_input_1["input_ids"].squeeze(0),
            "input_ids_2": text_input_2["input_ids"].squeeze(0),
        }

# Étape 4 : Charger le Modèle SDXL
print("Loading tokenizers...")
tokenizer = CLIPTokenizer.from_pretrained(model_id, subfolder="tokenizer", token=hf_token)
tokenizer_2 = CLIPTokenizer.from_pretrained(model_id, subfolder="tokenizer_2", token=hf_token)

print("Loading text encoders...")
text_encoder = CLIPTextModel.from_pretrained(
    model_id, subfolder="text_encoder", token=hf_token, torch_dtype=torch.float16
)
text_encoder_2 = CLIPTextModelWithProjection.from_pretrained(
    model_id, subfolder="text_encoder_2", token=hf_token, torch_dtype=torch.float16
)
text_encoder.requires_grad_(False)
text_encoder_2.requires_grad_(False)

print("Loading VAE...")
vae = AutoencoderKL.from_pretrained(
    model_id, subfolder="vae", token=hf_token, torch_dtype=torch.float16
)
vae.requires_grad_(False)

print("Loading UNet...")
unet = UNet2DConditionModel.from_pretrained(
    model_id, subfolder="unet", token=hf_token, torch_dtype=torch.float16
)

# Enable gradient checkpointing to save memory
unet.enable_gradient_checkpointing()

print("Loading scheduler...")
scheduler = DDPMScheduler.from_pretrained(model_id, subfolder="scheduler", token=hf_token)

# Appliquer LoRA sur UNet
print("Applying LoRA to UNet...")
lora_config = LoraConfig(
    r=lora_rank,
    lora_alpha=lora_rank * 2,
    target_modules=["to_k", "to_q", "to_v", "to_out.0"],
    lora_dropout=0.05,
    bias="none",
)
unet = get_peft_model(unet, lora_config)
unet.print_trainable_parameters()

# Initialiser Accelerator
accelerator = Accelerator(
    gradient_accumulation_steps=gradient_accumulation_steps,
    mixed_precision="fp16"
)

# Optimizer
optimizer = torch.optim.AdamW(
    [p for p in unet.parameters() if p.requires_grad],
    lr=learning_rate
)

# Dataset et Dataloader
dataset = KaggleEcommerceDataset(train_dir, tokenizer, tokenizer_2, resolution, categories, num_images=100)
dataloader = torch.utils.data.DataLoader(
    dataset,
    batch_size=train_batch_size,
    shuffle=True,
    num_workers=0  # Évite les problèmes multiprocessing
)

lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    optimizer,
    T_max=num_epochs * len(dataloader)
)

# Préparer avec Accelerator
unet, optimizer, dataloader, lr_scheduler = accelerator.prepare(
    unet, optimizer, dataloader, lr_scheduler
)

# Déplacer les modèles figés
text_encoder = text_encoder.to(accelerator.device)
text_encoder_2 = text_encoder_2.to(accelerator.device)
vae = vae.to(accelerator.device)

# Mode training
unet.train()
text_encoder.eval()
text_encoder_2.eval()
vae.eval()

# Étape 5 : Boucle d'Entraînement
print("Starting training...")
progress_bar = tqdm(range(num_epochs * len(dataloader)), desc="Training")

for epoch in range(num_epochs):
    for step, batch in enumerate(dataloader):
        with accelerator.accumulate(unet):
            # Récupérer les données
            pixel_values = batch["pixel_values"].to(accelerator.device)
            input_ids_1 = batch["input_ids_1"].to(accelerator.device)
            input_ids_2 = batch["input_ids_2"].to(accelerator.device)

            # Encoder images -> latents
            with torch.no_grad():
                latents = vae.encode(pixel_values.half()).latent_dist.sample()
                latents = latents * vae.config.scaling_factor
                latents = latents.float()  # Retour en float32 pour training

            # Encoder le texte avec les 2 encodeurs CLIP
            with torch.no_grad():
                # Premier encodeur
                encoder_output_1 = text_encoder(input_ids_1, output_hidden_states=True)
                text_embeds_1 = encoder_output_1.hidden_states[-2]

                # Deuxième encodeur (avec projection)
                encoder_output_2 = text_encoder_2(input_ids_2, output_hidden_states=True)
                text_embeds_2 = encoder_output_2.hidden_states[-2]
                pooled_embeds = encoder_output_2.text_embeds

                # Concaténer les embeddings
                text_embeds = torch.cat([text_embeds_1, text_embeds_2], dim=-1)

                # Convertir en float32
                text_embeds = text_embeds.float()
                pooled_embeds = pooled_embeds.float()

            # Ajouter du bruit
            noise = torch.randn_like(latents)
            timesteps = torch.randint(
                0, scheduler.config.num_train_timesteps, (latents.shape[0],),
                device=accelerator.device
            ).long()
            noisy_latents = scheduler.add_noise(latents, noise, timesteps)

            # Prédiction du bruit avec UNet
            # SDXL nécessite added_cond_kwargs pour pooled_embeds et time_ids corrects
            add_time_ids = torch.tensor([resolution, resolution, 0, 0, resolution, resolution], dtype=torch.float32, device=accelerator.device)
            added_cond_kwargs = {"text_embeds": pooled_embeds, "time_ids": add_time_ids.repeat(latents.shape[0], 1)}

            model_pred = unet(
                noisy_latents,
                timesteps,
                encoder_hidden_states=text_embeds,
                added_cond_kwargs=added_cond_kwargs,
                return_dict=False
            )[0]

            # Calcul de la loss
            loss = torch.nn.functional.mse_loss(model_pred, noise)

            # Backward
            accelerator.backward(loss)

            if accelerator.sync_gradients:
                accelerator.clip_grad_norm_(unet.parameters(), 1.0)

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

        progress_bar.update(1)

        if step % 10 == 0 and accelerator.is_main_process:
            print(f"Epoch {epoch}, Step {step}, Loss: {loss.item():.4f}")

        # Nettoyer la mémoire plus fréquemment
        if step % 20 == 0:
            torch.cuda.empty_cache()

print("Training complete!")

# Étape 6 : Sauvegarder
if accelerator.is_main_process:
    print("Saving model...")
    os.makedirs(output_dir, exist_ok=True)

    unwrapped_unet = accelerator.unwrap_model(unet)
    unwrapped_unet.save_pretrained(os.path.join(output_dir, "unet_lora"))

    print(f"Model saved to {output_dir}")

# Étape 7 : Inférence
print("Running inference...")
pipeline = StableDiffusionXLPipeline.from_pretrained(
    model_id,
    unet=accelerator.unwrap_model(unet),
    torch_dtype=torch.float16,
    token=hf_token
)
pipeline = pipeline.to("cuda")

prompt = "produit e-commerce t-shirt, bleu, moderne, haute qualité"
image = pipeline(
    prompt,
    num_inference_steps=30,
    guidance_scale=7.5,
    height=resolution,
    width=resolution
).images[0]

image.save("generated_ecommerce_product.png")
print("✅ Image générée : generated_ecommerce_product.png")

Authentification HF réussie.
Using Colab cache for faster access to the 'ecommerce-product-images-18k' dataset.
Dataset downloaded to: /kaggle/input/ecommerce-product-images-18k
Top-level contents: ['ECOMMERCE_PRODUCT_IMAGES']
Categories: ['ELECTRONICS', 'PET_SUPPLIES', 'BABY_PRODUCTS', 'SPORTS_OUTDOOR', 'HOME_KITCHEN_TOOLS', 'HOBBY_ARTS_STATIONERY', 'BEAUTY_HEALTH', 'CLOTHING_ACCESSORIES_JEWELLERY', 'GROCERY']
Loading tokenizers...
Loading text encoders...
Loading VAE...
Loading UNet...
Loading scheduler...
Applying LoRA to UNet...
trainable params: 5,806,080 || all params: 2,573,269,764 || trainable%: 0.2256
Dataset créé : 13992 images. Captions saved to /content/generated_captions.json
Starting training...



Training:   0%|          | 86/41976 [04:37<37:31:46,  3.23s/it]

Training:   0%|          | 1/41976 [00:02<25:07:37,  2.16s/it][A

Epoch 0, Step 0, Loss: nan



Training:   0%|          | 2/41976 [00:03<19:37:18,  1.68s/it][A
Training:   0%|          | 3/41976 [00:04<17:48:55,  1.53s/it][A
Training:   0%|          | 4/41976 [00:06<17:11:20,  1.47s/it][A
Training:   0%|          | 5/41976 [00:07<16:33:55,  1.42s/it][A
Training:   0%|          | 6/41976 [00:08<16:13:57,  1.39s/it][A
Training:   0%|          | 7/41976 [00:11<20:10:19,  1.73s/it][A
Training:   0%|          | 8/41976 [00:13<19:57:54,  1.71s/it][A
Training:   0%|          | 9/41976 [00:14<18:32:06,  1.59s/it][A
Training:   0%|          | 10/41976 [00:15<17:40:59,  1.52s/it][A
Training:   0%|          | 11/41976 [00:17<17:00:55,  1.46s/it][A

Epoch 0, Step 10, Loss: nan



Training:   0%|          | 12/41976 [00:18<16:42:51,  1.43s/it][A
Training:   0%|          | 13/41976 [00:19<16:24:50,  1.41s/it][A
Training:   0%|          | 14/41976 [00:21<16:10:53,  1.39s/it][A
Training:   0%|          | 15/41976 [00:22<16:05:28,  1.38s/it][A
Training:   0%|          | 16/41976 [00:24<17:36:16,  1.51s/it][A
Training:   0%|          | 17/41976 [00:25<17:18:40,  1.49s/it][A
Training:   0%|          | 18/41976 [00:27<16:57:16,  1.45s/it][A
Training:   0%|          | 19/41976 [00:28<16:42:32,  1.43s/it][A
Training:   0%|          | 20/41976 [00:29<16:31:07,  1.42s/it][A
Training:   0%|          | 21/41976 [00:31<17:54:02,  1.54s/it][A

Epoch 0, Step 20, Loss: nan



Training:   0%|          | 22/41976 [00:33<17:18:31,  1.49s/it][A
Training:   0%|          | 23/41976 [00:34<16:48:37,  1.44s/it][A
Training:   0%|          | 24/41976 [00:36<17:59:39,  1.54s/it][A
Training:   0%|          | 25/41976 [00:37<17:39:26,  1.52s/it][A
Training:   0%|          | 26/41976 [00:38<17:06:28,  1.47s/it][A
Training:   0%|          | 27/41976 [00:40<16:38:37,  1.43s/it][A
Training:   0%|          | 28/41976 [00:41<16:28:58,  1.41s/it][A
Training:   0%|          | 29/41976 [00:43<16:17:33,  1.40s/it][A
Training:   0%|          | 30/41976 [00:44<16:06:25,  1.38s/it][A
Training:   0%|          | 31/41976 [00:45<15:58:19,  1.37s/it][A

Epoch 0, Step 30, Loss: nan



Training:   0%|          | 32/41976 [00:47<16:40:22,  1.43s/it][A
Training:   0%|          | 33/41976 [00:48<17:39:40,  1.52s/it][A
Training:   0%|          | 34/41976 [00:50<17:04:45,  1.47s/it][A
Training:   0%|          | 35/41976 [00:52<18:16:10,  1.57s/it][A
Training:   0%|          | 36/41976 [00:53<17:36:06,  1.51s/it][A
Training:   0%|          | 37/41976 [00:54<16:54:46,  1.45s/it][A
Training:   0%|          | 38/41976 [00:56<16:44:43,  1.44s/it][A
Training:   0%|          | 39/41976 [00:57<16:23:07,  1.41s/it][A
Training:   0%|          | 40/41976 [00:59<16:58:34,  1.46s/it][A
Training:   0%|          | 41/41976 [01:01<18:29:58,  1.59s/it][A

Epoch 0, Step 40, Loss: nan



Training:   0%|          | 42/41976 [01:02<19:28:42,  1.67s/it][A
Training:   0%|          | 43/41976 [01:04<18:19:26,  1.57s/it][A
Training:   0%|          | 44/41976 [01:05<17:41:59,  1.52s/it][A
Training:   0%|          | 45/41976 [01:06<17:01:11,  1.46s/it][A
Training:   0%|          | 46/41976 [01:08<16:41:08,  1.43s/it][A
Training:   0%|          | 47/41976 [01:09<16:26:09,  1.41s/it][A
Training:   0%|          | 48/41976 [01:11<16:18:32,  1.40s/it][A
Training:   0%|          | 49/41976 [01:13<18:40:31,  1.60s/it][A
Training:   0%|          | 50/41976 [01:14<18:39:26,  1.60s/it][A
Training:   0%|          | 51/41976 [01:16<17:48:28,  1.53s/it][A

Epoch 0, Step 50, Loss: nan



Training:   0%|          | 52/41976 [01:17<17:20:46,  1.49s/it][A
Training:   0%|          | 53/41976 [01:18<16:44:11,  1.44s/it][A
Training:   0%|          | 54/41976 [01:20<16:27:25,  1.41s/it][A
Training:   0%|          | 55/41976 [01:21<16:10:34,  1.39s/it][A
Training:   0%|          | 56/41976 [01:22<16:07:57,  1.39s/it][A
Training:   0%|          | 57/41976 [01:24<16:20:21,  1.40s/it][A
Training:   0%|          | 58/41976 [01:26<17:57:59,  1.54s/it][A
Training:   0%|          | 59/41976 [01:27<17:18:18,  1.49s/it][A
Training:   0%|          | 60/41976 [01:28<16:57:31,  1.46s/it][A
Training:   0%|          | 61/41976 [01:30<16:32:30,  1.42s/it][A

Epoch 0, Step 60, Loss: nan



Training:   0%|          | 62/41976 [01:31<16:21:59,  1.41s/it][A
Training:   0%|          | 63/41976 [01:33<16:09:42,  1.39s/it][A
Training:   0%|          | 64/41976 [01:34<17:43:59,  1.52s/it][A
Training:   0%|          | 65/41976 [01:36<17:09:51,  1.47s/it][A
Training:   0%|          | 66/41976 [01:38<18:19:25,  1.57s/it][A
Training:   0%|          | 67/41976 [01:39<17:45:53,  1.53s/it][A
Training:   0%|          | 68/41976 [01:40<17:14:13,  1.48s/it][A
Training:   0%|          | 69/41976 [01:42<16:47:20,  1.44s/it][A
Training:   0%|          | 70/41976 [01:43<16:27:58,  1.41s/it][A
Training:   0%|          | 71/41976 [01:44<16:14:13,  1.39s/it][A

Epoch 0, Step 70, Loss: nan



Training:   0%|          | 72/41976 [01:46<16:14:03,  1.39s/it][A
Training:   0%|          | 73/41976 [01:47<15:54:46,  1.37s/it][A
Training:   0%|          | 74/41976 [01:49<16:51:14,  1.45s/it][A
Training:   0%|          | 75/41976 [01:50<17:30:28,  1.50s/it][A
Training:   0%|          | 76/41976 [01:52<17:00:44,  1.46s/it][A
Training:   0%|          | 77/41976 [01:53<16:27:38,  1.41s/it][A
Training:   0%|          | 78/41976 [01:55<17:49:34,  1.53s/it][A
Training:   0%|          | 79/41976 [01:56<17:14:54,  1.48s/it][A
Training:   0%|          | 80/41976 [01:58<16:49:57,  1.45s/it][A
Training:   0%|          | 81/41976 [01:59<16:22:51,  1.41s/it][A

Epoch 0, Step 80, Loss: nan



Training:   0%|          | 82/41976 [02:00<16:56:20,  1.46s/it][A
Training:   0%|          | 83/41976 [02:02<17:44:54,  1.53s/it][A
Training:   0%|          | 84/41976 [02:03<17:14:58,  1.48s/it][A
Training:   0%|          | 85/41976 [02:05<16:43:55,  1.44s/it][A
Training:   0%|          | 86/41976 [02:06<16:24:15,  1.41s/it][A
Training:   0%|          | 87/41976 [02:07<16:05:37,  1.38s/it][A
Training:   0%|          | 88/41976 [02:09<16:08:33,  1.39s/it][A
Training:   0%|          | 89/41976 [02:10<15:54:49,  1.37s/it][A
Training:   0%|          | 90/41976 [02:12<15:50:34,  1.36s/it][A
Training:   0%|          | 91/41976 [02:13<17:08:27,  1.47s/it][A

Epoch 0, Step 90, Loss: nan



Training:   0%|          | 92/41976 [02:15<18:59:11,  1.63s/it][A
Training:   0%|          | 93/41976 [02:17<17:52:54,  1.54s/it][A
Training:   0%|          | 94/41976 [02:18<17:08:42,  1.47s/it][A
Training:   0%|          | 95/41976 [02:19<16:39:37,  1.43s/it][A
Training:   0%|          | 96/41976 [02:21<16:27:38,  1.41s/it][A
Training:   0%|          | 97/41976 [02:22<16:11:54,  1.39s/it][A
Training:   0%|          | 98/41976 [02:23<16:00:30,  1.38s/it][A
Training:   0%|          | 99/41976 [02:25<16:57:27,  1.46s/it][A
Training:   0%|          | 100/41976 [02:27<17:43:09,  1.52s/it][A
Training:   0%|          | 101/41976 [02:28<16:59:21,  1.46s/it][A

Epoch 0, Step 100, Loss: nan



Training:   0%|          | 102/41976 [02:29<16:43:52,  1.44s/it][A
Training:   0%|          | 103/41976 [02:31<16:31:22,  1.42s/it][A
Training:   0%|          | 104/41976 [02:32<16:21:56,  1.41s/it][A
Training:   0%|          | 105/41976 [02:33<16:02:43,  1.38s/it][A
Training:   0%|          | 106/41976 [02:35<15:47:36,  1.36s/it][A
Training:   0%|          | 107/41976 [02:37<18:17:29,  1.57s/it][A
Training:   0%|          | 108/41976 [02:38<18:37:15,  1.60s/it][A
Training:   0%|          | 109/41976 [02:40<17:42:55,  1.52s/it][A
Training:   0%|          | 110/41976 [02:41<17:04:08,  1.47s/it][A
Training:   0%|          | 111/41976 [02:42<16:41:20,  1.44s/it][A

Epoch 0, Step 110, Loss: nan



Training:   0%|          | 112/41976 [02:44<16:32:39,  1.42s/it][A
Training:   0%|          | 113/41976 [02:45<16:10:41,  1.39s/it][A
Training:   0%|          | 114/41976 [02:47<15:55:28,  1.37s/it][A
Training:   0%|          | 115/41976 [02:48<15:49:18,  1.36s/it][A
Training:   0%|          | 116/41976 [02:50<17:28:13,  1.50s/it][A
Training:   0%|          | 117/41976 [02:51<16:58:03,  1.46s/it][A
Training:   0%|          | 118/41976 [02:52<16:37:02,  1.43s/it][A
Training:   0%|          | 119/41976 [02:54<16:21:23,  1.41s/it][A
Training:   0%|          | 120/41976 [02:55<16:15:57,  1.40s/it][A
Training:   0%|          | 121/41976 [02:57<17:46:20,  1.53s/it][A

Epoch 0, Step 120, Loss: nan



Training:   0%|          | 122/41976 [02:58<17:12:53,  1.48s/it][A
Training:   0%|          | 123/41976 [03:00<16:41:30,  1.44s/it][A
Training:   0%|          | 124/41976 [03:01<17:55:52,  1.54s/it][A
Training:   0%|          | 125/41976 [03:03<17:37:06,  1.52s/it][A
Training:   0%|          | 126/41976 [03:04<17:04:47,  1.47s/it][A
Training:   0%|          | 127/41976 [03:06<16:36:21,  1.43s/it][A
Training:   0%|          | 128/41976 [03:07<16:29:50,  1.42s/it][A
Training:   0%|          | 129/41976 [03:08<16:08:45,  1.39s/it][A
Training:   0%|          | 130/41976 [03:10<16:01:02,  1.38s/it][A
Training:   0%|          | 131/41976 [03:11<15:55:48,  1.37s/it][A

Epoch 0, Step 130, Loss: nan



Training:   0%|          | 132/41976 [03:13<16:40:48,  1.44s/it][A
Training:   0%|          | 133/41976 [03:14<17:37:40,  1.52s/it][A
Training:   0%|          | 134/41976 [03:16<17:01:54,  1.47s/it][A
Training:   0%|          | 135/41976 [03:17<16:39:14,  1.43s/it][A
Training:   0%|          | 136/41976 [03:19<18:06:28,  1.56s/it][A
Training:   0%|          | 137/41976 [03:20<17:24:40,  1.50s/it][A
Training:   0%|          | 138/41976 [03:22<16:53:38,  1.45s/it][A
Training:   0%|          | 139/41976 [03:23<16:37:04,  1.43s/it][A
Training:   0%|          | 140/41976 [03:25<17:03:28,  1.47s/it][A
Training:   0%|          | 141/41976 [03:26<17:56:43,  1.54s/it][A

Epoch 0, Step 140, Loss: nan



Training:   0%|          | 142/41976 [03:28<17:19:46,  1.49s/it][A
Training:   0%|          | 143/41976 [03:29<16:49:26,  1.45s/it][A
Training:   0%|          | 144/41976 [03:30<16:34:45,  1.43s/it][A
Training:   0%|          | 145/41976 [03:32<16:10:29,  1.39s/it][A
Training:   0%|          | 146/41976 [03:33<15:56:55,  1.37s/it][A
Training:   0%|          | 147/41976 [03:34<15:48:30,  1.36s/it][A
Training:   0%|          | 148/41976 [03:36<15:53:53,  1.37s/it][A
Training:   0%|          | 149/41976 [03:37<16:52:06,  1.45s/it][A
Training:   0%|          | 150/41976 [03:39<18:50:16,  1.62s/it][A
Training:   0%|          | 151/41976 [03:41<17:51:50,  1.54s/it][A

Epoch 0, Step 150, Loss: nan



Training:   0%|          | 152/41976 [03:42<17:14:42,  1.48s/it][A
Training:   0%|          | 153/41976 [03:43<16:37:06,  1.43s/it][A
Training:   0%|          | 154/41976 [03:45<16:19:04,  1.40s/it][A
Training:   0%|          | 155/41976 [03:46<16:05:29,  1.39s/it][A
Training:   0%|          | 156/41976 [03:47<16:03:54,  1.38s/it][A
Training:   0%|          | 157/41976 [03:49<16:43:30,  1.44s/it][A
Training:   0%|          | 158/41976 [03:51<17:31:28,  1.51s/it][A
Training:   0%|          | 159/41976 [03:52<16:57:36,  1.46s/it][A
Training:   0%|          | 160/41976 [03:53<16:40:20,  1.44s/it][A
Training:   0%|          | 161/41976 [03:55<16:17:59,  1.40s/it][A

Epoch 0, Step 160, Loss: nan



Training:   0%|          | 162/41976 [03:56<16:11:07,  1.39s/it][A
Training:   0%|          | 163/41976 [03:57<16:04:47,  1.38s/it][A
Training:   0%|          | 164/41976 [03:59<17:46:03,  1.53s/it][A
Training:   0%|          | 165/41976 [04:01<17:49:01,  1.53s/it][A
Training:   0%|          | 166/41976 [04:03<18:26:12,  1.59s/it][A
Training:   0%|          | 167/41976 [04:04<17:36:33,  1.52s/it][A
Training:   0%|          | 168/41976 [04:05<17:01:37,  1.47s/it][A
Training:   0%|          | 169/41976 [04:07<16:29:00,  1.42s/it][A
Training:   0%|          | 170/41976 [04:08<16:10:19,  1.39s/it][A
Training:   0%|          | 171/41976 [04:09<15:59:40,  1.38s/it][A

Epoch 0, Step 170, Loss: nan



Training:   0%|          | 172/41976 [04:11<15:59:39,  1.38s/it][A
Training:   0%|          | 173/41976 [04:12<15:45:28,  1.36s/it][A
Training:   0%|          | 174/41976 [04:14<17:10:08,  1.48s/it][A
Training:   0%|          | 175/41976 [04:15<17:16:31,  1.49s/it][A
Training:   0%|          | 176/41976 [04:17<16:52:00,  1.45s/it][A
Training:   0%|          | 177/41976 [04:18<16:24:43,  1.41s/it][A
Training:   0%|          | 178/41976 [04:20<17:46:02,  1.53s/it][A
Training:   0%|          | 179/41976 [04:21<17:06:03,  1.47s/it][A
Training:   0%|          | 180/41976 [04:22<16:46:20,  1.44s/it][A
Training:   0%|          | 181/41976 [04:24<16:23:10,  1.41s/it][A

Epoch 0, Step 180, Loss: nan



Training:   0%|          | 182/41976 [04:25<17:17:44,  1.49s/it][A
Training:   0%|          | 183/41976 [04:27<17:58:17,  1.55s/it][A
Training:   0%|          | 184/41976 [04:28<17:21:02,  1.49s/it][A
Training:   0%|          | 185/41976 [04:30<16:47:19,  1.45s/it][A
Training:   0%|          | 186/41976 [04:31<16:25:09,  1.41s/it][A
Training:   0%|          | 187/41976 [04:33<16:10:56,  1.39s/it][A
Training:   0%|          | 188/41976 [04:34<16:05:30,  1.39s/it][A
Training:   0%|          | 189/41976 [04:35<15:46:23,  1.36s/it][A
Training:   0%|          | 190/41976 [04:37<16:09:03,  1.39s/it][A
Training:   0%|          | 191/41976 [04:39<18:02:52,  1.55s/it][A

Epoch 0, Step 190, Loss: nan



Training:   0%|          | 192/41976 [04:41<20:38:24,  1.78s/it][A
Training:   0%|          | 193/41976 [04:42<19:01:10,  1.64s/it][A
Training:   0%|          | 194/41976 [04:44<17:59:17,  1.55s/it][A
Training:   0%|          | 195/41976 [04:45<17:18:08,  1.49s/it][A
Training:   0%|          | 196/41976 [04:46<16:54:26,  1.46s/it][A
Training:   0%|          | 197/41976 [04:48<16:23:22,  1.41s/it][A
Training:   0%|          | 198/41976 [04:49<16:38:29,  1.43s/it][A
Training:   0%|          | 199/41976 [04:51<17:53:36,  1.54s/it][A
Training:   0%|          | 200/41976 [04:52<17:20:42,  1.49s/it][A
Training:   0%|          | 201/41976 [04:54<16:44:03,  1.44s/it][A

Epoch 0, Step 200, Loss: nan



Training:   0%|          | 202/41976 [04:55<16:28:55,  1.42s/it][A
Training:   0%|          | 203/41976 [04:56<16:06:15,  1.39s/it][A
Training:   0%|          | 204/41976 [04:58<16:07:47,  1.39s/it][A
Training:   0%|          | 205/41976 [04:59<15:53:40,  1.37s/it][A
Training:   0%|          | 206/41976 [05:01<17:40:37,  1.52s/it][A
Training:   0%|          | 207/41976 [05:03<18:36:23,  1.60s/it][A
Training:   0%|          | 208/41976 [05:04<17:50:47,  1.54s/it][A
Training:   0%|          | 209/41976 [05:05<17:04:37,  1.47s/it][A
Training:   1%|          | 210/41976 [05:07<16:36:51,  1.43s/it][A
Training:   1%|          | 211/41976 [05:08<16:16:46,  1.40s/it][A

Epoch 0, Step 210, Loss: nan



Training:   1%|          | 212/41976 [05:09<16:10:54,  1.39s/it][A
Training:   1%|          | 213/41976 [05:11<15:57:14,  1.38s/it][A
Training:   1%|          | 214/41976 [05:12<15:46:46,  1.36s/it][A
Training:   1%|          | 215/41976 [05:14<16:30:42,  1.42s/it][A
Training:   1%|          | 216/41976 [05:15<17:23:37,  1.50s/it][A
Training:   1%|          | 217/41976 [05:17<16:45:15,  1.44s/it][A
Training:   1%|          | 218/41976 [05:18<16:23:58,  1.41s/it][A
Training:   1%|          | 219/41976 [05:19<16:10:51,  1.40s/it][A
Training:   1%|          | 220/41976 [05:21<17:41:09,  1.52s/it][A
Training:   1%|          | 221/41976 [05:22<17:02:01,  1.47s/it][A

Epoch 0, Step 220, Loss: nan



Training:   1%|          | 222/41976 [05:24<16:38:04,  1.43s/it][A
Training:   1%|          | 223/41976 [05:25<16:53:20,  1.46s/it][A
Training:   1%|          | 224/41976 [05:27<18:03:04,  1.56s/it][A
Training:   1%|          | 225/41976 [05:28<17:19:05,  1.49s/it][A
Training:   1%|          | 226/41976 [05:30<16:47:39,  1.45s/it][A
Training:   1%|          | 227/41976 [05:31<16:25:00,  1.42s/it][A
Training:   1%|          | 228/41976 [05:33<16:17:43,  1.41s/it][A
Training:   1%|          | 229/41976 [05:34<16:00:17,  1.38s/it][A
Training:   1%|          | 230/41976 [05:35<15:52:04,  1.37s/it][A
Training:   1%|          | 231/41976 [05:37<15:44:11,  1.36s/it][A

Epoch 0, Step 230, Loss: nan



Training:   1%|          | 232/41976 [05:38<17:16:25,  1.49s/it][A
Training:   1%|          | 233/41976 [05:40<17:07:47,  1.48s/it][A
Training:   1%|          | 234/41976 [05:42<18:15:57,  1.58s/it][A
Training:   1%|          | 235/41976 [05:43<17:26:42,  1.50s/it][A
Training:   1%|          | 236/41976 [05:44<16:58:17,  1.46s/it][A
Training:   1%|          | 237/41976 [05:46<16:26:45,  1.42s/it][A
Training:   1%|          | 238/41976 [05:47<16:06:37,  1.39s/it][A
Training:   1%|          | 239/41976 [05:48<15:58:03,  1.38s/it][A
Training:   1%|          | 240/41976 [05:50<17:03:07,  1.47s/it][A
Training:   1%|          | 241/41976 [05:51<17:20:04,  1.50s/it][A

Epoch 0, Step 240, Loss: nan



Training:   1%|          | 242/41976 [05:53<16:54:26,  1.46s/it][A
Training:   1%|          | 243/41976 [05:54<16:23:47,  1.41s/it][A
Training:   1%|          | 244/41976 [05:56<16:13:25,  1.40s/it][A
Training:   1%|          | 245/41976 [05:57<15:53:43,  1.37s/it][A
Training:   1%|          | 246/41976 [05:58<15:53:26,  1.37s/it][A
Training:   1%|          | 247/41976 [06:00<15:49:14,  1.36s/it][A
Training:   1%|          | 248/41976 [06:02<18:24:06,  1.59s/it][A
Training:   1%|          | 249/41976 [06:03<18:27:20,  1.59s/it][A
Training:   1%|          | 250/41976 [06:05<17:33:58,  1.52s/it][A
Training:   1%|          | 251/41976 [06:06<17:04:24,  1.47s/it][A

Epoch 0, Step 250, Loss: nan



Training:   1%|          | 252/41976 [06:07<16:43:08,  1.44s/it][A
Training:   1%|          | 253/41976 [06:09<16:18:55,  1.41s/it][A
Training:   1%|          | 254/41976 [06:10<16:10:06,  1.40s/it][A
Training:   1%|          | 255/41976 [06:11<15:55:35,  1.37s/it][A
Training:   1%|          | 256/41976 [06:13<16:05:12,  1.39s/it][A
Training:   1%|          | 257/41976 [06:15<17:26:58,  1.51s/it][A
Training:   1%|          | 258/41976 [06:16<16:54:28,  1.46s/it][A
Training:   1%|          | 259/41976 [06:17<16:33:56,  1.43s/it][A
Training:   1%|          | 260/41976 [06:19<16:17:45,  1.41s/it][A
Training:   1%|          | 261/41976 [06:20<16:02:54,  1.38s/it][A

Epoch 0, Step 260, Loss: nan



Training:   1%|          | 262/41976 [06:22<17:34:05,  1.52s/it][A
Training:   1%|          | 263/41976 [06:23<17:00:17,  1.47s/it][A
Training:   1%|          | 264/41976 [06:25<16:44:08,  1.44s/it][A
Training:   1%|          | 265/41976 [06:26<17:42:05,  1.53s/it][A
Training:   1%|          | 266/41976 [06:28<17:33:30,  1.52s/it][A
Training:   1%|          | 267/41976 [06:29<17:05:34,  1.48s/it][A
Training:   1%|          | 268/41976 [06:31<16:47:07,  1.45s/it][A
Training:   1%|          | 269/41976 [06:32<16:25:13,  1.42s/it][A
Training:   1%|          | 270/41976 [06:33<16:10:28,  1.40s/it][A
Training:   1%|          | 271/41976 [06:35<15:56:15,  1.38s/it][A

Epoch 0, Step 270, Loss: nan



Training:   1%|          | 272/41976 [06:36<15:57:54,  1.38s/it][A
Training:   1%|          | 273/41976 [06:37<16:32:24,  1.43s/it][A
Training:   1%|          | 274/41976 [06:39<17:38:41,  1.52s/it][A
Training:   1%|          | 275/41976 [06:41<17:00:59,  1.47s/it][A
Training:   1%|          | 276/41976 [06:42<18:25:10,  1.59s/it][A
Training:   1%|          | 277/41976 [06:44<17:27:40,  1.51s/it][A
Training:   1%|          | 278/41976 [06:45<16:54:44,  1.46s/it][A
Training:   1%|          | 279/41976 [06:46<16:33:30,  1.43s/it][A
Training:   1%|          | 280/41976 [06:48<16:20:21,  1.41s/it][A
Training:   1%|          | 281/41976 [06:49<16:40:25,  1.44s/it][A

Epoch 0, Step 280, Loss: nan



Training:   1%|          | 282/41976 [06:51<17:41:54,  1.53s/it][A
Training:   1%|          | 283/41976 [06:52<17:02:46,  1.47s/it][A
Training:   1%|          | 284/41976 [06:54<16:45:25,  1.45s/it][A
Training:   1%|          | 285/41976 [06:55<16:23:59,  1.42s/it][A
Training:   1%|          | 286/41976 [06:56<16:06:39,  1.39s/it][A
Training:   1%|          | 287/41976 [06:58<15:56:42,  1.38s/it][A
Training:   1%|          | 288/41976 [06:59<16:04:28,  1.39s/it][A
Training:   1%|          | 289/41976 [07:01<15:49:57,  1.37s/it][A
Training:   1%|          | 290/41976 [07:03<19:11:14,  1.66s/it][A
Training:   1%|          | 291/41976 [07:04<18:06:18,  1.56s/it][A

Epoch 0, Step 290, Loss: nan



Training:   1%|          | 292/41976 [07:06<17:27:51,  1.51s/it][A
Training:   1%|          | 293/41976 [07:07<16:49:57,  1.45s/it][A
Training:   1%|          | 294/41976 [07:08<16:29:47,  1.42s/it][A
Training:   1%|          | 295/41976 [07:10<16:14:25,  1.40s/it][A
Training:   1%|          | 296/41976 [07:11<16:10:28,  1.40s/it][A
Training:   1%|          | 297/41976 [07:12<15:55:57,  1.38s/it][A
Training:   1%|          | 298/41976 [07:14<16:47:19,  1.45s/it][A
Training:   1%|          | 299/41976 [07:16<17:18:00,  1.49s/it][A
Training:   1%|          | 300/41976 [07:17<16:48:41,  1.45s/it][A
Training:   1%|          | 301/41976 [07:18<16:19:37,  1.41s/it][A

Epoch 0, Step 300, Loss: nan



Training:   1%|          | 302/41976 [07:20<16:12:08,  1.40s/it][A
Training:   1%|          | 303/41976 [07:21<17:38:31,  1.52s/it][A
Training:   1%|          | 304/41976 [07:23<17:13:20,  1.49s/it][A
Training:   1%|          | 305/41976 [07:24<16:42:42,  1.44s/it][A
Training:   1%|          | 306/41976 [07:26<17:20:00,  1.50s/it][A
Training:   1%|          | 307/41976 [07:27<17:48:36,  1.54s/it][A
Training:   1%|          | 308/41976 [07:29<17:23:15,  1.50s/it][A
Training:   1%|          | 309/41976 [07:30<16:47:56,  1.45s/it][A
Training:   1%|          | 310/41976 [07:32<16:25:06,  1.42s/it][A
Training:   1%|          | 311/41976 [07:33<16:10:45,  1.40s/it][A

Epoch 0, Step 310, Loss: nan



Training:   1%|          | 312/41976 [07:34<16:08:04,  1.39s/it][A
Training:   1%|          | 313/41976 [07:36<15:50:50,  1.37s/it][A
Training:   1%|          | 314/41976 [07:37<15:45:57,  1.36s/it][A
Training:   1%|          | 315/41976 [07:39<17:14:47,  1.49s/it][A
Training:   1%|          | 316/41976 [07:40<16:58:48,  1.47s/it][A
Training:   1%|          | 317/41976 [07:42<18:05:54,  1.56s/it][A
Training:   1%|          | 318/41976 [07:43<17:23:17,  1.50s/it][A
Training:   1%|          | 319/41976 [07:45<16:56:30,  1.46s/it][A
Training:   1%|          | 320/41976 [07:46<16:36:43,  1.44s/it][A
Training:   1%|          | 321/41976 [07:47<16:12:08,  1.40s/it][A

Epoch 0, Step 320, Loss: nan



Training:   1%|          | 322/41976 [07:49<16:06:31,  1.39s/it][A
Training:   1%|          | 323/41976 [07:50<17:14:29,  1.49s/it][A
Training:   1%|          | 324/41976 [07:52<17:20:20,  1.50s/it][A
Training:   1%|          | 325/41976 [07:53<16:45:04,  1.45s/it][A
Training:   1%|          | 326/41976 [07:55<16:17:52,  1.41s/it][A
Training:   1%|          | 327/41976 [07:56<16:06:38,  1.39s/it][A
Training:   1%|          | 328/41976 [07:57<15:59:40,  1.38s/it][A
Training:   1%|          | 329/41976 [07:59<15:46:36,  1.36s/it][A
Training:   1%|          | 330/41976 [08:00<15:49:57,  1.37s/it][A
Training:   1%|          | 331/41976 [08:02<16:19:42,  1.41s/it][A

Epoch 0, Step 330, Loss: nan



Training:   1%|          | 332/41976 [08:04<18:59:06,  1.64s/it][A
Training:   1%|          | 333/41976 [08:05<17:52:17,  1.54s/it][A
Training:   1%|          | 334/41976 [08:06<17:17:20,  1.49s/it][A
Training:   1%|          | 335/41976 [08:08<16:46:07,  1.45s/it][A
Training:   1%|          | 336/41976 [08:09<16:34:42,  1.43s/it][A
Training:   1%|          | 337/41976 [08:10<16:14:24,  1.40s/it][A
Training:   1%|          | 338/41976 [08:12<16:01:05,  1.38s/it][A
Training:   1%|          | 339/41976 [08:13<16:15:00,  1.41s/it][A
Training:   1%|          | 340/41976 [08:15<17:37:32,  1.52s/it][A
Training:   1%|          | 341/41976 [08:16<16:57:05,  1.47s/it][A

Epoch 0, Step 340, Loss: nan



Training:   1%|          | 342/41976 [08:18<17:34:57,  1.52s/it][A
Training:   1%|          | 343/41976 [08:20<17:58:13,  1.55s/it][A
Training:   1%|          | 344/41976 [08:21<17:24:06,  1.50s/it][A
Training:   1%|          | 345/41976 [08:23<18:24:47,  1.59s/it][A
Training:   1%|          | 346/41976 [08:24<17:30:04,  1.51s/it][A
Training:   1%|          | 347/41976 [08:26<17:41:06,  1.53s/it][A
Training:   1%|          | 348/41976 [08:28<18:30:26,  1.60s/it][A
Training:   1%|          | 349/41976 [08:29<17:34:26,  1.52s/it][A
Training:   1%|          | 350/41976 [08:30<17:06:22,  1.48s/it][A
Training:   1%|          | 351/41976 [08:32<16:33:25,  1.43s/it][A

Epoch 0, Step 350, Loss: nan



Training:   1%|          | 352/41976 [08:33<16:16:18,  1.41s/it][A
Training:   1%|          | 353/41976 [08:34<15:56:41,  1.38s/it][A
Training:   1%|          | 354/41976 [08:36<15:49:01,  1.37s/it][A
Training:   1%|          | 355/41976 [08:37<15:47:12,  1.37s/it][A
Training:   1%|          | 356/41976 [08:39<17:32:28,  1.52s/it][A
Training:   1%|          | 357/41976 [08:40<16:58:21,  1.47s/it][A
Training:   1%|          | 358/41976 [08:41<16:32:44,  1.43s/it][A
Training:   1%|          | 359/41976 [08:43<17:54:00,  1.55s/it][A
Training:   1%|          | 360/41976 [08:45<17:14:19,  1.49s/it][A
Training:   1%|          | 361/41976 [08:46<16:39:24,  1.44s/it][A

Epoch 0, Step 360, Loss: nan



Training:   1%|          | 362/41976 [08:47<16:21:12,  1.41s/it][A
Training:   1%|          | 363/41976 [08:49<16:04:22,  1.39s/it][A
Training:   1%|          | 364/41976 [08:50<17:21:43,  1.50s/it][A
Training:   1%|          | 365/41976 [08:52<17:23:51,  1.51s/it][A
Training:   1%|          | 366/41976 [08:53<16:56:25,  1.47s/it][A
Training:   1%|          | 367/41976 [08:55<16:30:46,  1.43s/it][A
Training:   1%|          | 368/41976 [08:56<16:18:19,  1.41s/it][A
Training:   1%|          | 369/41976 [08:57<15:56:08,  1.38s/it][A
Training:   1%|          | 370/41976 [08:59<15:49:26,  1.37s/it][A
Training:   1%|          | 371/41976 [09:00<15:52:53,  1.37s/it][A

Epoch 0, Step 370, Loss: nan



Training:   1%|          | 372/41976 [09:02<16:35:39,  1.44s/it][A
Training:   1%|          | 373/41976 [09:04<19:00:20,  1.64s/it][A
Training:   1%|          | 374/41976 [09:05<17:57:35,  1.55s/it][A
Training:   1%|          | 375/41976 [09:06<17:08:56,  1.48s/it][A
Training:   1%|          | 376/41976 [09:08<16:45:38,  1.45s/it][A
Training:   1%|          | 377/41976 [09:09<16:19:31,  1.41s/it][A
Training:   1%|          | 378/41976 [09:10<16:05:39,  1.39s/it][A
Training:   1%|          | 379/41976 [09:12<15:54:52,  1.38s/it][A
Training:   1%|          | 380/41976 [09:13<16:12:52,  1.40s/it][A
Training:   1%|          | 381/41976 [09:15<17:35:07,  1.52s/it][A

Epoch 0, Step 380, Loss: nan



Training:   1%|          | 382/41976 [09:16<16:59:26,  1.47s/it][A
Training:   1%|          | 383/41976 [09:18<16:37:29,  1.44s/it][A
Training:   1%|          | 384/41976 [09:19<16:28:29,  1.43s/it][A
Training:   1%|          | 385/41976 [09:21<16:07:31,  1.40s/it][A
Training:   1%|          | 386/41976 [09:22<15:58:59,  1.38s/it][A
Training:   1%|          | 387/41976 [09:24<17:28:01,  1.51s/it][A
Training:   1%|          | 388/41976 [09:25<17:01:23,  1.47s/it][A
Training:   1%|          | 389/41976 [09:27<18:01:08,  1.56s/it][A
Training:   1%|          | 390/41976 [09:28<17:38:55,  1.53s/it][A
Training:   1%|          | 391/41976 [09:30<17:04:58,  1.48s/it][A

Epoch 0, Step 390, Loss: nan



Training:   1%|          | 392/41976 [09:31<16:46:00,  1.45s/it][A
Training:   1%|          | 393/41976 [09:32<16:14:14,  1.41s/it][A
Training:   1%|          | 394/41976 [09:34<15:57:02,  1.38s/it][A
Training:   1%|          | 395/41976 [09:35<15:44:53,  1.36s/it][A
Training:   1%|          | 396/41976 [09:36<15:48:05,  1.37s/it][A
Training:   1%|          | 397/41976 [09:38<16:23:49,  1.42s/it][A
Training:   1%|          | 398/41976 [09:40<17:19:07,  1.50s/it][A
Training:   1%|          | 399/41976 [09:41<16:41:52,  1.45s/it][A
Training:   1%|          | 400/41976 [09:42<16:26:41,  1.42s/it][A
Training:   1%|          | 401/41976 [09:44<17:42:46,  1.53s/it][A

Epoch 0, Step 400, Loss: nan



Training:   1%|          | 402/41976 [09:45<17:09:14,  1.49s/it][A
Training:   1%|          | 403/41976 [09:47<16:41:02,  1.44s/it][A
Training:   1%|          | 404/41976 [09:48<16:24:21,  1.42s/it][A
Training:   1%|          | 405/41976 [09:50<16:27:06,  1.42s/it][A
Training:   1%|          | 406/41976 [09:51<17:34:43,  1.52s/it][A
Training:   1%|          | 407/41976 [09:53<16:56:36,  1.47s/it][A
Training:   1%|          | 408/41976 [09:54<16:33:39,  1.43s/it][A
Training:   1%|          | 409/41976 [09:55<16:09:25,  1.40s/it][A
Training:   1%|          | 410/41976 [09:57<16:01:04,  1.39s/it][A
Training:   1%|          | 411/41976 [09:58<15:49:15,  1.37s/it][A

Epoch 0, Step 410, Loss: nan



Training:   1%|          | 412/41976 [09:59<15:53:21,  1.38s/it][A
Training:   1%|          | 413/41976 [10:01<15:51:58,  1.37s/it][A
Training:   1%|          | 414/41976 [10:02<16:56:31,  1.47s/it][A
Training:   1%|          | 415/41976 [10:04<18:44:28,  1.62s/it][A
Training:   1%|          | 416/41976 [10:06<17:57:48,  1.56s/it][A
Training:   1%|          | 417/41976 [10:07<17:10:41,  1.49s/it][A
Training:   1%|          | 418/41976 [10:09<16:39:43,  1.44s/it][A
Training:   1%|          | 419/41976 [10:10<16:21:48,  1.42s/it][A
Training:   1%|          | 420/41976 [10:11<16:12:06,  1.40s/it][A
Training:   1%|          | 421/41976 [10:13<15:53:48,  1.38s/it][A

Epoch 0, Step 420, Loss: nan



Training:   1%|          | 422/41976 [10:14<17:08:41,  1.49s/it][A
Training:   1%|          | 423/41976 [10:16<17:30:18,  1.52s/it][A
Training:   1%|          | 424/41976 [10:17<17:01:59,  1.48s/it][A
Training:   1%|          | 425/41976 [10:19<16:29:39,  1.43s/it][A
Training:   1%|          | 426/41976 [10:20<16:10:10,  1.40s/it][A
Training:   1%|          | 427/41976 [10:21<15:59:37,  1.39s/it][A
Training:   1%|          | 428/41976 [10:23<16:04:35,  1.39s/it][A
Training:   1%|          | 429/41976 [10:25<17:28:34,  1.51s/it][A
Training:   1%|          | 430/41976 [10:26<17:44:55,  1.54s/it][A
Training:   1%|          | 431/41976 [10:28<18:15:18,  1.58s/it][A

Epoch 0, Step 430, Loss: nan



Training:   1%|          | 432/41976 [10:29<17:33:33,  1.52s/it][A
Training:   1%|          | 433/41976 [10:31<17:02:42,  1.48s/it][A
Training:   1%|          | 434/41976 [10:32<16:37:04,  1.44s/it][A
Training:   1%|          | 435/41976 [10:33<16:18:31,  1.41s/it][A
Training:   1%|          | 436/41976 [10:35<16:15:06,  1.41s/it][A
Training:   1%|          | 437/41976 [10:36<15:54:10,  1.38s/it][A
Training:   1%|          | 438/41976 [10:37<15:42:38,  1.36s/it][A
Training:   1%|          | 439/41976 [10:39<17:15:03,  1.50s/it][A
Training:   1%|          | 440/41976 [10:41<16:58:22,  1.47s/it][A
Training:   1%|          | 441/41976 [10:42<16:29:25,  1.43s/it][A

Epoch 0, Step 440, Loss: nan



Training:   1%|          | 442/41976 [10:44<17:54:29,  1.55s/it][A
Training:   1%|          | 443/41976 [10:45<17:10:43,  1.49s/it][A
Training:   1%|          | 444/41976 [10:46<16:49:49,  1.46s/it][A
Training:   1%|          | 445/41976 [10:48<16:18:25,  1.41s/it][A
Training:   1%|          | 446/41976 [10:49<16:02:35,  1.39s/it][A
Training:   1%|          | 447/41976 [10:51<17:16:33,  1.50s/it][A
Training:   1%|          | 448/41976 [10:52<17:20:19,  1.50s/it][A
Training:   1%|          | 449/41976 [10:54<16:41:37,  1.45s/it][A
Training:   1%|          | 450/41976 [10:55<16:16:48,  1.41s/it][A
Training:   1%|          | 451/41976 [10:56<16:00:40,  1.39s/it][A

Epoch 0, Step 450, Loss: nan



Training:   1%|          | 452/41976 [10:58<16:00:56,  1.39s/it][A
Training:   1%|          | 453/41976 [10:59<15:46:46,  1.37s/it][A
Training:   1%|          | 454/41976 [11:00<15:46:17,  1.37s/it][A
Training:   1%|          | 455/41976 [11:02<16:24:46,  1.42s/it][A
Training:   1%|          | 456/41976 [11:04<18:59:02,  1.65s/it][A
Training:   1%|          | 457/41976 [11:05<17:50:23,  1.55s/it][A
Training:   1%|          | 458/41976 [11:07<17:04:00,  1.48s/it][A
Training:   1%|          | 459/41976 [11:08<16:33:32,  1.44s/it][A
Training:   1%|          | 460/41976 [11:09<16:25:36,  1.42s/it][A
Training:   1%|          | 461/41976 [11:11<16:06:03,  1.40s/it][A

Epoch 0, Step 460, Loss: nan



Training:   1%|          | 462/41976 [11:12<16:02:10,  1.39s/it][A
Training:   1%|          | 463/41976 [11:14<16:07:55,  1.40s/it][A
Training:   1%|          | 464/41976 [11:15<17:48:16,  1.54s/it][A
Training:   1%|          | 465/41976 [11:17<17:04:19,  1.48s/it][A
Training:   1%|          | 466/41976 [11:18<16:39:53,  1.45s/it][A
Training:   1%|          | 467/41976 [11:20<16:23:18,  1.42s/it][A
Training:   1%|          | 468/41976 [11:21<16:13:04,  1.41s/it][A
Training:   1%|          | 469/41976 [11:22<15:57:46,  1.38s/it][A
Training:   1%|          | 470/41976 [11:24<17:32:46,  1.52s/it][A
Training:   1%|          | 471/41976 [11:25<16:58:17,  1.47s/it][A

Epoch 0, Step 470, Loss: nan



Training:   1%|          | 472/41976 [11:27<18:16:06,  1.58s/it][A
Training:   1%|          | 473/41976 [11:29<17:21:41,  1.51s/it][A
Training:   1%|          | 474/41976 [11:30<16:50:07,  1.46s/it][A
Training:   1%|          | 475/41976 [11:31<16:29:54,  1.43s/it][A
Training:   1%|          | 476/41976 [11:33<16:18:37,  1.41s/it][A
Training:   1%|          | 477/41976 [11:34<16:04:04,  1.39s/it][A
Training:   1%|          | 478/41976 [11:35<16:00:31,  1.39s/it][A
Training:   1%|          | 479/41976 [11:37<15:47:30,  1.37s/it][A
Training:   1%|          | 480/41976 [11:38<16:57:35,  1.47s/it][A
Training:   1%|          | 481/41976 [11:40<17:27:56,  1.52s/it][A

Epoch 0, Step 480, Loss: nan



Training:   1%|          | 482/41976 [11:41<16:56:18,  1.47s/it][A
Training:   1%|          | 483/41976 [11:43<16:32:19,  1.43s/it][A
Training:   1%|          | 484/41976 [11:45<18:00:40,  1.56s/it][A
Training:   1%|          | 485/41976 [11:46<17:10:55,  1.49s/it][A
Training:   1%|          | 486/41976 [11:47<16:38:13,  1.44s/it][A
Training:   1%|          | 487/41976 [11:49<16:13:46,  1.41s/it][A
Training:   1%|          | 488/41976 [11:50<16:57:28,  1.47s/it][A
Training:   1%|          | 489/41976 [11:52<17:42:46,  1.54s/it][A
Training:   1%|          | 490/41976 [11:53<17:05:07,  1.48s/it][A
Training:   1%|          | 491/41976 [11:55<16:41:14,  1.45s/it][A

Epoch 0, Step 490, Loss: nan



Training:   1%|          | 492/41976 [11:56<16:52:09,  1.46s/it][A
Training:   1%|          | 493/41976 [11:58<17:57:42,  1.56s/it][A
Training:   1%|          | 494/41976 [11:59<17:15:28,  1.50s/it][A
Training:   1%|          | 495/41976 [12:01<16:47:10,  1.46s/it][A
Training:   1%|          | 496/41976 [12:02<17:27:59,  1.52s/it][A
Training:   1%|          | 497/41976 [12:04<17:54:14,  1.55s/it][A
Training:   1%|          | 498/41976 [12:06<18:53:07,  1.64s/it][A
Training:   1%|          | 499/41976 [12:07<17:48:18,  1.55s/it][A
Training:   1%|          | 500/41976 [12:09<17:16:01,  1.50s/it][A
Training:   1%|          | 501/41976 [12:10<16:42:08,  1.45s/it][A

Epoch 0, Step 500, Loss: nan



Training:   1%|          | 502/41976 [12:11<16:27:09,  1.43s/it][A
Training:   1%|          | 503/41976 [12:13<16:09:20,  1.40s/it][A
Training:   1%|          | 504/41976 [12:14<16:41:04,  1.45s/it][A
Training:   1%|          | 505/41976 [12:16<17:32:05,  1.52s/it][A
Training:   1%|          | 506/41976 [12:17<16:53:49,  1.47s/it][A
Training:   1%|          | 507/41976 [12:18<16:25:48,  1.43s/it][A
Training:   1%|          | 508/41976 [12:20<16:13:59,  1.41s/it][A
Training:   1%|          | 509/41976 [12:21<15:54:45,  1.38s/it][A
Training:   1%|          | 510/41976 [12:23<15:48:30,  1.37s/it][A
Training:   1%|          | 511/41976 [12:24<15:42:59,  1.36s/it][A

Epoch 0, Step 510, Loss: nan



Training:   1%|          | 512/41976 [12:26<17:39:08,  1.53s/it][A
Training:   1%|          | 513/41976 [12:28<18:28:24,  1.60s/it][A
Training:   1%|          | 514/41976 [12:29<17:35:48,  1.53s/it][A
Training:   1%|          | 515/41976 [12:30<16:55:47,  1.47s/it][A
Training:   1%|          | 516/41976 [12:32<16:37:26,  1.44s/it][A
Training:   1%|          | 517/41976 [12:33<16:10:56,  1.41s/it][A
Training:   1%|          | 518/41976 [12:34<15:59:48,  1.39s/it][A
Training:   1%|          | 519/41976 [12:36<15:49:04,  1.37s/it][A
Training:   1%|          | 520/41976 [12:37<15:49:40,  1.37s/it][A
Training:   1%|          | 521/41976 [12:39<16:36:03,  1.44s/it][A

Epoch 0, Step 520, Loss: nan



Training:   1%|          | 522/41976 [12:40<17:18:46,  1.50s/it][A
Training:   1%|          | 523/41976 [12:42<16:49:38,  1.46s/it][A
Training:   1%|          | 524/41976 [12:43<16:32:22,  1.44s/it][A
Training:   1%|▏         | 525/41976 [12:44<16:08:43,  1.40s/it][A
Training:   1%|▏         | 526/41976 [12:46<17:34:49,  1.53s/it][A
Training:   1%|▏         | 527/41976 [12:47<16:53:15,  1.47s/it][A
Training:   1%|▏         | 528/41976 [12:49<16:38:31,  1.45s/it][A
Training:   1%|▏         | 529/41976 [12:50<16:57:32,  1.47s/it][A
Training:   1%|▏         | 530/41976 [12:52<17:35:20,  1.53s/it][A
Training:   1%|▏         | 531/41976 [12:53<16:55:45,  1.47s/it][A

Epoch 0, Step 530, Loss: nan



Training:   1%|▏         | 532/41976 [12:55<16:32:37,  1.44s/it][A
Training:   1%|▏         | 533/41976 [12:56<16:12:02,  1.41s/it][A
Training:   1%|▏         | 534/41976 [12:57<15:55:43,  1.38s/it][A
Training:   1%|▏         | 535/41976 [12:59<15:45:57,  1.37s/it][A
Training:   1%|▏         | 536/41976 [13:00<15:46:25,  1.37s/it][A
Training:   1%|▏         | 537/41976 [13:01<15:41:28,  1.36s/it][A
Training:   1%|▏         | 538/41976 [13:03<17:07:58,  1.49s/it][A
Training:   1%|▏         | 539/41976 [13:05<16:49:20,  1.46s/it][A
Training:   1%|▏         | 540/41976 [13:07<18:12:43,  1.58s/it][A
Training:   1%|▏         | 541/41976 [13:08<17:14:58,  1.50s/it][A

Epoch 0, Step 540, Loss: nan



Training:   1%|▏         | 542/41976 [13:09<16:47:10,  1.46s/it][A
Training:   1%|▏         | 543/41976 [13:11<16:21:55,  1.42s/it][A
Training:   1%|▏         | 544/41976 [13:12<16:14:46,  1.41s/it][A
Training:   1%|▏         | 545/41976 [13:13<15:51:10,  1.38s/it][A
Training:   1%|▏         | 546/41976 [13:15<16:53:29,  1.47s/it][A
Training:   1%|▏         | 547/41976 [13:16<17:11:05,  1.49s/it][A
Training:   1%|▏         | 548/41976 [13:18<16:47:52,  1.46s/it][A
Training:   1%|▏         | 549/41976 [13:19<16:18:40,  1.42s/it][A
Training:   1%|▏         | 550/41976 [13:20<16:03:35,  1.40s/it][A
Training:   1%|▏         | 551/41976 [13:22<15:53:59,  1.38s/it][A

Epoch 0, Step 550, Loss: nan



Training:   1%|▏         | 552/41976 [13:23<15:58:06,  1.39s/it][A
Training:   1%|▏         | 553/41976 [13:25<15:44:18,  1.37s/it][A
Training:   1%|▏         | 554/41976 [13:26<16:10:35,  1.41s/it][A
Training:   1%|▏         | 555/41976 [13:28<19:02:47,  1.66s/it][A
Training:   1%|▏         | 556/41976 [13:30<18:04:53,  1.57s/it][A
Training:   1%|▏         | 557/41976 [13:31<17:14:38,  1.50s/it][A
Training:   1%|▏         | 558/41976 [13:32<16:52:09,  1.47s/it][A
Training:   1%|▏         | 559/41976 [13:34<16:23:37,  1.42s/it][A
Training:   1%|▏         | 560/41976 [13:35<16:08:35,  1.40s/it][A
Training:   1%|▏         | 561/41976 [13:36<15:49:41,  1.38s/it][A

Epoch 0, Step 560, Loss: nan



Training:   1%|▏         | 562/41976 [13:38<15:56:04,  1.39s/it][A
Training:   1%|▏         | 563/41976 [13:40<17:24:18,  1.51s/it][A
Training:   1%|▏         | 564/41976 [13:41<17:01:33,  1.48s/it][A
Training:   1%|▏         | 565/41976 [13:42<16:26:02,  1.43s/it][A
Training:   1%|▏         | 566/41976 [13:44<16:07:44,  1.40s/it][A
Training:   1%|▏         | 567/41976 [13:45<15:57:50,  1.39s/it][A
Training:   1%|▏         | 568/41976 [13:46<15:57:13,  1.39s/it][A
Training:   1%|▏         | 569/41976 [13:48<17:20:38,  1.51s/it][A
Training:   1%|▏         | 570/41976 [13:50<16:48:11,  1.46s/it][A
Training:   1%|▏         | 571/41976 [13:51<17:52:10,  1.55s/it][A

Epoch 0, Step 570, Loss: nan



Training:   1%|▏         | 572/41976 [13:53<17:38:43,  1.53s/it][A
Training:   1%|▏         | 573/41976 [13:54<16:56:01,  1.47s/it][A
Training:   1%|▏         | 574/41976 [13:55<16:29:07,  1.43s/it][A
Training:   1%|▏         | 575/41976 [13:57<16:09:31,  1.41s/it][A
Training:   1%|▏         | 576/41976 [13:58<15:58:52,  1.39s/it][A
Training:   1%|▏         | 577/41976 [13:59<15:46:00,  1.37s/it][A
Training:   1%|▏         | 578/41976 [14:01<15:41:07,  1.36s/it][A
Training:   1%|▏         | 579/41976 [14:02<16:15:57,  1.41s/it][A
Training:   1%|▏         | 580/41976 [14:04<17:23:13,  1.51s/it][A
Training:   1%|▏         | 581/41976 [14:05<16:42:42,  1.45s/it][A

Epoch 0, Step 580, Loss: nan



Training:   1%|▏         | 582/41976 [14:07<16:25:46,  1.43s/it][A
Training:   1%|▏         | 583/41976 [14:09<17:44:10,  1.54s/it][A
Training:   1%|▏         | 584/41976 [14:10<17:15:15,  1.50s/it][A
Training:   1%|▏         | 585/41976 [14:11<16:37:25,  1.45s/it][A
Training:   1%|▏         | 586/41976 [14:13<16:16:32,  1.42s/it][A
Training:   1%|▏         | 587/41976 [14:14<16:22:35,  1.42s/it][A
Training:   1%|▏         | 588/41976 [14:16<17:51:42,  1.55s/it][A
Training:   1%|▏         | 589/41976 [14:17<17:07:39,  1.49s/it][A
Training:   1%|▏         | 590/41976 [14:19<16:40:13,  1.45s/it][A
Training:   1%|▏         | 591/41976 [14:20<16:21:01,  1.42s/it][A

Epoch 0, Step 590, Loss: nan



Training:   1%|▏         | 592/41976 [14:21<16:11:25,  1.41s/it][A
Training:   1%|▏         | 593/41976 [14:23<15:59:18,  1.39s/it][A
Training:   1%|▏         | 594/41976 [14:24<15:49:14,  1.38s/it][A
Training:   1%|▏         | 595/41976 [14:25<15:39:36,  1.36s/it][A
Training:   1%|▏         | 596/41976 [14:27<16:53:18,  1.47s/it][A
Training:   1%|▏         | 597/41976 [14:29<18:38:34,  1.62s/it][A
Training:   1%|▏         | 598/41976 [14:30<17:41:24,  1.54s/it][A
Training:   1%|▏         | 599/41976 [14:32<17:01:48,  1.48s/it][A
Training:   1%|▏         | 600/41976 [14:33<16:44:44,  1.46s/it][A
Training:   1%|▏         | 601/41976 [14:35<16:14:26,  1.41s/it][A

Epoch 0, Step 600, Loss: nan



Training:   1%|▏         | 602/41976 [14:36<16:03:28,  1.40s/it][A
Training:   1%|▏         | 603/41976 [14:37<15:50:48,  1.38s/it][A
Training:   1%|▏         | 604/41976 [14:39<16:53:44,  1.47s/it][A
Training:   1%|▏         | 605/41976 [14:40<17:15:16,  1.50s/it][A
Training:   1%|▏         | 606/41976 [14:42<16:44:10,  1.46s/it][A
Training:   1%|▏         | 607/41976 [14:43<16:20:20,  1.42s/it][A
Training:   1%|▏         | 608/41976 [14:45<16:10:42,  1.41s/it][A
Training:   1%|▏         | 609/41976 [14:46<15:51:40,  1.38s/it][A
Training:   1%|▏         | 610/41976 [14:47<15:43:43,  1.37s/it][A
Training:   1%|▏         | 611/41976 [14:49<17:14:35,  1.50s/it][A

Epoch 0, Step 610, Loss: nan



Training:   1%|▏         | 612/41976 [14:51<17:34:23,  1.53s/it][A
Training:   1%|▏         | 613/41976 [14:52<17:56:25,  1.56s/it][A
Training:   1%|▏         | 614/41976 [14:54<17:14:56,  1.50s/it][A
Training:   1%|▏         | 615/41976 [14:55<16:42:51,  1.45s/it][A
Training:   1%|▏         | 616/41976 [14:56<16:27:45,  1.43s/it][A
Training:   1%|▏         | 617/41976 [14:58<16:00:06,  1.39s/it][A
Training:   1%|▏         | 618/41976 [14:59<15:51:24,  1.38s/it][A
Training:   1%|▏         | 619/41976 [15:00<15:41:31,  1.37s/it][A
Training:   1%|▏         | 620/41976 [15:02<15:44:21,  1.37s/it][A
Training:   1%|▏         | 621/41976 [15:03<17:01:39,  1.48s/it][A

Epoch 0, Step 620, Loss: nan



Training:   1%|▏         | 622/41976 [15:05<17:04:25,  1.49s/it][A
Training:   1%|▏         | 623/41976 [15:06<16:35:27,  1.44s/it][A
Training:   1%|▏         | 624/41976 [15:08<16:21:59,  1.42s/it][A
Training:   1%|▏         | 625/41976 [15:09<17:41:46,  1.54s/it][A
Training:   1%|▏         | 626/41976 [15:11<17:03:45,  1.49s/it][A
Training:   1%|▏         | 627/41976 [15:12<16:37:04,  1.45s/it][A
Training:   1%|▏         | 628/41976 [15:14<16:22:09,  1.43s/it][A
Training:   1%|▏         | 629/41976 [15:15<17:09:40,  1.49s/it][A
Training:   2%|▏         | 630/41976 [15:17<17:20:02,  1.51s/it][A
Training:   2%|▏         | 631/41976 [15:18<16:46:00,  1.46s/it][A

Epoch 0, Step 630, Loss: nan



Training:   2%|▏         | 632/41976 [15:20<16:36:13,  1.45s/it][A
Training:   2%|▏         | 633/41976 [15:21<16:12:19,  1.41s/it][A
Training:   2%|▏         | 634/41976 [15:22<16:01:09,  1.39s/it][A
Training:   2%|▏         | 635/41976 [15:24<15:52:57,  1.38s/it][A
Training:   2%|▏         | 636/41976 [15:25<15:47:20,  1.37s/it][A
Training:   2%|▏         | 637/41976 [15:26<16:11:13,  1.41s/it][A
Training:   2%|▏         | 638/41976 [15:29<18:48:03,  1.64s/it][A
Training:   2%|▏         | 639/41976 [15:30<17:48:16,  1.55s/it][A
Training:   2%|▏         | 640/41976 [15:31<17:11:10,  1.50s/it][A
Training:   2%|▏         | 641/41976 [15:33<16:34:14,  1.44s/it][A

Epoch 0, Step 640, Loss: nan



Training:   2%|▏         | 642/41976 [15:34<16:27:42,  1.43s/it][A
Training:   2%|▏         | 643/41976 [15:35<16:07:19,  1.40s/it][A
Training:   2%|▏         | 644/41976 [15:37<16:01:54,  1.40s/it][A
Training:   2%|▏         | 645/41976 [15:38<17:13:51,  1.50s/it][A
Training:   2%|▏         | 646/41976 [15:41<19:12:20,  1.67s/it][A
Training:   2%|▏         | 647/41976 [15:42<18:26:03,  1.61s/it][A
Training:   2%|▏         | 648/41976 [15:43<17:43:13,  1.54s/it][A
Training:   2%|▏         | 649/41976 [15:45<16:56:51,  1.48s/it][A
Training:   2%|▏         | 650/41976 [15:46<16:26:56,  1.43s/it][A
Training:   2%|▏         | 651/41976 [15:47<16:11:24,  1.41s/it][A

Epoch 0, Step 650, Loss: nan



Training:   2%|▏         | 652/41976 [15:49<17:41:19,  1.54s/it][A
Training:   2%|▏         | 653/41976 [15:51<16:55:26,  1.47s/it][A
Training:   2%|▏         | 654/41976 [15:52<17:35:34,  1.53s/it][A
Training:   2%|▏         | 655/41976 [15:54<17:38:43,  1.54s/it][A
Training:   2%|▏         | 656/41976 [15:55<17:02:53,  1.49s/it][A
Training:   2%|▏         | 657/41976 [15:56<16:30:01,  1.44s/it][A
Training:   2%|▏         | 658/41976 [15:58<16:15:11,  1.42s/it][A
Training:   2%|▏         | 659/41976 [15:59<15:57:18,  1.39s/it][A
Training:   2%|▏         | 660/41976 [16:01<15:53:50,  1.39s/it][A
Training:   2%|▏         | 661/41976 [16:02<15:39:09,  1.36s/it][A

Epoch 0, Step 660, Loss: nan



Training:   2%|▏         | 662/41976 [16:03<16:10:55,  1.41s/it][A
Training:   2%|▏         | 663/41976 [16:05<17:25:11,  1.52s/it][A
Training:   2%|▏         | 664/41976 [16:06<16:51:35,  1.47s/it][A
Training:   2%|▏         | 665/41976 [16:08<16:19:11,  1.42s/it][A
Training:   2%|▏         | 666/41976 [16:09<16:07:44,  1.41s/it][A
Training:   2%|▏         | 667/41976 [16:11<17:28:30,  1.52s/it][A
Training:   2%|▏         | 668/41976 [16:12<17:01:08,  1.48s/it][A
Training:   2%|▏         | 669/41976 [16:14<16:27:14,  1.43s/it][A
Training:   2%|▏         | 670/41976 [16:15<16:22:12,  1.43s/it][A
Training:   2%|▏         | 671/41976 [16:17<17:42:44,  1.54s/it][A

Epoch 0, Step 670, Loss: nan



Training:   2%|▏         | 672/41976 [16:18<17:06:35,  1.49s/it][A
Training:   2%|▏         | 673/41976 [16:20<16:35:06,  1.45s/it][A
Training:   2%|▏         | 674/41976 [16:21<16:15:18,  1.42s/it][A
Training:   2%|▏         | 675/41976 [16:22<16:03:34,  1.40s/it][A
Training:   2%|▏         | 676/41976 [16:24<16:05:11,  1.40s/it][A
Training:   2%|▏         | 677/41976 [16:25<15:50:36,  1.38s/it][A
Training:   2%|▏         | 678/41976 [16:26<15:42:33,  1.37s/it][A
Training:   2%|▏         | 679/41976 [16:28<16:47:47,  1.46s/it][A
Training:   2%|▏         | 680/41976 [16:30<17:24:00,  1.52s/it][A
Training:   2%|▏         | 681/41976 [16:32<18:23:27,  1.60s/it][A

Epoch 0, Step 680, Loss: nan



Training:   2%|▏         | 682/41976 [16:33<17:31:11,  1.53s/it][A
Training:   2%|▏         | 683/41976 [16:34<16:54:38,  1.47s/it][A
Training:   2%|▏         | 684/41976 [16:36<16:33:51,  1.44s/it][A
Training:   2%|▏         | 685/41976 [16:37<16:09:49,  1.41s/it][A
Training:   2%|▏         | 686/41976 [16:38<15:58:01,  1.39s/it][A
Training:   2%|▏         | 687/41976 [16:40<16:43:47,  1.46s/it][A
Training:   2%|▏         | 688/41976 [16:42<17:25:53,  1.52s/it][A
Training:   2%|▏         | 689/41976 [16:43<16:46:28,  1.46s/it][A
Training:   2%|▏         | 690/41976 [16:44<16:19:10,  1.42s/it][A
Training:   2%|▏         | 691/41976 [16:46<16:01:47,  1.40s/it][A

Epoch 0, Step 690, Loss: nan



Training:   2%|▏         | 692/41976 [16:47<16:00:53,  1.40s/it][A
Training:   2%|▏         | 693/41976 [16:48<15:42:27,  1.37s/it][A
Training:   2%|▏         | 694/41976 [16:50<15:46:27,  1.38s/it][A
Training:   2%|▏         | 695/41976 [16:52<18:02:31,  1.57s/it][A
Training:   2%|▏         | 696/41976 [16:53<18:33:24,  1.62s/it][A
Training:   2%|▏         | 697/41976 [16:55<17:26:29,  1.52s/it][A
Training:   2%|▏         | 698/41976 [16:56<16:48:47,  1.47s/it][A
Training:   2%|▏         | 699/41976 [16:57<16:17:15,  1.42s/it][A
Training:   2%|▏         | 700/41976 [16:59<16:08:24,  1.41s/it][A
Training:   2%|▏         | 701/41976 [17:00<15:54:13,  1.39s/it][A

Epoch 0, Step 700, Loss: nan



Training:   2%|▏         | 702/41976 [17:01<15:49:40,  1.38s/it][A
Training:   2%|▏         | 703/41976 [17:03<15:46:52,  1.38s/it][A
Training:   2%|▏         | 704/41976 [17:05<17:22:36,  1.52s/it][A
Training:   2%|▏         | 705/41976 [17:06<17:08:05,  1.49s/it][A
Training:   2%|▏         | 706/41976 [17:07<16:39:02,  1.45s/it][A
Training:   2%|▏         | 707/41976 [17:09<16:20:52,  1.43s/it][A
Training:   2%|▏         | 708/41976 [17:10<16:18:25,  1.42s/it][A
Training:   2%|▏         | 709/41976 [17:12<17:40:12,  1.54s/it][A
Training:   2%|▏         | 710/41976 [17:13<17:01:43,  1.49s/it][A
Training:   2%|▏         | 711/41976 [17:15<16:32:00,  1.44s/it][A

Epoch 0, Step 710, Loss: nan



Training:   2%|▏         | 712/41976 [17:16<17:33:00,  1.53s/it][A
Training:   2%|▏         | 713/41976 [17:18<17:33:57,  1.53s/it][A
Training:   2%|▏         | 714/41976 [17:19<16:54:15,  1.47s/it][A
Training:   2%|▏         | 715/41976 [17:21<16:24:23,  1.43s/it][A
Training:   2%|▏         | 716/41976 [17:22<16:10:25,  1.41s/it][A
Training:   2%|▏         | 717/41976 [17:23<15:51:55,  1.38s/it][A
Training:   2%|▏         | 718/41976 [17:25<15:40:56,  1.37s/it][A
Training:   2%|▏         | 719/41976 [17:26<15:33:54,  1.36s/it][A
Training:   2%|▏         | 720/41976 [17:28<15:58:16,  1.39s/it][A
Training:   2%|▏         | 721/41976 [17:29<17:18:45,  1.51s/it][A

Epoch 0, Step 720, Loss: nan



Training:   2%|▏         | 722/41976 [17:31<16:50:27,  1.47s/it][A
Training:   2%|▏         | 723/41976 [17:32<18:00:47,  1.57s/it][A
Training:   2%|▏         | 724/41976 [17:34<17:21:43,  1.52s/it][A
Training:   2%|▏         | 725/41976 [17:35<16:48:22,  1.47s/it][A
Training:   2%|▏         | 726/41976 [17:37<16:20:26,  1.43s/it][A
Training:   2%|▏         | 727/41976 [17:38<16:02:21,  1.40s/it][A
Training:   2%|▏         | 728/41976 [17:39<16:09:17,  1.41s/it][A
Training:   2%|▏         | 729/41976 [17:41<17:19:19,  1.51s/it][A
Training:   2%|▏         | 730/41976 [17:42<16:45:51,  1.46s/it][A
Training:   2%|▏         | 731/41976 [17:44<16:22:57,  1.43s/it][A

Epoch 0, Step 730, Loss: nan



Training:   2%|▏         | 732/41976 [17:45<16:13:47,  1.42s/it][A
Training:   2%|▏         | 733/41976 [17:46<15:54:45,  1.39s/it][A
Training:   2%|▏         | 734/41976 [17:48<15:47:49,  1.38s/it][A
Training:   2%|▏         | 735/41976 [17:49<15:38:21,  1.37s/it][A
Training:   2%|▏         | 736/41976 [17:51<15:41:01,  1.37s/it][A
Training:   2%|▏         | 737/41976 [17:53<18:18:51,  1.60s/it][A
Training:   2%|▏         | 738/41976 [17:54<18:01:38,  1.57s/it][A
Training:   2%|▏         | 739/41976 [17:56<17:12:39,  1.50s/it][A
Training:   2%|▏         | 740/41976 [17:57<16:49:53,  1.47s/it][A
Training:   2%|▏         | 741/41976 [17:58<16:21:35,  1.43s/it][A

Epoch 0, Step 740, Loss: nan



Training:   2%|▏         | 742/41976 [18:00<16:07:58,  1.41s/it][A
Training:   2%|▏         | 743/41976 [18:01<15:54:23,  1.39s/it][A
Training:   2%|▏         | 744/41976 [18:02<15:51:38,  1.38s/it][A
Training:   2%|▏         | 745/41976 [18:04<16:15:04,  1.42s/it][A
Training:   2%|▏         | 746/41976 [18:06<17:22:48,  1.52s/it][A
Training:   2%|▏         | 747/41976 [18:07<16:48:57,  1.47s/it][A
Training:   2%|▏         | 748/41976 [18:08<16:30:54,  1.44s/it][A
Training:   2%|▏         | 749/41976 [18:10<16:08:46,  1.41s/it][A
Training:   2%|▏         | 750/41976 [18:11<15:57:54,  1.39s/it][A
Training:   2%|▏         | 751/41976 [18:13<17:31:35,  1.53s/it][A

Epoch 0, Step 750, Loss: nan



Training:   2%|▏         | 752/41976 [18:14<17:01:16,  1.49s/it][A
Training:   2%|▏         | 753/41976 [18:16<16:50:54,  1.47s/it][A
Training:   2%|▏         | 754/41976 [18:17<17:48:07,  1.55s/it][A
Training:   2%|▏         | 755/41976 [18:19<17:10:20,  1.50s/it][A
Training:   2%|▏         | 756/41976 [18:20<16:41:03,  1.46s/it][A
Training:   2%|▏         | 757/41976 [18:21<16:13:09,  1.42s/it][A
Training:   2%|▏         | 758/41976 [18:23<16:00:11,  1.40s/it][A
Training:   2%|▏         | 759/41976 [18:24<15:46:09,  1.38s/it][A
Training:   2%|▏         | 760/41976 [18:26<15:43:49,  1.37s/it][A
Training:   2%|▏         | 761/41976 [18:27<15:35:49,  1.36s/it][A

Epoch 0, Step 760, Loss: nan



Training:   2%|▏         | 762/41976 [18:29<16:46:15,  1.46s/it][A
Training:   2%|▏         | 763/41976 [18:30<17:12:17,  1.50s/it][A
Training:   2%|▏         | 764/41976 [18:32<16:46:43,  1.47s/it][A
Training:   2%|▏         | 765/41976 [18:33<17:54:22,  1.56s/it][A
Training:   2%|▏         | 766/41976 [18:35<17:07:36,  1.50s/it][A
Training:   2%|▏         | 767/41976 [18:36<16:40:53,  1.46s/it][A
Training:   2%|▏         | 768/41976 [18:37<16:24:34,  1.43s/it][A
Training:   2%|▏         | 769/41976 [18:39<16:02:51,  1.40s/it][A
Training:   2%|▏         | 770/41976 [18:40<16:46:10,  1.47s/it][A
Training:   2%|▏         | 771/41976 [18:42<17:27:45,  1.53s/it][A

Epoch 0, Step 770, Loss: nan



Training:   2%|▏         | 772/41976 [18:43<16:59:31,  1.48s/it][A
Training:   2%|▏         | 773/41976 [18:45<16:25:12,  1.43s/it][A
Training:   2%|▏         | 774/41976 [18:46<16:06:45,  1.41s/it][A
Training:   2%|▏         | 775/41976 [18:47<15:53:28,  1.39s/it][A
Training:   2%|▏         | 776/41976 [18:49<15:53:32,  1.39s/it][A
Training:   2%|▏         | 777/41976 [18:50<15:41:35,  1.37s/it][A
Training:   2%|▏         | 778/41976 [18:52<15:39:37,  1.37s/it][A
Training:   2%|▏         | 779/41976 [18:54<18:54:21,  1.65s/it][A
Training:   2%|▏         | 780/41976 [18:55<17:58:54,  1.57s/it][A
Training:   2%|▏         | 781/41976 [18:57<17:06:54,  1.50s/it][A

Epoch 0, Step 780, Loss: nan



Training:   2%|▏         | 782/41976 [18:58<16:42:46,  1.46s/it][A
Training:   2%|▏         | 783/41976 [18:59<16:17:21,  1.42s/it][A
Training:   2%|▏         | 784/41976 [19:01<16:05:57,  1.41s/it][A
Training:   2%|▏         | 785/41976 [19:02<15:44:25,  1.38s/it][A
Training:   2%|▏         | 786/41976 [19:03<15:34:09,  1.36s/it][A
Training:   2%|▏         | 787/41976 [19:05<16:51:30,  1.47s/it][A
Training:   2%|▏         | 788/41976 [19:06<17:01:56,  1.49s/it][A
Training:   2%|▏         | 789/41976 [19:08<16:23:45,  1.43s/it][A
Training:   2%|▏         | 790/41976 [19:09<16:05:39,  1.41s/it][A
Training:   2%|▏         | 791/41976 [19:10<15:51:08,  1.39s/it][A

Epoch 0, Step 790, Loss: nan



Training:   2%|▏         | 792/41976 [19:12<15:49:01,  1.38s/it][A
Training:   2%|▏         | 793/41976 [19:14<17:09:09,  1.50s/it][A
Training:   2%|▏         | 794/41976 [19:15<16:42:28,  1.46s/it][A
Training:   2%|▏         | 795/41976 [19:17<17:20:55,  1.52s/it][A
Training:   2%|▏         | 796/41976 [19:18<17:53:53,  1.56s/it][A
Training:   2%|▏         | 797/41976 [19:20<18:09:41,  1.59s/it][A
Training:   2%|▏         | 798/41976 [19:22<18:09:56,  1.59s/it][A
Training:   2%|▏         | 799/41976 [19:23<17:16:35,  1.51s/it][A
Training:   2%|▏         | 800/41976 [19:24<16:50:19,  1.47s/it][A
Training:   2%|▏         | 801/41976 [19:26<16:21:32,  1.43s/it][A

Epoch 0, Step 800, Loss: nan



Training:   2%|▏         | 802/41976 [19:27<16:09:03,  1.41s/it][A
Training:   2%|▏         | 803/41976 [19:29<16:55:45,  1.48s/it][A
Training:   2%|▏         | 804/41976 [19:30<17:31:51,  1.53s/it][A
Training:   2%|▏         | 805/41976 [19:32<16:50:31,  1.47s/it][A
Training:   2%|▏         | 806/41976 [19:33<16:26:26,  1.44s/it][A
Training:   2%|▏         | 807/41976 [19:35<17:40:58,  1.55s/it][A
Training:   2%|▏         | 808/41976 [19:36<17:13:02,  1.51s/it][A
Training:   2%|▏         | 809/41976 [19:37<16:36:44,  1.45s/it][A
Training:   2%|▏         | 810/41976 [19:39<16:22:03,  1.43s/it][A
Training:   2%|▏         | 811/41976 [19:40<16:52:22,  1.48s/it][A

Epoch 0, Step 810, Loss: nan



Training:   2%|▏         | 812/41976 [19:42<17:35:55,  1.54s/it][A
Training:   2%|▏         | 813/41976 [19:43<16:52:34,  1.48s/it][A
Training:   2%|▏         | 814/41976 [19:45<16:22:33,  1.43s/it][A
Training:   2%|▏         | 815/41976 [19:46<16:06:14,  1.41s/it][A
Training:   2%|▏         | 816/41976 [19:48<15:59:44,  1.40s/it][A
Training:   2%|▏         | 817/41976 [19:49<15:43:57,  1.38s/it][A
Training:   2%|▏         | 818/41976 [19:50<15:38:01,  1.37s/it][A
Training:   2%|▏         | 819/41976 [19:52<15:34:03,  1.36s/it][A
Training:   2%|▏         | 820/41976 [19:53<17:13:19,  1.51s/it][A
Training:   2%|▏         | 821/41976 [19:55<18:20:49,  1.60s/it][A

Epoch 0, Step 820, Loss: nan



Training:   2%|▏         | 822/41976 [19:57<17:31:24,  1.53s/it][A
Training:   2%|▏         | 823/41976 [19:58<16:53:12,  1.48s/it][A
Training:   2%|▏         | 824/41976 [19:59<16:28:54,  1.44s/it][A
Training:   2%|▏         | 825/41976 [20:01<16:02:59,  1.40s/it][A
Training:   2%|▏         | 826/41976 [20:02<15:48:29,  1.38s/it][A
Training:   2%|▏         | 827/41976 [20:03<15:39:45,  1.37s/it][A
Training:   2%|▏         | 828/41976 [20:05<16:59:09,  1.49s/it][A
Training:   2%|▏         | 829/41976 [20:07<17:08:07,  1.50s/it][A
Training:   2%|▏         | 830/41976 [20:08<16:35:59,  1.45s/it][A
Training:   2%|▏         | 831/41976 [20:09<16:12:11,  1.42s/it][A

Epoch 0, Step 830, Loss: nan



Training:   2%|▏         | 832/41976 [20:11<16:03:36,  1.41s/it][A
Training:   2%|▏         | 833/41976 [20:12<15:47:28,  1.38s/it][A
Training:   2%|▏         | 834/41976 [20:13<15:42:38,  1.37s/it][A
Training:   2%|▏         | 835/41976 [20:15<17:06:03,  1.50s/it][A
Training:   2%|▏         | 836/41976 [20:17<17:55:14,  1.57s/it][A
Training:   2%|▏         | 837/41976 [20:18<17:52:36,  1.56s/it][A
Training:   2%|▏         | 838/41976 [20:20<17:13:16,  1.51s/it][A
Training:   2%|▏         | 839/41976 [20:21<16:43:33,  1.46s/it][A
Training:   2%|▏         | 840/41976 [20:23<16:29:24,  1.44s/it][A
Training:   2%|▏         | 841/41976 [20:24<16:06:14,  1.41s/it][A

Epoch 0, Step 840, Loss: nan



Training:   2%|▏         | 842/41976 [20:25<15:55:21,  1.39s/it][A
Training:   2%|▏         | 843/41976 [20:27<15:43:32,  1.38s/it][A
Training:   2%|▏         | 844/41976 [20:28<16:06:49,  1.41s/it][A
Training:   2%|▏         | 845/41976 [20:30<17:18:40,  1.52s/it][A
Training:   2%|▏         | 846/41976 [20:31<16:45:15,  1.47s/it][A
Training:   2%|▏         | 847/41976 [20:32<16:18:04,  1.43s/it][A
Training:   2%|▏         | 848/41976 [20:34<16:07:17,  1.41s/it][A
Training:   2%|▏         | 849/41976 [20:36<17:23:53,  1.52s/it][A
Training:   2%|▏         | 850/41976 [20:37<16:50:18,  1.47s/it][A
Training:   2%|▏         | 851/41976 [20:38<16:24:35,  1.44s/it][A

Epoch 0, Step 850, Loss: nan



Training:   2%|▏         | 852/41976 [20:40<16:20:28,  1.43s/it][A
Training:   2%|▏         | 853/41976 [20:42<17:31:20,  1.53s/it][A
Training:   2%|▏         | 854/41976 [20:43<16:57:16,  1.48s/it][A
Training:   2%|▏         | 855/41976 [20:44<16:25:51,  1.44s/it][A
Training:   2%|▏         | 856/41976 [20:46<16:15:37,  1.42s/it][A
Training:   2%|▏         | 857/41976 [20:47<15:50:36,  1.39s/it][A
Training:   2%|▏         | 858/41976 [20:48<15:39:24,  1.37s/it][A
Training:   2%|▏         | 859/41976 [20:50<15:34:14,  1.36s/it][A
Training:   2%|▏         | 860/41976 [20:51<15:39:28,  1.37s/it][A
Training:   2%|▏         | 861/41976 [20:53<16:22:27,  1.43s/it][A

Epoch 0, Step 860, Loss: nan



Training:   2%|▏         | 862/41976 [20:54<17:11:21,  1.51s/it][A
Training:   2%|▏         | 863/41976 [20:56<18:15:08,  1.60s/it][A
Training:   2%|▏         | 864/41976 [20:57<17:27:59,  1.53s/it][A
Training:   2%|▏         | 865/41976 [20:59<16:46:30,  1.47s/it][A
Training:   2%|▏         | 866/41976 [21:00<16:22:18,  1.43s/it][A
Training:   2%|▏         | 867/41976 [21:01<16:01:41,  1.40s/it][A
Training:   2%|▏         | 868/41976 [21:03<15:54:03,  1.39s/it][A
Training:   2%|▏         | 869/41976 [21:04<16:18:58,  1.43s/it][A
Training:   2%|▏         | 870/41976 [21:06<17:17:40,  1.51s/it][A
Training:   2%|▏         | 871/41976 [21:07<16:46:36,  1.47s/it][A

Epoch 0, Step 870, Loss: nan



Training:   2%|▏         | 872/41976 [21:09<16:26:38,  1.44s/it][A
Training:   2%|▏         | 873/41976 [21:10<16:03:30,  1.41s/it][A
Training:   2%|▏         | 874/41976 [21:11<15:51:10,  1.39s/it][A
Training:   2%|▏         | 875/41976 [21:13<15:40:28,  1.37s/it][A
Training:   2%|▏         | 876/41976 [21:14<15:47:45,  1.38s/it][A
Training:   2%|▏         | 877/41976 [21:16<17:49:19,  1.56s/it][A
Training:   2%|▏         | 878/41976 [21:18<18:31:25,  1.62s/it][A
Training:   2%|▏         | 879/41976 [21:19<17:32:49,  1.54s/it][A
Training:   2%|▏         | 880/41976 [21:21<16:59:52,  1.49s/it][A
Training:   2%|▏         | 881/41976 [21:22<16:22:28,  1.43s/it][A

Epoch 0, Step 880, Loss: nan



Training:   2%|▏         | 882/41976 [21:23<16:08:53,  1.41s/it][A
Training:   2%|▏         | 883/41976 [21:25<15:54:05,  1.39s/it][A
Training:   2%|▏         | 884/41976 [21:26<15:55:42,  1.40s/it][A
Training:   2%|▏         | 885/41976 [21:27<15:43:06,  1.38s/it][A
Training:   2%|▏         | 886/41976 [21:29<16:54:07,  1.48s/it][A
Training:   2%|▏         | 887/41976 [21:31<17:09:14,  1.50s/it][A
Training:   2%|▏         | 888/41976 [21:32<16:42:49,  1.46s/it][A
Training:   2%|▏         | 889/41976 [21:33<16:13:58,  1.42s/it][A
Training:   2%|▏         | 890/41976 [21:35<15:56:02,  1.40s/it][A
Training:   2%|▏         | 891/41976 [21:37<17:18:55,  1.52s/it][A

Epoch 0, Step 890, Loss: nan



Training:   2%|▏         | 892/41976 [21:38<16:51:16,  1.48s/it][A
Training:   2%|▏         | 893/41976 [21:39<16:20:49,  1.43s/it][A
Training:   2%|▏         | 894/41976 [21:41<17:02:50,  1.49s/it][A
Training:   2%|▏         | 895/41976 [21:42<17:23:42,  1.52s/it][A
Training:   2%|▏         | 896/41976 [21:44<16:52:48,  1.48s/it][A
Training:   2%|▏         | 897/41976 [21:45<16:13:42,  1.42s/it][A
Training:   2%|▏         | 898/41976 [21:46<15:58:32,  1.40s/it][A
Training:   2%|▏         | 899/41976 [21:48<15:44:48,  1.38s/it][A
Training:   2%|▏         | 900/41976 [21:49<15:44:29,  1.38s/it][A
Training:   2%|▏         | 901/41976 [21:51<15:32:50,  1.36s/it][A

Epoch 0, Step 900, Loss: nan



Training:   2%|▏         | 902/41976 [21:52<15:39:35,  1.37s/it][A
Training:   2%|▏         | 903/41976 [21:54<17:02:07,  1.49s/it][A
Training:   2%|▏         | 904/41976 [21:55<16:44:52,  1.47s/it][A
Training:   2%|▏         | 905/41976 [21:57<17:48:24,  1.56s/it][A
Training:   2%|▏         | 906/41976 [21:58<17:05:14,  1.50s/it][A
Training:   2%|▏         | 907/41976 [22:00<16:27:26,  1.44s/it][A
Training:   2%|▏         | 908/41976 [22:01<16:13:04,  1.42s/it][A
Training:   2%|▏         | 909/41976 [22:02<15:48:08,  1.39s/it][A
Training:   2%|▏         | 910/41976 [22:04<15:39:38,  1.37s/it][A
Training:   2%|▏         | 911/41976 [22:05<16:44:35,  1.47s/it][A

Epoch 0, Step 910, Loss: nan



Training:   2%|▏         | 912/41976 [22:07<16:58:45,  1.49s/it][A
Training:   2%|▏         | 913/41976 [22:08<16:27:48,  1.44s/it][A
Training:   2%|▏         | 914/41976 [22:09<16:09:37,  1.42s/it][A
Training:   2%|▏         | 915/41976 [22:11<15:57:21,  1.40s/it][A
Training:   2%|▏         | 916/41976 [22:12<15:53:56,  1.39s/it][A
Training:   2%|▏         | 917/41976 [22:14<15:36:54,  1.37s/it][A
Training:   2%|▏         | 918/41976 [22:15<15:33:40,  1.36s/it][A
Training:   2%|▏         | 919/41976 [22:17<17:59:51,  1.58s/it][A
Training:   2%|▏         | 920/41976 [22:19<18:11:15,  1.59s/it][A
Training:   2%|▏         | 921/41976 [22:20<17:18:13,  1.52s/it][A

Epoch 0, Step 920, Loss: nan



Training:   2%|▏         | 922/41976 [22:21<16:54:14,  1.48s/it][A
Training:   2%|▏         | 923/41976 [22:23<16:23:10,  1.44s/it][A
Training:   2%|▏         | 924/41976 [22:24<16:09:55,  1.42s/it][A
Training:   2%|▏         | 925/41976 [22:25<15:47:54,  1.39s/it][A
Training:   2%|▏         | 926/41976 [22:27<15:44:01,  1.38s/it][A
Training:   2%|▏         | 927/41976 [22:28<15:51:35,  1.39s/it][A
Training:   2%|▏         | 928/41976 [22:30<17:30:44,  1.54s/it][A
Training:   2%|▏         | 929/41976 [22:31<16:49:35,  1.48s/it][A
Training:   2%|▏         | 930/41976 [22:33<16:23:59,  1.44s/it][A
Training:   2%|▏         | 931/41976 [22:34<16:02:54,  1.41s/it][A

Epoch 0, Step 930, Loss: nan



Training:   2%|▏         | 932/41976 [22:35<15:56:42,  1.40s/it][A
Training:   2%|▏         | 933/41976 [22:37<17:17:43,  1.52s/it][A
Training:   2%|▏         | 934/41976 [22:39<16:44:08,  1.47s/it][A
Training:   2%|▏         | 935/41976 [22:40<16:22:03,  1.44s/it][A
Training:   2%|▏         | 936/41976 [22:42<17:48:31,  1.56s/it][A
Training:   2%|▏         | 937/41976 [22:43<17:01:15,  1.49s/it][A
Training:   2%|▏         | 938/41976 [22:44<16:24:42,  1.44s/it][A
Training:   2%|▏         | 939/41976 [22:46<16:01:39,  1.41s/it][A
Training:   2%|▏         | 940/41976 [22:47<15:56:31,  1.40s/it][A
Training:   2%|▏         | 941/41976 [22:48<15:43:38,  1.38s/it][A

Epoch 0, Step 940, Loss: nan



Training:   2%|▏         | 942/41976 [22:50<15:41:52,  1.38s/it][A
Training:   2%|▏         | 943/41976 [22:51<15:36:28,  1.37s/it][A
Training:   2%|▏         | 944/41976 [22:53<16:27:53,  1.44s/it][A
Training:   2%|▏         | 945/41976 [22:54<17:06:55,  1.50s/it][A
Training:   2%|▏         | 946/41976 [22:56<16:35:33,  1.46s/it][A
Training:   2%|▏         | 947/41976 [22:58<17:47:57,  1.56s/it][A
Training:   2%|▏         | 948/41976 [22:59<17:11:57,  1.51s/it][A
Training:   2%|▏         | 949/41976 [23:00<16:33:41,  1.45s/it][A
Training:   2%|▏         | 950/41976 [23:02<16:31:41,  1.45s/it][A
Training:   2%|▏         | 951/41976 [23:04<17:44:25,  1.56s/it][A

Epoch 0, Step 950, Loss: nan



Training:   2%|▏         | 952/41976 [23:05<18:19:56,  1.61s/it][A
Training:   2%|▏         | 953/41976 [23:07<18:01:55,  1.58s/it][A
Training:   2%|▏         | 954/41976 [23:08<17:19:49,  1.52s/it][A
Training:   2%|▏         | 955/41976 [23:09<16:41:05,  1.46s/it][A
Training:   2%|▏         | 956/41976 [23:11<16:21:05,  1.44s/it][A
Training:   2%|▏         | 957/41976 [23:12<15:55:24,  1.40s/it][A
Training:   2%|▏         | 958/41976 [23:14<15:41:31,  1.38s/it][A
Training:   2%|▏         | 959/41976 [23:15<15:30:38,  1.36s/it][A
Training:   2%|▏         | 960/41976 [23:16<16:01:02,  1.41s/it][A
Training:   2%|▏         | 961/41976 [23:19<18:39:56,  1.64s/it][A

Epoch 0, Step 960, Loss: nan



Training:   2%|▏         | 962/41976 [23:20<17:45:15,  1.56s/it][A
Training:   2%|▏         | 963/41976 [23:21<16:58:21,  1.49s/it][A
Training:   2%|▏         | 964/41976 [23:23<16:38:06,  1.46s/it][A
Training:   2%|▏         | 965/41976 [23:24<16:09:05,  1.42s/it][A
Training:   2%|▏         | 966/41976 [23:25<15:52:13,  1.39s/it][A
Training:   2%|▏         | 967/41976 [23:27<15:38:23,  1.37s/it][A
Training:   2%|▏         | 968/41976 [23:28<15:41:38,  1.38s/it][A
Training:   2%|▏         | 969/41976 [23:30<17:04:00,  1.50s/it][A
Training:   2%|▏         | 970/41976 [23:31<16:37:15,  1.46s/it][A
Training:   2%|▏         | 971/41976 [23:32<16:11:18,  1.42s/it][A

Epoch 0, Step 970, Loss: nan



Training:   2%|▏         | 972/41976 [23:34<16:03:57,  1.41s/it][A
Training:   2%|▏         | 973/41976 [23:35<15:46:04,  1.38s/it][A
Training:   2%|▏         | 974/41976 [23:37<15:37:59,  1.37s/it][A
Training:   2%|▏         | 975/41976 [23:38<17:12:50,  1.51s/it][A
Training:   2%|▏         | 976/41976 [23:40<16:43:14,  1.47s/it][A
Training:   2%|▏         | 977/41976 [23:41<17:28:43,  1.53s/it][A
Training:   2%|▏         | 978/41976 [23:43<17:19:17,  1.52s/it][A
Training:   2%|▏         | 979/41976 [23:44<16:41:29,  1.47s/it][A
Training:   2%|▏         | 980/41976 [23:46<16:21:39,  1.44s/it][A
Training:   2%|▏         | 981/41976 [23:47<15:57:07,  1.40s/it][A

Epoch 0, Step 980, Loss: nan



Training:   2%|▏         | 982/41976 [23:48<15:45:57,  1.38s/it][A
Training:   2%|▏         | 983/41976 [23:50<15:35:22,  1.37s/it][A
Training:   2%|▏         | 984/41976 [23:51<15:36:02,  1.37s/it][A
Training:   2%|▏         | 985/41976 [23:52<15:56:27,  1.40s/it][A
Training:   2%|▏         | 986/41976 [23:54<17:04:17,  1.50s/it][A
Training:   2%|▏         | 987/41976 [23:56<16:31:36,  1.45s/it][A
Training:   2%|▏         | 988/41976 [23:57<16:19:01,  1.43s/it][A
Training:   2%|▏         | 989/41976 [23:59<17:27:51,  1.53s/it][A
Training:   2%|▏         | 990/41976 [24:00<16:50:27,  1.48s/it][A
Training:   2%|▏         | 991/41976 [24:01<16:21:21,  1.44s/it][A

Epoch 0, Step 990, Loss: nan



Training:   2%|▏         | 992/41976 [24:03<16:08:11,  1.42s/it][A
Training:   2%|▏         | 993/41976 [24:04<16:02:23,  1.41s/it][A
Training:   2%|▏         | 994/41976 [24:06<17:23:59,  1.53s/it][A
Training:   2%|▏         | 995/41976 [24:07<16:52:09,  1.48s/it][A
Training:   2%|▏         | 996/41976 [24:09<16:39:04,  1.46s/it][A
Training:   2%|▏         | 997/41976 [24:10<16:11:33,  1.42s/it][A
Training:   2%|▏         | 998/41976 [24:11<15:55:46,  1.40s/it][A
Training:   2%|▏         | 999/41976 [24:13<15:46:29,  1.39s/it][A
Training:   2%|▏         | 1000/41976 [24:14<15:46:25,  1.39s/it][A
Training:   2%|▏         | 1001/41976 [24:15<15:30:22,  1.36s/it][A

Epoch 0, Step 1000, Loss: nan



Training:   2%|▏         | 1002/41976 [24:17<16:24:17,  1.44s/it][A
Training:   2%|▏         | 1003/41976 [24:19<18:33:10,  1.63s/it][A
Training:   2%|▏         | 1004/41976 [24:21<17:40:23,  1.55s/it][A
Training:   2%|▏         | 1005/41976 [24:22<16:54:13,  1.49s/it][A
Training:   2%|▏         | 1006/41976 [24:23<16:25:19,  1.44s/it][A
Training:   2%|▏         | 1007/41976 [24:24<15:59:01,  1.40s/it][A
Training:   2%|▏         | 1008/41976 [24:26<15:49:43,  1.39s/it][A
Training:   2%|▏         | 1009/41976 [24:27<15:33:48,  1.37s/it][A
Training:   2%|▏         | 1010/41976 [24:29<16:12:57,  1.43s/it][A
Training:   2%|▏         | 1011/41976 [24:30<17:07:14,  1.50s/it][A

Epoch 0, Step 1010, Loss: nan



Training:   2%|▏         | 1012/41976 [24:32<16:39:50,  1.46s/it][A
Training:   2%|▏         | 1013/41976 [24:33<16:10:55,  1.42s/it][A
Training:   2%|▏         | 1014/41976 [24:34<15:56:13,  1.40s/it][A
Training:   2%|▏         | 1015/41976 [24:36<15:44:40,  1.38s/it][A
Training:   2%|▏         | 1016/41976 [24:37<15:37:45,  1.37s/it][A
Training:   2%|▏         | 1017/41976 [24:39<17:15:12,  1.52s/it][A
Training:   2%|▏         | 1018/41976 [24:41<17:15:53,  1.52s/it][A
Training:   2%|▏         | 1019/41976 [24:42<17:56:49,  1.58s/it][A
Training:   2%|▏         | 1020/41976 [24:44<17:14:21,  1.52s/it][A
Training:   2%|▏         | 1021/41976 [24:45<16:32:28,  1.45s/it][A

Epoch 0, Step 1020, Loss: nan



Training:   2%|▏         | 1022/41976 [24:46<16:13:35,  1.43s/it][A
Training:   2%|▏         | 1023/41976 [24:48<15:54:49,  1.40s/it][A
Training:   2%|▏         | 1024/41976 [24:49<15:47:09,  1.39s/it][A
Training:   2%|▏         | 1025/41976 [24:50<15:32:47,  1.37s/it][A
Training:   2%|▏         | 1026/41976 [24:52<15:24:39,  1.35s/it][A
Training:   2%|▏         | 1027/41976 [24:53<16:30:18,  1.45s/it][A
Training:   2%|▏         | 1028/41976 [24:55<16:54:39,  1.49s/it][A
Training:   2%|▏         | 1029/41976 [24:56<16:17:06,  1.43s/it][A
Training:   2%|▏         | 1030/41976 [24:58<15:57:16,  1.40s/it][A
Training:   2%|▏         | 1031/41976 [24:59<17:24:54,  1.53s/it][A

Epoch 0, Step 1030, Loss: nan



Training:   2%|▏         | 1032/41976 [25:01<16:52:42,  1.48s/it][A
Training:   2%|▏         | 1033/41976 [25:02<16:16:44,  1.43s/it][A
Training:   2%|▏         | 1034/41976 [25:03<15:59:21,  1.41s/it][A
Training:   2%|▏         | 1035/41976 [25:05<16:37:06,  1.46s/it][A
Training:   2%|▏         | 1036/41976 [25:07<17:19:03,  1.52s/it][A
Training:   2%|▏         | 1037/41976 [25:08<16:37:17,  1.46s/it][A
Training:   2%|▏         | 1038/41976 [25:09<16:22:09,  1.44s/it][A
Training:   2%|▏         | 1039/41976 [25:11<16:02:58,  1.41s/it][A
Training:   2%|▏         | 1040/41976 [25:12<15:55:14,  1.40s/it][A
Training:   2%|▏         | 1041/41976 [25:13<15:38:48,  1.38s/it][A

Epoch 0, Step 1040, Loss: nan



Training:   2%|▏         | 1042/41976 [25:15<15:32:32,  1.37s/it][A
Training:   2%|▏         | 1043/41976 [25:16<15:33:36,  1.37s/it][A
Training:   2%|▏         | 1044/41976 [25:18<17:08:09,  1.51s/it][A
Training:   2%|▏         | 1045/41976 [25:20<18:04:47,  1.59s/it][A
Training:   2%|▏         | 1046/41976 [25:21<17:16:13,  1.52s/it][A
Training:   2%|▏         | 1047/41976 [25:22<16:40:05,  1.47s/it][A
Training:   2%|▏         | 1048/41976 [25:24<16:24:10,  1.44s/it][A
Training:   2%|▏         | 1049/41976 [25:25<15:59:56,  1.41s/it][A
Training:   3%|▎         | 1050/41976 [25:26<15:49:27,  1.39s/it][A
Training:   3%|▎         | 1051/41976 [25:28<15:37:07,  1.37s/it][A

Epoch 0, Step 1050, Loss: nan



Training:   3%|▎         | 1052/41976 [25:30<17:05:01,  1.50s/it][A
Training:   3%|▎         | 1053/41976 [25:31<16:53:59,  1.49s/it][A
Training:   3%|▎         | 1054/41976 [25:32<16:24:32,  1.44s/it][A
Training:   3%|▎         | 1055/41976 [25:34<16:04:53,  1.41s/it][A
Training:   3%|▎         | 1056/41976 [25:35<15:55:27,  1.40s/it][A
Training:   3%|▎         | 1057/41976 [25:36<15:40:56,  1.38s/it][A
Training:   3%|▎         | 1058/41976 [25:38<15:34:20,  1.37s/it][A
Training:   3%|▎         | 1059/41976 [25:40<17:10:16,  1.51s/it][A
Training:   3%|▎         | 1060/41976 [25:41<17:43:04,  1.56s/it][A
Training:   3%|▎         | 1061/41976 [25:43<17:43:48,  1.56s/it][A

Epoch 0, Step 1060, Loss: nan



Training:   3%|▎         | 1062/41976 [25:44<17:02:57,  1.50s/it][A
Training:   3%|▎         | 1063/41976 [25:46<16:29:28,  1.45s/it][A
Training:   3%|▎         | 1064/41976 [25:47<16:16:14,  1.43s/it][A
Training:   3%|▎         | 1065/41976 [25:48<15:51:26,  1.40s/it][A
Training:   3%|▎         | 1066/41976 [25:50<15:40:35,  1.38s/it][A
Training:   3%|▎         | 1067/41976 [25:51<15:32:51,  1.37s/it][A
Training:   3%|▎         | 1068/41976 [25:52<15:57:04,  1.40s/it][A
Training:   3%|▎         | 1069/41976 [25:54<17:05:45,  1.50s/it][A
Training:   3%|▎         | 1070/41976 [25:56<16:34:11,  1.46s/it][A
Training:   3%|▎         | 1071/41976 [25:57<16:11:23,  1.42s/it][A

Epoch 0, Step 1070, Loss: nan



Training:   3%|▎         | 1072/41976 [25:58<16:00:03,  1.41s/it][A
Training:   3%|▎         | 1073/41976 [26:00<17:16:16,  1.52s/it][A
Training:   3%|▎         | 1074/41976 [26:01<16:39:32,  1.47s/it][A
Training:   3%|▎         | 1075/41976 [26:03<16:12:04,  1.43s/it][A
Training:   3%|▎         | 1076/41976 [26:04<15:59:04,  1.41s/it][A
Training:   3%|▎         | 1077/41976 [26:06<17:08:19,  1.51s/it][A
Training:   3%|▎         | 1078/41976 [26:07<16:45:16,  1.47s/it][A
Training:   3%|▎         | 1079/41976 [26:09<16:19:58,  1.44s/it][A
Training:   3%|▎         | 1080/41976 [26:10<16:18:22,  1.44s/it][A
Training:   3%|▎         | 1081/41976 [26:11<15:53:23,  1.40s/it][A

Epoch 0, Step 1080, Loss: nan



Training:   3%|▎         | 1082/41976 [26:13<15:46:48,  1.39s/it][A
Training:   3%|▎         | 1083/41976 [26:14<15:36:07,  1.37s/it][A
Training:   3%|▎         | 1084/41976 [26:15<15:35:37,  1.37s/it][A
Training:   3%|▎         | 1085/41976 [26:17<16:13:41,  1.43s/it][A
Training:   3%|▎         | 1086/41976 [26:19<16:53:11,  1.49s/it][A
Training:   3%|▎         | 1087/41976 [26:20<17:59:54,  1.58s/it][A
Training:   3%|▎         | 1088/41976 [26:22<17:17:49,  1.52s/it][A
Training:   3%|▎         | 1089/41976 [26:23<16:39:18,  1.47s/it][A
Training:   3%|▎         | 1090/41976 [26:24<16:13:47,  1.43s/it][A
Training:   3%|▎         | 1091/41976 [26:26<16:02:22,  1.41s/it][A

Epoch 0, Step 1090, Loss: nan



Training:   3%|▎         | 1092/41976 [26:27<15:56:55,  1.40s/it][A
Training:   3%|▎         | 1093/41976 [26:29<16:19:41,  1.44s/it][A
Training:   3%|▎         | 1094/41976 [26:30<17:17:23,  1.52s/it][A
Training:   3%|▎         | 1095/41976 [26:32<16:38:43,  1.47s/it][A
Training:   3%|▎         | 1096/41976 [26:33<16:18:17,  1.44s/it][A
Training:   3%|▎         | 1097/41976 [26:34<15:53:38,  1.40s/it][A
Training:   3%|▎         | 1098/41976 [26:36<15:43:58,  1.39s/it][A
Training:   3%|▎         | 1099/41976 [26:37<15:30:11,  1.37s/it][A
Training:   3%|▎         | 1100/41976 [26:38<15:31:17,  1.37s/it][A
Training:   3%|▎         | 1101/41976 [26:40<17:29:28,  1.54s/it][A

Epoch 0, Step 1100, Loss: nan



Training:   3%|▎         | 1102/41976 [26:42<18:22:53,  1.62s/it][A
Training:   3%|▎         | 1103/41976 [26:44<17:25:40,  1.54s/it][A
Training:   3%|▎         | 1104/41976 [26:45<17:15:25,  1.52s/it][A
Training:   3%|▎         | 1105/41976 [26:47<18:06:32,  1.60s/it][A
Training:   3%|▎         | 1106/41976 [26:48<17:14:28,  1.52s/it][A
Training:   3%|▎         | 1107/41976 [26:49<16:33:39,  1.46s/it][A
Training:   3%|▎         | 1108/41976 [26:51<16:17:16,  1.43s/it][A
Training:   3%|▎         | 1109/41976 [26:52<15:58:39,  1.41s/it][A
Training:   3%|▎         | 1110/41976 [26:54<17:20:39,  1.53s/it][A
Training:   3%|▎         | 1111/41976 [26:55<16:42:49,  1.47s/it][A

Epoch 0, Step 1110, Loss: nan



Training:   3%|▎         | 1112/41976 [26:57<16:23:07,  1.44s/it][A
Training:   3%|▎         | 1113/41976 [26:58<15:56:34,  1.40s/it][A
Training:   3%|▎         | 1114/41976 [26:59<15:42:51,  1.38s/it][A
Training:   3%|▎         | 1115/41976 [27:01<17:13:03,  1.52s/it][A
Training:   3%|▎         | 1116/41976 [27:03<16:41:29,  1.47s/it][A
Training:   3%|▎         | 1117/41976 [27:04<16:11:17,  1.43s/it][A
Training:   3%|▎         | 1118/41976 [27:06<17:10:31,  1.51s/it][A
Training:   3%|▎         | 1119/41976 [27:07<17:04:23,  1.50s/it][A
Training:   3%|▎         | 1120/41976 [27:08<16:35:47,  1.46s/it][A
Training:   3%|▎         | 1121/41976 [27:10<16:06:58,  1.42s/it][A

Epoch 0, Step 1120, Loss: nan



Training:   3%|▎         | 1122/41976 [27:11<15:58:10,  1.41s/it][A
Training:   3%|▎         | 1123/41976 [27:12<15:42:26,  1.38s/it][A
Training:   3%|▎         | 1124/41976 [27:14<15:40:41,  1.38s/it][A
Training:   3%|▎         | 1125/41976 [27:15<15:30:13,  1.37s/it][A
Training:   3%|▎         | 1126/41976 [27:17<16:00:32,  1.41s/it][A
Training:   3%|▎         | 1127/41976 [27:18<17:04:13,  1.50s/it][A
Training:   3%|▎         | 1128/41976 [27:20<16:36:08,  1.46s/it][A
Training:   3%|▎         | 1129/41976 [27:22<17:45:07,  1.56s/it][A
Training:   3%|▎         | 1130/41976 [27:23<17:01:44,  1.50s/it][A
Training:   3%|▎         | 1131/41976 [27:24<16:30:37,  1.46s/it][A

Epoch 0, Step 1130, Loss: nan



Training:   3%|▎         | 1132/41976 [27:26<16:17:38,  1.44s/it][A
Training:   3%|▎         | 1133/41976 [27:27<15:57:25,  1.41s/it][A
Training:   3%|▎         | 1134/41976 [27:28<16:06:48,  1.42s/it][A
Training:   3%|▎         | 1135/41976 [27:30<17:27:15,  1.54s/it][A
Training:   3%|▎         | 1136/41976 [27:32<16:59:27,  1.50s/it][A
Training:   3%|▎         | 1137/41976 [27:33<16:21:50,  1.44s/it][A
Training:   3%|▎         | 1138/41976 [27:34<16:02:02,  1.41s/it][A
Training:   3%|▎         | 1139/41976 [27:36<15:42:45,  1.39s/it][A
Training:   3%|▎         | 1140/41976 [27:37<15:51:47,  1.40s/it][A
Training:   3%|▎         | 1141/41976 [27:38<15:36:38,  1.38s/it][A

Epoch 0, Step 1140, Loss: nan



Training:   3%|▎         | 1142/41976 [27:40<15:36:34,  1.38s/it][A
Training:   3%|▎         | 1143/41976 [27:42<18:56:20,  1.67s/it][A
Training:   3%|▎         | 1144/41976 [27:44<17:56:50,  1.58s/it][A
Training:   3%|▎         | 1145/41976 [27:45<17:02:52,  1.50s/it][A
Training:   3%|▎         | 1146/41976 [27:46<16:30:27,  1.46s/it][A
Training:   3%|▎         | 1147/41976 [27:48<16:06:35,  1.42s/it][A
Training:   3%|▎         | 1148/41976 [27:49<15:59:12,  1.41s/it][A
Training:   3%|▎         | 1149/41976 [27:50<15:38:30,  1.38s/it][A
Training:   3%|▎         | 1150/41976 [27:52<15:33:09,  1.37s/it][A
Training:   3%|▎         | 1151/41976 [27:53<16:22:12,  1.44s/it][A

Epoch 0, Step 1150, Loss: nan



Training:   3%|▎         | 1152/41976 [27:55<17:06:29,  1.51s/it][A
Training:   3%|▎         | 1153/41976 [27:56<16:28:09,  1.45s/it][A
Training:   3%|▎         | 1154/41976 [27:57<16:02:47,  1.42s/it][A
Training:   3%|▎         | 1155/41976 [27:59<15:49:36,  1.40s/it][A
Training:   3%|▎         | 1156/41976 [28:00<15:48:49,  1.39s/it][A
Training:   3%|▎         | 1157/41976 [28:02<17:11:33,  1.52s/it][A
Training:   3%|▎         | 1158/41976 [28:03<16:35:46,  1.46s/it][A
Training:   3%|▎         | 1159/41976 [28:05<16:55:55,  1.49s/it][A
Training:   3%|▎         | 1160/41976 [28:07<17:40:41,  1.56s/it][A
Training:   3%|▎         | 1161/41976 [28:08<16:51:24,  1.49s/it][A

Epoch 0, Step 1160, Loss: nan



Training:   3%|▎         | 1162/41976 [28:09<16:28:39,  1.45s/it][A
Training:   3%|▎         | 1163/41976 [28:11<16:11:14,  1.43s/it][A
Training:   3%|▎         | 1164/41976 [28:12<15:58:30,  1.41s/it][A
Training:   3%|▎         | 1165/41976 [28:13<15:38:02,  1.38s/it][A
Training:   3%|▎         | 1166/41976 [28:15<15:27:58,  1.36s/it][A
Training:   3%|▎         | 1167/41976 [28:16<15:22:20,  1.36s/it][A
Training:   3%|▎         | 1168/41976 [28:18<16:53:50,  1.49s/it][A
Training:   3%|▎         | 1169/41976 [28:19<16:35:46,  1.46s/it][A
Training:   3%|▎         | 1170/41976 [28:21<16:11:24,  1.43s/it][A
Training:   3%|▎         | 1171/41976 [28:22<17:29:17,  1.54s/it][A

Epoch 0, Step 1170, Loss: nan



Training:   3%|▎         | 1172/41976 [28:24<16:54:14,  1.49s/it][A
Training:   3%|▎         | 1173/41976 [28:25<16:17:04,  1.44s/it][A
Training:   3%|▎         | 1174/41976 [28:26<15:53:37,  1.40s/it][A
Training:   3%|▎         | 1175/41976 [28:28<15:35:09,  1.38s/it][A
Training:   3%|▎         | 1176/41976 [28:29<16:43:50,  1.48s/it][A
Training:   3%|▎         | 1177/41976 [28:31<17:01:36,  1.50s/it][A
Training:   3%|▎         | 1178/41976 [28:32<16:27:40,  1.45s/it][A
Training:   3%|▎         | 1179/41976 [28:34<16:14:07,  1.43s/it][A
Training:   3%|▎         | 1180/41976 [28:35<16:25:57,  1.45s/it][A
Training:   3%|▎         | 1181/41976 [28:37<16:02:04,  1.41s/it][A

Epoch 0, Step 1180, Loss: nan



Training:   3%|▎         | 1182/41976 [28:38<15:50:52,  1.40s/it][A
Training:   3%|▎         | 1183/41976 [28:39<15:45:14,  1.39s/it][A
Training:   3%|▎         | 1184/41976 [28:41<16:26:41,  1.45s/it][A
Training:   3%|▎         | 1185/41976 [28:43<18:46:17,  1.66s/it][A
Training:   3%|▎         | 1186/41976 [28:44<17:41:53,  1.56s/it][A
Training:   3%|▎         | 1187/41976 [28:46<16:54:48,  1.49s/it][A
Training:   3%|▎         | 1188/41976 [28:47<16:28:10,  1.45s/it][A
Training:   3%|▎         | 1189/41976 [28:48<15:57:53,  1.41s/it][A
Training:   3%|▎         | 1190/41976 [28:50<15:46:37,  1.39s/it][A
Training:   3%|▎         | 1191/41976 [28:51<15:34:35,  1.37s/it][A

Epoch 0, Step 1190, Loss: nan



Training:   3%|▎         | 1192/41976 [28:52<15:48:59,  1.40s/it][A
Training:   3%|▎         | 1193/41976 [28:54<17:11:22,  1.52s/it][A
Training:   3%|▎         | 1194/41976 [28:56<16:40:12,  1.47s/it][A
Training:   3%|▎         | 1195/41976 [28:57<16:15:08,  1.43s/it][A
Training:   3%|▎         | 1196/41976 [28:58<16:03:54,  1.42s/it][A
Training:   3%|▎         | 1197/41976 [29:00<15:42:46,  1.39s/it][A
Training:   3%|▎         | 1198/41976 [29:01<15:37:19,  1.38s/it][A
Training:   3%|▎         | 1199/41976 [29:03<17:06:52,  1.51s/it][A
Training:   3%|▎         | 1200/41976 [29:04<16:41:55,  1.47s/it][A
Training:   3%|▎         | 1201/41976 [29:06<17:38:57,  1.56s/it][A

Epoch 0, Step 1200, Loss: nan



Training:   3%|▎         | 1202/41976 [29:07<17:19:47,  1.53s/it][A
Training:   3%|▎         | 1203/41976 [29:09<16:41:14,  1.47s/it][A
Training:   3%|▎         | 1204/41976 [29:10<16:21:29,  1.44s/it][A
Training:   3%|▎         | 1205/41976 [29:12<16:05:55,  1.42s/it][A
Training:   3%|▎         | 1206/41976 [29:13<15:52:26,  1.40s/it][A
Training:   3%|▎         | 1207/41976 [29:14<15:41:03,  1.38s/it][A
Training:   3%|▎         | 1208/41976 [29:16<15:38:40,  1.38s/it][A
Training:   3%|▎         | 1209/41976 [29:17<16:03:27,  1.42s/it][A
Training:   3%|▎         | 1210/41976 [29:19<17:00:33,  1.50s/it][A
Training:   3%|▎         | 1211/41976 [29:20<16:26:10,  1.45s/it][A

Epoch 0, Step 1210, Loss: nan



Training:   3%|▎         | 1212/41976 [29:22<16:08:48,  1.43s/it][A
Training:   3%|▎         | 1213/41976 [29:23<17:24:46,  1.54s/it][A
Training:   3%|▎         | 1214/41976 [29:25<16:45:52,  1.48s/it][A
Training:   3%|▎         | 1215/41976 [29:26<16:20:52,  1.44s/it][A
Training:   3%|▎         | 1216/41976 [29:27<16:07:22,  1.42s/it][A
Training:   3%|▎         | 1217/41976 [29:29<16:13:28,  1.43s/it][A
Training:   3%|▎         | 1218/41976 [29:31<17:20:33,  1.53s/it][A
Training:   3%|▎         | 1219/41976 [29:32<16:40:34,  1.47s/it][A
Training:   3%|▎         | 1220/41976 [29:33<16:22:52,  1.45s/it][A
Training:   3%|▎         | 1221/41976 [29:35<15:56:11,  1.41s/it][A

Epoch 0, Step 1220, Loss: nan



Training:   3%|▎         | 1222/41976 [29:36<15:45:43,  1.39s/it][A
Training:   3%|▎         | 1223/41976 [29:37<15:37:08,  1.38s/it][A
Training:   3%|▎         | 1224/41976 [29:39<15:36:17,  1.38s/it][A
Training:   3%|▎         | 1225/41976 [29:40<15:26:51,  1.36s/it][A
Training:   3%|▎         | 1226/41976 [29:42<16:37:34,  1.47s/it][A
Training:   3%|▎         | 1227/41976 [29:44<18:26:33,  1.63s/it][A
Training:   3%|▎         | 1228/41976 [29:45<17:33:12,  1.55s/it][A
Training:   3%|▎         | 1229/41976 [29:46<16:42:38,  1.48s/it][A
Training:   3%|▎         | 1230/41976 [29:48<16:17:32,  1.44s/it][A
Training:   3%|▎         | 1231/41976 [29:49<15:56:59,  1.41s/it][A

Epoch 0, Step 1230, Loss: nan



Training:   3%|▎         | 1232/41976 [29:51<15:46:32,  1.39s/it][A
Training:   3%|▎         | 1233/41976 [29:52<15:32:39,  1.37s/it][A
Training:   3%|▎         | 1234/41976 [29:53<16:17:21,  1.44s/it][A
Training:   3%|▎         | 1235/41976 [29:55<16:58:36,  1.50s/it][A
Training:   3%|▎         | 1236/41976 [29:56<16:34:45,  1.47s/it][A
Training:   3%|▎         | 1237/41976 [29:58<16:05:09,  1.42s/it][A
Training:   3%|▎         | 1238/41976 [29:59<15:48:02,  1.40s/it][A
Training:   3%|▎         | 1239/41976 [30:00<15:37:48,  1.38s/it][A
Training:   3%|▎         | 1240/41976 [30:02<15:35:44,  1.38s/it][A
Training:   3%|▎         | 1241/41976 [30:04<16:58:54,  1.50s/it][A

Epoch 0, Step 1240, Loss: nan



Training:   3%|▎         | 1242/41976 [30:05<17:10:29,  1.52s/it][A
Training:   3%|▎         | 1243/41976 [30:07<17:43:24,  1.57s/it][A
Training:   3%|▎         | 1244/41976 [30:08<17:03:13,  1.51s/it][A
Training:   3%|▎         | 1245/41976 [30:10<16:29:24,  1.46s/it][A
Training:   3%|▎         | 1246/41976 [30:11<16:06:44,  1.42s/it][A
Training:   3%|▎         | 1247/41976 [30:12<15:55:51,  1.41s/it][A
Training:   3%|▎         | 1248/41976 [30:14<15:50:29,  1.40s/it][A
Training:   3%|▎         | 1249/41976 [30:15<15:35:38,  1.38s/it][A
Training:   3%|▎         | 1250/41976 [30:16<15:26:52,  1.37s/it][A
Training:   3%|▎         | 1251/41976 [30:18<16:48:25,  1.49s/it][A

Epoch 0, Step 1250, Loss: nan



Training:   3%|▎         | 1252/41976 [30:20<16:47:57,  1.49s/it][A
Training:   3%|▎         | 1253/41976 [30:21<16:19:19,  1.44s/it][A
Training:   3%|▎         | 1254/41976 [30:22<15:59:30,  1.41s/it][A
Training:   3%|▎         | 1255/41976 [30:24<17:25:13,  1.54s/it][A
Training:   3%|▎         | 1256/41976 [30:25<16:50:10,  1.49s/it][A
Training:   3%|▎         | 1257/41976 [30:27<16:18:24,  1.44s/it][A
Training:   3%|▎         | 1258/41976 [30:28<16:01:39,  1.42s/it][A
Training:   3%|▎         | 1259/41976 [30:30<17:08:12,  1.52s/it][A
Training:   3%|▎         | 1260/41976 [30:32<18:45:37,  1.66s/it][A
Training:   3%|▎         | 1261/41976 [30:33<18:05:19,  1.60s/it][A

Epoch 0, Step 1260, Loss: nan



Training:   3%|▎         | 1262/41976 [30:35<17:15:32,  1.53s/it][A
Training:   3%|▎         | 1263/41976 [30:36<16:38:38,  1.47s/it][A
Training:   3%|▎         | 1264/41976 [30:37<16:17:16,  1.44s/it][A
Training:   3%|▎         | 1265/41976 [30:39<15:55:10,  1.41s/it][A
Training:   3%|▎         | 1266/41976 [30:40<15:43:04,  1.39s/it][A
Training:   3%|▎         | 1267/41976 [30:42<15:50:47,  1.40s/it][A
Training:   3%|▎         | 1268/41976 [30:43<17:25:55,  1.54s/it][A
Training:   3%|▎         | 1269/41976 [30:45<18:22:54,  1.63s/it][A
Training:   3%|▎         | 1270/41976 [30:47<17:24:43,  1.54s/it][A
Training:   3%|▎         | 1271/41976 [30:48<16:44:30,  1.48s/it][A

Epoch 0, Step 1270, Loss: nan



Training:   3%|▎         | 1272/41976 [30:49<16:26:17,  1.45s/it][A
Training:   3%|▎         | 1273/41976 [30:51<15:58:22,  1.41s/it][A
Training:   3%|▎         | 1274/41976 [30:52<15:42:22,  1.39s/it][A
Training:   3%|▎         | 1275/41976 [30:53<15:32:07,  1.37s/it][A
Training:   3%|▎         | 1276/41976 [30:55<17:00:13,  1.50s/it][A
Training:   3%|▎         | 1277/41976 [30:56<16:34:10,  1.47s/it][A
Training:   3%|▎         | 1278/41976 [30:58<16:10:14,  1.43s/it][A
Training:   3%|▎         | 1279/41976 [30:59<15:50:25,  1.40s/it][A
Training:   3%|▎         | 1280/41976 [31:01<15:48:18,  1.40s/it][A
Training:   3%|▎         | 1281/41976 [31:02<15:30:55,  1.37s/it][A

Epoch 0, Step 1280, Loss: nan



Training:   3%|▎         | 1282/41976 [31:03<15:27:37,  1.37s/it][A
Training:   3%|▎         | 1283/41976 [31:05<16:58:46,  1.50s/it][A
Training:   3%|▎         | 1284/41976 [31:07<17:42:29,  1.57s/it][A
Training:   3%|▎         | 1285/41976 [31:08<17:29:48,  1.55s/it][A
Training:   3%|▎         | 1286/41976 [31:10<16:48:11,  1.49s/it][A
Training:   3%|▎         | 1287/41976 [31:11<16:17:40,  1.44s/it][A
Training:   3%|▎         | 1288/41976 [31:12<16:09:17,  1.43s/it][A
Training:   3%|▎         | 1289/41976 [31:14<15:50:45,  1.40s/it][A
Training:   3%|▎         | 1290/41976 [31:15<15:44:20,  1.39s/it][A
Training:   3%|▎         | 1291/41976 [31:16<15:34:56,  1.38s/it][A

Epoch 0, Step 1290, Loss: nan



Training:   3%|▎         | 1292/41976 [31:18<16:12:58,  1.43s/it][A
Training:   3%|▎         | 1293/41976 [31:20<17:05:04,  1.51s/it][A
Training:   3%|▎         | 1294/41976 [31:21<16:31:24,  1.46s/it][A
Training:   3%|▎         | 1295/41976 [31:22<16:10:07,  1.43s/it][A
Training:   3%|▎         | 1296/41976 [31:24<15:54:44,  1.41s/it][A
Training:   3%|▎         | 1297/41976 [31:26<17:10:15,  1.52s/it][A
Training:   3%|▎         | 1298/41976 [31:27<16:31:38,  1.46s/it][A
Training:   3%|▎         | 1299/41976 [31:28<16:05:16,  1.42s/it][A
Training:   3%|▎         | 1300/41976 [31:30<16:08:40,  1.43s/it][A
Training:   3%|▎         | 1301/41976 [31:31<17:18:29,  1.53s/it][A

Epoch 0, Step 1300, Loss: nan



Training:   3%|▎         | 1302/41976 [31:33<16:47:55,  1.49s/it][A
Training:   3%|▎         | 1303/41976 [31:34<16:17:05,  1.44s/it][A
Training:   3%|▎         | 1304/41976 [31:35<16:02:18,  1.42s/it][A
Training:   3%|▎         | 1305/41976 [31:37<15:42:28,  1.39s/it][A
Training:   3%|▎         | 1306/41976 [31:38<15:32:13,  1.38s/it][A
Training:   3%|▎         | 1307/41976 [31:39<15:27:01,  1.37s/it][A
Training:   3%|▎         | 1308/41976 [31:41<15:33:37,  1.38s/it][A
Training:   3%|▎         | 1309/41976 [31:42<16:15:44,  1.44s/it][A
Training:   3%|▎         | 1310/41976 [31:44<16:59:33,  1.50s/it][A
Training:   3%|▎         | 1311/41976 [31:46<17:59:26,  1.59s/it][A

Epoch 0, Step 1310, Loss: nan



Training:   3%|▎         | 1312/41976 [31:47<17:23:02,  1.54s/it][A
Training:   3%|▎         | 1313/41976 [31:49<16:36:13,  1.47s/it][A
Training:   3%|▎         | 1314/41976 [31:50<16:08:50,  1.43s/it][A
Training:   3%|▎         | 1315/41976 [31:51<15:54:57,  1.41s/it][A
Training:   3%|▎         | 1316/41976 [31:53<15:46:23,  1.40s/it][A
Training:   3%|▎         | 1317/41976 [31:54<16:06:38,  1.43s/it][A
Training:   3%|▎         | 1318/41976 [31:56<17:01:13,  1.51s/it][A
Training:   3%|▎         | 1319/41976 [31:57<16:26:57,  1.46s/it][A
Training:   3%|▎         | 1320/41976 [31:59<16:12:04,  1.43s/it][A
Training:   3%|▎         | 1321/41976 [32:00<15:47:55,  1.40s/it][A

Epoch 0, Step 1320, Loss: nan



Training:   3%|▎         | 1322/41976 [32:01<15:39:48,  1.39s/it][A
Training:   3%|▎         | 1323/41976 [32:03<15:29:53,  1.37s/it][A
Training:   3%|▎         | 1324/41976 [32:04<15:26:51,  1.37s/it][A
Training:   3%|▎         | 1325/41976 [32:06<17:14:33,  1.53s/it][A
Training:   3%|▎         | 1326/41976 [32:08<18:09:00,  1.61s/it][A
Training:   3%|▎         | 1327/41976 [32:09<17:19:17,  1.53s/it][A
Training:   3%|▎         | 1328/41976 [32:10<16:45:21,  1.48s/it][A
Training:   3%|▎         | 1329/41976 [32:12<16:10:19,  1.43s/it][A
Training:   3%|▎         | 1330/41976 [32:13<16:00:27,  1.42s/it][A
Training:   3%|▎         | 1331/41976 [32:14<15:40:54,  1.39s/it][A

Epoch 0, Step 1330, Loss: nan



Training:   3%|▎         | 1332/41976 [32:16<15:36:42,  1.38s/it][A
Training:   3%|▎         | 1333/41976 [32:17<15:19:58,  1.36s/it][A
Training:   3%|▎         | 1334/41976 [32:19<16:12:37,  1.44s/it][A
Training:   3%|▎         | 1335/41976 [32:20<16:41:23,  1.48s/it][A
Training:   3%|▎         | 1336/41976 [32:22<16:18:01,  1.44s/it][A
Training:   3%|▎         | 1337/41976 [32:23<15:53:44,  1.41s/it][A
Training:   3%|▎         | 1338/41976 [32:24<15:38:28,  1.39s/it][A
Training:   3%|▎         | 1339/41976 [32:26<17:07:28,  1.52s/it][A
Training:   3%|▎         | 1340/41976 [32:27<16:34:54,  1.47s/it][A
Training:   3%|▎         | 1341/41976 [32:29<16:02:15,  1.42s/it][A

Epoch 0, Step 1340, Loss: nan



Training:   3%|▎         | 1342/41976 [32:30<16:37:48,  1.47s/it][A
Training:   3%|▎         | 1343/41976 [32:32<17:19:11,  1.53s/it][A
Training:   3%|▎         | 1344/41976 [32:33<16:47:14,  1.49s/it][A
Training:   3%|▎         | 1345/41976 [32:35<16:10:00,  1.43s/it][A
Training:   3%|▎         | 1346/41976 [32:36<15:50:58,  1.40s/it][A
Training:   3%|▎         | 1347/41976 [32:37<15:32:51,  1.38s/it][A
Training:   3%|▎         | 1348/41976 [32:39<15:34:54,  1.38s/it][A
Training:   3%|▎         | 1349/41976 [32:40<15:28:21,  1.37s/it][A
Training:   3%|▎         | 1350/41976 [32:41<15:19:02,  1.36s/it][A
Training:   3%|▎         | 1351/41976 [32:43<16:56:31,  1.50s/it][A

Epoch 0, Step 1350, Loss: nan



Training:   3%|▎         | 1352/41976 [32:45<16:55:16,  1.50s/it][A
Training:   3%|▎         | 1353/41976 [32:47<17:52:09,  1.58s/it][A
Training:   3%|▎         | 1354/41976 [32:48<17:03:08,  1.51s/it][A
Training:   3%|▎         | 1355/41976 [32:49<16:25:41,  1.46s/it][A
Training:   3%|▎         | 1356/41976 [32:51<16:06:17,  1.43s/it][A
Training:   3%|▎         | 1357/41976 [32:52<15:42:26,  1.39s/it][A
Training:   3%|▎         | 1358/41976 [32:53<15:30:09,  1.37s/it][A
Training:   3%|▎         | 1359/41976 [32:55<16:19:50,  1.45s/it][A
Training:   3%|▎         | 1360/41976 [32:57<16:57:47,  1.50s/it][A
Training:   3%|▎         | 1361/41976 [32:58<16:23:48,  1.45s/it][A

Epoch 0, Step 1360, Loss: nan



Training:   3%|▎         | 1362/41976 [32:59<16:07:03,  1.43s/it][A
Training:   3%|▎         | 1363/41976 [33:01<15:49:16,  1.40s/it][A
Training:   3%|▎         | 1364/41976 [33:02<15:44:48,  1.40s/it][A
Training:   3%|▎         | 1365/41976 [33:03<15:28:08,  1.37s/it][A
Training:   3%|▎         | 1366/41976 [33:05<15:19:59,  1.36s/it][A
Training:   3%|▎         | 1367/41976 [33:07<17:45:04,  1.57s/it][A
Training:   3%|▎         | 1368/41976 [33:08<18:10:39,  1.61s/it][A
Training:   3%|▎         | 1369/41976 [33:10<17:12:24,  1.53s/it][A
Training:   3%|▎         | 1370/41976 [33:11<16:36:01,  1.47s/it][A
Training:   3%|▎         | 1371/41976 [33:12<16:10:06,  1.43s/it][A

Epoch 0, Step 1370, Loss: nan



Training:   3%|▎         | 1372/41976 [33:14<16:06:42,  1.43s/it][A
Training:   3%|▎         | 1373/41976 [33:15<15:44:53,  1.40s/it][A
Training:   3%|▎         | 1374/41976 [33:16<15:29:02,  1.37s/it][A
Training:   3%|▎         | 1375/41976 [33:18<15:23:10,  1.36s/it][A
Training:   3%|▎         | 1376/41976 [33:20<16:54:01,  1.50s/it][A
Training:   3%|▎         | 1377/41976 [33:21<16:28:22,  1.46s/it][A
Training:   3%|▎         | 1378/41976 [33:22<16:03:01,  1.42s/it][A
Training:   3%|▎         | 1379/41976 [33:24<15:48:18,  1.40s/it][A
Training:   3%|▎         | 1380/41976 [33:25<15:38:50,  1.39s/it][A
Training:   3%|▎         | 1381/41976 [33:27<17:00:11,  1.51s/it][A

Epoch 0, Step 1380, Loss: nan



Training:   3%|▎         | 1382/41976 [33:28<16:34:28,  1.47s/it][A
Training:   3%|▎         | 1383/41976 [33:30<16:07:26,  1.43s/it][A
Training:   3%|▎         | 1384/41976 [33:31<17:12:37,  1.53s/it][A
Training:   3%|▎         | 1385/41976 [33:33<17:05:40,  1.52s/it][A
Training:   3%|▎         | 1386/41976 [33:34<16:32:45,  1.47s/it][A
Training:   3%|▎         | 1387/41976 [33:35<16:05:19,  1.43s/it][A
Training:   3%|▎         | 1388/41976 [33:37<15:56:40,  1.41s/it][A
Training:   3%|▎         | 1389/41976 [33:38<15:41:31,  1.39s/it][A
Training:   3%|▎         | 1390/41976 [33:39<15:30:05,  1.37s/it][A
Training:   3%|▎         | 1391/41976 [33:41<15:23:54,  1.37s/it][A

Epoch 0, Step 1390, Loss: nan



Training:   3%|▎         | 1392/41976 [33:42<15:58:35,  1.42s/it][A
Training:   3%|▎         | 1393/41976 [33:44<17:03:05,  1.51s/it][A
Training:   3%|▎         | 1394/41976 [33:45<16:29:30,  1.46s/it][A
Training:   3%|▎         | 1395/41976 [33:47<17:36:46,  1.56s/it][A
Training:   3%|▎         | 1396/41976 [33:49<17:00:23,  1.51s/it][A
Training:   3%|▎         | 1397/41976 [33:50<16:19:39,  1.45s/it][A
Training:   3%|▎         | 1398/41976 [33:51<15:54:46,  1.41s/it][A
Training:   3%|▎         | 1399/41976 [33:53<15:40:27,  1.39s/it][A
Training:   3%|▎         | 1400/41976 [33:54<15:43:34,  1.40s/it][A
Training:   3%|▎         | 1401/41976 [33:56<16:56:29,  1.50s/it][A

Epoch 0, Step 1400, Loss: nan



Training:   3%|▎         | 1402/41976 [33:57<16:28:28,  1.46s/it][A
Training:   3%|▎         | 1403/41976 [33:58<16:04:15,  1.43s/it][A
Training:   3%|▎         | 1404/41976 [34:00<15:52:20,  1.41s/it][A
Training:   3%|▎         | 1405/41976 [34:01<15:35:53,  1.38s/it][A
Training:   3%|▎         | 1406/41976 [34:03<15:29:18,  1.37s/it][A
Training:   3%|▎         | 1407/41976 [34:04<15:17:31,  1.36s/it][A
Training:   3%|▎         | 1408/41976 [34:05<15:22:29,  1.36s/it][A
Training:   3%|▎         | 1409/41976 [34:07<17:57:43,  1.59s/it][A
Training:   3%|▎         | 1410/41976 [34:09<17:44:21,  1.57s/it][A
Training:   3%|▎         | 1411/41976 [34:10<16:57:11,  1.50s/it][A

Epoch 0, Step 1410, Loss: nan



Training:   3%|▎         | 1412/41976 [34:12<16:29:21,  1.46s/it][A
Training:   3%|▎         | 1413/41976 [34:13<16:01:12,  1.42s/it][A
Training:   3%|▎         | 1414/41976 [34:14<16:17:59,  1.45s/it][A
Training:   3%|▎         | 1415/41976 [34:16<17:20:09,  1.54s/it][A
Training:   3%|▎         | 1416/41976 [34:18<16:45:21,  1.49s/it][A
Training:   3%|▎         | 1417/41976 [34:19<17:16:04,  1.53s/it][A
Training:   3%|▎         | 1418/41976 [34:21<17:17:47,  1.54s/it][A
Training:   3%|▎         | 1419/41976 [34:22<16:35:31,  1.47s/it][A
Training:   3%|▎         | 1420/41976 [34:23<16:15:36,  1.44s/it][A
Training:   3%|▎         | 1421/41976 [34:25<15:46:34,  1.40s/it][A

Epoch 0, Step 1420, Loss: nan



Training:   3%|▎         | 1422/41976 [34:26<15:40:44,  1.39s/it][A
Training:   3%|▎         | 1423/41976 [34:28<17:01:06,  1.51s/it][A
Training:   3%|▎         | 1424/41976 [34:29<16:40:12,  1.48s/it][A
Training:   3%|▎         | 1425/41976 [34:31<17:06:05,  1.52s/it][A
Training:   3%|▎         | 1426/41976 [34:33<17:25:38,  1.55s/it][A
Training:   3%|▎         | 1427/41976 [34:34<16:42:36,  1.48s/it][A
Training:   3%|▎         | 1428/41976 [34:35<16:22:06,  1.45s/it][A
Training:   3%|▎         | 1429/41976 [34:37<15:55:12,  1.41s/it][A
Training:   3%|▎         | 1430/41976 [34:38<15:44:44,  1.40s/it][A
Training:   3%|▎         | 1431/41976 [34:39<15:35:58,  1.39s/it][A

Epoch 0, Step 1430, Loss: nan



Training:   3%|▎         | 1432/41976 [34:41<15:35:09,  1.38s/it][A
Training:   3%|▎         | 1433/41976 [34:42<15:21:15,  1.36s/it][A
Training:   3%|▎         | 1434/41976 [34:44<16:43:53,  1.49s/it][A
Training:   3%|▎         | 1435/41976 [34:45<16:30:49,  1.47s/it][A
Training:   3%|▎         | 1436/41976 [34:47<16:10:44,  1.44s/it][A
Training:   3%|▎         | 1437/41976 [34:48<17:25:29,  1.55s/it][A
Training:   3%|▎         | 1438/41976 [34:50<16:45:32,  1.49s/it][A
Training:   3%|▎         | 1439/41976 [34:51<16:18:58,  1.45s/it][A
Training:   3%|▎         | 1440/41976 [34:52<16:03:20,  1.43s/it][A
Training:   3%|▎         | 1441/41976 [34:54<15:41:11,  1.39s/it][A

Epoch 0, Step 1440, Loss: nan



Training:   3%|▎         | 1442/41976 [34:55<16:54:12,  1.50s/it][A
Training:   3%|▎         | 1443/41976 [34:57<16:52:03,  1.50s/it][A
Training:   3%|▎         | 1444/41976 [34:58<16:28:51,  1.46s/it][A
Training:   3%|▎         | 1445/41976 [35:00<15:57:46,  1.42s/it][A
Training:   3%|▎         | 1446/41976 [35:01<15:42:01,  1.39s/it][A
Training:   3%|▎         | 1447/41976 [35:02<15:26:09,  1.37s/it][A
Training:   3%|▎         | 1448/41976 [35:04<15:26:34,  1.37s/it][A
Training:   3%|▎         | 1449/41976 [35:05<15:13:42,  1.35s/it][A
Training:   3%|▎         | 1450/41976 [35:07<15:42:00,  1.39s/it][A
Training:   3%|▎         | 1451/41976 [35:09<18:17:44,  1.63s/it][A

Epoch 0, Step 1450, Loss: nan



Training:   3%|▎         | 1452/41976 [35:10<17:22:56,  1.54s/it][A
Training:   3%|▎         | 1453/41976 [35:11<16:36:12,  1.48s/it][A
Training:   3%|▎         | 1454/41976 [35:13<16:08:26,  1.43s/it][A
Training:   3%|▎         | 1455/41976 [35:14<15:49:16,  1.41s/it][A
Training:   3%|▎         | 1456/41976 [35:15<15:47:05,  1.40s/it][A
Training:   3%|▎         | 1457/41976 [35:17<15:28:15,  1.37s/it][A
Training:   3%|▎         | 1458/41976 [35:18<15:24:53,  1.37s/it][A
Training:   3%|▎         | 1459/41976 [35:20<16:43:01,  1.49s/it][A
Training:   3%|▎         | 1460/41976 [35:21<16:31:02,  1.47s/it][A
Training:   3%|▎         | 1461/41976 [35:23<16:02:30,  1.43s/it][A

Epoch 0, Step 1460, Loss: nan



Training:   3%|▎         | 1462/41976 [35:24<15:50:26,  1.41s/it][A
Training:   3%|▎         | 1463/41976 [35:25<15:36:35,  1.39s/it][A
Training:   3%|▎         | 1464/41976 [35:27<15:31:20,  1.38s/it][A
Training:   3%|▎         | 1465/41976 [35:28<16:51:50,  1.50s/it][A
Training:   3%|▎         | 1466/41976 [35:30<16:20:43,  1.45s/it][A
Training:   3%|▎         | 1467/41976 [35:32<17:20:15,  1.54s/it][A
Training:   3%|▎         | 1468/41976 [35:33<17:16:43,  1.54s/it][A
Training:   3%|▎         | 1469/41976 [35:34<16:33:18,  1.47s/it][A
Training:   4%|▎         | 1470/41976 [35:36<16:09:01,  1.44s/it][A
Training:   4%|▎         | 1471/41976 [35:37<15:51:03,  1.41s/it][A

Epoch 0, Step 1470, Loss: nan



Training:   4%|▎         | 1472/41976 [35:38<15:44:57,  1.40s/it][A
Training:   4%|▎         | 1473/41976 [35:40<15:33:03,  1.38s/it][A
Training:   4%|▎         | 1474/41976 [35:41<15:26:40,  1.37s/it][A
Training:   4%|▎         | 1475/41976 [35:43<15:54:27,  1.41s/it][A
Training:   4%|▎         | 1476/41976 [35:44<16:59:52,  1.51s/it][A
Training:   4%|▎         | 1477/41976 [35:46<16:26:11,  1.46s/it][A
Training:   4%|▎         | 1478/41976 [35:47<16:06:42,  1.43s/it][A
Training:   4%|▎         | 1479/41976 [35:49<17:19:51,  1.54s/it][A
Training:   4%|▎         | 1480/41976 [35:50<16:53:41,  1.50s/it][A
Training:   4%|▎         | 1481/41976 [35:52<16:15:42,  1.45s/it][A

Epoch 0, Step 1480, Loss: nan



Training:   4%|▎         | 1482/41976 [35:53<15:57:56,  1.42s/it][A
Training:   4%|▎         | 1483/41976 [35:54<16:06:15,  1.43s/it][A
Training:   4%|▎         | 1484/41976 [35:56<17:16:28,  1.54s/it][A
Training:   4%|▎         | 1485/41976 [35:58<16:31:45,  1.47s/it][A
Training:   4%|▎         | 1486/41976 [35:59<16:06:41,  1.43s/it][A
Training:   4%|▎         | 1487/41976 [36:00<15:46:39,  1.40s/it][A
Training:   4%|▎         | 1488/41976 [36:02<15:41:27,  1.40s/it][A
Training:   4%|▎         | 1489/41976 [36:03<15:25:08,  1.37s/it][A
Training:   4%|▎         | 1490/41976 [36:04<15:18:19,  1.36s/it][A
Training:   4%|▎         | 1491/41976 [36:06<15:12:20,  1.35s/it][A

Epoch 0, Step 1490, Loss: nan



Training:   4%|▎         | 1492/41976 [36:07<16:26:31,  1.46s/it][A
Training:   4%|▎         | 1493/41976 [36:09<18:23:52,  1.64s/it][A
Training:   4%|▎         | 1494/41976 [36:11<17:24:50,  1.55s/it][A
Training:   4%|▎         | 1495/41976 [36:12<16:43:56,  1.49s/it][A
Training:   4%|▎         | 1496/41976 [36:13<16:20:10,  1.45s/it][A
Training:   4%|▎         | 1497/41976 [36:15<15:50:15,  1.41s/it][A
Training:   4%|▎         | 1498/41976 [36:16<15:40:53,  1.39s/it][A
Training:   4%|▎         | 1499/41976 [36:17<15:30:49,  1.38s/it][A
Training:   4%|▎         | 1500/41976 [36:19<16:20:05,  1.45s/it][A
Training:   4%|▎         | 1501/41976 [36:21<16:53:17,  1.50s/it][A

Epoch 0, Step 1500, Loss: nan



Training:   4%|▎         | 1502/41976 [36:22<16:28:02,  1.46s/it][A
Training:   4%|▎         | 1503/41976 [36:23<16:04:57,  1.43s/it][A
Training:   4%|▎         | 1504/41976 [36:25<15:50:12,  1.41s/it][A
Training:   4%|▎         | 1505/41976 [36:26<15:30:56,  1.38s/it][A
Training:   4%|▎         | 1506/41976 [36:27<15:21:13,  1.37s/it][A
Training:   4%|▎         | 1507/41976 [36:29<16:55:50,  1.51s/it][A
Training:   4%|▎         | 1508/41976 [36:31<17:18:37,  1.54s/it][A
Training:   4%|▎         | 1509/41976 [36:32<17:40:43,  1.57s/it][A
Training:   4%|▎         | 1510/41976 [36:34<16:56:54,  1.51s/it][A
Training:   4%|▎         | 1511/41976 [36:35<16:18:26,  1.45s/it][A

Epoch 0, Step 1510, Loss: nan



Training:   4%|▎         | 1512/41976 [36:37<16:01:39,  1.43s/it][A
Training:   4%|▎         | 1513/41976 [36:38<15:38:13,  1.39s/it][A
Training:   4%|▎         | 1514/41976 [36:39<15:29:12,  1.38s/it][A
Training:   4%|▎         | 1515/41976 [36:40<15:19:17,  1.36s/it][A
Training:   4%|▎         | 1516/41976 [36:42<15:19:22,  1.36s/it][A
Training:   4%|▎         | 1517/41976 [36:44<16:24:43,  1.46s/it][A
Training:   4%|▎         | 1518/41976 [36:45<16:31:33,  1.47s/it][A
Training:   4%|▎         | 1519/41976 [36:46<16:09:10,  1.44s/it][A
Training:   4%|▎         | 1520/41976 [36:48<15:55:32,  1.42s/it][A
Training:   4%|▎         | 1521/41976 [36:50<17:08:45,  1.53s/it][A

Epoch 0, Step 1520, Loss: nan



Training:   4%|▎         | 1522/41976 [36:51<16:38:42,  1.48s/it][A
Training:   4%|▎         | 1523/41976 [36:52<16:07:49,  1.44s/it][A
Training:   4%|▎         | 1524/41976 [36:54<15:55:57,  1.42s/it][A
Training:   4%|▎         | 1525/41976 [36:55<16:30:31,  1.47s/it][A
Training:   4%|▎         | 1526/41976 [36:57<16:59:39,  1.51s/it][A
Training:   4%|▎         | 1527/41976 [36:58<16:25:24,  1.46s/it][A
Training:   4%|▎         | 1528/41976 [37:00<16:07:52,  1.44s/it][A
Training:   4%|▎         | 1529/41976 [37:01<15:47:34,  1.41s/it][A
Training:   4%|▎         | 1530/41976 [37:02<15:30:59,  1.38s/it][A
Training:   4%|▎         | 1531/41976 [37:04<15:19:34,  1.36s/it][A

Epoch 0, Step 1530, Loss: nan



Training:   4%|▎         | 1532/41976 [37:05<15:22:27,  1.37s/it][A
Training:   4%|▎         | 1533/41976 [37:06<15:22:59,  1.37s/it][A
Training:   4%|▎         | 1534/41976 [37:08<16:47:19,  1.49s/it][A
Training:   4%|▎         | 1535/41976 [37:10<17:57:16,  1.60s/it][A
Training:   4%|▎         | 1536/41976 [37:11<17:16:00,  1.54s/it][A
Training:   4%|▎         | 1537/41976 [37:13<16:28:55,  1.47s/it][A
Training:   4%|▎         | 1538/41976 [37:14<16:04:33,  1.43s/it][A
Training:   4%|▎         | 1539/41976 [37:15<15:46:58,  1.41s/it][A
Training:   4%|▎         | 1540/41976 [37:17<15:44:13,  1.40s/it][A
Training:   4%|▎         | 1541/41976 [37:18<15:25:21,  1.37s/it][A

Epoch 0, Step 1540, Loss: nan



Training:   4%|▎         | 1542/41976 [37:20<16:50:28,  1.50s/it][A
Training:   4%|▎         | 1543/41976 [37:21<16:34:22,  1.48s/it][A
Training:   4%|▎         | 1544/41976 [37:23<16:16:09,  1.45s/it][A
Training:   4%|▎         | 1545/41976 [37:24<15:51:07,  1.41s/it][A
Training:   4%|▎         | 1546/41976 [37:25<15:36:13,  1.39s/it][A
Training:   4%|▎         | 1547/41976 [37:27<15:24:35,  1.37s/it][A
Training:   4%|▎         | 1548/41976 [37:28<15:24:55,  1.37s/it][A
Training:   4%|▎         | 1549/41976 [37:30<16:47:02,  1.49s/it][A
Training:   4%|▎         | 1550/41976 [37:31<17:26:37,  1.55s/it][A
Training:   4%|▎         | 1551/41976 [37:33<17:29:21,  1.56s/it][A

Epoch 0, Step 1550, Loss: nan



Training:   4%|▎         | 1552/41976 [37:34<16:51:31,  1.50s/it][A
Training:   4%|▎         | 1553/41976 [37:36<16:13:31,  1.45s/it][A
Training:   4%|▎         | 1554/41976 [37:37<15:50:18,  1.41s/it][A
Training:   4%|▎         | 1555/41976 [37:38<15:34:41,  1.39s/it][A
Training:   4%|▎         | 1556/41976 [37:40<15:34:02,  1.39s/it][A
Training:   4%|▎         | 1557/41976 [37:41<15:17:57,  1.36s/it][A
Training:   4%|▎         | 1558/41976 [37:42<15:34:03,  1.39s/it][A
Training:   4%|▎         | 1559/41976 [37:44<16:58:38,  1.51s/it][A
Training:   4%|▎         | 1560/41976 [37:46<16:29:06,  1.47s/it][A
Training:   4%|▎         | 1561/41976 [37:47<16:04:48,  1.43s/it][A

Epoch 0, Step 1560, Loss: nan



Training:   4%|▎         | 1562/41976 [37:48<15:51:52,  1.41s/it][A
Training:   4%|▎         | 1563/41976 [37:50<17:12:52,  1.53s/it][A
Training:   4%|▎         | 1564/41976 [37:52<16:41:40,  1.49s/it][A
Training:   4%|▎         | 1565/41976 [37:53<16:15:39,  1.45s/it][A
Training:   4%|▎         | 1566/41976 [37:54<15:59:02,  1.42s/it][A
Training:   4%|▎         | 1567/41976 [37:56<17:11:27,  1.53s/it][A
Training:   4%|▎         | 1568/41976 [37:57<16:43:38,  1.49s/it][A
Training:   4%|▎         | 1569/41976 [37:59<16:10:32,  1.44s/it][A
Training:   4%|▎         | 1570/41976 [38:00<16:57:11,  1.51s/it][A
Training:   4%|▎         | 1571/41976 [38:02<16:59:34,  1.51s/it][A

Epoch 0, Step 1570, Loss: nan



Training:   4%|▎         | 1572/41976 [38:03<16:29:57,  1.47s/it][A
Training:   4%|▎         | 1573/41976 [38:05<15:57:20,  1.42s/it][A
Training:   4%|▎         | 1574/41976 [38:06<15:40:25,  1.40s/it][A
Training:   4%|▍         | 1575/41976 [38:08<16:49:21,  1.50s/it][A
Training:   4%|▍         | 1576/41976 [38:09<16:56:54,  1.51s/it][A
Training:   4%|▍         | 1577/41976 [38:11<17:52:04,  1.59s/it][A
Training:   4%|▍         | 1578/41976 [38:12<17:00:17,  1.52s/it][A
Training:   4%|▍         | 1579/41976 [38:14<16:23:38,  1.46s/it][A
Training:   4%|▍         | 1580/41976 [38:15<16:03:15,  1.43s/it][A
Training:   4%|▍         | 1581/41976 [38:16<15:46:25,  1.41s/it][A

Epoch 0, Step 1580, Loss: nan



Training:   4%|▍         | 1582/41976 [38:18<15:35:52,  1.39s/it][A
Training:   4%|▍         | 1583/41976 [38:19<16:24:46,  1.46s/it][A
Training:   4%|▍         | 1584/41976 [38:21<16:57:56,  1.51s/it][A
Training:   4%|▍         | 1585/41976 [38:22<16:18:11,  1.45s/it][A
Training:   4%|▍         | 1586/41976 [38:24<15:53:31,  1.42s/it][A
Training:   4%|▍         | 1587/41976 [38:25<15:39:05,  1.40s/it][A
Training:   4%|▍         | 1588/41976 [38:26<15:35:07,  1.39s/it][A
Training:   4%|▍         | 1589/41976 [38:28<15:18:33,  1.36s/it][A
Training:   4%|▍         | 1590/41976 [38:29<15:14:54,  1.36s/it][A
Training:   4%|▍         | 1591/41976 [38:31<15:34:05,  1.39s/it][A

Epoch 0, Step 1590, Loss: nan



Training:   4%|▍         | 1592/41976 [38:33<18:41:36,  1.67s/it][A
Training:   4%|▍         | 1593/41976 [38:34<17:27:53,  1.56s/it][A
Training:   4%|▍         | 1594/41976 [38:36<16:46:59,  1.50s/it][A
Training:   4%|▍         | 1595/41976 [38:37<16:15:00,  1.45s/it][A
Training:   4%|▍         | 1596/41976 [38:38<16:00:23,  1.43s/it][A
Training:   4%|▍         | 1597/41976 [38:40<15:39:07,  1.40s/it][A
Training:   4%|▍         | 1598/41976 [38:41<15:28:23,  1.38s/it][A
Training:   4%|▍         | 1599/41976 [38:42<15:19:52,  1.37s/it][A
Training:   4%|▍         | 1600/41976 [38:44<16:48:29,  1.50s/it][A
Training:   4%|▍         | 1601/41976 [38:45<16:29:58,  1.47s/it][A

Epoch 0, Step 1600, Loss: nan



Training:   4%|▍         | 1602/41976 [38:47<16:14:16,  1.45s/it][A
Training:   4%|▍         | 1603/41976 [38:48<15:51:03,  1.41s/it][A
Training:   4%|▍         | 1604/41976 [38:50<15:43:28,  1.40s/it][A
Training:   4%|▍         | 1605/41976 [38:51<15:26:18,  1.38s/it][A
Training:   4%|▍         | 1606/41976 [38:53<16:52:18,  1.50s/it][A
Training:   4%|▍         | 1607/41976 [38:54<16:17:10,  1.45s/it][A
Training:   4%|▍         | 1608/41976 [38:56<17:05:51,  1.52s/it][A
Training:   4%|▍         | 1609/41976 [38:57<17:03:51,  1.52s/it][A
Training:   4%|▍         | 1610/41976 [38:59<16:27:16,  1.47s/it][A
Training:   4%|▍         | 1611/41976 [39:00<16:00:33,  1.43s/it][A

Epoch 0, Step 1610, Loss: nan



Training:   4%|▍         | 1612/41976 [39:01<15:44:13,  1.40s/it][A
Training:   4%|▍         | 1613/41976 [39:03<15:26:04,  1.38s/it][A
Training:   4%|▍         | 1614/41976 [39:04<15:19:13,  1.37s/it][A
Training:   4%|▍         | 1615/41976 [39:05<15:11:08,  1.35s/it][A
Training:   4%|▍         | 1616/41976 [39:07<15:26:31,  1.38s/it][A
Training:   4%|▍         | 1617/41976 [39:08<16:43:16,  1.49s/it][A
Training:   4%|▍         | 1618/41976 [39:10<16:20:25,  1.46s/it][A
Training:   4%|▍         | 1619/41976 [39:11<15:52:20,  1.42s/it][A
Training:   4%|▍         | 1620/41976 [39:12<15:44:32,  1.40s/it][A
Training:   4%|▍         | 1621/41976 [39:14<17:02:44,  1.52s/it][A

Epoch 0, Step 1620, Loss: nan



Training:   4%|▍         | 1622/41976 [39:16<16:28:20,  1.47s/it][A
Training:   4%|▍         | 1623/41976 [39:17<16:14:09,  1.45s/it][A
Training:   4%|▍         | 1624/41976 [39:18<15:54:23,  1.42s/it][A
Training:   4%|▍         | 1625/41976 [39:20<17:03:36,  1.52s/it][A
Training:   4%|▍         | 1626/41976 [39:22<16:40:17,  1.49s/it][A
Training:   4%|▍         | 1627/41976 [39:23<16:09:30,  1.44s/it][A
Training:   4%|▍         | 1628/41976 [39:24<15:55:49,  1.42s/it][A
Training:   4%|▍         | 1629/41976 [39:26<15:33:54,  1.39s/it][A
Training:   4%|▍         | 1630/41976 [39:27<15:26:30,  1.38s/it][A
Training:   4%|▍         | 1631/41976 [39:28<15:14:31,  1.36s/it][A

Epoch 0, Step 1630, Loss: nan



Training:   4%|▍         | 1632/41976 [39:30<15:17:39,  1.36s/it][A
Training:   4%|▍         | 1633/41976 [39:31<15:52:43,  1.42s/it][A
Training:   4%|▍         | 1634/41976 [39:33<16:48:08,  1.50s/it][A
Training:   4%|▍         | 1635/41976 [39:35<17:48:36,  1.59s/it][A
Training:   4%|▍         | 1636/41976 [39:36<17:07:00,  1.53s/it][A
Training:   4%|▍         | 1637/41976 [39:37<16:24:29,  1.46s/it][A
Training:   4%|▍         | 1638/41976 [39:39<15:56:26,  1.42s/it][A
Training:   4%|▍         | 1639/41976 [39:40<15:42:33,  1.40s/it][A
Training:   4%|▍         | 1640/41976 [39:41<15:31:33,  1.39s/it][A
Training:   4%|▍         | 1641/41976 [39:43<15:40:06,  1.40s/it][A

Epoch 0, Step 1640, Loss: nan



Training:   4%|▍         | 1642/41976 [39:45<16:52:31,  1.51s/it][A
Training:   4%|▍         | 1643/41976 [39:46<16:17:36,  1.45s/it][A
Training:   4%|▍         | 1644/41976 [39:47<16:07:30,  1.44s/it][A
Training:   4%|▍         | 1645/41976 [39:49<15:41:12,  1.40s/it][A
Training:   4%|▍         | 1646/41976 [39:50<15:29:24,  1.38s/it][A
Training:   4%|▍         | 1647/41976 [39:51<15:21:35,  1.37s/it][A
Training:   4%|▍         | 1648/41976 [39:53<15:31:17,  1.39s/it][A
Training:   4%|▍         | 1649/41976 [39:55<16:59:17,  1.52s/it][A
Training:   4%|▍         | 1650/41976 [39:56<17:49:17,  1.59s/it][A
Training:   4%|▍         | 1651/41976 [39:58<17:00:42,  1.52s/it][A

Epoch 0, Step 1650, Loss: nan



Training:   4%|▍         | 1652/41976 [39:59<16:31:04,  1.47s/it][A
Training:   4%|▍         | 1653/41976 [40:00<15:58:57,  1.43s/it][A
Training:   4%|▍         | 1654/41976 [40:02<15:41:53,  1.40s/it][A
Training:   4%|▍         | 1655/41976 [40:03<15:25:14,  1.38s/it][A
Training:   4%|▍         | 1656/41976 [40:04<15:22:36,  1.37s/it][A
Training:   4%|▍         | 1657/41976 [40:06<15:08:57,  1.35s/it][A
Training:   4%|▍         | 1658/41976 [40:07<15:50:10,  1.41s/it][A
Training:   4%|▍         | 1659/41976 [40:09<16:49:31,  1.50s/it][A
Training:   4%|▍         | 1660/41976 [40:10<16:22:46,  1.46s/it][A
Training:   4%|▍         | 1661/41976 [40:12<15:52:01,  1.42s/it][A

Epoch 0, Step 1660, Loss: nan



Training:   4%|▍         | 1662/41976 [40:13<15:48:11,  1.41s/it][A
Training:   4%|▍         | 1663/41976 [40:15<17:08:23,  1.53s/it][A
Training:   4%|▍         | 1664/41976 [40:16<16:41:06,  1.49s/it][A
Training:   4%|▍         | 1665/41976 [40:18<16:09:20,  1.44s/it][A
Training:   4%|▍         | 1666/41976 [40:19<16:15:43,  1.45s/it][A
Training:   4%|▍         | 1667/41976 [40:21<17:09:10,  1.53s/it][A
Training:   4%|▍         | 1668/41976 [40:22<16:36:04,  1.48s/it][A
Training:   4%|▍         | 1669/41976 [40:23<16:06:04,  1.44s/it][A
Training:   4%|▍         | 1670/41976 [40:25<15:42:22,  1.40s/it][A
Training:   4%|▍         | 1671/41976 [40:26<15:29:07,  1.38s/it][A

Epoch 0, Step 1670, Loss: nan



Training:   4%|▍         | 1672/41976 [40:27<15:28:31,  1.38s/it][A
Training:   4%|▍         | 1673/41976 [40:29<15:16:32,  1.36s/it][A
Training:   4%|▍         | 1674/41976 [40:30<15:08:43,  1.35s/it][A
Training:   4%|▍         | 1675/41976 [40:32<16:12:01,  1.45s/it][A
Training:   4%|▍         | 1676/41976 [40:33<16:39:08,  1.49s/it][A
Training:   4%|▍         | 1677/41976 [40:35<16:04:29,  1.44s/it][A
Training:   4%|▍         | 1678/41976 [40:36<17:13:58,  1.54s/it][A
Training:   4%|▍         | 1679/41976 [40:38<16:31:48,  1.48s/it][A
Training:   4%|▍         | 1680/41976 [40:39<16:11:13,  1.45s/it][A
Training:   4%|▍         | 1681/41976 [40:40<15:46:27,  1.41s/it][A

Epoch 0, Step 1680, Loss: nan



Training:   4%|▍         | 1682/41976 [40:42<15:35:18,  1.39s/it][A
Training:   4%|▍         | 1683/41976 [40:43<16:17:34,  1.46s/it][A
Training:   4%|▍         | 1684/41976 [40:45<16:58:01,  1.52s/it][A
Training:   4%|▍         | 1685/41976 [40:46<16:19:02,  1.46s/it][A
Training:   4%|▍         | 1686/41976 [40:48<15:59:52,  1.43s/it][A
Training:   4%|▍         | 1687/41976 [40:49<15:40:41,  1.40s/it][A
Training:   4%|▍         | 1688/41976 [40:50<15:36:56,  1.40s/it][A
Training:   4%|▍         | 1689/41976 [40:52<15:19:38,  1.37s/it][A
Training:   4%|▍         | 1690/41976 [40:53<15:14:13,  1.36s/it][A
Training:   4%|▍         | 1691/41976 [40:55<17:24:17,  1.56s/it][A

Epoch 0, Step 1690, Loss: nan



Training:   4%|▍         | 1692/41976 [40:57<18:02:06,  1.61s/it][A
Training:   4%|▍         | 1693/41976 [40:58<17:04:02,  1.53s/it][A
Training:   4%|▍         | 1694/41976 [41:00<16:27:11,  1.47s/it][A
Training:   4%|▍         | 1695/41976 [41:01<16:02:13,  1.43s/it][A
Training:   4%|▍         | 1696/41976 [41:02<15:46:56,  1.41s/it][A
Training:   4%|▍         | 1697/41976 [41:04<15:27:44,  1.38s/it][A
Training:   4%|▍         | 1698/41976 [41:05<15:22:49,  1.37s/it][A
Training:   4%|▍         | 1699/41976 [41:06<15:13:03,  1.36s/it][A
Training:   4%|▍         | 1700/41976 [41:08<16:34:47,  1.48s/it][A
Training:   4%|▍         | 1701/41976 [41:10<16:32:39,  1.48s/it][A

Epoch 0, Step 1700, Loss: nan



Training:   4%|▍         | 1702/41976 [41:11<16:09:24,  1.44s/it][A
Training:   4%|▍         | 1703/41976 [41:12<15:54:53,  1.42s/it][A
Training:   4%|▍         | 1704/41976 [41:14<15:56:03,  1.42s/it][A
Training:   4%|▍         | 1705/41976 [41:15<17:13:40,  1.54s/it][A
Training:   4%|▍         | 1706/41976 [41:17<16:36:43,  1.49s/it][A
Training:   4%|▍         | 1707/41976 [41:18<16:10:49,  1.45s/it][A
Training:   4%|▍         | 1708/41976 [41:20<17:06:04,  1.53s/it][A
Training:   4%|▍         | 1709/41976 [41:21<16:56:30,  1.51s/it][A
Training:   4%|▍         | 1710/41976 [41:23<16:20:13,  1.46s/it][A
Training:   4%|▍         | 1711/41976 [41:24<15:56:12,  1.42s/it][A

Epoch 0, Step 1710, Loss: nan



Training:   4%|▍         | 1712/41976 [41:25<15:46:05,  1.41s/it][A
Training:   4%|▍         | 1713/41976 [41:27<15:29:39,  1.39s/it][A
Training:   4%|▍         | 1714/41976 [41:28<15:19:41,  1.37s/it][A
Training:   4%|▍         | 1715/41976 [41:29<15:09:37,  1.36s/it][A
Training:   4%|▍         | 1716/41976 [41:31<15:45:06,  1.41s/it][A
Training:   4%|▍         | 1717/41976 [41:33<16:48:13,  1.50s/it][A
Training:   4%|▍         | 1718/41976 [41:34<16:24:50,  1.47s/it][A
Training:   4%|▍         | 1719/41976 [41:36<17:31:46,  1.57s/it][A
Training:   4%|▍         | 1720/41976 [41:37<16:51:00,  1.51s/it][A
Training:   4%|▍         | 1721/41976 [41:39<16:13:34,  1.45s/it][A

Epoch 0, Step 1720, Loss: nan



Training:   4%|▍         | 1722/41976 [41:40<15:57:01,  1.43s/it][A
Training:   4%|▍         | 1723/41976 [41:41<15:42:15,  1.40s/it][A
Training:   4%|▍         | 1724/41976 [41:43<15:53:14,  1.42s/it][A
Training:   4%|▍         | 1725/41976 [41:45<17:32:55,  1.57s/it][A
Training:   4%|▍         | 1726/41976 [41:47<18:54:03,  1.69s/it][A
Training:   4%|▍         | 1727/41976 [41:48<17:43:45,  1.59s/it][A
Training:   4%|▍         | 1728/41976 [41:49<17:02:07,  1.52s/it][A
Training:   4%|▍         | 1729/41976 [41:51<16:22:49,  1.47s/it][A
Training:   4%|▍         | 1730/41976 [41:52<15:56:32,  1.43s/it][A
Training:   4%|▍         | 1731/41976 [41:53<15:32:11,  1.39s/it][A

Epoch 0, Step 1730, Loss: nan



Training:   4%|▍         | 1732/41976 [41:55<15:27:17,  1.38s/it][A
Training:   4%|▍         | 1733/41976 [41:57<17:20:03,  1.55s/it][A
Training:   4%|▍         | 1734/41976 [41:58<17:50:02,  1.60s/it][A
Training:   4%|▍         | 1735/41976 [42:00<16:57:03,  1.52s/it][A
Training:   4%|▍         | 1736/41976 [42:01<16:29:43,  1.48s/it][A
Training:   4%|▍         | 1737/41976 [42:02<15:55:34,  1.42s/it][A
Training:   4%|▍         | 1738/41976 [42:04<15:37:35,  1.40s/it][A
Training:   4%|▍         | 1739/41976 [42:05<15:23:56,  1.38s/it][A
Training:   4%|▍         | 1740/41976 [42:06<15:24:53,  1.38s/it][A
Training:   4%|▍         | 1741/41976 [42:08<15:12:20,  1.36s/it][A

Epoch 0, Step 1740, Loss: nan



Training:   4%|▍         | 1742/41976 [42:09<16:28:32,  1.47s/it][A
Training:   4%|▍         | 1743/41976 [42:11<16:40:07,  1.49s/it][A
Training:   4%|▍         | 1744/41976 [42:12<16:15:29,  1.45s/it][A
Training:   4%|▍         | 1745/41976 [42:14<15:49:13,  1.42s/it][A
Training:   4%|▍         | 1746/41976 [42:15<15:32:49,  1.39s/it][A
Training:   4%|▍         | 1747/41976 [42:17<16:55:01,  1.51s/it][A
Training:   4%|▍         | 1748/41976 [42:18<16:27:59,  1.47s/it][A
Training:   4%|▍         | 1749/41976 [42:20<16:03:04,  1.44s/it][A
Training:   4%|▍         | 1750/41976 [42:21<16:41:22,  1.49s/it][A
Training:   4%|▍         | 1751/41976 [42:23<17:02:36,  1.53s/it][A

Epoch 0, Step 1750, Loss: nan



Training:   4%|▍         | 1752/41976 [42:24<16:31:07,  1.48s/it][A
Training:   4%|▍         | 1753/41976 [42:25<15:56:35,  1.43s/it][A
Training:   4%|▍         | 1754/41976 [42:27<15:39:00,  1.40s/it][A
Training:   4%|▍         | 1755/41976 [42:28<15:26:48,  1.38s/it][A
Training:   4%|▍         | 1756/41976 [42:30<15:28:10,  1.38s/it][A
Training:   4%|▍         | 1757/41976 [42:31<15:15:57,  1.37s/it][A
Training:   4%|▍         | 1758/41976 [42:32<15:29:11,  1.39s/it][A
Training:   4%|▍         | 1759/41976 [42:34<17:00:39,  1.52s/it][A
Training:   4%|▍         | 1760/41976 [42:35<16:32:41,  1.48s/it][A
Training:   4%|▍         | 1761/41976 [42:37<17:32:37,  1.57s/it][A

Epoch 0, Step 1760, Loss: nan



Training:   4%|▍         | 1762/41976 [42:39<16:51:25,  1.51s/it][A
Training:   4%|▍         | 1763/41976 [42:40<16:17:00,  1.46s/it][A
Training:   4%|▍         | 1764/41976 [42:41<15:59:49,  1.43s/it][A
Training:   4%|▍         | 1765/41976 [42:43<15:35:23,  1.40s/it][A
Training:   4%|▍         | 1766/41976 [42:44<15:25:29,  1.38s/it][A
Training:   4%|▍         | 1767/41976 [42:46<16:42:43,  1.50s/it][A
Training:   4%|▍         | 1768/41976 [42:47<16:34:05,  1.48s/it][A
Training:   4%|▍         | 1769/41976 [42:49<15:57:24,  1.43s/it][A
Training:   4%|▍         | 1770/41976 [42:50<15:45:09,  1.41s/it][A
Training:   4%|▍         | 1771/41976 [42:51<15:30:29,  1.39s/it][A

Epoch 0, Step 1770, Loss: nan



Training:   4%|▍         | 1772/41976 [42:53<15:26:56,  1.38s/it][A
Training:   4%|▍         | 1773/41976 [42:54<15:21:31,  1.38s/it][A
Training:   4%|▍         | 1774/41976 [42:55<15:13:50,  1.36s/it][A
Training:   4%|▍         | 1775/41976 [42:57<17:39:28,  1.58s/it][A
Training:   4%|▍         | 1776/41976 [42:59<17:41:11,  1.58s/it][A
Training:   4%|▍         | 1777/41976 [43:00<16:51:21,  1.51s/it][A
Training:   4%|▍         | 1778/41976 [43:02<16:11:12,  1.45s/it][A
Training:   4%|▍         | 1779/41976 [43:03<15:48:21,  1.42s/it][A
Training:   4%|▍         | 1780/41976 [43:04<15:44:34,  1.41s/it][A
Training:   4%|▍         | 1781/41976 [43:06<15:26:50,  1.38s/it][A

Epoch 0, Step 1780, Loss: nan



Training:   4%|▍         | 1782/41976 [43:07<15:18:11,  1.37s/it][A
Training:   4%|▍         | 1783/41976 [43:08<15:33:34,  1.39s/it][A
Training:   4%|▍         | 1784/41976 [43:10<16:59:09,  1.52s/it][A
Training:   4%|▍         | 1785/41976 [43:12<16:18:05,  1.46s/it][A
Training:   4%|▍         | 1786/41976 [43:13<15:55:59,  1.43s/it][A
Training:   4%|▍         | 1787/41976 [43:14<15:37:57,  1.40s/it][A
Training:   4%|▍         | 1788/41976 [43:16<15:34:28,  1.40s/it][A
Training:   4%|▍         | 1789/41976 [43:17<16:54:45,  1.52s/it][A
Training:   4%|▍         | 1790/41976 [43:19<16:20:00,  1.46s/it][A
Training:   4%|▍         | 1791/41976 [43:20<15:56:07,  1.43s/it][A

Epoch 0, Step 1790, Loss: nan



Training:   4%|▍         | 1792/41976 [43:22<17:15:23,  1.55s/it][A
Training:   4%|▍         | 1793/41976 [43:23<16:39:29,  1.49s/it][A
Training:   4%|▍         | 1794/41976 [43:25<16:08:57,  1.45s/it][A
Training:   4%|▍         | 1795/41976 [43:26<15:49:44,  1.42s/it][A
Training:   4%|▍         | 1796/41976 [43:27<15:43:10,  1.41s/it][A
Training:   4%|▍         | 1797/41976 [43:29<15:23:07,  1.38s/it][A
Training:   4%|▍         | 1798/41976 [43:30<15:18:06,  1.37s/it][A
Training:   4%|▍         | 1799/41976 [43:31<15:14:11,  1.37s/it][A
Training:   4%|▍         | 1800/41976 [43:33<16:01:23,  1.44s/it][A
Training:   4%|▍         | 1801/41976 [43:35<16:41:21,  1.50s/it][A

Epoch 0, Step 1800, Loss: nan



Training:   4%|▍         | 1802/41976 [43:36<16:12:15,  1.45s/it][A
Training:   4%|▍         | 1803/41976 [43:38<17:23:24,  1.56s/it][A
Training:   4%|▍         | 1804/41976 [43:39<16:49:58,  1.51s/it][A
Training:   4%|▍         | 1805/41976 [43:41<16:11:30,  1.45s/it][A
Training:   4%|▍         | 1806/41976 [43:42<15:51:38,  1.42s/it][A
Training:   4%|▍         | 1807/41976 [43:43<15:33:55,  1.39s/it][A
Training:   4%|▍         | 1808/41976 [43:45<15:56:08,  1.43s/it][A
Training:   4%|▍         | 1809/41976 [43:46<16:53:29,  1.51s/it][A
Training:   4%|▍         | 1810/41976 [43:48<16:16:56,  1.46s/it][A
Training:   4%|▍         | 1811/41976 [43:49<15:51:52,  1.42s/it][A

Epoch 0, Step 1810, Loss: nan



Training:   4%|▍         | 1812/41976 [43:50<15:40:42,  1.41s/it][A
Training:   4%|▍         | 1813/41976 [43:52<15:23:17,  1.38s/it][A
Training:   4%|▍         | 1814/41976 [43:53<15:10:58,  1.36s/it][A
Training:   4%|▍         | 1815/41976 [43:54<15:11:14,  1.36s/it][A
Training:   4%|▍         | 1816/41976 [43:56<15:12:36,  1.36s/it][A
Training:   4%|▍         | 1817/41976 [43:58<18:06:00,  1.62s/it][A
Training:   4%|▍         | 1818/41976 [43:59<17:15:20,  1.55s/it][A
Training:   4%|▍         | 1819/41976 [44:01<16:36:25,  1.49s/it][A
Training:   4%|▍         | 1820/41976 [44:02<16:10:54,  1.45s/it][A
Training:   4%|▍         | 1821/41976 [44:03<15:42:31,  1.41s/it][A

Epoch 0, Step 1820, Loss: nan



Training:   4%|▍         | 1822/41976 [44:05<15:34:14,  1.40s/it][A
Training:   4%|▍         | 1823/41976 [44:06<15:23:36,  1.38s/it][A
Training:   4%|▍         | 1824/41976 [44:08<15:21:24,  1.38s/it][A
Training:   4%|▍         | 1825/41976 [44:09<16:07:16,  1.45s/it][A
Training:   4%|▍         | 1826/41976 [44:11<16:47:56,  1.51s/it][A
Training:   4%|▍         | 1827/41976 [44:12<16:13:52,  1.46s/it][A
Training:   4%|▍         | 1828/41976 [44:14<15:57:54,  1.43s/it][A
Training:   4%|▍         | 1829/41976 [44:15<15:36:43,  1.40s/it][A
Training:   4%|▍         | 1830/41976 [44:16<15:26:35,  1.38s/it][A
Training:   4%|▍         | 1831/41976 [44:18<16:50:19,  1.51s/it][A

Epoch 0, Step 1830, Loss: nan



Training:   4%|▍         | 1832/41976 [44:19<16:24:18,  1.47s/it][A
Training:   4%|▍         | 1833/41976 [44:21<16:25:34,  1.47s/it][A
Training:   4%|▍         | 1834/41976 [44:23<17:16:29,  1.55s/it][A
Training:   4%|▍         | 1835/41976 [44:24<16:33:57,  1.49s/it][A
Training:   4%|▍         | 1836/41976 [44:25<16:07:24,  1.45s/it][A
Training:   4%|▍         | 1837/41976 [44:27<15:42:57,  1.41s/it][A
Training:   4%|▍         | 1838/41976 [44:28<15:28:46,  1.39s/it][A
Training:   4%|▍         | 1839/41976 [44:29<15:18:22,  1.37s/it][A
Training:   4%|▍         | 1840/41976 [44:31<15:21:42,  1.38s/it][A
Training:   4%|▍         | 1841/41976 [44:32<15:14:13,  1.37s/it][A

Epoch 0, Step 1840, Loss: nan



Training:   4%|▍         | 1842/41976 [44:34<16:25:59,  1.47s/it][A
Training:   4%|▍         | 1843/41976 [44:35<16:31:35,  1.48s/it][A
Training:   4%|▍         | 1844/41976 [44:37<16:10:25,  1.45s/it][A
Training:   4%|▍         | 1845/41976 [44:38<17:23:03,  1.56s/it][A
Training:   4%|▍         | 1846/41976 [44:40<16:36:35,  1.49s/it][A
Training:   4%|▍         | 1847/41976 [44:41<16:06:31,  1.45s/it][A
Training:   4%|▍         | 1848/41976 [44:42<15:49:46,  1.42s/it][A
Training:   4%|▍         | 1849/41976 [44:44<15:24:44,  1.38s/it][A
Training:   4%|▍         | 1850/41976 [44:45<16:14:25,  1.46s/it][A
Training:   4%|▍         | 1851/41976 [44:47<16:37:40,  1.49s/it][A

Epoch 0, Step 1850, Loss: nan



Training:   4%|▍         | 1852/41976 [44:48<16:15:18,  1.46s/it][A
Training:   4%|▍         | 1853/41976 [44:50<15:46:49,  1.42s/it][A
Training:   4%|▍         | 1854/41976 [44:51<15:33:08,  1.40s/it][A
Training:   4%|▍         | 1855/41976 [44:52<15:23:33,  1.38s/it][A
Training:   4%|▍         | 1856/41976 [44:54<15:19:21,  1.37s/it][A
Training:   4%|▍         | 1857/41976 [44:55<15:05:08,  1.35s/it][A
Training:   4%|▍         | 1858/41976 [44:56<15:05:50,  1.35s/it][A
Training:   4%|▍         | 1859/41976 [44:59<18:12:18,  1.63s/it][A
Training:   4%|▍         | 1860/41976 [45:00<17:17:21,  1.55s/it][A
Training:   4%|▍         | 1861/41976 [45:01<16:29:25,  1.48s/it][A

Epoch 0, Step 1860, Loss: nan



Training:   4%|▍         | 1862/41976 [45:03<16:02:04,  1.44s/it][A
Training:   4%|▍         | 1863/41976 [45:04<15:41:42,  1.41s/it][A
Training:   4%|▍         | 1864/41976 [45:05<15:34:41,  1.40s/it][A
Training:   4%|▍         | 1865/41976 [45:07<15:17:09,  1.37s/it][A
Training:   4%|▍         | 1866/41976 [45:08<15:10:57,  1.36s/it][A
Training:   4%|▍         | 1867/41976 [45:10<16:14:30,  1.46s/it][A
Training:   4%|▍         | 1868/41976 [45:11<16:40:25,  1.50s/it][A
Training:   4%|▍         | 1869/41976 [45:13<16:06:33,  1.45s/it][A
Training:   4%|▍         | 1870/41976 [45:14<15:46:02,  1.42s/it][A
Training:   4%|▍         | 1871/41976 [45:15<15:39:55,  1.41s/it][A

Epoch 0, Step 1870, Loss: nan



Training:   4%|▍         | 1872/41976 [45:17<15:32:53,  1.40s/it][A
Training:   4%|▍         | 1873/41976 [45:19<16:51:20,  1.51s/it][A
Training:   4%|▍         | 1874/41976 [45:20<16:16:42,  1.46s/it][A
Training:   4%|▍         | 1875/41976 [45:21<16:54:59,  1.52s/it][A
Training:   4%|▍         | 1876/41976 [45:23<17:14:16,  1.55s/it][A
Training:   4%|▍         | 1877/41976 [45:24<16:26:54,  1.48s/it][A
Training:   4%|▍         | 1878/41976 [45:26<16:01:48,  1.44s/it][A
Training:   4%|▍         | 1879/41976 [45:27<15:42:58,  1.41s/it][A
Training:   4%|▍         | 1880/41976 [45:29<17:05:59,  1.54s/it][A
Training:   4%|▍         | 1881/41976 [45:30<16:32:58,  1.49s/it][A

Epoch 0, Step 1880, Loss: nan



Training:   4%|▍         | 1882/41976 [45:32<16:08:59,  1.45s/it][A
Training:   4%|▍         | 1883/41976 [45:33<16:38:13,  1.49s/it][A
Training:   4%|▍         | 1884/41976 [45:35<17:13:58,  1.55s/it][A
Training:   4%|▍         | 1885/41976 [45:36<16:27:32,  1.48s/it][A
Training:   4%|▍         | 1886/41976 [45:38<15:58:09,  1.43s/it][A
Training:   4%|▍         | 1887/41976 [45:39<17:13:37,  1.55s/it][A
Training:   4%|▍         | 1888/41976 [45:41<16:39:37,  1.50s/it][A
Training:   5%|▍         | 1889/41976 [45:42<16:05:34,  1.45s/it][A
Training:   5%|▍         | 1890/41976 [45:43<15:47:20,  1.42s/it][A
Training:   5%|▍         | 1891/41976 [45:45<16:04:09,  1.44s/it][A

Epoch 0, Step 1890, Loss: nan



Training:   5%|▍         | 1892/41976 [45:47<17:04:24,  1.53s/it][A
Training:   5%|▍         | 1893/41976 [45:48<16:19:48,  1.47s/it][A
Training:   5%|▍         | 1894/41976 [45:49<15:52:56,  1.43s/it][A
Training:   5%|▍         | 1895/41976 [45:51<15:40:21,  1.41s/it][A
Training:   5%|▍         | 1896/41976 [45:52<15:33:35,  1.40s/it][A
Training:   5%|▍         | 1897/41976 [45:53<15:13:38,  1.37s/it][A
Training:   5%|▍         | 1898/41976 [45:55<15:09:19,  1.36s/it][A
Training:   5%|▍         | 1899/41976 [45:56<15:04:33,  1.35s/it][A
Training:   5%|▍         | 1900/41976 [45:58<16:22:48,  1.47s/it][A
Training:   5%|▍         | 1901/41976 [45:59<16:22:33,  1.47s/it][A

Epoch 0, Step 1900, Loss: nan



Training:   5%|▍         | 1902/41976 [46:01<17:36:16,  1.58s/it][A
Training:   5%|▍         | 1903/41976 [46:02<16:45:29,  1.51s/it][A
Training:   5%|▍         | 1904/41976 [46:04<16:19:06,  1.47s/it][A
Training:   5%|▍         | 1905/41976 [46:05<15:50:00,  1.42s/it][A
Training:   5%|▍         | 1906/41976 [46:06<15:32:19,  1.40s/it][A
Training:   5%|▍         | 1907/41976 [46:08<15:22:42,  1.38s/it][A
Training:   5%|▍         | 1908/41976 [46:10<16:27:43,  1.48s/it][A
Training:   5%|▍         | 1909/41976 [46:11<16:43:52,  1.50s/it][A
Training:   5%|▍         | 1910/41976 [46:12<16:12:13,  1.46s/it][A
Training:   5%|▍         | 1911/41976 [46:14<15:50:48,  1.42s/it][A

Epoch 0, Step 1910, Loss: nan



Training:   5%|▍         | 1912/41976 [46:15<15:40:39,  1.41s/it][A
Training:   5%|▍         | 1913/41976 [46:16<15:17:09,  1.37s/it][A
Training:   5%|▍         | 1914/41976 [46:18<15:07:19,  1.36s/it][A
Training:   5%|▍         | 1915/41976 [46:19<15:00:05,  1.35s/it][A
Training:   5%|▍         | 1916/41976 [46:21<17:30:52,  1.57s/it][A
Training:   5%|▍         | 1917/41976 [46:23<17:47:28,  1.60s/it][A
Training:   5%|▍         | 1918/41976 [46:24<16:54:01,  1.52s/it][A
Training:   5%|▍         | 1919/41976 [46:26<16:16:46,  1.46s/it][A
Training:   5%|▍         | 1920/41976 [46:27<15:58:43,  1.44s/it][A
Training:   5%|▍         | 1921/41976 [46:28<15:32:58,  1.40s/it][A

Epoch 0, Step 1920, Loss: nan



Training:   5%|▍         | 1922/41976 [46:30<15:25:16,  1.39s/it][A
Training:   5%|▍         | 1923/41976 [46:31<15:12:58,  1.37s/it][A
Training:   5%|▍         | 1924/41976 [46:32<15:15:51,  1.37s/it][A
Training:   5%|▍         | 1925/41976 [46:34<16:24:09,  1.47s/it][A
Training:   5%|▍         | 1926/41976 [46:35<16:23:15,  1.47s/it][A
Training:   5%|▍         | 1927/41976 [46:37<15:57:19,  1.43s/it][A
Training:   5%|▍         | 1928/41976 [46:38<15:48:08,  1.42s/it][A
Training:   5%|▍         | 1929/41976 [46:40<15:33:21,  1.40s/it][A
Training:   5%|▍         | 1930/41976 [46:41<15:24:20,  1.38s/it][A
Training:   5%|▍         | 1931/41976 [46:43<16:50:34,  1.51s/it][A

Epoch 0, Step 1930, Loss: nan



Training:   5%|▍         | 1932/41976 [46:44<16:24:49,  1.48s/it][A
Training:   5%|▍         | 1933/41976 [46:46<17:00:55,  1.53s/it][A
Training:   5%|▍         | 1934/41976 [46:47<17:06:38,  1.54s/it][A
Training:   5%|▍         | 1935/41976 [46:49<16:26:52,  1.48s/it][A
Training:   5%|▍         | 1936/41976 [46:50<16:05:24,  1.45s/it][A
Training:   5%|▍         | 1937/41976 [46:51<15:43:33,  1.41s/it][A
Training:   5%|▍         | 1938/41976 [46:53<15:29:32,  1.39s/it][A
Training:   5%|▍         | 1939/41976 [46:54<15:18:11,  1.38s/it][A
Training:   5%|▍         | 1940/41976 [46:55<15:13:18,  1.37s/it][A
Training:   5%|▍         | 1941/41976 [46:57<15:27:33,  1.39s/it][A

Epoch 0, Step 1940, Loss: nan



Training:   5%|▍         | 1942/41976 [46:59<16:48:54,  1.51s/it][A
Training:   5%|▍         | 1943/41976 [47:00<16:13:11,  1.46s/it][A
Training:   5%|▍         | 1944/41976 [47:01<15:57:10,  1.43s/it][A
Training:   5%|▍         | 1945/41976 [47:03<17:07:32,  1.54s/it][A
Training:   5%|▍         | 1946/41976 [47:04<16:28:41,  1.48s/it][A
Training:   5%|▍         | 1947/41976 [47:06<15:55:27,  1.43s/it][A
Training:   5%|▍         | 1948/41976 [47:07<15:39:09,  1.41s/it][A
Training:   5%|▍         | 1949/41976 [47:08<15:24:57,  1.39s/it][A
Training:   5%|▍         | 1950/41976 [47:10<16:38:37,  1.50s/it][A
Training:   5%|▍         | 1951/41976 [47:12<16:21:07,  1.47s/it][A

Epoch 0, Step 1950, Loss: nan



Training:   5%|▍         | 1952/41976 [47:13<16:03:03,  1.44s/it][A
Training:   5%|▍         | 1953/41976 [47:14<15:41:19,  1.41s/it][A
Training:   5%|▍         | 1954/41976 [47:16<15:28:24,  1.39s/it][A
Training:   5%|▍         | 1955/41976 [47:17<15:17:51,  1.38s/it][A
Training:   5%|▍         | 1956/41976 [47:18<15:17:53,  1.38s/it][A
Training:   5%|▍         | 1957/41976 [47:20<15:08:54,  1.36s/it][A
Training:   5%|▍         | 1958/41976 [47:21<15:59:34,  1.44s/it][A
Training:   5%|▍         | 1959/41976 [47:23<18:09:18,  1.63s/it][A
Training:   5%|▍         | 1960/41976 [47:25<17:17:36,  1.56s/it][A
Training:   5%|▍         | 1961/41976 [47:26<16:35:03,  1.49s/it][A

Epoch 0, Step 1960, Loss: nan



Training:   5%|▍         | 1962/41976 [47:28<16:12:07,  1.46s/it][A
Training:   5%|▍         | 1963/41976 [47:29<15:47:51,  1.42s/it][A
Training:   5%|▍         | 1964/41976 [47:30<15:40:55,  1.41s/it][A
Training:   5%|▍         | 1965/41976 [47:32<15:21:39,  1.38s/it][A
Training:   5%|▍         | 1966/41976 [47:33<15:45:12,  1.42s/it][A
Training:   5%|▍         | 1967/41976 [47:35<16:52:17,  1.52s/it][A
Training:   5%|▍         | 1968/41976 [47:36<16:21:07,  1.47s/it][A
Training:   5%|▍         | 1969/41976 [47:38<15:47:11,  1.42s/it][A
Training:   5%|▍         | 1970/41976 [47:39<15:35:22,  1.40s/it][A
Training:   5%|▍         | 1971/41976 [47:40<15:21:09,  1.38s/it][A

Epoch 0, Step 1970, Loss: nan



Training:   5%|▍         | 1972/41976 [47:42<15:22:45,  1.38s/it][A
Training:   5%|▍         | 1973/41976 [47:43<15:11:02,  1.37s/it][A
Training:   5%|▍         | 1974/41976 [47:45<16:53:50,  1.52s/it][A
Training:   5%|▍         | 1975/41976 [47:47<17:51:40,  1.61s/it][A
Training:   5%|▍         | 1976/41976 [47:48<17:10:04,  1.55s/it][A
Training:   5%|▍         | 1977/41976 [47:49<16:25:18,  1.48s/it][A
Training:   5%|▍         | 1978/41976 [47:51<15:53:56,  1.43s/it][A
Training:   5%|▍         | 1979/41976 [47:52<15:42:30,  1.41s/it][A
Training:   5%|▍         | 1980/41976 [47:53<15:35:50,  1.40s/it][A
Training:   5%|▍         | 1981/41976 [47:55<15:18:00,  1.38s/it][A

Epoch 0, Step 1980, Loss: nan



Training:   5%|▍         | 1982/41976 [47:56<15:14:11,  1.37s/it][A
Training:   5%|▍         | 1983/41976 [47:58<16:01:42,  1.44s/it][A
Training:   5%|▍         | 1984/41976 [47:59<16:48:31,  1.51s/it][A
Training:   5%|▍         | 1985/41976 [48:01<16:09:56,  1.46s/it][A
Training:   5%|▍         | 1986/41976 [48:02<15:45:32,  1.42s/it][A
Training:   5%|▍         | 1987/41976 [48:03<15:30:18,  1.40s/it][A
Training:   5%|▍         | 1988/41976 [48:05<15:27:23,  1.39s/it][A
Training:   5%|▍         | 1989/41976 [48:07<16:47:42,  1.51s/it][A
Training:   5%|▍         | 1990/41976 [48:08<16:15:26,  1.46s/it][A
Training:   5%|▍         | 1991/41976 [48:09<16:35:51,  1.49s/it][A

Epoch 0, Step 1990, Loss: nan



Training:   5%|▍         | 1992/41976 [48:11<17:22:59,  1.57s/it][A
Training:   5%|▍         | 1993/41976 [48:13<16:36:59,  1.50s/it][A
Training:   5%|▍         | 1994/41976 [48:14<16:16:07,  1.46s/it][A
Training:   5%|▍         | 1995/41976 [48:15<15:53:18,  1.43s/it][A
Training:   5%|▍         | 1996/41976 [48:17<15:40:16,  1.41s/it][A
Training:   5%|▍         | 1997/41976 [48:18<15:23:26,  1.39s/it][A
Training:   5%|▍         | 1998/41976 [48:19<15:20:09,  1.38s/it][A
Training:   5%|▍         | 1999/41976 [48:21<15:16:13,  1.38s/it][A
Training:   5%|▍         | 2000/41976 [48:23<16:52:54,  1.52s/it][A
Training:   5%|▍         | 2001/41976 [48:24<16:29:50,  1.49s/it][A

Epoch 0, Step 2000, Loss: nan



Training:   5%|▍         | 2002/41976 [48:25<16:07:20,  1.45s/it][A
Training:   5%|▍         | 2003/41976 [48:27<17:17:26,  1.56s/it][A
Training:   5%|▍         | 2004/41976 [48:29<16:43:49,  1.51s/it][A
Training:   5%|▍         | 2005/41976 [48:30<16:04:57,  1.45s/it][A
Training:   5%|▍         | 2006/41976 [48:31<15:43:06,  1.42s/it][A
Training:   5%|▍         | 2007/41976 [48:33<15:32:48,  1.40s/it][A
Training:   5%|▍         | 2008/41976 [48:34<16:45:01,  1.51s/it][A
Training:   5%|▍         | 2009/41976 [48:36<16:42:39,  1.51s/it][A
Training:   5%|▍         | 2010/41976 [48:37<16:10:50,  1.46s/it][A
Training:   5%|▍         | 2011/41976 [48:38<15:46:54,  1.42s/it][A

Epoch 0, Step 2010, Loss: nan



Training:   5%|▍         | 2012/41976 [48:40<15:46:24,  1.42s/it][A
Training:   5%|▍         | 2013/41976 [48:41<15:25:39,  1.39s/it][A
Training:   5%|▍         | 2014/41976 [48:43<15:18:29,  1.38s/it][A
Training:   5%|▍         | 2015/41976 [48:44<15:16:03,  1.38s/it][A
Training:   5%|▍         | 2016/41976 [48:45<15:53:35,  1.43s/it][A
Training:   5%|▍         | 2017/41976 [48:48<18:15:28,  1.64s/it][A
Training:   5%|▍         | 2018/41976 [48:49<17:12:23,  1.55s/it][A
Training:   5%|▍         | 2019/41976 [48:50<16:32:40,  1.49s/it][A
Training:   5%|▍         | 2020/41976 [48:52<16:09:24,  1.46s/it][A
Training:   5%|▍         | 2021/41976 [48:53<15:51:36,  1.43s/it][A

Epoch 0, Step 2020, Loss: nan



Training:   5%|▍         | 2022/41976 [48:54<15:41:17,  1.41s/it][A
Training:   5%|▍         | 2023/41976 [48:56<15:26:03,  1.39s/it][A
Training:   5%|▍         | 2024/41976 [48:57<15:40:58,  1.41s/it][A
Training:   5%|▍         | 2025/41976 [48:59<17:01:31,  1.53s/it][A
Training:   5%|▍         | 2026/41976 [49:00<16:25:30,  1.48s/it][A
Training:   5%|▍         | 2027/41976 [49:02<16:01:44,  1.44s/it][A
Training:   5%|▍         | 2028/41976 [49:03<15:52:28,  1.43s/it][A
Training:   5%|▍         | 2029/41976 [49:04<15:29:35,  1.40s/it][A
Training:   5%|▍         | 2030/41976 [49:06<15:25:13,  1.39s/it][A
Training:   5%|▍         | 2031/41976 [49:08<16:49:10,  1.52s/it][A

Epoch 0, Step 2030, Loss: nan



Training:   5%|▍         | 2032/41976 [49:09<16:42:36,  1.51s/it][A
Training:   5%|▍         | 2033/41976 [49:11<17:40:39,  1.59s/it][A
Training:   5%|▍         | 2034/41976 [49:12<16:51:43,  1.52s/it][A
Training:   5%|▍         | 2035/41976 [49:14<16:20:49,  1.47s/it][A
Training:   5%|▍         | 2036/41976 [49:15<16:05:36,  1.45s/it][A
Training:   5%|▍         | 2037/41976 [49:17<16:14:27,  1.46s/it][A
Training:   5%|▍         | 2038/41976 [49:18<17:23:23,  1.57s/it][A
Training:   5%|▍         | 2039/41976 [49:20<16:41:10,  1.50s/it][A
Training:   5%|▍         | 2040/41976 [49:21<16:33:55,  1.49s/it][A
Training:   5%|▍         | 2041/41976 [49:23<17:46:05,  1.60s/it][A

Epoch 0, Step 2040, Loss: nan



Training:   5%|▍         | 2042/41976 [49:24<17:00:05,  1.53s/it][A
Training:   5%|▍         | 2043/41976 [49:26<16:25:58,  1.48s/it][A
Training:   5%|▍         | 2044/41976 [49:27<16:09:47,  1.46s/it][A
Training:   5%|▍         | 2045/41976 [49:28<15:43:30,  1.42s/it][A
Training:   5%|▍         | 2046/41976 [49:30<17:09:28,  1.55s/it][A
Training:   5%|▍         | 2047/41976 [49:32<16:30:26,  1.49s/it][A
Training:   5%|▍         | 2048/41976 [49:33<16:16:41,  1.47s/it][A
Training:   5%|▍         | 2049/41976 [49:35<17:12:44,  1.55s/it][A
Training:   5%|▍         | 2050/41976 [49:36<16:36:53,  1.50s/it][A
Training:   5%|▍         | 2051/41976 [49:38<16:04:15,  1.45s/it][A

Epoch 0, Step 2050, Loss: nan



Training:   5%|▍         | 2052/41976 [49:39<15:55:53,  1.44s/it][A
Training:   5%|▍         | 2053/41976 [49:40<15:34:15,  1.40s/it][A
Training:   5%|▍         | 2054/41976 [49:42<15:21:23,  1.38s/it][A
Training:   5%|▍         | 2055/41976 [49:43<15:13:23,  1.37s/it][A
Training:   5%|▍         | 2056/41976 [49:44<15:16:50,  1.38s/it][A
Training:   5%|▍         | 2057/41976 [49:46<15:53:26,  1.43s/it][A
Training:   5%|▍         | 2058/41976 [49:48<16:40:53,  1.50s/it][A
Training:   5%|▍         | 2059/41976 [49:49<16:09:05,  1.46s/it][A
Training:   5%|▍         | 2060/41976 [49:50<15:52:12,  1.43s/it][A
Training:   5%|▍         | 2061/41976 [49:52<17:05:52,  1.54s/it][A

Epoch 0, Step 2060, Loss: nan



Training:   5%|▍         | 2062/41976 [49:54<16:34:46,  1.50s/it][A
Training:   5%|▍         | 2063/41976 [49:55<16:03:21,  1.45s/it][A
Training:   5%|▍         | 2064/41976 [49:56<15:49:52,  1.43s/it][A
Training:   5%|▍         | 2065/41976 [49:58<16:14:21,  1.46s/it][A
Training:   5%|▍         | 2066/41976 [49:59<17:00:57,  1.53s/it][A
Training:   5%|▍         | 2067/41976 [50:01<16:22:16,  1.48s/it][A
Training:   5%|▍         | 2068/41976 [50:02<16:04:06,  1.45s/it][A
Training:   5%|▍         | 2069/41976 [50:04<15:45:45,  1.42s/it][A
Training:   5%|▍         | 2070/41976 [50:05<15:32:09,  1.40s/it][A
Training:   5%|▍         | 2071/41976 [50:06<15:25:21,  1.39s/it][A

Epoch 0, Step 2070, Loss: nan



Training:   5%|▍         | 2072/41976 [50:08<15:24:26,  1.39s/it][A
Training:   5%|▍         | 2073/41976 [50:09<15:12:57,  1.37s/it][A
Training:   5%|▍         | 2074/41976 [50:11<16:39:34,  1.50s/it][A
Training:   5%|▍         | 2075/41976 [50:13<18:00:35,  1.62s/it][A
Training:   5%|▍         | 2076/41976 [50:14<17:12:19,  1.55s/it][A
Training:   5%|▍         | 2077/41976 [50:15<16:27:12,  1.48s/it][A
Training:   5%|▍         | 2078/41976 [50:17<16:00:00,  1.44s/it][A
Training:   5%|▍         | 2079/41976 [50:18<15:38:58,  1.41s/it][A
Training:   5%|▍         | 2080/41976 [50:19<15:29:28,  1.40s/it][A
Training:   5%|▍         | 2081/41976 [50:21<15:13:30,  1.37s/it][A

Epoch 0, Step 2080, Loss: nan



Training:   5%|▍         | 2082/41976 [50:23<16:22:42,  1.48s/it][A
Training:   5%|▍         | 2083/41976 [50:24<16:45:24,  1.51s/it][A
Training:   5%|▍         | 2084/41976 [50:25<16:17:59,  1.47s/it][A
Training:   5%|▍         | 2085/41976 [50:27<15:46:41,  1.42s/it][A
Training:   5%|▍         | 2086/41976 [50:28<15:31:57,  1.40s/it][A
Training:   5%|▍         | 2087/41976 [50:30<15:20:43,  1.38s/it][A
Training:   5%|▍         | 2088/41976 [50:31<15:21:02,  1.39s/it][A
Training:   5%|▍         | 2089/41976 [50:33<16:43:04,  1.51s/it][A
Training:   5%|▍         | 2090/41976 [50:34<17:11:48,  1.55s/it][A
Training:   5%|▍         | 2091/41976 [50:36<17:25:15,  1.57s/it][A

Epoch 0, Step 2090, Loss: nan



Training:   5%|▍         | 2092/41976 [50:37<16:51:42,  1.52s/it][A
Training:   5%|▍         | 2093/41976 [50:39<16:21:53,  1.48s/it][A
Training:   5%|▍         | 2094/41976 [50:40<16:02:58,  1.45s/it][A
Training:   5%|▍         | 2095/41976 [50:41<15:41:53,  1.42s/it][A
Training:   5%|▍         | 2096/41976 [50:43<15:36:24,  1.41s/it][A
Training:   5%|▍         | 2097/41976 [50:44<15:18:55,  1.38s/it][A
Training:   5%|▍         | 2098/41976 [50:46<15:33:50,  1.41s/it][A
Training:   5%|▌         | 2099/41976 [50:47<16:49:37,  1.52s/it][A
Training:   5%|▌         | 2100/41976 [50:49<16:25:44,  1.48s/it][A
Training:   5%|▌         | 2101/41976 [50:50<15:54:30,  1.44s/it][A

Epoch 0, Step 2100, Loss: nan



Training:   5%|▌         | 2102/41976 [50:52<15:39:59,  1.41s/it][A
Training:   5%|▌         | 2103/41976 [50:53<17:12:22,  1.55s/it][A
Training:   5%|▌         | 2104/41976 [50:55<16:32:58,  1.49s/it][A
Training:   5%|▌         | 2105/41976 [50:56<16:00:10,  1.44s/it][A
Training:   5%|▌         | 2106/41976 [50:57<15:48:58,  1.43s/it][A
Training:   5%|▌         | 2107/41976 [50:59<17:05:36,  1.54s/it][A
Training:   5%|▌         | 2108/41976 [51:01<16:35:13,  1.50s/it][A
Training:   5%|▌         | 2109/41976 [51:02<16:01:18,  1.45s/it][A
Training:   5%|▌         | 2110/41976 [51:03<15:45:22,  1.42s/it][A
Training:   5%|▌         | 2111/41976 [51:05<15:31:45,  1.40s/it][A

Epoch 0, Step 2110, Loss: nan



Training:   5%|▌         | 2112/41976 [51:06<15:28:53,  1.40s/it][A
Training:   5%|▌         | 2113/41976 [51:07<15:15:28,  1.38s/it][A
Training:   5%|▌         | 2114/41976 [51:09<15:10:17,  1.37s/it][A
Training:   5%|▌         | 2115/41976 [51:10<15:59:37,  1.44s/it][A
Training:   5%|▌         | 2116/41976 [51:12<16:44:55,  1.51s/it][A
Training:   5%|▌         | 2117/41976 [51:13<16:10:17,  1.46s/it][A
Training:   5%|▌         | 2118/41976 [51:15<17:26:30,  1.58s/it][A
Training:   5%|▌         | 2119/41976 [51:17<16:42:10,  1.51s/it][A
Training:   5%|▌         | 2120/41976 [51:18<16:14:17,  1.47s/it][A
Training:   5%|▌         | 2121/41976 [51:19<15:42:18,  1.42s/it][A

Epoch 0, Step 2120, Loss: nan



Training:   5%|▌         | 2122/41976 [51:21<15:30:34,  1.40s/it][A
Training:   5%|▌         | 2123/41976 [51:22<16:02:48,  1.45s/it][A
Training:   5%|▌         | 2124/41976 [51:24<17:00:53,  1.54s/it][A
Training:   5%|▌         | 2125/41976 [51:25<16:17:55,  1.47s/it][A
Training:   5%|▌         | 2126/41976 [51:27<15:50:37,  1.43s/it][A
Training:   5%|▌         | 2127/41976 [51:28<15:30:43,  1.40s/it][A
Training:   5%|▌         | 2128/41976 [51:29<15:25:07,  1.39s/it][A
Training:   5%|▌         | 2129/41976 [51:31<15:10:52,  1.37s/it][A
Training:   5%|▌         | 2130/41976 [51:32<15:05:36,  1.36s/it][A
Training:   5%|▌         | 2131/41976 [51:34<17:01:46,  1.54s/it][A

Epoch 0, Step 2130, Loss: nan



Training:   5%|▌         | 2132/41976 [51:36<17:56:57,  1.62s/it][A