In [1]:
import os
import time
import inspect
import math
import pickle
from tqdm import tqdm
from contextlib import nullcontext
import numpy as np
import torch
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group
from torch.nn import Linear as nnLinear
import torch.nn as nn
import torch.nn.functional as F

from mamba_ssm.models.config_mamba import MambaConfig
from models import *
from models.ttt import TTTLinear, TTTConfig, TTTForCausalLM
from models.PreCo import PreCoNewConfig, PreCoNewModel
from models.preco_nogain import PreCoNoGainConfig, PreCoNoGainModel
# import wandb

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
master_seed = 1337
out_dir = 'results/slim_results'
# 🚀 與 run.sh 實際訓練參數一致
eval_interval = 200     # 評估間隔 (run.sh: --eval_interval=200)
log_interval = 10       # 日誌間隔
eval_iters = 50         # 評估迭代次數
eval_only = False
always_save_checkpoint = False

In [3]:
model_name = "ttt" # llama, gla, rwkv, retnet, mamba, longhorn, ttt, preco

In [4]:
dataset = 'Slim'
do_eval = True
gradient_accumulation_steps = 4   # 實際梯度累積步數 (run.sh: grad_accum=4)
# 🚀 與 run.sh 實際訓練參數一致
batch_size = 4          # 實際批次大小 (run.sh: batch_size=4)
block_size = 2048       # 序列長度 (run.sh: block_size=2048)
use_branch_losses = False  # 僅用 CE 端到端訓練兩分支（建議）
branch_longhorn_weight = 1.0
branch_ttt_weight = 1.0

### 工具函數

In [5]:
def setup_distributed():
    """設置分布式訓練環境（單GPU優化版）"""
    ddp = int(os.environ.get('RANK', -1)) != -1
    if ddp:
        # 多GPU分布式訓練設置
        init_process_group(backend=backend)
        ddp_rank = int(os.environ['RANK'])
        ddp_local_rank = int(os.environ['LOCAL_RANK'])
        ddp_world_size = int(os.environ['WORLD_SIZE'])
        device_name = f'cuda:{ddp_local_rank}'
        torch.cuda.set_device(device_name)
        master_process = ddp_rank == 0
        seed_offset = ddp_rank
        return ddp, device_name, master_process, seed_offset, ddp_world_size, ddp_local_rank
    else:
        # 單GPU訓練設置（您的情況）
        print("檢測到單GPU環境，使用標準訓練模式")
        return False, device, True, 0, 1, 0


In [6]:
def load_data(dataset_name, block_size):
    """載入訓練數據"""
    data_dir = os.path.join('data', dataset_name)
    
    print(f"使用 PyTorch 原生數據載入方式")
    
    # 載入數據
    train_data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
    val_data = np.memmap(os.path.join(data_dir, 'val.bin'), dtype=np.uint16, mode='r')
    
    print(f" 載入數據:")
    print(f" Train tokens: {len(train_data):,}")
    print(f" Val tokens: {len(val_data):,}")
    print(f" 序列長度: {block_size}")
    
    # 載入詞彙表信息
    meta_path = os.path.join(data_dir, 'meta.pkl')
    if os.path.exists(meta_path):
        with open(meta_path, 'rb') as f:
            meta = pickle.load(f)
        vocab_size = meta.get('vocab_size', 50257)
        print(f"載入meta資訊: vocab_size={vocab_size}")
    else:
        vocab_size = 50257
        print(f"未找到meta.pkl，使用預設vocab_size={vocab_size}")
    
    return train_data, val_data, vocab_size

In [7]:
def get_batch_fn(train_data, val_data, batch_size, block_size, device_type, device):
    """創建批次獲取函數"""
    def get_batch(split):
        data = train_data if split == 'train' else val_data
        ix = torch.randint(len(data) - block_size, (batch_size,))
        x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
        y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
        
        if device_type == 'cuda':
            x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
        else:
            x, y = x.to(device), y.to(device)
        
        return x, y
    return get_batch

def calculate_ppl(loss):
    """計算困惑度"""
    if loss > 700:
        return float('inf')
    try:
        return math.exp(loss)
    except OverflowError:
        return float('inf')

### 學習率調度器

In [8]:
def get_lr(iter_num, warmup_iters, lr_decay_iters, learning_rate, min_lr, decay_lr=True):
    """主學習率調度器 - 餘弦衰減 + warmup"""
    if not decay_lr:
        return learning_rate
    
    # 線性 warmup
    if iter_num < warmup_iters:
        return learning_rate * iter_num / warmup_iters
    
    # 達到衰減終點：固定最小學習率
    if iter_num >= lr_decay_iters:
        return min_lr
    
    # 餘弦衰減
    decay_ratio = (iter_num - warmup_iters) / (lr_decay_iters - warmup_iters)
    decay_ratio = max(0.0, min(1.0, decay_ratio))
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
    return min_lr + coeff * (learning_rate - min_lr)

def get_ttt_lr_mult(iter_num, lr_decay_iters):
    """TTT 學習率倍數調度器（僅用於 preco_nogain 和 ttt）"""
    base_mult = 1.0
    warmup_steps = 500
    
    if iter_num < warmup_steps:
        # 餘弦 warmup: 0.1 → 1.0
        progress = iter_num / warmup_steps
        return 0.1 + 0.9 * (1.0 - math.cos(math.pi * progress)) / 2.0
    elif iter_num < 2000:
        return base_mult
    elif iter_num >= 3000:
        # 餘弦衰減: 1.0 → 0.3
        decay_start = 3000
        decay_ratio = (iter_num - decay_start) / (lr_decay_iters - decay_start)
        decay_ratio = min(decay_ratio, 1.0)
        min_ttt_mult = 0.3
        coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
        return min_ttt_mult + coeff * (base_mult - min_ttt_mult)
    else:
        return base_mult

### 模型創建函數

In [9]:
def create_estimate_loss_fn_corrected(model, model_name, get_batch, eval_iters, device_type):
    """🔧 修正後的損失評估函數 - 明確區分 total_loss 和 ce_loss"""
    
    @torch.no_grad()
    def estimate_loss(iter_num=0):
        out = {}
        model.eval()
        
        for split in ['train', 'val']:
            # 🎯 關鍵修正：明確區分 total_loss 和 ce_loss
            total_losses_list = []    # 用於報告 loss (total_loss)
            ce_losses_list = []       # 用於計算 perplexity (ce_loss)
            
            # PreCo 系列特定統計
            if model_name == "preco":
                longhorn_losses, ttt_losses = [], []
                kalman_means, kalman_stds = [], []
            
            bad_batches = 0
            
            for k in range(eval_iters):
                x, y = get_batch(split)
                
                # 評估時使用 FP32 以提高穩定性
                with torch.cuda.amp.autocast(enabled=False):
                    if model_name == "ttt":
                        outputs = model(input_ids=x, labels=y)
                        if isinstance(outputs, tuple):
                            logits, loss = outputs[0], outputs[1]
                        else:
                            logits, loss = outputs.logits, outputs.loss
                        
                        loss_val = float(loss.item()) if hasattr(loss, 'item') else float(loss)
                        if math.isfinite(loss_val):
                            # TTT 模型：loss 就是 ce_loss
                            total_losses_list.append(loss_val)
                            ce_losses_list.append(loss_val)
                        else:
                            bad_batches += 1
                            
                    elif model_name == "preco":
                        # PreCo 使用 TTT 原生自適應學習率
                        logits, loss_dict = model(x, y, compute_branch_loss=False)
                        
                        # 轉換所有損失為 float
                        for key in list(loss_dict.keys()):
                            if hasattr(loss_dict[key], 'item'):
                                loss_dict[key] = loss_dict[key].item()
                            else:
                                loss_dict[key] = float(loss_dict[key])
                        
                        total_loss_val = float(loss_dict.get('total_loss', float('nan')))
                        ce_loss_val = float(loss_dict.get('ce_loss', float('nan')))
                        
                        if math.isfinite(total_loss_val) and math.isfinite(ce_loss_val):
                            # 🎯 關鍵修正：明確區分用途
                            total_losses_list.append(total_loss_val)  # 報告 total_loss
                            ce_losses_list.append(ce_loss_val)        # 計算 perplexity
                            
                            longhorn_losses.append(float(loss_dict.get('longhorn_loss', 0.0)))
                            ttt_losses.append(float(loss_dict.get('ttt_loss', 0.0)))
                            kalman_means.append(float(loss_dict.get('kalman_mean', 0.0)))
                            kalman_stds.append(float(loss_dict.get('kalman_std', 0.0)))
                        else:
                            bad_batches += 1
                            
                    else:
                        # 其他模型 (longhorn, etc.)
                        logits, loss = model(x, y)
                        loss_val = float(loss.item()) if hasattr(loss, 'item') else float(loss)
                        if math.isfinite(loss_val):
                            # 其他模型：loss 就是 ce_loss
                            total_losses_list.append(loss_val)
                            ce_losses_list.append(loss_val)
                        else:
                            bad_batches += 1
            
            # 🎯 計算平均損失：明確區分用途
            mean_total_loss = (sum(total_losses_list) / len(total_losses_list)) if len(total_losses_list) > 0 else float('inf')
            mean_ce_loss = (sum(ce_losses_list) / len(ce_losses_list)) if len(ce_losses_list) > 0 else float('inf')
            
            result = {
                'loss': float(mean_total_loss),           # 報告 total_loss
                'ppl': calculate_ppl(float(mean_ce_loss)) # 基於 ce_loss 計算 perplexity ✅
            }
            
            # 添加 PreCo 統計
            if model_name == "preco":
                result.update({
                    'ce_loss': float(mean_ce_loss),  # 明確報告 ce_loss
                    'longhorn_loss': float(sum(longhorn_losses)/len(longhorn_losses)) if len(longhorn_losses)>0 else float('inf'),
                    'ttt_loss': float(sum(ttt_losses)/len(ttt_losses)) if len(ttt_losses)>0 else float('inf'),
                    'kalman_mean': float(sum(kalman_means)/len(kalman_means)) if len(kalman_means)>0 else 0.0,
                    'kalman_std': float(sum(kalman_stds)/len(kalman_stds)) if len(kalman_stds)>0 else 0.0,
                    'dropped_batches': bad_batches,
                })
            
            out[split] = result
        
        model.train()
        return out
    
    return estimate_loss

print("  - loss 報告: total_loss")
print("  - ppl 計算: exp(ce_loss)")

  - loss 報告: total_loss
  - ppl 計算: exp(ce_loss)


### 修正說明和使用示例


In [10]:
# # 📊 修正說明：Loss vs Perplexity 的正確計算

# print("🔧 問題分析:")
# print("  原始代碼問題：")
# print("    - losses_list 存儲的是 total_loss (可能包含 longhorn/ttt loss)")
# print("    - 但用 total_loss 計算 perplexity 是錯誤的")
# print("    - perplexity 應該基於 ce_loss 計算")
# print()

# print("✅ 修正方案:")
# print("  明確區分兩個用途：")
# print("    - loss 報告: total_loss (完整的訓練損失)")
# print("    - ppl 計算: exp(ce_loss) (語言模型的困惑度)")
# print()

# print("📈 不同模型的處理:")
# print("  TTT/Longhorn 模型:")
# print("    - loss = ce_loss (單純的交叉熵)")
# print("    - ppl = exp(ce_loss)")
# print()
# print("  PreCo 模型:")
# print("    - loss = total_loss (可能包含分支損失)")
# print("    - ppl = exp(ce_loss) (僅基於交叉熵)")
# print("    - 額外報告: ce_loss, longhorn_loss, ttt_loss")
# print()

# # 使用示例
# print("🚀 使用修正後的評估函數:")
# print("# 替換原來的評估函數")
# print("estimate_loss_corrected = create_estimate_loss_fn_corrected(model, model_name, get_batch, eval_iters, device_type)")
# print()
# print("# 評估結果示例 (PreCo 模型):")
# print("losses = estimate_loss_corrected()")
# print("print(f'Loss: {losses[\"train\"][\"loss\"]:.4f}')      # total_loss")
# print("print(f'PPL: {losses[\"train\"][\"ppl\"]:.2f}')        # exp(ce_loss)")
# print("print(f'CE Loss: {losses[\"train\"][\"ce_loss\"]:.4f}') # ce_loss")


In [11]:
# 🚀 修正後的模型創建 - 與實際訓練參數一致

from models.longhorn import LonghornLM, LonghornConfig
from models.ttt import TTTLinear, TTTConfig, TTTForCausalLM
from models.PreCo import PreCoNewConfig, PreCoNewModel

# ========================================
# 1. Longhorn 模型 (與 run.sh 和 train.py 一致)
# ========================================
print("🚀 創建 Longhorn 模型 - 實際訓練配置")

Longhorn_model = LonghornLM(
    LonghornConfig(
        vocab_size=50257,   # 實際詞彙表大小 (與 train.py 一致)
        d_model=768,        # 模型維度 (run.sh: --n_embd=768)
        n_layer=12,         # 層數 (run.sh: --n_layer=12)
        ssm_cfg={
            'd_state': 16,  # SSM 狀態維度 (train.py 實際配置)
            'd_conv': 4,    # 卷積核大小 (train.py 實際配置)
            'expand': 4     # 內部維度擴展倍數 (train.py: 4×768=3072)
        },
        # 其他配置與 train.py 一致
        rms_norm=True,
        residual_in_fp32=True,
        fused_add_norm=True,
        tie_embeddings=True
    )
)

print(f"Longhorn 模型參數量: {sum(p.numel() for p in Longhorn_model.parameters())/1e6:.1f}M")

# ========================================
# 2. TTT 模型 (與 run.sh 一致)
# ========================================
print("🚀 創建 TTT 模型 - 125M 配置")

TTT_model = TTTForCausalLM(
    TTTConfig(
        vocab_size=50257,                    # 實際詞彙表大小
        hidden_size=768,                     # 隱藏層維度 (run.sh: --n_embd=768)
        intermediate_size=2048,              # MLP 中間層維度 (768*2.67≈2048)
        num_hidden_layers=12,                # Transformer 層數 (run.sh: --n_layer=12)
        num_attention_heads=12,              # 注意力頭數 (run.sh: --n_head=12)
        max_position_embeddings=2048,        # 最大位置編碼 (run.sh: --block_size=2048)
        
        # TTT 特定參數 (與 run.sh 一致)
        ttt_base_lr=1.0,                     # TTT 基礎學習率
        mini_batch_size=16,                  # 小批次大小
        use_gate=False,                      # 門控機制 (run.sh: --use_gate=False)
        share_qk=False,                      # Q/K 共享 (run.sh: --share_qk=False)
        ttt_layer_type="linear",             # TTT 層類型
        pre_conv=True,                       # 預卷積 (run.sh: --pre_conv=True)
        conv_kernel=4,                       # 卷積核大小 (run.sh: --conv_kernel=4)
        scan_checkpoint_group_size=0,        # 梯度檢查點
        
        # 標準參數
        dropout=0.0,                         # Dropout (run.sh: --dropout=0.0)
        pad_token_id=0,
        bos_token_id=2,
        eos_token_id=3,
        tie_word_embeddings=True,
    )
)

print(f"TTT 模型參數量: {sum(p.numel() for p in TTT_model.parameters())/1e6:.1f}M")

# ========================================
# 3. PreCo 模型 (與 run.sh 一致)
# ========================================
print("創建 PreCo 模型 - 127M 配置")

PreCo_model = PreCoNewModel(
    PreCoNewConfig(
        vocab_size=50257,                    # 實際詞彙表大小
        d_model=512,                         # 模型維度 (run.sh: --longhorn_d_model=512)
        n_layer=12,                          # 層數 (run.sh: --longhorn_n_layer=12)
        
        # Longhorn 配置 (與 run.sh 一致)
        d_state=8,                           # SSM 狀態維度 (run.sh: --longhorn_d_state=8)
        d_conv=3,                            # 卷積核大小 (固定為3)
        expand=6,                            # SSM 擴展倍數 (run.sh: --longhorn_ssm_expand=6)
        
        # TTT 配置 (與 run.sh 一致)
        ttt_num_heads=8,                     # TTT 頭數 (run.sh: --ttt_num_heads=8)
        ttt_num_layers=1,                    # TTT 層數 (run.sh: --ttt_num_layers=1)
        mini_batch_size=16,                  # 小批次大小 (run.sh: --mini_batch_size=16)
        
        # 其他參數
        dropout=0.1,                         # Dropout (run.sh: --dropout=0.1)
    )
)


🚀 創建 Longhorn 模型 - 實際訓練配置
Longhorn 模型參數量: 128.5M
🚀 創建 TTT 模型 - 125M 配置
TTT 模型參數量: 124.4M
創建 PreCo 模型 - 127M 配置


In [12]:
print(TTT_model)

TTTForCausalLM(
  (model): TTTModel(
    (embed_tokens): Embedding(50257, 768, padding_idx=0)
    (layers): ModuleList(
      (0-11): 12 x Block(
        (seq_modeling_block): TTTLinear(
          (q_proj): Linear(in_features=768, out_features=768, bias=False)
          (k_proj): Linear(in_features=768, out_features=768, bias=False)
          (v_proj): Linear(in_features=768, out_features=768, bias=False)
          (o_proj): Linear(in_features=768, out_features=768, bias=False)
          (rotary_emb): RotaryEmbedding()
          (post_norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        )
        (mlp): SwiGluMLP(
          (gate_proj): Linear(in_features=768, out_features=2048, bias=False)
          (up_proj): Linear(in_features=768, out_features=2048, bias=False)
          (down_proj): Linear(in_features=2048, out_features=768, bias=False)
          (act_fn): SiLU()
        )
        (conv): Conv(
          (norm): RMSNorm()
          (conv): Conv1d(768, 768, kernel_s

### 評估函數

In [13]:
def create_estimate_loss_fn(model, model_name, get_batch, eval_iters, device_type):
    """創建損失評估函數"""
    
    @torch.no_grad()
    def estimate_loss(iter_num=0):
        out = {}
        model.eval()
        
        for split in ['train', 'val']:
            losses_list = []
            ppl_losses_list = []
            
            # PreCo 系列特定統計
            if model_name == "preco":
                ce_losses, longhorn_losses, ttt_losses = [], [], []
                kalman_means, kalman_stds = [], []
            
            bad_batches = 0
            
            for k in range(eval_iters):
                x, y = get_batch(split)
                
                # 評估時使用 FP32 以提高穩定性
                with torch.cuda.amp.autocast(enabled=False):
                    if model_name == "ttt":
                        outputs = model(input_ids=x, labels=y)
                        if isinstance(outputs, tuple):
                            logits, loss = outputs[0], outputs[1]
                        else:
                            logits, loss = outputs.logits, outputs.loss
                        
                        loss_val = float(loss.item()) if hasattr(loss, 'item') else float(loss)
                        if math.isfinite(loss_val):
                            losses_list.append(loss_val)
                            ppl_losses_list.append(loss_val)
                        else:
                            bad_batches += 1
                            
                    elif model_name == "preco":
                        # PreCo 使用 TTT 原生自適應學習率
                        logits, loss_dict = model(x, y, compute_branch_loss=False)
                        
                        # 轉換所有損失為 float
                        for key in list(loss_dict.keys()):
                            if hasattr(loss_dict[key], 'item'):
                                loss_dict[key] = loss_dict[key].item()
                            else:
                                loss_dict[key] = float(loss_dict[key])
                        
                        total_loss_val = float(loss_dict.get('total_loss', float('nan')))
                        ce_loss_val = float(loss_dict.get('ce_loss', float('nan')))
                        
                        if math.isfinite(total_loss_val) and math.isfinite(ce_loss_val):
                            losses_list.append(total_loss_val)
                            ppl_losses_list.append(ce_loss_val)
                            
                            ce_losses.append(ce_loss_val)
                            longhorn_losses.append(float(loss_dict.get('longhorn_loss', 0.0)))
                            ttt_losses.append(float(loss_dict.get('ttt_loss', 0.0)))
                            kalman_means.append(float(loss_dict.get('kalman_mean', 0.0)))
                            kalman_stds.append(float(loss_dict.get('kalman_std', 0.0)))
                        else:
                            bad_batches += 1
                            
                    else:
                        logits, loss = model(x, y)
                        loss_val = float(loss.item()) if hasattr(loss, 'item') else float(loss)
                        if math.isfinite(loss_val):
                            losses_list.append(loss_val)
                            ppl_losses_list.append(loss_val)
                        else:
                            bad_batches += 1
            
            # 計算平均損失
            mean_loss = (sum(losses_list) / len(losses_list)) if len(losses_list) > 0 else float('inf')
            mean_ppl_loss = (sum(ppl_losses_list) / len(ppl_losses_list)) if len(ppl_losses_list) > 0 else float('inf')
            
            result = {
                'loss': float(mean_loss),
                'ppl': calculate_ppl(float(mean_ppl_loss)),
                'dropped_batches': bad_batches
            }
            
            # 添加 PreCo 統計
            if model_name == "preco":
                result.update({
                    'ce_loss': float(sum(ce_losses)/len(ce_losses)) if len(ce_losses)>0 else float('inf'),
                    'longhorn_loss': float(sum(longhorn_losses)/len(longhorn_losses)) if len(longhorn_losses)>0 else float('inf'),
                    'ttt_loss': float(sum(ttt_losses)/len(ttt_losses)) if len(ttt_losses)>0 else float('inf'),
                    'kalman_mean': float(sum(kalman_means)/len(kalman_means)) if len(kalman_means)>0 else 0.0,
                    'kalman_std': float(sum(kalman_stds)/len(kalman_stds)) if len(kalman_stds)>0 else 0.0,
                })
            
            out[split] = result
        
        model.train()
        return out
    
    return estimate_loss

### 主訓練函數

In [14]:
# 🚀 訓練循環函數

def train_loop(model, optimizer, scaler, ctx, estimate_loss, tokens_per_iter, use_wandb=False):
    """主訓練循環"""
    
    # 訓練參數
    learning_rate = 1e-3
    max_iters = 9600
    warmup_iters = 960
    lr_decay_iters = max_iters
    min_lr = 1e-5
    grad_clip = 1.0
    
    # 統計變量
    iter_num = 0
    best_val_loss = 1e9
    t0 = time.time()
    
    # 預取第一個批次
    x, y = get_batch('train')
    
    print("開始訓練循環...")
    print(f"總迭代次數: {max_iters}")
    print(f"評估間隔: {eval_interval}")
    print(f"每次迭代處理 tokens: {tokens_per_iter:,}")
    
    # ========================================
    # 主訓練循環
    # ========================================
    while True:
        
        # ========================================
        # 1. 設置學習率
        # ========================================
        lr = get_lr(iter_num, warmup_iters, lr_decay_iters, learning_rate, min_lr, decay_lr=True)
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr
        
        # ========================================
        # 2. 評估和檢查點保存
        # ========================================
        if iter_num % eval_interval == 0:
            losses = estimate_loss(iter_num)
            train_loss = losses['train']['loss']
            val_loss = losses['val']['loss']
            
            print(f"step {iter_num}: train loss {train_loss:.4f}, val loss {val_loss:.4f}")
            print(f"train ppl {losses['train']['ppl']:.2f}, val ppl {losses['val']['ppl']:.2f}")
            
            # Train/Val gap 監控
            gap = val_loss - train_loss
            gap_ratio = gap / train_loss if train_loss > 0 else 0
            print(f"📈 Train/Val Gap: {gap:.4f} ({gap_ratio*100:.2f}%)")
            
            # PreCo 詳細統計
            if model_name == "preco":
                print(f"  📊 PRECO 詳細統計:")
                print(f"    CE Loss: {losses['train']['ce_loss']:.4f} / {losses['val']['ce_loss']:.4f}")
                print(f"    Kalman 統計: mean={losses['train']['kalman_mean']:.4f}, std={losses['train']['kalman_std']:.4f}")
            
            # wandb 記錄
            if use_wandb:
                log_data = {
                    "train/loss": train_loss, "val/loss": val_loss,
                    "train/ppl": losses['train']['ppl'], "val/ppl": losses['val']['ppl'],
                    "learning_rate": lr, "iter": iter_num,
                }
                
                if model_name == "preco":
                    log_data.update({
                        "preco/adaptive_ttt_lr": "native",
                        "preco/kalman_mean": losses['train']['kalman_mean'],
                        "preco/kalman_std": losses['train']['kalman_std'],
                    })
                
                wandb.log(log_data)
            
            # 保存最佳檢查點
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                print(f"新的最佳驗證損失: {best_val_loss:.4f}")
        
        # ========================================
        # 3. 前向和反向傳播
        # ========================================
        for micro_step in range(gradient_accumulation_steps):
            with ctx:
                if model_name == "ttt":
                    outputs = model(input_ids=x, labels=y)
                    if isinstance(outputs, tuple):
                        logits, loss = outputs[0], outputs[1]
                    else:
                        logits, loss = outputs.logits, outputs.loss
                    loss = loss / gradient_accumulation_steps
                    
                elif model_name == "preco":
                    logits, loss_dict = model(x, y, compute_branch_loss=False)
                    loss = loss_dict['total_loss'] / gradient_accumulation_steps
                    
                else:
                    logits, loss = model(x, y)
                    loss = loss / gradient_accumulation_steps
            
            # 異步預取下一個批次
            x, y = get_batch('train')
            scaler.scale(loss).backward()
        
        # ========================================
        # 4. 梯度裁剪和優化器步驟
        # ========================================
        if grad_clip != 0.0:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
        
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad(set_to_none=True)
        
        # ========================================
        # 5. 日誌記錄
        # ========================================
        t1 = time.time()
        dt = t1 - t0
        t0 = t1
        
        if iter_num % (log_interval * 2) == 0:
            lossf = loss.item() * gradient_accumulation_steps
            print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms")
            
            if model_name == "preco":
                print(f"TTT 自適應學習率: 原生機制")
        
        iter_num += 1
        
        # ========================================
        # 6. 終止條件
        # ========================================
        if iter_num > max_iters:
            break
    
    # 清理
    if use_wandb:
        wandb.finish()
    
    print("🎉 訓練完成！")
    return model, best_val_loss

print("訓練循環函數已定義")


訓練循環函數已定義


### 開始訓練

In [15]:
# ========================================
# 1. 基本配置設置
# ========================================

# 訓練參數 (與 run.sh 一致)
learning_rate = 1e-3      # run.sh: lr=2.5e-3
weight_decay = 0.1          # run.sh: wd=0.1
max_iters = 9600            # run.sh: max_iters=9600
warmup_iters = 960          # run.sh: warmup=960
lr_decay_iters = max_iters  # run.sh: --lr_decay_iters=$max_iters
min_lr = 1e-5               # 最小學習率
beta1, beta2 = 0.9, 0.95    # Adam 參數
grad_clip = 1.0             # 梯度裁剪

# 設備和精度配置
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device_type = 'cuda' if 'cuda' in device else 'cpu'
dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16'
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)

# print(f"🚀 訓練配置:")
# print(f"  - 設備: {device}")
# print(f"  - 精度: {dtype}")
# print(f"  - 學習率: {learning_rate}")
# print(f"  - 最大迭代: {max_iters}")
# print(f"  - 批次大小: {batch_size}")
# print(f"  - 序列長度: {block_size}")

# ========================================
# 2. 數據載入
# ========================================
train_data, val_data, vocab_size = load_data(dataset, block_size)
get_batch = get_batch_fn(train_data, val_data, batch_size, block_size, device_type, device)
# ========================================
# 3. 模型選擇和初始化
# ========================================
print(f"創建模型: {model_name}")

if model_name == "longhorn":
    model = Longhorn_model
elif model_name == "ttt":
    model = TTT_model
elif model_name == "preco":
    model = PreCo_model
else:
    raise ValueError(f"不支持的模型類型: {model_name}")

model.to(device)

# 僅 TTT 模型需要轉換精度
if model_name == "ttt":
    model = model.to(ptdtype)

# ========================================
# 4. 優化器設置
# ========================================
param_dict = {pn: p for pn, p in model.named_parameters() if p.requires_grad}
decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]

optim_groups = [
    {'params': decay_params, 'weight_decay': weight_decay},
    {'params': nodecay_params, 'weight_decay': 0.0}
]

total_params = sum(p.numel() for p in decay_params) + sum(p.numel() for p in nodecay_params)
print(f"模型參數總量: {total_params/1e6:.1f}M")

# 創建優化器
fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
use_fused = fused_available and device_type == 'cuda'
extra_args = dict(fused=True) if use_fused else dict()

optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, 
                             betas=(beta1, beta2), weight_decay=weight_decay, **extra_args)
print(f"使用融合 AdamW: {use_fused}")

# 設置梯度縮放器
scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))

# ========================================
# 5. 初始化 wandb (可選)
# ========================================
experiment_name = f"{dataset}_{block_size * batch_size * gradient_accumulation_steps * max_iters / 1e9:.1f}_block{block_size}_{model_name}"

wandb_config = {
    "model_name": model_name, "dataset": dataset, "block_size": block_size,
    "batch_size": batch_size, "gradient_accumulation_steps": gradient_accumulation_steps,
    "learning_rate": learning_rate, "max_iters": max_iters,
    "vocab_size": vocab_size, "total_params": total_params,
}

# 添加模型特定配置
if model_name == "preco":
    wandb_config.update({
        "longhorn_d_model": 512,
        "ttt_num_heads": 8,
        "training_objective": "end_to_end_cross_entropy",
    })

try:
    wandb.init(project="2048", name=experiment_name, config=wandb_config)
    use_wandb = True
    print("wandb 初始化成功")
except:
    use_wandb = False
    print("wandb 初始化失敗，跳過在線記錄")

# ========================================
# 6. 創建評估函數
# ========================================
estimate_loss = create_estimate_loss_fn(model, model_name, get_batch, eval_iters, device_type)

# 訓練統計
tokens_per_iter = gradient_accumulation_steps * batch_size * block_size
print(f"每次迭代處理 tokens: {tokens_per_iter:,}")

print("初始化完成，準備開始訓練！")


使用 PyTorch 原生數據載入方式
 載入數據:
 Train tokens: 943,635,020
 Val tokens: 9,101,108
 序列長度: 2048
載入meta資訊: vocab_size=50257
創建模型: ttt
模型參數總量: 124.4M
使用融合 AdamW: True
wandb 初始化失敗，跳過在線記錄
每次迭代處理 tokens: 32,768
初始化完成，準備開始訓練！


In [16]:
# 🚀 執行訓練循環

# 統計變量
iter_num = 0
best_val_loss = 1e9
t0 = time.time()

# 預取第一個批次
x, y = get_batch('train')

print("\\n🚀 開始訓練循環...")
print(f"總迭代次數: {max_iters}")
print(f"評估間隔: {eval_interval}")
print(f"每次迭代處理 tokens: {tokens_per_iter:,}")

# ========================================
# 主訓練循環
# ========================================
while True:
    
    # ========================================
    # 1. 設置學習率
    # ========================================
    lr = get_lr(iter_num, warmup_iters, lr_decay_iters, learning_rate, min_lr, decay_lr=True)
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
    
    # ========================================
    # 2. 評估和檢查點保存
    # ========================================
    if iter_num % eval_interval == 0:
        losses = estimate_loss(iter_num)
        train_loss = losses['train']['loss']
        val_loss = losses['val']['loss']
        
        print(f"step {iter_num}: train loss {train_loss:.4f}, val loss {val_loss:.4f}")
        print(f"train ppl {losses['train']['ppl']:.2f}, val ppl {losses['val']['ppl']:.2f}")
        
        # Train/Val gap 監控
        gap = val_loss - train_loss
        gap_ratio = gap / train_loss if train_loss > 0 else 0
        print(f"📈 Train/Val Gap: {gap:.4f} ({gap_ratio*100:.2f}%)")
        
        # PreCo 詳細統計
        if model_name == "preco":
            print(f"PRECO 詳細統計:")
            print(f"CE Loss: {losses['train']['ce_loss']:.4f} / {losses['val']['ce_loss']:.4f}")
            print(f"Kalman 統計: mean={losses['train']['kalman_mean']:.4f}, std={losses['train']['kalman_std']:.4f}")
        
        # wandb 記錄
        if use_wandb:
            log_data = {
                "train/loss": train_loss, "val/loss": val_loss,
                "train/ppl": losses['train']['ppl'], "val/ppl": losses['val']['ppl'],
                "learning_rate": lr, "iter": iter_num,
            }
            
            if model_name == "preco":
                log_data.update({
                    "preco/adaptive_ttt_lr": "native",
                    "preco/kalman_mean": losses['train']['kalman_mean'],
                    "preco/kalman_std": losses['train']['kalman_std'],
                })
            
            wandb.log(log_data)
        
        # 保存最佳檢查點
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            print(f"✅ 新的最佳驗證損失: {best_val_loss:.4f}")
    
    # ========================================
    # 3. 前向和反向傳播
    # ========================================
    for micro_step in tqdm(range(gradient_accumulation_steps)):
        with ctx:
            if model_name == "ttt":
                outputs = model(input_ids=x, labels=y)
                if isinstance(outputs, tuple):
                    logits, loss = outputs[0], outputs[1]
                else:
                    logits, loss = outputs.logits, outputs.loss
                loss = loss / gradient_accumulation_steps
                
            elif model_name == "preco":
                logits, loss_dict = model(x, y, compute_branch_loss=False)
                loss = loss_dict['total_loss'] / gradient_accumulation_steps
                
            else:
                logits, loss = model(x, y)
                loss = loss / gradient_accumulation_steps
        
        # 異步預取下一個批次
        x, y = get_batch('train')
        scaler.scale(loss).backward()
    
    # ========================================
    # 4. 梯度裁剪和優化器步驟
    # ========================================
    if grad_clip != 0.0:
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
    
    scaler.step(optimizer)
    scaler.update()
    optimizer.zero_grad(set_to_none=True)
    
    # ========================================
    # 5. 日誌記錄
    # ========================================
    t1 = time.time()
    dt = t1 - t0
    t0 = t1
    
    if iter_num % (log_interval * 2) == 0:
        lossf = loss.item() * gradient_accumulation_steps
        print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms")
        
        if model_name == "preco":
            print(f"TTT 自適應學習率: 原生機制")
    
    iter_num += 1
    
    # ========================================
    # 6. 終止條件
    # ========================================
    if iter_num > max_iters:
        break

# 清理
if use_wandb:
    wandb.finish()

print("訓練完成！")
print(f"最佳驗證損失: {best_val_loss:.4f}")


\n🚀 開始訓練循環...
總迭代次數: 9600
評估間隔: 200
每次迭代處理 tokens: 32,768
step 0: train loss 10.9541, val loss 10.9543
train ppl 57189.97, val ppl 57201.75
📈 Train/Val Gap: 0.0002 (0.00%)
✅ 新的最佳驗證損失: 10.9543


100%|██████████| 4/4 [00:16<00:00,  4.02s/it]


iter 0: loss 10.9652, time 108140.81ms


100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.07s/it]
100%|██████████| 4/4 [00:16<00:00,  4.07s/it]
100%|██████████| 4/4 [00:16<00:00,  4.07s/it]
100%|██████████| 4/4 [00:16<00:00,  4.09s/it]
100%|██████████| 4/4 [00:16<00:00,  4.08s/it]
100%|██████████| 4/4 [00:16<00:00,  4.08s/it]
100%|██████████| 4/4 [00:16<00:00,  4.08s/it]
100%|██████████| 4/4 [00:16<00:00,  4.07s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.08s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]


iter 20: loss 10.6862, time 16099.57ms


100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]


iter 40: loss 9.8822, time 16119.67ms


100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]


iter 60: loss 9.2872, time 15864.54ms


100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:16<00:00,  4.00s/it]


iter 80: loss 8.5172, time 16040.32ms


100%|██████████| 4/4 [00:15<00:00,  4.00s/it]
100%|██████████| 4/4 [00:16<00:00,  4.00s/it]
100%|██████████| 4/4 [00:16<00:00,  4.00s/it]
100%|██████████| 4/4 [00:16<00:00,  4.00s/it]
100%|██████████| 4/4 [00:16<00:00,  4.00s/it]
100%|██████████| 4/4 [00:16<00:00,  4.00s/it]
100%|██████████| 4/4 [00:16<00:00,  4.00s/it]
100%|██████████| 4/4 [00:16<00:00,  4.00s/it]
100%|██████████| 4/4 [00:16<00:00,  4.00s/it]
100%|██████████| 4/4 [00:16<00:00,  4.00s/it]
100%|██████████| 4/4 [00:16<00:00,  4.00s/it]
100%|██████████| 4/4 [00:16<00:00,  4.00s/it]
100%|██████████| 4/4 [00:16<00:00,  4.00s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]


iter 100: loss 8.1979, time 15919.87ms


100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]


iter 120: loss 7.8115, time 15901.73ms


100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]


iter 140: loss 7.7183, time 16148.68ms


100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]


iter 160: loss 7.7637, time 16156.72ms


100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]


iter 180: loss 7.8173, time 16162.77ms


100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]


step 200: train loss 7.6533, val loss 7.7148
train ppl 2107.58, val ppl 2241.19
📈 Train/Val Gap: 0.0615 (0.80%)
✅ 新的最佳驗證損失: 7.7148


100%|██████████| 4/4 [00:16<00:00,  4.03s/it]


iter 200: loss 7.5518, time 111601.34ms


100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.05s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 220: loss 7.6178, time 15772.34ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]


iter 240: loss 7.3625, time 15754.16ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 260: loss 7.4465, time 15765.89ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 280: loss 7.2218, time 15769.90ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 300: loss 6.9354, time 15768.58ms


100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]


iter 320: loss 7.4519, time 16099.07ms


100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]


iter 340: loss 7.2291, time 15756.43ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]


iter 360: loss 7.4587, time 15766.05ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]


iter 380: loss 7.1775, time 15719.02ms


100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]


step 400: train loss 7.2521, val loss 7.2463
train ppl 1411.04, val ppl 1402.84
📈 Train/Val Gap: -0.0058 (-0.08%)
✅ 新的最佳驗證損失: 7.2463


100%|██████████| 4/4 [00:15<00:00,  3.92s/it]


iter 400: loss 7.5290, time 110782.56ms


100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]


iter 420: loss 7.2707, time 15716.54ms


100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 440: loss 7.0560, time 15769.95ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 460: loss 7.3213, time 15769.31ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]


iter 480: loss 7.0003, time 15751.53ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 500: loss 6.6653, time 15768.01ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]


iter 520: loss 7.3619, time 15763.40ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 540: loss 7.1213, time 15767.63ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]


iter 560: loss 7.3305, time 15763.29ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]


iter 580: loss 7.0252, time 15764.56ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


step 600: train loss 6.9816, val loss 6.9723
train ppl 1076.60, val ppl 1066.72
📈 Train/Val Gap: -0.0092 (-0.13%)
✅ 新的最佳驗證損失: 6.9723


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]


iter 600: loss 7.1145, time 111357.42ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]


iter 620: loss 6.5837, time 15760.43ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 640: loss 6.8013, time 15765.74ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  4.00s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]


iter 660: loss 6.8658, time 16110.18ms


100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]


iter 680: loss 6.8905, time 16096.72ms


100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]


iter 700: loss 7.0616, time 16111.97ms


100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]


iter 720: loss 6.8853, time 16120.23ms


100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]


iter 740: loss 7.2581, time 16155.39ms


100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]


iter 760: loss 7.0125, time 16153.81ms


100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]


iter 780: loss 6.8102, time 16151.72ms


100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]


step 800: train loss 6.8392, val loss 6.8415
train ppl 933.70, val ppl 935.94
📈 Train/Val Gap: 0.0024 (0.03%)
✅ 新的最佳驗證損失: 6.8415


100%|██████████| 4/4 [00:16<00:00,  4.02s/it]


iter 800: loss 7.0394, time 111475.61ms


100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]


iter 820: loss 6.6769, time 16110.59ms


100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]


iter 840: loss 6.5802, time 15659.52ms


100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]


iter 860: loss 6.7221, time 15700.27ms


100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]


iter 880: loss 6.5069, time 15675.07ms


100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]


iter 900: loss 6.9779, time 15689.26ms


100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]


iter 920: loss 6.5570, time 15682.43ms


100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]


iter 940: loss 6.6994, time 15679.36ms


100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]


iter 960: loss 6.7365, time 15696.66ms


100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]


iter 980: loss 6.4125, time 15684.29ms


100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]


step 1000: train loss 6.7078, val loss 6.6995
train ppl 818.75, val ppl 812.01
📈 Train/Val Gap: -0.0083 (-0.12%)
✅ 新的最佳驗證損失: 6.6995


100%|██████████| 4/4 [00:15<00:00,  3.91s/it]


iter 1000: loss 6.1864, time 103002.85ms


100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]


iter 1020: loss 6.2809, time 15672.89ms


100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]


iter 1040: loss 6.8519, time 15673.73ms


100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]


iter 1060: loss 6.4986, time 15672.77ms


100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]


iter 1080: loss 6.7525, time 15691.47ms


100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]


iter 1100: loss 6.4358, time 15694.15ms


100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]


iter 1120: loss 6.4127, time 15675.53ms


100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]


iter 1140: loss 6.7191, time 15672.38ms


100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]


iter 1160: loss 6.6972, time 15672.50ms


100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]


iter 1180: loss 6.5424, time 15698.58ms


100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]


step 1200: train loss 6.5716, val loss 6.6152
train ppl 714.51, val ppl 746.32
📈 Train/Val Gap: 0.0436 (0.66%)
✅ 新的最佳驗證損失: 6.6152


100%|██████████| 4/4 [00:15<00:00,  3.81s/it]


iter 1200: loss 6.6385, time 102821.30ms


100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.84s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]


iter 1220: loss 6.8314, time 15284.03ms


100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 1240: loss 6.3912, time 15770.48ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]


iter 1260: loss 6.6049, time 15764.21ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]


iter 1280: loss 6.7771, time 15758.11ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 1300: loss 6.2479, time 15767.84ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]


iter 1320: loss 6.7328, time 15277.70ms


100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 1340: loss 6.8734, time 15773.79ms


100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]


iter 1360: loss 6.4185, time 15748.08ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]


iter 1380: loss 6.1745, time 15760.09ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]


step 1400: train loss 6.5118, val loss 6.5025
train ppl 673.00, val ppl 666.81
📈 Train/Val Gap: -0.0092 (-0.14%)
✅ 新的最佳驗證損失: 6.5025


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]


iter 1400: loss 6.3333, time 111369.09ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.05s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]


iter 1420: loss 6.6158, time 16146.95ms


100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.05s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]


iter 1440: loss 6.5986, time 16136.27ms


100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.05s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]


iter 1460: loss 6.3862, time 16161.88ms


100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]


iter 1480: loss 6.3904, time 16179.61ms


100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]


iter 1500: loss 6.6508, time 16156.78ms


100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]


iter 1520: loss 6.4057, time 16161.46ms


100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]


iter 1540: loss 6.4570, time 15895.48ms


100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]


iter 1560: loss 6.6470, time 15890.49ms


100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]


iter 1580: loss 6.2627, time 15898.04ms


100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]


step 1600: train loss 6.3806, val loss 6.4371
train ppl 590.28, val ppl 624.57
📈 Train/Val Gap: 0.0565 (0.88%)
✅ 新的最佳驗證損失: 6.4371


100%|██████████| 4/4 [00:15<00:00,  3.97s/it]


iter 1600: loss 6.0148, time 111488.31ms


100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]


iter 1620: loss 6.7012, time 15917.96ms


100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]


iter 1640: loss 6.5715, time 15893.27ms


100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]


iter 1660: loss 6.2890, time 15899.15ms


100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]


iter 1680: loss 6.3485, time 15907.32ms


100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]


iter 1700: loss 6.4042, time 15905.48ms


100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]


iter 1720: loss 6.4019, time 15903.20ms


100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]


iter 1740: loss 5.9140, time 15885.18ms


100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]


iter 1760: loss 6.6776, time 15910.41ms


100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]


iter 1780: loss 6.5664, time 15897.98ms


100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]


step 1800: train loss 6.3767, val loss 6.3563
train ppl 588.01, val ppl 576.10
📈 Train/Val Gap: -0.0205 (-0.32%)
✅ 新的最佳驗證損失: 6.3563


100%|██████████| 4/4 [00:15<00:00,  3.97s/it]


iter 1800: loss 6.3617, time 111588.51ms


100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]


iter 1820: loss 6.2642, time 15760.86ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]


iter 1840: loss 6.6279, time 15762.74ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 1860: loss 6.4486, time 15774.01ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 1880: loss 6.3205, time 15778.68ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 1900: loss 6.3159, time 15766.20ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]


iter 1920: loss 6.4698, time 15763.67ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]


iter 1940: loss 6.2408, time 15808.71ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]


iter 1960: loss 6.5753, time 15758.11ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]


iter 1980: loss 6.2177, time 15713.43ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


step 2000: train loss 6.2972, val loss 6.2942
train ppl 543.07, val ppl 541.43
📈 Train/Val Gap: -0.0030 (-0.05%)
✅ 新的最佳驗證損失: 6.2942


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]


iter 2000: loss 6.9975, time 111301.48ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]


iter 2020: loss 6.4140, time 15761.22ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]


iter 2040: loss 5.7661, time 15706.90ms


100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:16<00:00,  4.00s/it]
100%|██████████| 4/4 [00:16<00:00,  4.00s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 2060: loss 6.6033, time 15782.08ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 2080: loss 6.5736, time 15769.84ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]


iter 2100: loss 6.2598, time 15754.88ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]


iter 2120: loss 6.5209, time 15914.46ms


100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]


iter 2140: loss 6.4913, time 15908.34ms


100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]


iter 2160: loss 6.4890, time 15858.25ms


100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]


iter 2180: loss 6.6929, time 15865.89ms


100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]


step 2200: train loss 6.2046, val loss 6.2727
train ppl 495.03, val ppl 529.92
📈 Train/Val Gap: 0.0681 (1.10%)
✅ 新的最佳驗證損失: 6.2727


100%|██████████| 4/4 [00:15<00:00,  3.95s/it]


iter 2200: loss 6.2327, time 111132.24ms


100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]


iter 2220: loss 6.1467, time 15856.57ms


100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]


iter 2240: loss 6.0063, time 15851.48ms


100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]


iter 2260: loss 5.9096, time 15836.14ms


100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]


iter 2280: loss 6.2416, time 15882.88ms


100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]


iter 2300: loss 6.1128, time 15902.83ms


100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]


iter 2320: loss 6.5131, time 15895.71ms


100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]


iter 2340: loss 6.4275, time 15898.78ms


100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]


iter 2360: loss 5.9667, time 15911.24ms


100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]


iter 2380: loss 5.8666, time 15904.96ms


100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]


step 2400: train loss 6.2577, val loss 6.2334
train ppl 522.02, val ppl 509.50
📈 Train/Val Gap: -0.0243 (-0.39%)
✅ 新的最佳驗證損失: 6.2334


100%|██████████| 4/4 [00:15<00:00,  3.96s/it]


iter 2400: loss 6.2147, time 111338.97ms


100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]


iter 2420: loss 6.1322, time 15911.51ms


100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]


iter 2440: loss 6.1150, time 15882.14ms


100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]


iter 2460: loss 6.2065, time 15888.29ms


100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]


iter 2480: loss 6.1018, time 15891.51ms


100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]


iter 2500: loss 5.6498, time 15893.98ms


100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]


iter 2520: loss 6.2781, time 15916.60ms


100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]


iter 2540: loss 6.3228, time 15891.10ms


100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]


iter 2560: loss 6.1556, time 15928.31ms


100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]


iter 2580: loss 6.5414, time 15936.43ms


100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]


step 2600: train loss 6.1243, val loss 6.1294
train ppl 456.84, val ppl 459.16
📈 Train/Val Gap: 0.0051 (0.08%)
✅ 新的最佳驗證損失: 6.1294


100%|██████████| 4/4 [00:15<00:00,  3.96s/it]


iter 2600: loss 6.0919, time 111398.10ms


100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]


iter 2620: loss 6.3552, time 15901.15ms


100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]


iter 2640: loss 6.3670, time 15584.53ms


100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]


iter 2660: loss 6.3134, time 15569.08ms


100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]


iter 2680: loss 5.8030, time 15561.04ms


100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]


iter 2700: loss 6.1043, time 15575.95ms


100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]


iter 2720: loss 6.6389, time 15848.86ms


100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]


iter 2740: loss 6.1456, time 15838.90ms


100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]


iter 2760: loss 6.0592, time 15703.21ms


100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]


iter 2780: loss 6.3840, time 15726.11ms


100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]


step 2800: train loss 6.0965, val loss 6.1478
train ppl 444.30, val ppl 467.68
📈 Train/Val Gap: 0.0513 (0.84%)


100%|██████████| 4/4 [00:15<00:00,  3.92s/it]


iter 2800: loss 5.9921, time 110983.20ms


100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]


iter 2820: loss 5.6553, time 15711.39ms


100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]


iter 2840: loss 6.1797, time 15719.24ms


100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]


iter 2860: loss 6.2865, time 15706.10ms


100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]


iter 2880: loss 6.0737, time 15720.77ms


100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]


iter 2900: loss 5.7284, time 15747.84ms


100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]


iter 2920: loss 6.1116, time 15714.45ms


100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]


iter 2940: loss 6.3310, time 15609.96ms


100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]


iter 2960: loss 6.0348, time 15619.84ms


100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]


iter 2980: loss 5.7535, time 15620.66ms


100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]


step 3000: train loss 6.1138, val loss 6.0605
train ppl 452.05, val ppl 428.61
📈 Train/Val Gap: -0.0532 (-0.87%)
✅ 新的最佳驗證損失: 6.0605


100%|██████████| 4/4 [00:15<00:00,  3.90s/it]


iter 3000: loss 5.9563, time 106419.29ms


100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]


iter 3020: loss 6.1065, time 15620.87ms


100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]


iter 3040: loss 6.3552, time 15652.59ms


100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]


iter 3060: loss 6.0808, time 15636.42ms


100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]


iter 3080: loss 6.3979, time 15485.93ms


100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]


iter 3100: loss 6.5091, time 15475.96ms


100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]


iter 3120: loss 6.1551, time 15487.88ms


100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]


iter 3140: loss 5.8287, time 15461.16ms


100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]


iter 3160: loss 5.9815, time 15482.53ms


100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]


iter 3180: loss 5.9488, time 15496.40ms


100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]


step 3200: train loss 6.0558, val loss 6.0877
train ppl 426.60, val ppl 440.42
📈 Train/Val Gap: 0.0319 (0.53%)


100%|██████████| 4/4 [00:15<00:00,  3.86s/it]


iter 3200: loss 5.5599, time 106338.00ms


100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]


iter 3220: loss 5.8718, time 15626.06ms


100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]


iter 3240: loss 6.0744, time 15505.30ms


100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]


iter 3260: loss 5.9520, time 15523.93ms


100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]


iter 3280: loss 6.2099, time 15678.41ms


100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]


iter 3300: loss 5.9882, time 15671.67ms


100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]


iter 3320: loss 6.1623, time 15566.70ms


100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]


iter 3340: loss 5.9925, time 15552.37ms


100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]


iter 3360: loss 5.6256, time 15812.52ms


100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]


iter 3380: loss 6.1372, time 15818.16ms


100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]


step 3400: train loss 5.9590, val loss 6.1039
train ppl 387.22, val ppl 447.58
📈 Train/Val Gap: 0.1449 (2.43%)


100%|██████████| 4/4 [00:15<00:00,  3.91s/it]


iter 3400: loss 6.2265, time 103086.18ms


100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 3420: loss 6.2257, time 15803.34ms


100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]


iter 3440: loss 5.6932, time 15836.89ms


100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]


iter 3460: loss 6.1359, time 15822.10ms


100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]


iter 3480: loss 5.9700, time 15824.28ms


100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 3500: loss 6.1355, time 15801.71ms


100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]


iter 3520: loss 6.3472, time 15836.80ms


100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 3540: loss 6.1553, time 15798.46ms


100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]


iter 3560: loss 5.8825, time 15820.44ms


100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 3580: loss 6.2156, time 15791.89ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]


step 3600: train loss 5.9617, val loss 6.0428
train ppl 388.25, val ppl 421.09
📈 Train/Val Gap: 0.0812 (1.36%)
✅ 新的最佳驗證損失: 6.0428


100%|██████████| 4/4 [00:15<00:00,  3.81s/it]


iter 3600: loss 6.1778, time 102531.98ms


100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]


iter 3620: loss 6.1810, time 15671.98ms


100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]


iter 3640: loss 5.3467, time 15668.55ms


100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]


iter 3660: loss 5.5373, time 15665.58ms


100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]


iter 3680: loss 6.0197, time 15664.84ms


100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]


iter 3700: loss 5.5654, time 15664.81ms


100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]


iter 3720: loss 6.1644, time 15756.84ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 3740: loss 6.0262, time 15775.18ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]


iter 3760: loss 5.6009, time 15751.47ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]


iter 3780: loss 6.1376, time 15757.86ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]


step 3800: train loss 5.9372, val loss 6.0073
train ppl 378.87, val ppl 406.40
📈 Train/Val Gap: 0.0701 (1.18%)
✅ 新的最佳驗證損失: 6.0073


100%|██████████| 4/4 [00:15<00:00,  3.96s/it]


iter 3800: loss 6.0470, time 111615.09ms


100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]


iter 3820: loss 5.9435, time 15895.73ms


100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]


iter 3840: loss 6.1363, time 15917.12ms


100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]


iter 3860: loss 5.7236, time 15909.36ms


100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]


iter 3880: loss 5.7700, time 15902.37ms


100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]


iter 3900: loss 6.1048, time 15928.35ms


100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 3920: loss 6.0159, time 15790.79ms


100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 3940: loss 5.6257, time 15805.05ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 3960: loss 6.0917, time 15797.97ms


100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]


iter 3980: loss 6.0447, time 15815.32ms


100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


step 4000: train loss 5.9643, val loss 6.0411
train ppl 389.27, val ppl 420.34
📈 Train/Val Gap: 0.0768 (1.29%)


100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 4000: loss 5.9948, time 109791.48ms


100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 4020: loss 5.9998, time 15797.61ms


100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]


iter 4040: loss 6.3492, time 15903.13ms


100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]


iter 4060: loss 6.0030, time 15928.80ms


100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]


iter 4080: loss 5.2970, time 15917.60ms


100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]


iter 4100: loss 5.4980, time 15902.94ms


100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]


iter 4120: loss 6.2163, time 15668.45ms


100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]


iter 4140: loss 6.0015, time 15682.46ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 4160: loss 5.4546, time 15794.83ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 4180: loss 6.0139, time 15771.49ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]


step 4200: train loss 5.9119, val loss 5.9696
train ppl 369.42, val ppl 391.36
📈 Train/Val Gap: 0.0577 (0.98%)
✅ 新的最佳驗證損失: 5.9696


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]


iter 4200: loss 5.7219, time 111462.29ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]


iter 4220: loss 6.0915, time 15739.82ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]


iter 4240: loss 6.1990, time 15762.11ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 4260: loss 6.1937, time 15772.50ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 4280: loss 5.9830, time 15785.60ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 4300: loss 6.0023, time 15766.47ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]


iter 4320: loss 6.2153, time 15754.97ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 4340: loss 5.7905, time 15768.05ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]


iter 4360: loss 5.8381, time 15752.71ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]


iter 4380: loss 6.2888, time 15892.45ms


100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]


step 4400: train loss 5.8115, val loss 5.8644
train ppl 334.13, val ppl 352.25
📈 Train/Val Gap: 0.0528 (0.91%)
✅ 新的最佳驗證損失: 5.8644


100%|██████████| 4/4 [00:15<00:00,  3.96s/it]


iter 4400: loss 5.7271, time 111303.05ms


100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]


iter 4420: loss 6.1511, time 15905.77ms


100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]


iter 4440: loss 6.1942, time 15813.52ms


100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 4460: loss 5.7366, time 15799.91ms


100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 4480: loss 6.0591, time 15804.27ms


100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]


iter 4500: loss 6.1439, time 15812.46ms


100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]


iter 4520: loss 5.9318, time 15823.07ms


100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]


iter 4540: loss 6.2822, time 15817.53ms


100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]


iter 4560: loss 5.4870, time 15807.63ms


100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]


iter 4580: loss 6.0140, time 15829.81ms


100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]


step 4600: train loss 5.8441, val loss 5.8469
train ppl 345.19, val ppl 346.16
📈 Train/Val Gap: 0.0028 (0.05%)
✅ 新的最佳驗證損失: 5.8469


100%|██████████| 4/4 [00:15<00:00,  3.95s/it]


iter 4600: loss 5.9412, time 111094.05ms


100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]


iter 4620: loss 5.1430, time 15843.43ms


100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]


iter 4640: loss 5.7681, time 15848.80ms


100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]


iter 4660: loss 5.8778, time 15611.61ms


100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]


iter 4680: loss 6.0894, time 15626.74ms


100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]


iter 4700: loss 5.7167, time 15644.64ms


100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]


iter 4720: loss 5.5966, time 15621.73ms


100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]


iter 4740: loss 6.1155, time 15617.79ms


100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]


iter 4760: loss 5.7952, time 15616.48ms


100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]


iter 4780: loss 5.7280, time 15623.22ms


100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]


step 4800: train loss 5.8318, val loss 5.8646
train ppl 340.99, val ppl 352.35
📈 Train/Val Gap: 0.0328 (0.56%)


100%|██████████| 4/4 [00:15<00:00,  3.89s/it]


iter 4800: loss 6.3957, time 106370.99ms


100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]


iter 4820: loss 5.7878, time 15478.43ms


100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]


iter 4840: loss 5.6361, time 15492.75ms


100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]


iter 4860: loss 5.6085, time 15484.45ms


100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]


iter 4880: loss 5.8004, time 15484.09ms


100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]


iter 4900: loss 5.7110, time 15486.84ms


100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]


iter 4920: loss 6.0577, time 15467.08ms


100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]


iter 4940: loss 5.9572, time 15476.15ms


100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]


iter 4960: loss 6.0868, time 15471.84ms


100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]


iter 4980: loss 5.6704, time 15513.86ms


100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]


step 5000: train loss 5.7512, val loss 5.7315
train ppl 314.56, val ppl 308.44
📈 Train/Val Gap: -0.0196 (-0.34%)
✅ 新的最佳驗證損失: 5.7315


100%|██████████| 4/4 [00:15<00:00,  3.86s/it]


iter 5000: loss 5.0139, time 106275.65ms


100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]


iter 5020: loss 5.8510, time 15491.95ms


100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]


iter 5040: loss 5.9947, time 15471.66ms


100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]


iter 5060: loss 5.5010, time 15471.80ms


100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]


iter 5080: loss 5.8796, time 15442.98ms


100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]


iter 5100: loss 6.0343, time 15458.58ms


100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.79s/it]
100%|██████████| 4/4 [00:15<00:00,  3.79s/it]
100%|██████████| 4/4 [00:15<00:00,  3.79s/it]
100%|██████████| 4/4 [00:15<00:00,  3.78s/it]
100%|██████████| 4/4 [00:15<00:00,  3.78s/it]
100%|██████████| 4/4 [00:15<00:00,  3.78s/it]
100%|██████████| 4/4 [00:15<00:00,  3.79s/it]
100%|██████████| 4/4 [00:15<00:00,  3.79s/it]
100%|██████████| 4/4 [00:15<00:00,  3.78s/it]


iter 5120: loss 5.0457, time 15158.55ms


100%|██████████| 4/4 [00:15<00:00,  3.78s/it]
100%|██████████| 4/4 [00:15<00:00,  3.79s/it]
100%|██████████| 4/4 [00:15<00:00,  3.78s/it]
100%|██████████| 4/4 [00:15<00:00,  3.78s/it]
100%|██████████| 4/4 [00:15<00:00,  3.78s/it]
100%|██████████| 4/4 [00:15<00:00,  3.79s/it]
100%|██████████| 4/4 [00:15<00:00,  3.79s/it]
100%|██████████| 4/4 [00:15<00:00,  3.79s/it]
100%|██████████| 4/4 [00:15<00:00,  3.78s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.79s/it]
100%|██████████| 4/4 [00:15<00:00,  3.78s/it]
100%|██████████| 4/4 [00:15<00:00,  3.79s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]


iter 5140: loss 5.0893, time 15428.02ms


100%|██████████| 4/4 [00:15<00:00,  3.84s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]


iter 5160: loss 5.6403, time 15433.81ms


100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]


iter 5180: loss 5.9327, time 15434.67ms


100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]


step 5200: train loss 5.8563, val loss 5.7643
train ppl 349.44, val ppl 318.71
📈 Train/Val Gap: -0.0920 (-1.57%)


100%|██████████| 4/4 [00:15<00:00,  3.84s/it]


iter 5200: loss 6.1779, time 102877.07ms


100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]


iter 5220: loss 5.9411, time 15886.01ms


100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]


iter 5240: loss 5.8985, time 15934.99ms


100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]


iter 5260: loss 5.8752, time 15904.39ms


100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]


iter 5280: loss 5.0747, time 15913.94ms


100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  4.00s/it]


iter 5300: loss 6.0744, time 16025.40ms


100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]


iter 5320: loss 5.9490, time 15895.56ms


100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]


iter 5340: loss 5.9243, time 15882.93ms


100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]


iter 5360: loss 5.6305, time 15931.16ms


100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]


iter 5380: loss 5.9732, time 15906.67ms


100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]


step 5400: train loss 5.7287, val loss 5.7022
train ppl 307.58, val ppl 299.54
📈 Train/Val Gap: -0.0265 (-0.46%)
✅ 新的最佳驗證損失: 5.7022


100%|██████████| 4/4 [00:15<00:00,  3.87s/it]


iter 5400: loss 5.9247, time 111090.72ms


100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]


iter 5420: loss 5.4072, time 15480.11ms


100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]


iter 5440: loss 6.0422, time 15431.83ms


100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]


iter 5460: loss 5.6115, time 15425.00ms


100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]


iter 5480: loss 5.7392, time 15425.03ms


100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]


iter 5500: loss 5.7925, time 15427.70ms


100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]


iter 5520: loss 5.0350, time 15474.65ms


100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]


iter 5540: loss 6.0897, time 15471.34ms


100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]


iter 5560: loss 5.6078, time 15470.69ms


100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]


iter 5580: loss 5.5209, time 15455.01ms


100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]


step 5600: train loss 5.7243, val loss 5.6876
train ppl 306.22, val ppl 295.18
📈 Train/Val Gap: -0.0367 (-0.64%)
✅ 新的最佳驗證損失: 5.6876


100%|██████████| 4/4 [00:15<00:00,  3.85s/it]


iter 5600: loss 6.0609, time 110831.49ms


100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]


iter 5620: loss 5.2107, time 15453.70ms


100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]


iter 5640: loss 5.7197, time 15461.15ms


100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]


iter 5660: loss 5.8741, time 15464.67ms


100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]


iter 5680: loss 5.4288, time 15498.27ms


100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]


iter 5700: loss 6.0602, time 15522.05ms


100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]


iter 5720: loss 6.2693, time 15520.18ms


100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]


iter 5740: loss 4.9162, time 15437.36ms


100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]


iter 5760: loss 5.4620, time 15444.10ms


100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.84s/it]
100%|██████████| 4/4 [00:15<00:00,  3.79s/it]
100%|██████████| 4/4 [00:15<00:00,  3.79s/it]


iter 5780: loss 5.7832, time 15168.33ms


100%|██████████| 4/4 [00:15<00:00,  3.78s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.79s/it]
100%|██████████| 4/4 [00:15<00:00,  3.79s/it]
100%|██████████| 4/4 [00:15<00:00,  3.79s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.79s/it]
100%|██████████| 4/4 [00:15<00:00,  3.79s/it]
100%|██████████| 4/4 [00:15<00:00,  3.78s/it]
100%|██████████| 4/4 [00:15<00:00,  3.79s/it]
100%|██████████| 4/4 [00:15<00:00,  3.79s/it]
100%|██████████| 4/4 [00:15<00:00,  3.79s/it]
100%|██████████| 4/4 [00:15<00:00,  3.79s/it]
100%|██████████| 4/4 [00:15<00:00,  3.79s/it]
100%|██████████| 4/4 [00:15<00:00,  3.79s/it]
100%|██████████| 4/4 [00:15<00:00,  3.79s/it]
100%|██████████| 4/4 [00:15<00:00,  3.79s/it]
100%|██████████| 4/4 [00:15<00:00,  3.79s/it]
100%|██████████| 4/4 [00:15<00:00,  3.79s/it]


step 5800: train loss 5.7130, val loss 5.6707
train ppl 302.77, val ppl 290.24
📈 Train/Val Gap: -0.0423 (-0.74%)
✅ 新的最佳驗證損失: 5.6707


100%|██████████| 4/4 [00:14<00:00,  3.75s/it]


iter 5800: loss 4.9747, time 103700.74ms


100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.75s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.75s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.76s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.77s/it]
100%|██████████| 4/4 [00:15<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.75s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.76s/it]


iter 5820: loss 5.8321, time 15051.92ms


100%|██████████| 4/4 [00:14<00:00,  3.74s/it]
100%|██████████| 4/4 [00:15<00:00,  3.76s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.75s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.77s/it]
100%|██████████| 4/4 [00:15<00:00,  3.76s/it]
100%|██████████| 4/4 [00:15<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.76s/it]
100%|██████████| 4/4 [00:15<00:00,  3.75s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.76s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.76s/it]


iter 5840: loss 6.0211, time 15060.71ms


100%|██████████| 4/4 [00:14<00:00,  3.74s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.77s/it]
100%|██████████| 4/4 [00:15<00:00,  3.76s/it]
100%|██████████| 4/4 [00:15<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.75s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.75s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.75s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.76s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.76s/it]


iter 5860: loss 5.8233, time 15054.81ms


100%|██████████| 4/4 [00:14<00:00,  3.74s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.76s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.76s/it]
100%|██████████| 4/4 [00:15<00:00,  3.75s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.75s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.75s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]


iter 5880: loss 5.9719, time 15022.20ms


100%|██████████| 4/4 [00:14<00:00,  3.74s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.75s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.75s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.76s/it]
100%|██████████| 4/4 [00:15<00:00,  3.75s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.75s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.75s/it]


iter 5900: loss 5.6944, time 15035.75ms


100%|██████████| 4/4 [00:14<00:00,  3.74s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.76s/it]
100%|██████████| 4/4 [00:15<00:00,  3.75s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.76s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.76s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.76s/it]
100%|██████████| 4/4 [00:15<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.75s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.76s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.75s/it]


iter 5920: loss 5.9085, time 15031.09ms


100%|██████████| 4/4 [00:14<00:00,  3.74s/it]
100%|██████████| 4/4 [00:15<00:00,  3.75s/it]
100%|██████████| 4/4 [00:14<00:00,  3.75s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.84s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.84s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.84s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]


iter 5940: loss 5.8341, time 15337.69ms


100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.84s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.84s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]


iter 5960: loss 5.7072, time 15326.53ms


100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.84s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.84s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]


iter 5980: loss 5.8288, time 15341.74ms


100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.84s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.76s/it]
100%|██████████| 4/4 [00:15<00:00,  3.77s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]


step 6000: train loss 5.6614, val loss 5.7024
train ppl 287.56, val ppl 299.58
📈 Train/Val Gap: 0.0410 (0.72%)


100%|██████████| 4/4 [00:15<00:00,  3.83s/it]


iter 6000: loss 5.1006, time 102739.93ms


100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.84s/it]


iter 6020: loss 5.9979, time 15367.86ms


100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 6040: loss 5.7310, time 15796.43ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]


iter 6060: loss 5.8667, time 15806.91ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 6080: loss 5.7481, time 15789.33ms


100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.05s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]


iter 6100: loss 5.7764, time 16180.50ms


100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.05s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]


iter 6120: loss 6.1974, time 16146.07ms


100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.05s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]


iter 6140: loss 6.1029, time 16162.38ms


100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]


iter 6160: loss 5.7572, time 16138.71ms


100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]


iter 6180: loss 6.0238, time 16166.19ms


100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.05s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]


step 6200: train loss 5.6727, val loss 5.6852
train ppl 290.83, val ppl 294.47
📈 Train/Val Gap: 0.0124 (0.22%)


100%|██████████| 4/4 [00:16<00:00,  4.03s/it]


iter 6200: loss 5.7814, time 111792.80ms


100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.05s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]


iter 6220: loss 5.7528, time 16160.05ms


100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.05s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]


iter 6240: loss 5.7890, time 16156.54ms


100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]


iter 6260: loss 5.6703, time 16159.02ms


100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]


iter 6280: loss 5.7033, time 16150.24ms


100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]


iter 6300: loss 5.7651, time 16153.41ms


100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]


iter 6320: loss 5.7703, time 16165.60ms


100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.05s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]


iter 6340: loss 5.2085, time 16156.40ms


100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]


iter 6360: loss 5.2410, time 16151.09ms


100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]


iter 6380: loss 5.5699, time 16157.58ms


100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]


step 6400: train loss 5.6765, val loss 5.6356
train ppl 291.92, val ppl 280.22
📈 Train/Val Gap: -0.0409 (-0.72%)
✅ 新的最佳驗證損失: 5.6356


100%|██████████| 4/4 [00:16<00:00,  4.03s/it]


iter 6400: loss 5.6863, time 111653.71ms


100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.05s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]


iter 6420: loss 5.5966, time 16154.72ms


100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.05s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]


iter 6440: loss 5.9702, time 15502.21ms


100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]


iter 6460: loss 5.4315, time 16125.31ms


100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]


iter 6480: loss 5.6507, time 16108.65ms


100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]


iter 6500: loss 5.7186, time 16104.86ms


100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]


iter 6520: loss 5.5514, time 16123.67ms


100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]


iter 6540: loss 5.9285, time 16124.33ms


100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]


iter 6560: loss 5.8792, time 16150.70ms


100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]


iter 6580: loss 5.6958, time 16157.51ms


100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]


step 6600: train loss 5.5060, val loss 5.6322
train ppl 246.17, val ppl 279.27
📈 Train/Val Gap: 0.1262 (2.29%)
✅ 新的最佳驗證損失: 5.6322


100%|██████████| 4/4 [00:16<00:00,  4.01s/it]


iter 6600: loss 5.8858, time 110454.86ms


100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]


iter 6620: loss 5.9298, time 16105.06ms


100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]


iter 6640: loss 5.6967, time 16112.42ms


100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]


iter 6660: loss 5.6618, time 16108.60ms


100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]


iter 6680: loss 6.0206, time 16099.84ms


100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]


iter 6700: loss 4.9569, time 16139.68ms


100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]


iter 6720: loss 5.8378, time 16101.48ms


100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]


iter 6740: loss 5.8509, time 16111.17ms


100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]


iter 6760: loss 5.4946, time 16151.13ms


100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]


iter 6780: loss 5.4645, time 16148.97ms


100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]


step 6800: train loss 5.6166, val loss 5.5586
train ppl 274.95, val ppl 259.45
📈 Train/Val Gap: -0.0580 (-1.03%)
✅ 新的最佳驗證損失: 5.5586


100%|██████████| 4/4 [00:16<00:00,  4.03s/it]


iter 6800: loss 5.6080, time 111953.26ms


100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.05s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]


iter 6820: loss 5.9202, time 16161.72ms


100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.05s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]


iter 6840: loss 6.0354, time 16136.99ms


100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.05s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]


iter 6860: loss 6.0583, time 16164.95ms


100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.05s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]


iter 6880: loss 5.7186, time 16165.38ms


100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.07s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]


iter 6900: loss 5.5290, time 16269.36ms


100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.07s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.07s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.07s/it]
100%|██████████| 4/4 [00:16<00:00,  4.07s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]


iter 6920: loss 5.1491, time 16278.86ms


100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.07s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.07s/it]
100%|██████████| 4/4 [00:16<00:00,  4.07s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.07s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.07s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]


iter 6940: loss 5.2195, time 16268.26ms


100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.07s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.07s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.07s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]


iter 6960: loss 5.1447, time 16283.50ms


100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.07s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]


iter 6980: loss 5.8080, time 16258.96ms


100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.07s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.07s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.07s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]


step 7000: train loss 5.5023, val loss 5.6430
train ppl 245.26, val ppl 282.31
📈 Train/Val Gap: 0.1407 (2.56%)


100%|██████████| 4/4 [00:16<00:00,  4.06s/it]


iter 7000: loss 5.8001, time 111976.89ms


100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.07s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.07s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.07s/it]
100%|██████████| 4/4 [00:16<00:00,  4.07s/it]
100%|██████████| 4/4 [00:16<00:00,  4.07s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]


iter 7020: loss 5.8631, time 16099.58ms


100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]


iter 7040: loss 5.3764, time 16163.07ms


100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]


iter 7060: loss 5.6726, time 16087.25ms


100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]


iter 7080: loss 5.6957, time 16102.58ms


100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]


iter 7100: loss 5.8355, time 16109.42ms


100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]


iter 7120: loss 5.7826, time 16102.83ms


100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]


iter 7140: loss 5.3405, time 16107.01ms


100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 7160: loss 6.2135, time 15786.22ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 7180: loss 5.1415, time 15767.11ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


step 7200: train loss 5.5761, val loss 5.5946
train ppl 264.04, val ppl 268.96
📈 Train/Val Gap: 0.0185 (0.33%)


100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 7200: loss 5.0281, time 105788.90ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 7220: loss 5.2597, time 15768.60ms


100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 7240: loss 4.9599, time 15769.42ms


100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 7260: loss 5.5544, time 15799.59ms


100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 7280: loss 5.5821, time 15767.24ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 7300: loss 5.5769, time 15769.57ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 7320: loss 5.9896, time 15784.22ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 7340: loss 5.7844, time 15777.00ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 7360: loss 5.8601, time 15782.35ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 7380: loss 5.5400, time 15775.51ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


step 7400: train loss 5.5685, val loss 5.4677
train ppl 262.03, val ppl 236.92
📈 Train/Val Gap: -0.1007 (-1.81%)
✅ 新的最佳驗證損失: 5.4677


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]


iter 7400: loss 5.4306, time 105685.70ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 7420: loss 5.6979, time 15770.52ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 7440: loss 5.8069, time 15772.24ms


100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 7460: loss 5.7323, time 15779.18ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 7480: loss 5.6434, time 15764.69ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 7500: loss 5.7466, time 15788.62ms


100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 7520: loss 5.3091, time 15791.94ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 7540: loss 5.7748, time 15784.92ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 7560: loss 5.7457, time 15769.71ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 7580: loss 5.7022, time 15792.12ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


step 7600: train loss 5.4548, val loss 5.4915
train ppl 233.89, val ppl 242.63
📈 Train/Val Gap: 0.0367 (0.67%)


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]


iter 7600: loss 5.9425, time 105568.63ms


100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 7620: loss 5.6375, time 15767.28ms


100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]


iter 7640: loss 5.8062, time 15805.58ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 7660: loss 5.7369, time 15775.91ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]


iter 7680: loss 4.7477, time 16105.49ms


100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]


iter 7700: loss 5.5840, time 16114.31ms


100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]


iter 7720: loss 5.3683, time 16113.48ms


100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]


iter 7740: loss 5.6852, time 16110.90ms


100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.08s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.07s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.07s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]


iter 7760: loss 5.8309, time 16274.38ms


100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.07s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.07s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]


iter 7780: loss 5.6827, time 16256.78ms


100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.07s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.07s/it]
100%|██████████| 4/4 [00:16<00:00,  4.07s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.07s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.07s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]


step 7800: train loss 5.5931, val loss 5.6079
train ppl 268.56, val ppl 272.58
📈 Train/Val Gap: 0.0149 (0.27%)


100%|██████████| 4/4 [00:16<00:00,  4.06s/it]


iter 7800: loss 5.7831, time 111816.57ms


100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.07s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.06s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  4.00s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]


iter 7820: loss 5.1261, time 15994.75ms


100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]


iter 7840: loss 5.4030, time 15940.69ms


100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:16<00:00,  4.00s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]


iter 7860: loss 6.1359, time 15937.57ms


100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]


iter 7880: loss 5.3182, time 15827.24ms


100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]


iter 7900: loss 5.6167, time 15840.02ms


100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]


iter 7920: loss 5.4195, time 15829.93ms


100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]


iter 7940: loss 5.7471, time 15827.17ms


100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]


iter 7960: loss 5.5319, time 15824.18ms


100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]


iter 7980: loss 5.1155, time 15465.42ms


100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]


step 8000: train loss 5.5639, val loss 5.6689
train ppl 260.85, val ppl 289.71
📈 Train/Val Gap: 0.1049 (1.89%)


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]


iter 8000: loss 4.1350, time 111021.95ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 8020: loss 5.3131, time 15774.46ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]


iter 8040: loss 5.4061, time 15739.73ms


100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]


iter 8060: loss 5.7900, time 15745.83ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]


iter 8080: loss 6.2723, time 15759.26ms


100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]


iter 8100: loss 6.1321, time 15676.68ms


100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]


iter 8120: loss 5.9392, time 15826.22ms


100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]


iter 8140: loss 5.6486, time 15807.09ms


100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]


iter 8160: loss 5.8584, time 15851.66ms


100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]


iter 8180: loss 5.5254, time 15854.49ms


100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]


step 8200: train loss 5.5587, val loss 5.5480
train ppl 259.48, val ppl 256.73
📈 Train/Val Gap: -0.0107 (-0.19%)


100%|██████████| 4/4 [00:15<00:00,  3.95s/it]


iter 8200: loss 5.8994, time 106756.48ms


100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]


iter 8220: loss 5.0746, time 15821.58ms


100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]


iter 8240: loss 5.6633, time 15824.42ms


100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]


iter 8260: loss 5.6624, time 15833.99ms


100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]


iter 8280: loss 5.3984, time 15532.78ms


100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]


iter 8300: loss 5.1278, time 15471.06ms


100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]


iter 8320: loss 4.8869, time 15587.62ms


100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]


iter 8340: loss 5.4573, time 15721.89ms


100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]


iter 8360: loss 5.7486, time 15483.19ms


100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]


iter 8380: loss 5.0639, time 15496.47ms


100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]


step 8400: train loss 5.4839, val loss 5.6306
train ppl 240.77, val ppl 278.84
📈 Train/Val Gap: 0.1468 (2.68%)


100%|██████████| 4/4 [00:15<00:00,  3.79s/it]


iter 8400: loss 5.9566, time 106005.56ms


100%|██████████| 4/4 [00:15<00:00,  3.79s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.79s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]


iter 8420: loss 5.0150, time 15505.60ms


100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]


iter 8440: loss 5.7114, time 15534.96ms


100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]


iter 8460: loss 6.0732, time 15524.91ms


100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]


iter 8480: loss 5.4109, time 15514.97ms


100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]


iter 8500: loss 5.0653, time 15512.68ms


100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]


iter 8520: loss 5.9583, time 15510.46ms


100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 8540: loss 5.1063, time 15767.50ms


100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]


iter 8560: loss 5.0365, time 15655.99ms


100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.84s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.84s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.84s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.84s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.84s/it]
100%|██████████| 4/4 [00:15<00:00,  3.84s/it]


iter 8580: loss 5.7856, time 15366.53ms


100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.84s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.84s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.84s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.84s/it]


step 8600: train loss 5.5921, val loss 5.6245
train ppl 268.30, val ppl 277.13
📈 Train/Val Gap: 0.0324 (0.58%)


100%|██████████| 4/4 [00:15<00:00,  3.83s/it]


iter 8600: loss 5.6303, time 109006.29ms


100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.84s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.84s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]


iter 8620: loss 4.8029, time 15679.30ms


100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]


iter 8640: loss 5.8587, time 15718.03ms


100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]


iter 8660: loss 5.7308, time 15701.36ms


100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.84s/it]
100%|██████████| 4/4 [00:15<00:00,  3.84s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.84s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]


iter 8680: loss 5.6890, time 15355.33ms


100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.04s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.05s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  4.00s/it]


iter 8700: loss 5.9375, time 16019.99ms


100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:16<00:00,  4.00s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:16<00:00,  4.00s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:16<00:00,  4.00s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]


iter 8720: loss 5.2169, time 15968.30ms


100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]


iter 8740: loss 5.7046, time 15607.52ms


100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]


iter 8760: loss 6.1786, time 15595.57ms


100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]


iter 8780: loss 5.8205, time 15286.39ms


100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]


step 8800: train loss 5.5652, val loss 5.5314
train ppl 261.18, val ppl 252.49
📈 Train/Val Gap: -0.0339 (-0.61%)


100%|██████████| 4/4 [00:15<00:00,  3.80s/it]


iter 8800: loss 5.4310, time 106564.19ms


100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]


iter 8820: loss 5.1082, time 15251.63ms


100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]


iter 8840: loss 5.5405, time 15238.99ms


100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]


iter 8860: loss 5.9455, time 15241.08ms


100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]


iter 8880: loss 5.7669, time 15248.67ms


100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]


iter 8900: loss 5.9733, time 15271.55ms


100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]


iter 8920: loss 5.1362, time 15434.02ms


100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]


iter 8940: loss 4.6876, time 15428.93ms


100%|██████████| 4/4 [00:15<00:00,  3.84s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]


iter 8960: loss 5.6398, time 15680.28ms


100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.88s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]


iter 8980: loss 5.6620, time 15315.63ms


100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.85s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]


step 9000: train loss 5.4626, val loss 5.5231
train ppl 235.71, val ppl 250.41
📈 Train/Val Gap: 0.0605 (1.11%)


100%|██████████| 4/4 [00:15<00:00,  3.95s/it]


iter 9000: loss 5.8760, time 106258.80ms


100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]


iter 9020: loss 5.5194, time 15852.03ms


100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]


iter 9040: loss 5.2353, time 15682.79ms


100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]


iter 9060: loss 5.5142, time 15703.67ms


100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.89s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]


iter 9080: loss 5.7077, time 15326.44ms


100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]


iter 9100: loss 5.2230, time 15312.56ms


100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]


iter 9120: loss 4.3986, time 15310.41ms


100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.84s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]


iter 9140: loss 5.1878, time 15848.72ms


100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]


iter 9160: loss 5.3356, time 15671.84ms


100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]


iter 9180: loss 5.7924, time 15683.49ms


100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]


step 9200: train loss 5.4532, val loss 5.5282
train ppl 233.51, val ppl 251.70
📈 Train/Val Gap: 0.0750 (1.38%)


100%|██████████| 4/4 [00:15<00:00,  3.91s/it]


iter 9200: loss 5.7936, time 103119.11ms


100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]


iter 9220: loss 5.4721, time 15697.18ms


100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]


iter 9240: loss 5.7246, time 15676.51ms


100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]


iter 9260: loss 4.6999, time 15673.08ms


100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]


iter 9280: loss 5.2080, time 15679.47ms


100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.91s/it]
100%|██████████| 4/4 [00:15<00:00,  3.97s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]


iter 9300: loss 5.6051, time 15847.59ms


100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]


iter 9320: loss 5.4032, time 15839.32ms


100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


iter 9340: loss 5.4830, time 15800.63ms


100%|██████████| 4/4 [00:15<00:00,  3.94s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  4.00s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  4.00s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  4.00s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:16<00:00,  4.00s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]


iter 9360: loss 5.4361, time 15984.50ms


100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  4.00s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  4.00s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]


iter 9380: loss 5.1316, time 15971.71ms


100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]


step 9400: train loss 5.5280, val loss 5.4976
train ppl 251.65, val ppl 244.10
📈 Train/Val Gap: -0.0305 (-0.55%)


100%|██████████| 4/4 [00:15<00:00,  3.98s/it]


iter 9400: loss 5.8633, time 106345.00ms


100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]


iter 9420: loss 5.6864, time 15947.64ms


100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]


iter 9440: loss 5.9789, time 15959.94ms


100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  4.00s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  4.00s/it]


iter 9460: loss 5.1273, time 16006.99ms


100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  4.00s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  4.00s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]


iter 9480: loss 5.3823, time 15965.99ms


100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]


iter 9500: loss 5.6172, time 15971.20ms


100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.99s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
100%|██████████| 4/4 [00:15<00:00,  4.00s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]


iter 9520: loss 5.7782, time 16127.37ms


100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.01s/it]


iter 9540: loss 5.7875, time 16073.77ms


100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.01s/it]


iter 9560: loss 5.6105, time 16074.94ms


100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.05s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]


iter 9580: loss 5.5412, time 16119.12ms


100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.03s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.02s/it]
100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.96s/it]
100%|██████████| 4/4 [00:15<00:00,  3.95s/it]


step 9600: train loss 5.5777, val loss 5.5689
train ppl 264.47, val ppl 262.13
📈 Train/Val Gap: -0.0089 (-0.16%)


100%|██████████| 4/4 [00:15<00:00,  3.95s/it]

iter 9600: loss 5.7583, time 106191.17ms
訓練完成！
最佳驗證損失: 5.4677



