In [1]:
import subprocess
import os
from types import SimpleNamespace
import torch
# 상위 폴더로 이동
os.chdir('..')
# GPU device 번호 순서 정리
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" 
# 첫번째 GPU만 사용
os.environ["CUDA_VISIBLE_DEVICES"]= "0"

## GPU가 할당되었는지 확인하기
GPU를 설정한 이후에 다음과 같은 코드로 device에 cuda (GPU)를 설정하고 device와 current_device()를 출력해봄으로 GPU가 잘 할당되었는지 확인한다.

In [None]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print('Device:', device)
print('Current cuda device:', torch.cuda.current_device())
print('Count of using GPUs:', torch.cuda.device_count())

### parameter 설정
딥러닝 모델 훈련에 필요한 설정값을 정의하는 방법에 대해 설명하겠습니다. 이 설정은 모델을 훈련할 때 중요한 역할을 하며, 성능을 최적화하는 데 도움이 됩니다. 

1. model 리스트 설정(models)

    우리는 사전학습된 모델을 이용해서 성능을 높일 것입니다. 각각의 모델은 고유한 구조와 특성을 가지고, 이 모델들은 Huggingface에서 확인하여 사용할 수 있습니다.
    또한 여러가지 모델을 사용해봄으로써 어떤 모델들이 성능이 좋게 나올지 확인해볼 것 입니다.


2. 시드값 설정(seeds)

    시드값은 모델 훈련의 재현성을 보장하기 위해 설정됩니다. 시드를 고정하면 매번 동일한 조건에서 실험이 진행되므로, 결과 비교가 용이해집니다. 또한 seed값마다 다른 성능이 나오고 마지막으로 이들을 앙상블하여 성능을 높이는 것 까지 확인해볼 예정입니다.

3. 배치사이즈 설정(batchsize)

    배치 사이즈는 한 번의 훈련 단계에서 처리할 데이터 샘플의 수를 의미합니다. 배치 사이즈가 크면 훈련 속도가 빨라지지만, 메모리 사용량(VRAM)이 증가할 수 있습니다. 또한 일반적으로 배치 사이즈는 너무 크면 성능이 감소함이 알려져있습니다. 또한 배치사이즈가 작을수록(<128) 학습률에 영향을 많이 받습니다. 

4. 학습률 설정(lr)

    학습률(Learning Rate)은 모델이 얼마나 빠르게 또는 천천히 학습할지를 결정하는 중요한 하이퍼파라미터입니다. 학습률이 너무 크면 훈련이 불안정해지고, 너무 작으면 훈련 속도가 매우 느려질 수 있습니다. 또한 일반적으로 배치 사이즈와 학습률은 양의 상관관계에 있습니다. 

    

5. weight decay 설정

    Weight decay는 기계 학습, 특히 딥러닝에서 모델의 과적합(overfitting)을 방지하기 위해 자주 사용되는 정규화 기법입니다. 이는 모델의 학습 과정에서 가중치(weight)가 지나치게 커지는 것을 억제하여, 모델이 학습 데이터에 지나치게 의존하지 않도록 도와줍니다.

    Weight decay는 기본적으로 비용 함수(cost function)에 가중치의 크기를 제어하는 항(term)을 추가하여 구현됩니다. 이 항은 L2 정규화로도 알려져 있으며, 주로 가중치의 제곱합(squared sum)을 비용 함수에 추가하는 방식입니다. 즉, 모델의 비용 함수는 다음과 같이 변형됩니다:

    
    $$L(w) = L_0(w) + \lambda \sum w_i^2$$
    

    여기서,
    - $L(w)$ 는 정규화된 비용 함수입니다.
    - $L_0(w)$는 원래의 비용 함수(예: 교차 엔트로피, MSE 등)입니다.
    - $w_i$는 모델의 가중치입니다.
    - $\lambda$는 weight decay의 강도를 조절하는 하이퍼파라미터입니다. 이 값이 크면 가중치에 더 큰 페널티를 주고, 작으면 페널티가 줄어듭니다.


6. 추천 파라미터
    bs, lr, wd = (32, 0.03, 1e-5)


- convnext small fp16 기준 17698MB VRAM 필요


In [10]:
# config parameter 설정
config = {
    "models": ["facebook/convnext-small-224"],
    "seeds": [11, 22, 33],
    "batchsize": 32,
    "epochs": 15,
    "early_stopping_epoch": 3,
    "dataset_dir": "data/hfdataset/train_valid",
    "project_name" : "cifar100"
}


### simplenamespace
Python에서 딕셔너리는 키-값 쌍을 저장하는 데 자주 사용됩니다. 하지만 설정 값이나 객체를 다룰 때, 클래스 속성처럼 점 표기법(예: `config.models`)을 사용해 값을 접근하는 것이 더 편리할 수 있습니다. Python의 `types.SimpleNamespace`는 이러한 기능을 제공하여 딕셔너리의 키를 객체 속성처럼 사용할 수 있게 해줍니다.

In [11]:
# SimpleNamespace로 변환
config = SimpleNamespace(**config)

## py 파일과 .ipynb 파일의 비교
train.py와 같은 .py 파일과 Jupyter Notebook의 .ipynb 파일은 둘 다 Python 코드를 작성하고 실행할 수 있지만, 목적과 사용 방식이 다릅니다.

1) 주요 목적
- py 파일: Python 스크립트를 작성하는 가장 일반적인 파일 형식입니다. 주로 독립적으로 실행할 수 있는 코드 작성에 사용되며, 대규모 프로젝트에서 많이 사용됩니다. 코드가 직관적이고 파일 단위로 구성되어 있어서 유지보수가 쉽고, 배포 시에도 매우 적합합니다. 학습 및 훈련 자동화 작업에 많이 사용됩니다.

- ipynb 파일: Jupyter Notebook 파일로, 대화형 코드 실행을 위해 설계되었습니다. 한 셀(Cell) 단위로 코드를 작성하고 바로 실행할 수 있어 데이터 분석, 실험, 시각화 등의 작업에 적합합니다. 주로 학습 과정에서 즉각적인 피드백을 받을 수 있는 환경을 제공합니다.

2) 사용 용도
- py 파일:

    자동화: 반복적으로 실행해야 하는 훈련이나 배포 작업에서 .py 파일이 유용합니다. 학습 파이프라인을 자동화하거나, 여러 모델과 하이퍼파라미터를 반복적으로 실험할 때 주로 사용됩니다.

    대규모 프로젝트: 협업이 필요한 대규모 프로젝트에서는 .py 파일을 사용하는 것이 훨씬 효율적입니다. 모듈화된 코드 관리와 배포가 쉽습니다.
- ipynb 파일:

    연구 및 실험: 중간 결과를 즉시 확인할 수 있고, 데이터를 시각화하면서 실험할 때 유리합니다. 데이터 전처리, 탐색적 데이터 분석(EDA), 모델의 성능 확인을 세밀하게 조정할 수 있습니다.

    강의 및 데모: 시각적이고 설명을 곁들인 코드 셀을 만들 수 있어서, 강의나 실습에 매우 적합합니다.

3) 재사용성과 유지보수
- py 파일: .py 파일은 독립적으로 실행되기 때문에 배포 후에도 쉽게 수정 및 재사용이 가능합니다. 특히 대규모 코드 베이스에서는 각 기능을 모듈화하여 여러 곳에서 재사용할 수 있습니다.

- ipynb 파일: .ipynb 파일은 실험적이고 탐색적인 작업에 더 적합하며, 특정 실험 결과에 집중할 때는 편리합니다. 하지만 대규모 프로젝트에서는 파일 관리와 코드 유지보수가 어렵고, 자동화된 작업이나 재사용성에서는 제한적입니다.



우리는 오늘 .py 파일을 이용해서 실험을 돌려볼것입니다.

In [12]:
# Loop over models and seeds
for model in config.models:
    project_model_name = model.replace("/", "_")
    for seed in config.seeds:
        command = [
            "CUDA_VISIBLE_DEVICES=0,1",
            "python3", "train.py",
            "-b", str(config.batchsize),
            "-e", str(config.epochs),
            "--use-v2",
            "--seed", str(seed),
            "--model", model,
            "--project-name", f"{config.project_name}_{project_model_name}",
            "--dataset-dir", config.dataset_dir,
            "--fp16",
            "--torch-compile",
        ]
        
        # Use subprocess to execute the command
        subprocess.run(" ".join(command), shell=True, check=True)

wandb: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: nwirandx. Use `wandb login --relogin` to force relogin
wandb: Tracking run with wandb version 0.18.1
wandb: Run data is saved locally in /tmp/wandb/run-20241021_024728-94lo0e1c
wandb: Run `wandb offline` to turn off syncing.
wandb: Syncing run convnext-small-224_20241021_0247
wandb: ⭐️ View project at https://wandb.ai/nwirandx/cifar100_facebook_convnext-small-224
wandb: 🚀 View run at https://wandb.ai/nwirandx/cifar100_facebook_convnext-small-224/runs/94lo0e1c
Some weights of ConvNextForImageClassification were not initialized from the model checkpoint at facebook/convnext-small-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([100]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([100

{'loss': 4.411, 'grad_norm': 1.3581597805023193, 'learning_rate': 0.0009998986144924252, 'epoch': 0.06}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:09, 15.96it/s][A
  3%|▎         | 4/157 [00:00<00:11, 12.81it/s][A
  4%|▍         | 6/157 [00:00<00:12, 12.12it/s][A
  5%|▌         | 8/157 [00:00<00:13, 10.91it/s][A
  6%|▋         | 10/157 [00:00<00:13, 10.57it/s][A
  8%|▊         | 12/157 [00:01<00:13, 10.73it/s][A
  9%|▉         | 14/157 [00:01<00:12, 11.09it/s][A
 10%|█         | 16/157 [00:01<00:12, 11.20it/s][A
 11%|█▏        | 18/157 [00:01<00:12, 11.23it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 11.68it/s][A
 14%|█▍        | 22/157 [00:01<00:11, 11.97it/s][A
 15%|█▌        | 24/157 [00:02<00:10, 12.18it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 11.98it/s][A
 18%|█▊        | 28/157 [00:02<00:12, 10.14it/s][A
 19%|█▉        | 30/157 [00:02<00:12, 10.02it/s][A
 20%|██        | 32/157 [00:02<00:12,  9.83it/s][A
 22%|██▏       | 34/157 [00:03<00:12,  9.95it/s][A
 23%|██▎       | 36/157 [00:03<00:12,  9.72it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 3.827998399734497, 'eval_acc': 0.2621, 'eval_acc2': 0.3719, 'eval_f1': 0.20649925929979712, 'eval_roc_auc_micro': 0.9079793587373737, 'eval_precision': 0.2621, 'eval_recall': 0.2621, 'eval_runtime': 18.8683, 'eval_samples_per_second': 529.989, 'eval_steps_per_second': 8.321, 'epoch': 0.06}


  1%|▏         | 20/1560 [00:43<28:32,  1.11s/it]  

{'loss': 3.4851, 'grad_norm': 3.1959052085876465, 'learning_rate': 0.0009995944990857848, 'epoch': 0.13}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:17,  8.80it/s][A
  2%|▏         | 3/157 [00:00<00:16,  9.22it/s][A
  3%|▎         | 5/157 [00:00<00:14, 10.72it/s][A
  4%|▍         | 7/157 [00:00<00:13, 11.50it/s][A
  6%|▌         | 9/157 [00:00<00:12, 11.95it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.21it/s][A
  8%|▊         | 13/157 [00:01<00:11, 12.16it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.37it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.49it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.57it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.39it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.57it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.66it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.65it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.69it/s][A
 20%|█▉        | 31/157 [00:02<00:09, 12.72it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.63it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.66it/s][A
 24%|██▎       | 37/157 [

{'eval_loss': 2.7571372985839844, 'eval_acc': 0.41, 'eval_acc2': 0.5726, 'eval_f1': 0.3749869291021614, 'eval_roc_auc_micro': 0.966732197121212, 'eval_precision': 0.41, 'eval_recall': 0.41, 'eval_runtime': 15.8103, 'eval_samples_per_second': 632.498, 'eval_steps_per_second': 9.93, 'epoch': 0.13}


  2%|▏         | 30/1560 [01:08<27:12,  1.07s/it]  

{'loss': 2.6721, 'grad_norm': 3.6387195587158203, 'learning_rate': 0.0009990877771116587, 'epoch': 0.19}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:10, 14.85it/s][A
  3%|▎         | 4/157 [00:00<00:11, 13.58it/s][A
  4%|▍         | 6/157 [00:00<00:11, 13.18it/s][A
  5%|▌         | 8/157 [00:00<00:11, 12.82it/s][A
  6%|▋         | 10/157 [00:00<00:12, 11.91it/s][A
  8%|▊         | 12/157 [00:01<00:21,  6.80it/s][A
  9%|▉         | 14/157 [00:01<00:18,  7.67it/s][A
 10%|▉         | 15/157 [00:01<00:17,  8.00it/s][A
 11%|█         | 17/157 [00:01<00:15,  8.83it/s][A
 12%|█▏        | 19/157 [00:02<00:14,  9.27it/s][A
 13%|█▎        | 21/157 [00:02<00:14,  9.67it/s][A
 15%|█▍        | 23/157 [00:02<00:13,  9.72it/s][A
 16%|█▌        | 25/157 [00:02<00:13, 10.13it/s][A
 17%|█▋        | 27/157 [00:02<00:12, 10.21it/s][A
 18%|█▊        | 29/157 [00:02<00:12, 10.42it/s][A
 20%|█▉        | 31/157 [00:03<00:11, 10.55it/s][A
 21%|██        | 33/157 [00:03<00:11, 10.72it/s][A
 22%|██▏       | 35/157 [00:03<00:11, 10.80it/s][A
 24%|██▎       | 37/157 

{'eval_loss': 2.0121726989746094, 'eval_acc': 0.5522, 'eval_acc2': 0.7204, 'eval_f1': 0.5272348703214194, 'eval_roc_auc_micro': 0.981817742878788, 'eval_precision': 0.5522, 'eval_recall': 0.5522, 'eval_runtime': 17.6896, 'eval_samples_per_second': 565.305, 'eval_steps_per_second': 8.875, 'epoch': 0.19}


  3%|▎         | 40/1560 [01:35<27:41,  1.09s/it]  

{'loss': 2.1318, 'grad_norm': 3.858973741531372, 'learning_rate': 0.000998378654067105, 'epoch': 0.26}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:13, 11.67it/s][A
  3%|▎         | 4/157 [00:00<00:12, 11.79it/s][A
  4%|▍         | 6/157 [00:00<00:12, 12.21it/s][A
  5%|▌         | 8/157 [00:00<00:12, 12.18it/s][A
  6%|▋         | 10/157 [00:00<00:11, 12.28it/s][A
  8%|▊         | 12/157 [00:00<00:11, 12.39it/s][A
  9%|▉         | 14/157 [00:01<00:11, 12.43it/s][A
 10%|█         | 16/157 [00:01<00:11, 12.52it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 12.58it/s][A
 13%|█▎        | 20/157 [00:01<00:10, 12.66it/s][A
 14%|█▍        | 22/157 [00:01<00:10, 12.72it/s][A
 15%|█▌        | 24/157 [00:01<00:10, 12.70it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.53it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 12.47it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 12.59it/s][A
 20%|██        | 32/157 [00:02<00:09, 12.65it/s][A
 22%|██▏       | 34/157 [00:02<00:09, 12.72it/s][A
 23%|██▎       | 36/157 [00:02<00:09, 12.76it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 1.6079806089401245, 'eval_acc': 0.621, 'eval_acc2': 0.7672, 'eval_f1': 0.6024008304428871, 'eval_roc_auc_micro': 0.9868723093939393, 'eval_precision': 0.621, 'eval_recall': 0.621, 'eval_runtime': 15.7563, 'eval_samples_per_second': 634.668, 'eval_steps_per_second': 9.964, 'epoch': 0.26}


  3%|▎         | 50/1560 [02:01<29:17,  1.16s/it]  

{'loss': 1.8632, 'grad_norm': 6.229714870452881, 'learning_rate': 0.0009974674175313228, 'epoch': 0.32}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:12, 12.08it/s][A
  3%|▎         | 4/157 [00:00<00:13, 11.41it/s][A
  4%|▍         | 6/157 [00:00<00:13, 11.30it/s][A
  5%|▌         | 8/157 [00:00<00:13, 11.04it/s][A
  6%|▋         | 10/157 [00:00<00:13, 10.71it/s][A
  8%|▊         | 12/157 [00:01<00:13, 10.68it/s][A
  9%|▉         | 14/157 [00:01<00:13, 10.59it/s][A
 10%|█         | 16/157 [00:01<00:13, 10.39it/s][A
 11%|█▏        | 18/157 [00:01<00:13, 10.01it/s][A
 13%|█▎        | 20/157 [00:01<00:13, 10.25it/s][A
 14%|█▍        | 22/157 [00:02<00:13, 10.25it/s][A
 15%|█▌        | 24/157 [00:02<00:13,  9.58it/s][A
 17%|█▋        | 26/157 [00:02<00:13,  9.77it/s][A
 18%|█▊        | 28/157 [00:02<00:13,  9.86it/s][A
 18%|█▊        | 29/157 [00:02<00:13,  9.69it/s][A
 19%|█▉        | 30/157 [00:02<00:13,  9.63it/s][A
 20%|█▉        | 31/157 [00:03<00:12,  9.70it/s][A
 20%|██        | 32/157 [00:03<00:12,  9.66it/s][A
 21%|██        | 33/157 

{'eval_loss': 1.3305442333221436, 'eval_acc': 0.6638, 'eval_acc2': 0.8045, 'eval_f1': 0.6468784113848189, 'eval_roc_auc_micro': 0.9907962522222222, 'eval_precision': 0.6638, 'eval_recall': 0.6638, 'eval_runtime': 17.2749, 'eval_samples_per_second': 578.874, 'eval_steps_per_second': 9.088, 'epoch': 0.32}


  4%|▍         | 60/1560 [02:28<27:42,  1.11s/it]  

{'loss': 1.6819, 'grad_norm': 4.592185974121094, 'learning_rate': 0.000996354437049027, 'epoch': 0.38}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.75it/s][A
  3%|▎         | 5/157 [00:00<00:18,  8.12it/s][A
  4%|▍         | 7/157 [00:00<00:15,  9.42it/s][A
  6%|▌         | 9/157 [00:00<00:14, 10.42it/s][A
  7%|▋         | 11/157 [00:01<00:13, 10.97it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.09it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.31it/s][A
 11%|█         | 17/157 [00:01<00:12, 11.46it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.62it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.91it/s][A
 15%|█▍        | 23/157 [00:02<00:11, 11.87it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.97it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.17it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.20it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.30it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.30it/s][A
 22%|██▏       | 35/157 [00:03<00:10, 12.10it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.02it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 1.1371392011642456, 'eval_acc': 0.7075, 'eval_acc2': 0.8361, 'eval_f1': 0.6947491205368113, 'eval_roc_auc_micro': 0.9928865273232323, 'eval_precision': 0.7075, 'eval_recall': 0.7075, 'eval_runtime': 16.2903, 'eval_samples_per_second': 613.864, 'eval_steps_per_second': 9.638, 'epoch': 0.38}


  4%|▍         | 70/1560 [02:55<31:03,  1.25s/it]  

{'loss': 1.5661, 'grad_norm': 4.130903244018555, 'learning_rate': 0.0009950401639805821, 'epoch': 0.45}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 17.58it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.22it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.25it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.64it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.54it/s][A
  8%|▊         | 13/157 [00:01<00:11, 12.33it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.27it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.30it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.15it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.18it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 12.11it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.09it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.16it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.21it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.06it/s][A
 21%|██        | 33/157 [00:02<00:10, 11.97it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 12.03it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.01it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 1.0727957487106323, 'eval_acc': 0.7104, 'eval_acc2': 0.842, 'eval_f1': 0.7042030728245321, 'eval_roc_auc_micro': 0.9936041966161617, 'eval_precision': 0.7104, 'eval_recall': 0.7104, 'eval_runtime': 18.6146, 'eval_samples_per_second': 537.212, 'eval_steps_per_second': 8.434, 'epoch': 0.45}


  5%|▌         | 80/1560 [03:29<43:50,  1.78s/it]  

{'loss': 1.4845, 'grad_norm': 5.290602684020996, 'learning_rate': 0.0009935251313189565, 'epoch': 0.51}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:19,  7.97it/s][A
  2%|▏         | 3/157 [00:00<00:17,  8.66it/s][A
  3%|▎         | 5/157 [00:00<00:16,  9.40it/s][A
  4%|▍         | 6/157 [00:00<00:16,  9.30it/s][A
  4%|▍         | 7/157 [00:00<00:16,  8.99it/s][A
  5%|▌         | 8/157 [00:00<00:17,  8.70it/s][A
  6%|▌         | 9/157 [00:01<00:17,  8.30it/s][A
  6%|▋         | 10/157 [00:01<00:18,  8.08it/s][A
  7%|▋         | 11/157 [00:01<00:18,  8.03it/s][A
  8%|▊         | 12/157 [00:01<00:17,  8.20it/s][A
  8%|▊         | 13/157 [00:01<00:17,  8.45it/s][A
  9%|▉         | 14/157 [00:01<00:16,  8.53it/s][A
 10%|▉         | 15/157 [00:01<00:16,  8.69it/s][A
 10%|█         | 16/157 [00:01<00:15,  8.86it/s][A
 11%|█         | 17/157 [00:01<00:15,  9.11it/s][A
 11%|█▏        | 18/157 [00:02<00:15,  9.13it/s][A
 12%|█▏        | 19/157 [00:02<00:14,  9.21it/s][A
 13%|█▎        | 20/157 [00:02<00:14,  9.38it/s][A
 13%|█▎        | 21/157 [00

{'eval_loss': 0.9566525220870972, 'eval_acc': 0.7351, 'eval_acc2': 0.8571, 'eval_f1': 0.724289361259219, 'eval_roc_auc_micro': 0.9947399481818182, 'eval_precision': 0.7351, 'eval_recall': 0.7351, 'eval_runtime': 19.2902, 'eval_samples_per_second': 518.398, 'eval_steps_per_second': 8.139, 'epoch': 0.51}


  6%|▌         | 90/1560 [04:00<31:58,  1.31s/it]  

{'loss': 1.4507, 'grad_norm': 4.69521427154541, 'learning_rate': 0.0009918099534735718, 'epoch': 0.58}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 17.51it/s][A
  3%|▎         | 5/157 [00:00<00:19,  7.70it/s][A
  4%|▍         | 7/157 [00:00<00:16,  8.83it/s][A
  6%|▌         | 9/157 [00:00<00:15,  9.55it/s][A
  7%|▋         | 11/157 [00:01<00:14, 10.21it/s][A
  8%|▊         | 13/157 [00:01<00:13, 10.69it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.05it/s][A
 11%|█         | 17/157 [00:01<00:12, 11.37it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.56it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.71it/s][A
 15%|█▍        | 23/157 [00:02<00:11, 11.82it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.90it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.00it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.13it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.09it/s][A
 21%|██        | 33/157 [00:03<00:11, 10.75it/s][A
 22%|██▏       | 35/157 [00:03<00:12,  9.90it/s][A
 24%|██▎       | 37/157 [00:03<00:13,  9.17it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.9966722726821899, 'eval_acc': 0.7227, 'eval_acc2': 0.8444, 'eval_f1': 0.7156142350429745, 'eval_roc_auc_micro': 0.9940808633838384, 'eval_precision': 0.7227, 'eval_recall': 0.7227, 'eval_runtime': 17.3073, 'eval_samples_per_second': 577.79, 'eval_steps_per_second': 9.071, 'epoch': 0.58}


  6%|▋         | 100/1560 [04:29<30:11,  1.24s/it] 

{'loss': 1.4582, 'grad_norm': 4.2350568771362305, 'learning_rate': 0.0009898953260211339, 'epoch': 0.64}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:08, 17.44it/s][A
  3%|▎         | 4/157 [00:00<00:11, 13.51it/s][A
  4%|▍         | 6/157 [00:00<00:12, 12.43it/s][A
  5%|▌         | 8/157 [00:00<00:12, 12.10it/s][A
  6%|▋         | 10/157 [00:00<00:12, 12.19it/s][A
  8%|▊         | 12/157 [00:00<00:11, 12.32it/s][A
  9%|▉         | 14/157 [00:01<00:12, 11.85it/s][A
 10%|█         | 16/157 [00:01<00:11, 11.92it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 11.85it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 11.82it/s][A
 14%|█▍        | 22/157 [00:01<00:11, 11.96it/s][A
 15%|█▌        | 24/157 [00:01<00:11, 11.93it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.07it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 12.04it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 11.94it/s][A
 20%|██        | 32/157 [00:02<00:10, 12.03it/s][A
 22%|██▏       | 34/157 [00:02<00:10, 12.09it/s][A
 23%|██▎       | 36/157 [00:02<00:09, 12.20it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.9440015554428101, 'eval_acc': 0.7284, 'eval_acc2': 0.8559, 'eval_f1': 0.7215113097531706, 'eval_roc_auc_micro': 0.9946183721717172, 'eval_precision': 0.7284, 'eval_recall': 0.7284, 'eval_runtime': 15.8186, 'eval_samples_per_second': 632.169, 'eval_steps_per_second': 9.925, 'epoch': 0.64}


  7%|▋         | 110/1560 [04:55<27:28,  1.14s/it]  

{'loss': 1.3599, 'grad_norm': 5.519344806671143, 'learning_rate': 0.000987782025423547, 'epoch': 0.7}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:17,  9.03it/s][A
  3%|▎         | 4/157 [00:00<00:14, 10.80it/s][A
  4%|▍         | 6/157 [00:00<00:13, 11.38it/s][A
  5%|▌         | 8/157 [00:00<00:13, 11.41it/s][A
  6%|▋         | 10/157 [00:00<00:12, 11.58it/s][A
  8%|▊         | 12/157 [00:01<00:12, 11.75it/s][A
  9%|▉         | 14/157 [00:01<00:11, 11.98it/s][A
 10%|█         | 16/157 [00:01<00:11, 12.12it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 12.16it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 12.20it/s][A
 14%|█▍        | 22/157 [00:01<00:11, 12.24it/s][A
 15%|█▌        | 24/157 [00:02<00:10, 12.38it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.48it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 12.36it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 12.28it/s][A
 20%|██        | 32/157 [00:02<00:10, 12.31it/s][A
 22%|██▏       | 34/157 [00:02<00:10, 11.94it/s][A
 23%|██▎       | 36/157 [00:03<00:10, 12.02it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.8927861452102661, 'eval_acc': 0.7502, 'eval_acc2': 0.865, 'eval_f1': 0.7448763761251926, 'eval_roc_auc_micro': 0.9949830294949495, 'eval_precision': 0.7502, 'eval_recall': 0.7502, 'eval_runtime': 17.2854, 'eval_samples_per_second': 578.523, 'eval_steps_per_second': 9.083, 'epoch': 0.7}


  8%|▊         | 120/1560 [05:22<26:50,  1.12s/it]  

{'loss': 1.3428, 'grad_norm': 3.357283592224121, 'learning_rate': 0.000985470908713026, 'epoch': 0.77}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.90it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.05it/s][A
  4%|▍         | 7/157 [00:00<00:10, 14.00it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.42it/s][A
  7%|▋         | 11/157 [00:00<00:11, 13.02it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.61it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.12it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.30it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.44it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.46it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.51it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.51it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.53it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.42it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.17it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.23it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.38it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.44it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.823374330997467, 'eval_acc': 0.7512, 'eval_acc2': 0.8748, 'eval_f1': 0.7434401148773755, 'eval_roc_auc_micro': 0.9959983074747475, 'eval_precision': 0.7512, 'eval_recall': 0.7512, 'eval_runtime': 15.6973, 'eval_samples_per_second': 637.051, 'eval_steps_per_second': 10.002, 'epoch': 0.77}


  8%|▊         | 130/1560 [05:47<25:56,  1.09s/it]  

{'loss': 1.3201, 'grad_norm': 5.160700798034668, 'learning_rate': 0.0009829629131445341, 'epoch': 0.83}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:09, 16.03it/s][A
  3%|▎         | 5/157 [00:00<00:12, 12.59it/s][A
  4%|▍         | 7/157 [00:00<00:13, 11.52it/s][A
  6%|▌         | 9/157 [00:00<00:13, 10.98it/s][A
  7%|▋         | 11/157 [00:00<00:13, 10.93it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.31it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.62it/s][A
 11%|█         | 17/157 [00:01<00:11, 11.90it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.00it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.13it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 12.08it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.91it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 11.93it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.04it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 11.96it/s][A
 21%|██        | 33/157 [00:02<00:10, 11.56it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 11.57it/s][A
 24%|██▎       | 37/157 [00:03<00:10, 11.51it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.8673505783081055, 'eval_acc': 0.7509, 'eval_acc2': 0.8648, 'eval_f1': 0.746697204845908, 'eval_roc_auc_micro': 0.9953696475757576, 'eval_precision': 0.7509, 'eval_recall': 0.7509, 'eval_runtime': 17.3802, 'eval_samples_per_second': 575.367, 'eval_steps_per_second': 9.033, 'epoch': 0.83}


  9%|▉         | 140/1560 [06:14<26:07,  1.10s/it]  

{'loss': 1.3048, 'grad_norm': 3.6757359504699707, 'learning_rate': 0.000980259055815686, 'epoch': 0.9}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:17,  9.10it/s][A
  3%|▎         | 4/157 [00:00<00:14, 10.88it/s][A
  4%|▍         | 6/157 [00:00<00:13, 11.56it/s][A
  5%|▌         | 8/157 [00:00<00:12, 11.95it/s][A
  6%|▋         | 10/157 [00:00<00:12, 12.22it/s][A
  8%|▊         | 12/157 [00:01<00:11, 12.44it/s][A
  9%|▉         | 14/157 [00:01<00:11, 12.47it/s][A
 10%|█         | 16/157 [00:01<00:11, 12.55it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 12.62it/s][A
 13%|█▎        | 20/157 [00:01<00:10, 12.65it/s][A
 14%|█▍        | 22/157 [00:01<00:10, 12.67it/s][A
 15%|█▌        | 24/157 [00:01<00:10, 12.43it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.26it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 12.43it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 12.52it/s][A
 20%|██        | 32/157 [00:02<00:09, 12.61it/s][A
 22%|██▏       | 34/157 [00:02<00:09, 12.64it/s][A
 23%|██▎       | 36/157 [00:02<00:09, 12.60it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.870690107345581, 'eval_acc': 0.748, 'eval_acc2': 0.8664, 'eval_f1': 0.7419996643917748, 'eval_roc_auc_micro': 0.9949399294444444, 'eval_precision': 0.748, 'eval_recall': 0.748, 'eval_runtime': 15.4534, 'eval_samples_per_second': 647.105, 'eval_steps_per_second': 10.16, 'epoch': 0.9}


 10%|▉         | 150/1560 [06:40<26:28,  1.13s/it]  

{'loss': 1.3176, 'grad_norm': 5.798245906829834, 'learning_rate': 0.0009773604332542728, 'epoch': 0.96}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:09, 15.62it/s][A
  3%|▎         | 5/157 [00:00<00:11, 13.01it/s][A
  4%|▍         | 7/157 [00:00<00:12, 11.93it/s][A
  6%|▌         | 9/157 [00:01<00:23,  6.36it/s][A
  7%|▋         | 11/157 [00:01<00:19,  7.41it/s][A
  8%|▊         | 13/157 [00:01<00:17,  8.35it/s][A
 10%|▉         | 15/157 [00:01<00:15,  9.01it/s][A
 11%|█         | 17/157 [00:01<00:14,  9.66it/s][A
 12%|█▏        | 19/157 [00:02<00:13, 10.08it/s][A
 13%|█▎        | 21/157 [00:02<00:13, 10.14it/s][A
 15%|█▍        | 23/157 [00:02<00:13, 10.05it/s][A
 16%|█▌        | 25/157 [00:02<00:12, 10.36it/s][A
 17%|█▋        | 27/157 [00:02<00:12, 10.56it/s][A
 18%|█▊        | 29/157 [00:02<00:12, 10.46it/s][A
 20%|█▉        | 31/157 [00:03<00:11, 10.78it/s][A
 21%|██        | 33/157 [00:03<00:11, 10.83it/s][A
 22%|██▏       | 35/157 [00:03<00:11, 10.91it/s][A
 24%|██▎       | 37/157 [00:03<00:11, 10.88it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.8491628766059875, 'eval_acc': 0.7514, 'eval_acc2': 0.8701, 'eval_f1': 0.748223202471523, 'eval_roc_auc_micro': 0.9956663690404041, 'eval_precision': 0.7514, 'eval_recall': 0.7514, 'eval_runtime': 17.0613, 'eval_samples_per_second': 586.122, 'eval_steps_per_second': 9.202, 'epoch': 0.96}


 10%|█         | 160/1560 [07:09<32:11,  1.38s/it]  

{'loss': 1.2069, 'grad_norm': 3.603344202041626, 'learning_rate': 0.0009742682209735727, 'epoch': 1.02}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.83it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.15it/s][A
  4%|▍         | 7/157 [00:00<00:10, 14.03it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.43it/s][A
  7%|▋         | 11/157 [00:00<00:11, 13.21it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.98it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.86it/s][A
 11%|█         | 17/157 [00:01<00:10, 12.77it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.65it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.60it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.66it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.73it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.74it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.74it/s][A
 20%|█▉        | 31/157 [00:02<00:09, 12.65it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.36it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.31it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.29it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.801152765750885, 'eval_acc': 0.7686, 'eval_acc2': 0.8793, 'eval_f1': 0.7644157177613828, 'eval_roc_auc_micro': 0.9957612812121213, 'eval_precision': 0.7686, 'eval_recall': 0.7686, 'eval_runtime': 15.9319, 'eval_samples_per_second': 627.673, 'eval_steps_per_second': 9.854, 'epoch': 1.02}


 11%|█         | 170/1560 [07:35<26:26,  1.14s/it]  

{'loss': 1.1171, 'grad_norm': 3.555765390396118, 'learning_rate': 0.0009709836729956326, 'epoch': 1.09}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 19.05it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.14it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.98it/s][A
  6%|▌         | 9/157 [00:00<00:10, 13.53it/s][A
  7%|▋         | 11/157 [00:00<00:11, 13.00it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.67it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.70it/s][A
 11%|█         | 17/157 [00:01<00:10, 12.73it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.71it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.74it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.58it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.28it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.31it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.47it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.57it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.59it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.65it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.66it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.7929334044456482, 'eval_acc': 0.7653, 'eval_acc2': 0.8798, 'eval_f1': 0.7625802090401252, 'eval_roc_auc_micro': 0.9959629537878788, 'eval_precision': 0.7653, 'eval_recall': 0.7653, 'eval_runtime': 15.358, 'eval_samples_per_second': 651.128, 'eval_steps_per_second': 10.223, 'epoch': 1.09}


 12%|█▏        | 180/1560 [08:00<24:39,  1.07s/it]  

{'loss': 1.1534, 'grad_norm': 4.420194149017334, 'learning_rate': 0.0009675081213427075, 'epoch': 1.15}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:25,  6.10it/s][A
  3%|▎         | 4/157 [00:00<00:17,  8.82it/s][A
  4%|▍         | 6/157 [00:00<00:14, 10.24it/s][A
  5%|▌         | 8/157 [00:00<00:13, 11.03it/s][A
  6%|▋         | 10/157 [00:00<00:12, 11.48it/s][A
  8%|▊         | 12/157 [00:01<00:12, 11.48it/s][A
  9%|▉         | 14/157 [00:01<00:12, 11.85it/s][A
 10%|█         | 16/157 [00:01<00:11, 12.10it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 12.27it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 12.42it/s][A
 14%|█▍        | 22/157 [00:01<00:10, 12.53it/s][A
 15%|█▌        | 24/157 [00:02<00:10, 12.26it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.37it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 12.18it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 12.33it/s][A
 20%|██        | 32/157 [00:02<00:10, 12.35it/s][A
 22%|██▏       | 34/157 [00:02<00:09, 12.40it/s][A
 23%|██▎       | 36/157 [00:03<00:09, 12.44it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.8143727779388428, 'eval_acc': 0.7553, 'eval_acc2': 0.8762, 'eval_f1': 0.7516259977252293, 'eval_roc_auc_micro': 0.9956549132828283, 'eval_precision': 0.7553, 'eval_recall': 0.7553, 'eval_runtime': 16.4159, 'eval_samples_per_second': 609.166, 'eval_steps_per_second': 9.564, 'epoch': 1.15}


 12%|█▏        | 190/1560 [08:26<25:16,  1.11s/it]  

{'loss': 1.0715, 'grad_norm': 3.298725128173828, 'learning_rate': 0.0009638429754970715, 'epoch': 1.22}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 19.05it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.87it/s][A
  4%|▍         | 7/157 [00:00<00:11, 12.97it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.93it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.81it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.51it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.49it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.49it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.53it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.63it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.70it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.77it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.78it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.72it/s][A
 20%|█▉        | 31/157 [00:02<00:09, 12.76it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.69it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.60it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.50it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.7495627999305725, 'eval_acc': 0.784, 'eval_acc2': 0.885, 'eval_f1': 0.7822027328084753, 'eval_roc_auc_micro': 0.9963737836868687, 'eval_precision': 0.784, 'eval_recall': 0.784, 'eval_runtime': 15.3919, 'eval_samples_per_second': 649.694, 'eval_steps_per_second': 10.2, 'epoch': 1.22}


 13%|█▎        | 200/1560 [08:52<24:43,  1.09s/it]  

{'loss': 1.1072, 'grad_norm': 3.0798277854919434, 'learning_rate': 0.0009599897218294122, 'epoch': 1.28}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.71it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.10it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.97it/s][A
  6%|▌         | 9/157 [00:00<00:10, 13.52it/s][A
  7%|▋         | 11/157 [00:00<00:11, 13.24it/s][A
  8%|▊         | 13/157 [00:00<00:11, 13.03it/s][A
 10%|▉         | 15/157 [00:01<00:10, 12.97it/s][A
 11%|█         | 17/157 [00:01<00:10, 12.83it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.80it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.75it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.64it/s][A
 16%|█▌        | 25/157 [00:01<00:11, 11.99it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.21it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.35it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.36it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.47it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.52it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.59it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.7440969944000244, 'eval_acc': 0.7806, 'eval_acc2': 0.89, 'eval_f1': 0.7801974324285018, 'eval_roc_auc_micro': 0.9963379537373738, 'eval_precision': 0.7806, 'eval_recall': 0.7806, 'eval_runtime': 16.6556, 'eval_samples_per_second': 600.398, 'eval_steps_per_second': 9.426, 'epoch': 1.28}


 13%|█▎        | 210/1560 [09:19<25:44,  1.14s/it]  

{'loss': 1.0878, 'grad_norm': 4.2520575523376465, 'learning_rate': 0.0009559499229960451, 'epoch': 1.34}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:09, 15.67it/s][A
  3%|▎         | 4/157 [00:00<00:15,  9.94it/s][A
  4%|▍         | 6/157 [00:00<00:13, 10.97it/s][A
  5%|▌         | 8/157 [00:00<00:12, 11.49it/s][A
  6%|▋         | 10/157 [00:00<00:12, 11.80it/s][A
  8%|▊         | 12/157 [00:01<00:11, 12.11it/s][A
  9%|▉         | 14/157 [00:01<00:11, 12.13it/s][A
 10%|█         | 16/157 [00:01<00:11, 12.24it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 12.40it/s][A
 13%|█▎        | 20/157 [00:01<00:10, 12.49it/s][A
 14%|█▍        | 22/157 [00:01<00:10, 12.49it/s][A
 15%|█▌        | 24/157 [00:01<00:10, 12.53it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.54it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 12.50it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 12.53it/s][A
 20%|██        | 32/157 [00:02<00:09, 12.52it/s][A
 22%|██▏       | 34/157 [00:02<00:09, 12.63it/s][A
 23%|██▎       | 36/157 [00:02<00:09, 12.72it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.7459242939949036, 'eval_acc': 0.7802, 'eval_acc2': 0.8888, 'eval_f1': 0.7773461480513473, 'eval_roc_auc_micro': 0.9963266664141414, 'eval_precision': 0.7802, 'eval_recall': 0.7802, 'eval_runtime': 15.198, 'eval_samples_per_second': 657.98, 'eval_steps_per_second': 10.33, 'epoch': 1.34}


 14%|█▍        | 220/1560 [09:43<23:59,  1.07s/it]  

{'loss': 1.0337, 'grad_norm': 3.443748950958252, 'learning_rate': 0.0009517252173051911, 'epoch': 1.41}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:36,  4.23it/s][A
  3%|▎         | 4/157 [00:00<00:21,  6.98it/s][A
  4%|▍         | 6/157 [00:00<00:17,  8.66it/s][A
  5%|▌         | 8/157 [00:00<00:15,  9.75it/s][A
  6%|▋         | 10/157 [00:01<00:13, 10.63it/s][A
  8%|▊         | 12/157 [00:01<00:12, 11.29it/s][A
  9%|▉         | 14/157 [00:01<00:12, 11.70it/s][A
 10%|█         | 16/157 [00:01<00:11, 12.01it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 12.09it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 11.92it/s][A
 14%|█▍        | 22/157 [00:02<00:11, 12.12it/s][A
 15%|█▌        | 24/157 [00:02<00:10, 12.12it/s][A
 17%|█▋        | 26/157 [00:02<00:11, 11.77it/s][A
 18%|█▊        | 28/157 [00:02<00:11, 11.12it/s][A
 19%|█▉        | 30/157 [00:02<00:11, 10.64it/s][A
 20%|██        | 32/157 [00:03<00:11, 10.75it/s][A
 22%|██▏       | 34/157 [00:03<00:11, 10.76it/s][A
 23%|██▎       | 36/157 [00:03<00:11, 10.84it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.7298774123191833, 'eval_acc': 0.7817, 'eval_acc2': 0.8887, 'eval_f1': 0.7780550478741064, 'eval_roc_auc_micro': 0.9964774634848484, 'eval_precision': 0.7817, 'eval_recall': 0.7817, 'eval_runtime': 17.0989, 'eval_samples_per_second': 584.834, 'eval_steps_per_second': 9.182, 'epoch': 1.41}


 15%|█▍        | 230/1560 [10:10<24:28,  1.10s/it]  

{'loss': 1.0807, 'grad_norm': 4.526337146759033, 'learning_rate': 0.0009473173180525737, 'epoch': 1.47}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:09, 16.30it/s][A
  3%|▎         | 5/157 [00:00<00:20,  7.34it/s][A
  4%|▍         | 7/157 [00:00<00:16,  8.86it/s][A
  6%|▌         | 9/157 [00:00<00:14,  9.95it/s][A
  7%|▋         | 11/157 [00:01<00:13, 10.73it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.29it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.44it/s][A
 11%|█         | 17/157 [00:01<00:11, 11.67it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.93it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.05it/s][A
 15%|█▍        | 23/157 [00:02<00:10, 12.20it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.22it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.30it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.38it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.43it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.53it/s][A
 22%|██▏       | 35/157 [00:03<00:09, 12.54it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.58it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.7683348655700684, 'eval_acc': 0.7713, 'eval_acc2': 0.8831, 'eval_f1': 0.7699989848120404, 'eval_roc_auc_micro': 0.9961263488888888, 'eval_precision': 0.7713, 'eval_recall': 0.7713, 'eval_runtime': 15.3108, 'eval_samples_per_second': 653.133, 'eval_steps_per_second': 10.254, 'epoch': 1.47}


 15%|█▌        | 240/1560 [10:35<24:55,  1.13s/it]  

{'loss': 1.102, 'grad_norm': 3.4161341190338135, 'learning_rate': 0.0009427280128266049, 'epoch': 1.54}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:09, 16.01it/s][A
  3%|▎         | 5/157 [00:00<00:11, 13.03it/s][A
  4%|▍         | 7/157 [00:00<00:12, 12.04it/s][A
  6%|▌         | 9/157 [00:00<00:13, 11.29it/s][A
  7%|▋         | 11/157 [00:00<00:12, 11.28it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.32it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.11it/s][A
 11%|█         | 17/157 [00:01<00:12, 11.12it/s][A
 12%|█▏        | 19/157 [00:01<00:12, 11.20it/s][A
 13%|█▎        | 21/157 [00:01<00:12, 11.05it/s][A
 15%|█▍        | 23/157 [00:02<00:11, 11.18it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.17it/s][A
 17%|█▋        | 27/157 [00:02<00:11, 11.20it/s][A
 18%|█▊        | 29/157 [00:02<00:11, 11.21it/s][A
 20%|█▉        | 31/157 [00:02<00:11, 11.16it/s][A
 21%|██        | 33/157 [00:02<00:11, 11.13it/s][A
 22%|██▏       | 35/157 [00:03<00:11, 11.07it/s][A
 24%|██▎       | 37/157 [00:03<00:10, 11.04it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.7191524505615234, 'eval_acc': 0.7916, 'eval_acc2': 0.8931, 'eval_f1': 0.7883302646474805, 'eval_roc_auc_micro': 0.9966365435858586, 'eval_precision': 0.7916, 'eval_recall': 0.7916, 'eval_runtime': 17.4192, 'eval_samples_per_second': 574.079, 'eval_steps_per_second': 9.013, 'epoch': 1.54}


 16%|█▌        | 250/1560 [11:02<24:16,  1.11s/it]  

{'loss': 1.0465, 'grad_norm': 6.009819507598877, 'learning_rate': 0.000937959162783444, 'epoch': 1.6}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.69it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.98it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.77it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.28it/s][A
  7%|▋         | 11/157 [00:00<00:11, 13.00it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.78it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.28it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.40it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.50it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.47it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.51it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.12it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.26it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.41it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.32it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.34it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.43it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.10it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.7102870941162109, 'eval_acc': 0.7883, 'eval_acc2': 0.8936, 'eval_f1': 0.7862318481780164, 'eval_roc_auc_micro': 0.9967163402020203, 'eval_precision': 0.7883, 'eval_recall': 0.7883, 'eval_runtime': 15.7529, 'eval_samples_per_second': 634.803, 'eval_steps_per_second': 9.966, 'epoch': 1.6}


 17%|█▋        | 260/1560 [11:28<24:53,  1.15s/it]  

{'loss': 1.0797, 'grad_norm': 3.1930043697357178, 'learning_rate': 0.0009330127018922195, 'epoch': 1.66}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 17.41it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.01it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.05it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.46it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.47it/s][A
  8%|▊         | 13/157 [00:01<00:11, 12.29it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.17it/s][A
 11%|█         | 17/157 [00:01<00:11, 11.83it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.78it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.74it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 11.68it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.64it/s][A
 17%|█▋        | 27/157 [00:02<00:11, 11.36it/s][A
 18%|█▊        | 29/157 [00:02<00:11, 11.12it/s][A
 20%|█▉        | 31/157 [00:02<00:11, 11.11it/s][A
 21%|██        | 33/157 [00:02<00:11, 11.18it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 11.10it/s][A
 24%|██▎       | 37/157 [00:03<00:10, 11.06it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.7359907031059265, 'eval_acc': 0.776, 'eval_acc2': 0.8858, 'eval_f1': 0.775968097483678, 'eval_roc_auc_micro': 0.9964117972727273, 'eval_precision': 0.776, 'eval_recall': 0.776, 'eval_runtime': 18.7907, 'eval_samples_per_second': 532.178, 'eval_steps_per_second': 8.355, 'epoch': 1.66}


 17%|█▋        | 270/1560 [11:56<23:59,  1.12s/it]  

{'loss': 1.085, 'grad_norm': 4.185244083404541, 'learning_rate': 0.0009278906361507238, 'epoch': 1.73}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.84it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.08it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.94it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.33it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.77it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.66it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.56it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.30it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.34it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.39it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 11.94it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.11it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.30it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.40it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.50it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.34it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.39it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.52it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.7158687710762024, 'eval_acc': 0.7839, 'eval_acc2': 0.8929, 'eval_f1': 0.7799353228084558, 'eval_roc_auc_micro': 0.9966198951010101, 'eval_precision': 0.7839, 'eval_recall': 0.7839, 'eval_runtime': 17.2654, 'eval_samples_per_second': 579.194, 'eval_steps_per_second': 9.093, 'epoch': 1.73}


 18%|█▊        | 280/1560 [12:23<24:59,  1.17s/it]  

{'loss': 1.0298, 'grad_norm': 3.3942668437957764, 'learning_rate': 0.0009225950427718975, 'epoch': 1.79}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:09, 16.31it/s][A
  3%|▎         | 5/157 [00:00<00:11, 13.53it/s][A
  4%|▍         | 7/157 [00:00<00:11, 12.56it/s][A
  6%|▌         | 9/157 [00:00<00:12, 12.18it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.22it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.73it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.52it/s][A
 11%|█         | 17/157 [00:01<00:12, 11.61it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.66it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.60it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 11.66it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.77it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 11.89it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.07it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.23it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.36it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.33it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.43it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.7103710174560547, 'eval_acc': 0.7854, 'eval_acc2': 0.8897, 'eval_f1': 0.7836451382865041, 'eval_roc_auc_micro': 0.9967426113131314, 'eval_precision': 0.7854, 'eval_recall': 0.7854, 'eval_runtime': 17.3045, 'eval_samples_per_second': 577.886, 'eval_steps_per_second': 9.073, 'epoch': 1.79}


 19%|█▊        | 290/1560 [12:50<23:33,  1.11s/it]  

{'loss': 0.9693, 'grad_norm': 3.281850814819336, 'learning_rate': 0.0009171280693414306, 'epoch': 1.86}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.60it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.06it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.88it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.32it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.96it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.83it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.78it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.67it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.49it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.17it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.20it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.31it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.41it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.46it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.51it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.39it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.46it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.50it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.6748632788658142, 'eval_acc': 0.7949, 'eval_acc2': 0.8974, 'eval_f1': 0.7896510558786952, 'eval_roc_auc_micro': 0.9970838932323232, 'eval_precision': 0.7949, 'eval_recall': 0.7949, 'eval_runtime': 15.9235, 'eval_samples_per_second': 628.003, 'eval_steps_per_second': 9.86, 'epoch': 1.86}


 19%|█▉        | 300/1560 [13:17<25:02,  1.19s/it]  

{'loss': 1.0238, 'grad_norm': 3.393048048019409, 'learning_rate': 0.0009114919329468282, 'epoch': 1.92}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.88it/s][A
  3%|▎         | 5/157 [00:00<00:09, 15.26it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.88it/s][A
  6%|▌         | 9/157 [00:00<00:10, 13.47it/s][A
  7%|▋         | 11/157 [00:00<00:11, 13.15it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.98it/s][A
 10%|▉         | 15/157 [00:01<00:10, 12.94it/s][A
 11%|█         | 17/157 [00:01<00:10, 12.87it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.79it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.46it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.53it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.56it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.50it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.48it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.48it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.52it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.40it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.50it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.7000189423561096, 'eval_acc': 0.7944, 'eval_acc2': 0.8962, 'eval_f1': 0.7929741524381759, 'eval_roc_auc_micro': 0.9967042288383838, 'eval_precision': 0.7944, 'eval_recall': 0.7944, 'eval_runtime': 15.3554, 'eval_samples_per_second': 651.235, 'eval_steps_per_second': 10.224, 'epoch': 1.92}


 20%|█▉        | 310/1560 [13:41<22:28,  1.08s/it]  

{'loss': 1.0032, 'grad_norm': 3.1523501873016357, 'learning_rate': 0.0009056889192782866, 'epoch': 1.98}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.65it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.98it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.51it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.09it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.89it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.74it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.23it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.31it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.19it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.32it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.43it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.49it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.26it/s][A
 18%|█▊        | 29/157 [00:02<00:11, 11.39it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 11.74it/s][A
 21%|██        | 33/157 [00:02<00:10, 11.99it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 12.12it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.27it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.6388049721717834, 'eval_acc': 0.8073, 'eval_acc2': 0.9051, 'eval_f1': 0.8034188247843885, 'eval_roc_auc_micro': 0.9974423170707071, 'eval_precision': 0.8073, 'eval_recall': 0.8073, 'eval_runtime': 16.8709, 'eval_samples_per_second': 592.736, 'eval_steps_per_second': 9.306, 'epoch': 1.98}


 21%|██        | 320/1560 [14:11<24:28,  1.18s/it]  

{'loss': 0.9005, 'grad_norm': 3.6607911586761475, 'learning_rate': 0.0008997213817017506, 'epoch': 2.05}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.65it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.11it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.94it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.30it/s][A
  7%|▋         | 11/157 [00:00<00:11, 13.00it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.90it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.47it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.55it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.56it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.50it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.40it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.48it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.06it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 11.97it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.17it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.31it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.40it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.52it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.6816002726554871, 'eval_acc': 0.7956, 'eval_acc2': 0.8992, 'eval_f1': 0.7942849840828832, 'eval_roc_auc_micro': 0.9969856100505051, 'eval_precision': 0.7956, 'eval_recall': 0.7956, 'eval_runtime': 17.0763, 'eval_samples_per_second': 585.608, 'eval_steps_per_second': 9.194, 'epoch': 2.05}


 21%|██        | 330/1560 [14:38<25:00,  1.22s/it]  

{'loss': 0.8568, 'grad_norm': 2.8544564247131348, 'learning_rate': 0.000893591740304525, 'epoch': 2.11}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:23,  6.52it/s][A
  3%|▎         | 4/157 [00:00<00:17,  8.64it/s][A
  4%|▍         | 6/157 [00:00<00:15,  9.77it/s][A
  5%|▌         | 8/157 [00:00<00:14, 10.48it/s][A
  6%|▋         | 10/157 [00:00<00:13, 10.97it/s][A
  8%|▊         | 12/157 [00:01<00:12, 11.25it/s][A
  9%|▉         | 14/157 [00:01<00:12, 11.46it/s][A
 10%|█         | 16/157 [00:01<00:12, 11.48it/s][A
 11%|█▏        | 18/157 [00:01<00:12, 11.34it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 11.45it/s][A
 14%|█▍        | 22/157 [00:02<00:11, 11.48it/s][A
 15%|█▌        | 24/157 [00:02<00:11, 11.68it/s][A
 17%|█▋        | 26/157 [00:02<00:11, 11.69it/s][A
 18%|█▊        | 28/157 [00:02<00:11, 11.71it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 11.65it/s][A
 20%|██        | 32/157 [00:02<00:10, 11.65it/s][A
 22%|██▏       | 34/157 [00:03<00:10, 11.43it/s][A
 23%|██▎       | 36/157 [00:03<00:10, 11.44it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.6797187328338623, 'eval_acc': 0.7965, 'eval_acc2': 0.9003, 'eval_f1': 0.7931305184985963, 'eval_roc_auc_micro': 0.9969530616161617, 'eval_precision': 0.7965, 'eval_recall': 0.7965, 'eval_runtime': 17.6498, 'eval_samples_per_second': 566.579, 'eval_steps_per_second': 8.895, 'epoch': 2.11}


 22%|██▏       | 340/1560 [15:06<22:49,  1.12s/it]  

{'loss': 0.9176, 'grad_norm': 3.100801467895508, 'learning_rate': 0.0008873024809138273, 'epoch': 2.18}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 17.42it/s][A
  3%|▎         | 5/157 [00:00<00:11, 13.20it/s][A
  4%|▍         | 7/157 [00:00<00:11, 12.53it/s][A
  6%|▌         | 9/157 [00:00<00:12, 12.29it/s][A
  7%|▋         | 11/157 [00:00<00:12, 11.69it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.48it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.59it/s][A
 11%|█         | 17/157 [00:01<00:12, 11.27it/s][A
 12%|█▏        | 19/157 [00:01<00:12, 11.35it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.44it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 11.65it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.61it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 11.86it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 11.97it/s][A
 20%|█▉        | 31/157 [00:02<00:16,  7.50it/s][A
 20%|██        | 32/157 [00:03<00:15,  7.83it/s][A
 22%|██▏       | 34/157 [00:03<00:14,  8.63it/s][A
 23%|██▎       | 36/157 [00:03<00:13,  9.27it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.6433478593826294, 'eval_acc': 0.8063, 'eval_acc2': 0.908, 'eval_f1': 0.8059238395552958, 'eval_roc_auc_micro': 0.9973213327272727, 'eval_precision': 0.8063, 'eval_recall': 0.8063, 'eval_runtime': 17.0066, 'eval_samples_per_second': 588.007, 'eval_steps_per_second': 9.232, 'epoch': 2.18}


 22%|██▏       | 350/1560 [15:33<22:53,  1.13s/it]  

{'loss': 0.8405, 'grad_norm': 3.1269845962524414, 'learning_rate': 0.0008808561540886796, 'epoch': 2.24}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 17.79it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.36it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.49it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.44it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.39it/s][A
  8%|▊         | 13/157 [00:01<00:11, 12.42it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.45it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.48it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.38it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.41it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.48it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.54it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.58it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.30it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.29it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.34it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.37it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.44it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.6405052542686462, 'eval_acc': 0.8062, 'eval_acc2': 0.908, 'eval_f1': 0.804273810490754, 'eval_roc_auc_micro': 0.9974369862626262, 'eval_precision': 0.8062, 'eval_recall': 0.8062, 'eval_runtime': 15.856, 'eval_samples_per_second': 630.677, 'eval_steps_per_second': 9.902, 'epoch': 2.24}


 23%|██▎       | 360/1560 [15:59<24:07,  1.21s/it]  

{'loss': 0.8859, 'grad_norm': 5.601583003997803, 'learning_rate': 0.0008742553740855505, 'epoch': 2.3}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:09, 16.22it/s][A
  3%|▎         | 5/157 [00:00<00:11, 13.14it/s][A
  4%|▍         | 7/157 [00:00<00:12, 12.09it/s][A
  6%|▌         | 9/157 [00:00<00:12, 11.56it/s][A
  7%|▋         | 11/157 [00:00<00:13, 10.95it/s][A
  8%|▊         | 13/157 [00:01<00:13, 10.65it/s][A
 10%|▉         | 15/157 [00:01<00:13, 10.74it/s][A
 11%|█         | 17/157 [00:01<00:12, 10.86it/s][A
 12%|█▏        | 19/157 [00:01<00:12, 10.74it/s][A
 13%|█▎        | 21/157 [00:01<00:12, 10.69it/s][A
 15%|█▍        | 23/157 [00:02<00:12, 10.70it/s][A
 17%|█▋        | 27/157 [00:02<00:12, 10.67it/s][A
 18%|█▊        | 29/157 [00:02<00:12, 10.53it/s][A
 20%|█▉        | 31/157 [00:02<00:11, 10.62it/s][A
 21%|██        | 33/157 [00:03<00:11, 10.65it/s][A
 22%|██▏       | 35/157 [00:03<00:11, 10.63it/s][A
 24%|██▎       | 37/157 [00:03<00:11, 10.52it/s][A
 25%|██▍       | 39/157 [00:03<00:11, 10.70it/s][A
 26%|██▌       | 41/157 

{'eval_loss': 0.7553624510765076, 'eval_acc': 0.7765, 'eval_acc2': 0.8855, 'eval_f1': 0.7754347319914136, 'eval_roc_auc_micro': 0.996104626060606, 'eval_precision': 0.7765, 'eval_recall': 0.7765, 'eval_runtime': 16.6506, 'eval_samples_per_second': 600.579, 'eval_steps_per_second': 9.429, 'epoch': 2.3}


 24%|██▎       | 370/1560 [16:25<21:52,  1.10s/it]  

{'loss': 0.8659, 'grad_norm': 2.7699108123779297, 'learning_rate': 0.0008675028177981643, 'epoch': 2.37}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:21,  7.25it/s][A
  3%|▎         | 4/157 [00:00<00:16,  9.16it/s][A
  4%|▍         | 6/157 [00:00<00:14, 10.17it/s][A
  5%|▌         | 8/157 [00:00<00:13, 11.01it/s][A
  6%|▋         | 10/157 [00:00<00:12, 11.48it/s][A
  8%|▊         | 12/157 [00:01<00:12, 11.61it/s][A
  9%|▉         | 14/157 [00:01<00:12, 11.63it/s][A
 10%|█         | 16/157 [00:01<00:12, 11.57it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 11.73it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 12.06it/s][A
 14%|█▍        | 22/157 [00:01<00:11, 12.26it/s][A
 15%|█▌        | 24/157 [00:02<00:10, 12.35it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.51it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 12.57it/s][A
 19%|█▉        | 30/157 [00:02<00:14,  8.73it/s][A
 20%|██        | 32/157 [00:02<00:12,  9.65it/s][A
 22%|██▏       | 34/157 [00:03<00:11, 10.40it/s][A
 23%|██▎       | 36/157 [00:03<00:11, 10.83it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.6578086018562317, 'eval_acc': 0.802, 'eval_acc2': 0.9004, 'eval_f1': 0.7995593898734488, 'eval_roc_auc_micro': 0.9970724694444445, 'eval_precision': 0.802, 'eval_recall': 0.802, 'eval_runtime': 15.402, 'eval_samples_per_second': 649.268, 'eval_steps_per_second': 10.194, 'epoch': 2.37}


 24%|██▍       | 380/1560 [16:51<22:55,  1.17s/it]  

{'loss': 0.9041, 'grad_norm': 4.146464824676514, 'learning_rate': 0.0008606012236719073, 'epoch': 2.43}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:36,  4.22it/s][A
  3%|▎         | 4/157 [00:00<00:22,  6.93it/s][A
  4%|▍         | 6/157 [00:00<00:17,  8.69it/s][A
  5%|▌         | 8/157 [00:00<00:15,  9.89it/s][A
  6%|▋         | 10/157 [00:01<00:13, 10.70it/s][A
  8%|▊         | 12/157 [00:01<00:12, 11.24it/s][A
  9%|▉         | 14/157 [00:01<00:12, 11.64it/s][A
 10%|█         | 16/157 [00:01<00:11, 11.79it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 11.99it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 12.19it/s][A
 14%|█▍        | 22/157 [00:02<00:10, 12.34it/s][A
 15%|█▌        | 24/157 [00:02<00:10, 12.27it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.17it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 12.33it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 12.41it/s][A
 20%|██        | 32/157 [00:02<00:09, 12.50it/s][A
 22%|██▏       | 34/157 [00:03<00:09, 12.48it/s][A
 23%|██▎       | 36/157 [00:03<00:09, 12.45it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.6536246538162231, 'eval_acc': 0.8054, 'eval_acc2': 0.9037, 'eval_f1': 0.8025150410999741, 'eval_roc_auc_micro': 0.9972048517171717, 'eval_precision': 0.8054, 'eval_recall': 0.8054, 'eval_runtime': 15.8936, 'eval_samples_per_second': 629.184, 'eval_steps_per_second': 9.878, 'epoch': 2.43}


 25%|██▌       | 390/1560 [17:16<21:38,  1.11s/it]  

{'loss': 0.7969, 'grad_norm': 3.6109724044799805, 'learning_rate': 0.0008535533905932737, 'epoch': 2.5}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 17.12it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.65it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.66it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.23it/s][A
  7%|▋         | 11/157 [00:00<00:11, 13.00it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.92it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.84it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.55it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.59it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.56it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.54it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.36it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.28it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.48it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.53it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.58it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.65it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.66it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.6254217624664307, 'eval_acc': 0.81, 'eval_acc2': 0.9059, 'eval_f1': 0.8081844507158029, 'eval_roc_auc_micro': 0.9974154785353535, 'eval_precision': 0.81, 'eval_recall': 0.81, 'eval_runtime': 15.9144, 'eval_samples_per_second': 628.36, 'eval_steps_per_second': 9.865, 'epoch': 2.5}


 26%|██▌       | 400/1560 [17:42<21:13,  1.10s/it]  

{'loss': 0.8092, 'grad_norm': 3.5384316444396973, 'learning_rate': 0.0008463621767547997, 'epoch': 2.56}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.88it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.03it/s][A
  4%|▍         | 7/157 [00:00<00:10, 14.00it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.38it/s][A
  7%|▋         | 11/157 [00:00<00:11, 13.08it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.96it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.33it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.33it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.45it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.52it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.58it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.64it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.63it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.59it/s][A
 20%|█▉        | 31/157 [00:02<00:09, 12.61it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.62it/s][A
 22%|██▏       | 35/157 [00:02<00:13,  8.82it/s][A
 24%|██▎       | 37/157 [00:03<00:12,  9.72it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.6376038193702698, 'eval_acc': 0.8068, 'eval_acc2': 0.9071, 'eval_f1': 0.8050585186786183, 'eval_roc_auc_micro': 0.9972131683333333, 'eval_precision': 0.8068, 'eval_recall': 0.8068, 'eval_runtime': 15.4459, 'eval_samples_per_second': 647.42, 'eval_steps_per_second': 10.165, 'epoch': 2.56}


 26%|██▋       | 410/1560 [18:07<20:42,  1.08s/it]  

{'loss': 0.8582, 'grad_norm': 4.55311393737793, 'learning_rate': 0.0008390304984959455, 'epoch': 2.62}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.48it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.05it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.56it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.08it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.71it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.53it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.61it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.64it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.23it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.70it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 11.65it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.29it/s][A
 17%|█▋        | 27/157 [00:02<00:11, 11.23it/s][A
 18%|█▊        | 29/157 [00:02<00:11, 11.22it/s][A
 20%|█▉        | 31/157 [00:02<00:11, 11.16it/s][A
 21%|██        | 33/157 [00:02<00:11, 11.06it/s][A
 22%|██▏       | 35/157 [00:02<00:11, 11.00it/s][A
 24%|██▎       | 37/157 [00:03<00:11, 10.83it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.6576540470123291, 'eval_acc': 0.8047, 'eval_acc2': 0.9021, 'eval_f1': 0.8040298217158708, 'eval_roc_auc_micro': 0.9970190020707071, 'eval_precision': 0.8047, 'eval_recall': 0.8047, 'eval_runtime': 17.0852, 'eval_samples_per_second': 585.3, 'eval_steps_per_second': 9.189, 'epoch': 2.62}


 27%|██▋       | 420/1560 [18:34<21:16,  1.12s/it]  

{'loss': 0.8201, 'grad_norm': 3.5072827339172363, 'learning_rate': 0.0008315613291203976, 'epoch': 2.69}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.67it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.08it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.37it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.59it/s][A
  7%|▋         | 11/157 [00:00<00:12, 11.72it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.48it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.41it/s][A
 11%|█         | 17/157 [00:01<00:12, 11.11it/s][A
 12%|█▏        | 19/157 [00:01<00:12, 11.08it/s][A
 13%|█▎        | 21/157 [00:01<00:12, 11.02it/s][A
 15%|█▍        | 23/157 [00:01<00:12, 10.74it/s][A
 16%|█▌        | 25/157 [00:02<00:12, 10.62it/s][A
 17%|█▋        | 27/157 [00:02<00:12, 10.75it/s][A
 18%|█▊        | 29/157 [00:02<00:11, 10.70it/s][A
 20%|█▉        | 31/157 [00:02<00:11, 10.63it/s][A
 21%|██        | 33/157 [00:02<00:12, 10.26it/s][A
 22%|██▏       | 35/157 [00:03<00:11, 10.48it/s][A
 24%|██▎       | 37/157 [00:03<00:11, 10.79it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.6138383746147156, 'eval_acc': 0.8144, 'eval_acc2': 0.9119, 'eval_f1': 0.8130705278641523, 'eval_roc_auc_micro': 0.9975326205050505, 'eval_precision': 0.8144, 'eval_recall': 0.8144, 'eval_runtime': 16.9891, 'eval_samples_per_second': 588.613, 'eval_steps_per_second': 9.241, 'epoch': 2.69}


 28%|██▊       | 430/1560 [19:00<20:33,  1.09s/it]  

{'loss': 0.7906, 'grad_norm': 2.7943999767303467, 'learning_rate': 0.0008239576976902694, 'epoch': 2.75}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.45it/s][A
  3%|▎         | 5/157 [00:00<00:12, 11.98it/s][A
  4%|▍         | 7/157 [00:00<00:12, 12.05it/s][A
  6%|▌         | 9/157 [00:00<00:12, 11.77it/s][A
  7%|▋         | 11/157 [00:00<00:12, 11.55it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.80it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.06it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.25it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.25it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.30it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.33it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.23it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.21it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.33it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.46it/s][A
 21%|██        | 33/157 [00:02<00:14,  8.67it/s][A
 22%|██▏       | 35/157 [00:03<00:12,  9.59it/s][A
 24%|██▎       | 37/157 [00:03<00:11, 10.24it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.621157705783844, 'eval_acc': 0.8112, 'eval_acc2': 0.9102, 'eval_f1': 0.8091548695767014, 'eval_roc_auc_micro': 0.9975177377777777, 'eval_precision': 0.8112, 'eval_recall': 0.8112, 'eval_runtime': 15.4794, 'eval_samples_per_second': 646.021, 'eval_steps_per_second': 10.143, 'epoch': 2.75}


 28%|██▊       | 440/1560 [19:26<20:27,  1.10s/it]  

{'loss': 0.8708, 'grad_norm': 2.275099277496338, 'learning_rate': 0.0008162226877976886, 'epoch': 2.82}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:09, 16.08it/s][A
  3%|▎         | 5/157 [00:00<00:11, 12.92it/s][A
  4%|▍         | 7/157 [00:00<00:12, 11.90it/s][A
  6%|▌         | 9/157 [00:00<00:12, 12.02it/s][A
  7%|▋         | 11/157 [00:00<00:12, 11.88it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.72it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.45it/s][A
 11%|█         | 17/157 [00:01<00:11, 11.75it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.80it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.87it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 11.89it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.81it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 11.93it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 11.89it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 11.90it/s][A
 21%|██        | 33/157 [00:02<00:10, 11.89it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 11.79it/s][A
 24%|██▎       | 37/157 [00:03<00:10, 11.41it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.6226443648338318, 'eval_acc': 0.8107, 'eval_acc2': 0.9089, 'eval_f1': 0.809913951221705, 'eval_roc_auc_micro': 0.9973969401010101, 'eval_precision': 0.8107, 'eval_recall': 0.8107, 'eval_runtime': 17.3422, 'eval_samples_per_second': 576.627, 'eval_steps_per_second': 9.053, 'epoch': 2.82}


 29%|██▉       | 450/1560 [19:52<20:38,  1.12s/it]  

{'loss': 0.8328, 'grad_norm': 3.4218950271606445, 'learning_rate': 0.0008083594363142716, 'epoch': 2.88}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:10, 15.03it/s][A
  3%|▎         | 5/157 [00:00<00:10, 13.87it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.28it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.86it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.81it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.69it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.64it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.45it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.53it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.62it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.70it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.73it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.67it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.59it/s][A
 20%|█▉        | 31/157 [00:02<00:09, 12.61it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.50it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.47it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.39it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.6346942186355591, 'eval_acc': 0.8059, 'eval_acc2': 0.909, 'eval_f1': 0.8040485547169787, 'eval_roc_auc_micro': 0.9974204567676767, 'eval_precision': 0.8059, 'eval_recall': 0.8059, 'eval_runtime': 15.2473, 'eval_samples_per_second': 655.854, 'eval_steps_per_second': 10.297, 'epoch': 2.88}


 29%|██▉       | 460/1560 [20:17<20:26,  1.12s/it]  

{'loss': 0.8125, 'grad_norm': 2.526024580001831, 'learning_rate': 0.0008003711321189895, 'epoch': 2.94}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:09, 16.32it/s][A
  3%|▎         | 5/157 [00:00<00:11, 13.37it/s][A
  4%|▍         | 7/157 [00:00<00:12, 12.29it/s][A
  6%|▌         | 9/157 [00:00<00:12, 11.52it/s][A
  7%|▋         | 11/157 [00:00<00:13, 11.09it/s][A
  8%|▊         | 13/157 [00:01<00:13, 10.74it/s][A
 10%|▉         | 15/157 [00:01<00:13, 10.76it/s][A
 11%|█         | 17/157 [00:01<00:12, 10.78it/s][A
 12%|█▏        | 19/157 [00:01<00:12, 10.84it/s][A
 13%|█▎        | 21/157 [00:01<00:12, 10.55it/s][A
 15%|█▍        | 23/157 [00:02<00:12, 10.59it/s][A
 16%|█▌        | 25/157 [00:02<00:12, 10.21it/s][A
 17%|█▋        | 27/157 [00:02<00:12, 10.32it/s][A
 18%|█▊        | 29/157 [00:02<00:12, 10.29it/s][A
 20%|█▉        | 31/157 [00:02<00:12, 10.35it/s][A
 21%|██        | 33/157 [00:03<00:11, 10.48it/s][A
 22%|██▏       | 35/157 [00:03<00:11, 10.57it/s][A
 24%|██▎       | 37/157 [00:03<00:11, 10.81it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5997201800346375, 'eval_acc': 0.8197, 'eval_acc2': 0.9125, 'eval_f1': 0.8173994777864833, 'eval_roc_auc_micro': 0.9976279544949495, 'eval_precision': 0.8197, 'eval_recall': 0.8197, 'eval_runtime': 16.4947, 'eval_samples_per_second': 606.256, 'eval_steps_per_second': 9.518, 'epoch': 2.94}


 30%|███       | 470/1560 [20:47<31:51,  1.75s/it]  

{'loss': 0.7923, 'grad_norm': 3.0902202129364014, 'learning_rate': 0.0007922610148049445, 'epoch': 3.01}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 19.00it/s][A
  3%|▎         | 5/157 [00:00<00:09, 15.21it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.98it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.41it/s][A
  7%|▋         | 11/157 [00:00<00:11, 13.14it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.98it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.81it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.72it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.68it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.71it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.79it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.58it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.41it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.31it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.43it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.53it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.40it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.28it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.6125423908233643, 'eval_acc': 0.8127, 'eval_acc2': 0.9105, 'eval_f1': 0.8120329532275701, 'eval_roc_auc_micro': 0.997554333989899, 'eval_precision': 0.8127, 'eval_recall': 0.8127, 'eval_runtime': 15.9844, 'eval_samples_per_second': 625.61, 'eval_steps_per_second': 9.822, 'epoch': 3.01}


 31%|███       | 480/1560 [21:13<21:19,  1.18s/it]  

{'loss': 0.6123, 'grad_norm': 3.440985679626465, 'learning_rate': 0.0007840323733655779, 'epoch': 3.07}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.20it/s][A
  3%|▎         | 5/157 [00:00<00:13, 11.54it/s][A
  4%|▍         | 7/157 [00:00<00:12, 11.99it/s][A
  6%|▌         | 9/157 [00:00<00:12, 12.24it/s][A
  7%|▋         | 11/157 [00:00<00:12, 11.76it/s][A
  8%|▊         | 13/157 [00:01<00:11, 12.05it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.08it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.26it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.32it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.42it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.46it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.50it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.18it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.16it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 11.96it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.16it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.37it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.45it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.6238719820976257, 'eval_acc': 0.8104, 'eval_acc2': 0.9063, 'eval_f1': 0.8105905204699696, 'eval_roc_auc_micro': 0.9975224752020202, 'eval_precision': 0.8104, 'eval_recall': 0.8104, 'eval_runtime': 15.2653, 'eval_samples_per_second': 655.08, 'eval_steps_per_second': 10.285, 'epoch': 3.07}


 31%|███▏      | 490/1560 [21:38<19:20,  1.08s/it]  

{'loss': 0.6651, 'grad_norm': 2.9261655807495117, 'learning_rate': 0.000775688544860846, 'epoch': 3.14}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 17.82it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.62it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.28it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.96it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.80it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.67it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.62it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.54it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.57it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.58it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.63it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.65it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.60it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.58it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.59it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.37it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.46it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.49it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.6110853552818298, 'eval_acc': 0.8189, 'eval_acc2': 0.9107, 'eval_f1': 0.8176246700257621, 'eval_roc_auc_micro': 0.9974444931313132, 'eval_precision': 0.8189, 'eval_recall': 0.8189, 'eval_runtime': 16.3808, 'eval_samples_per_second': 610.471, 'eval_steps_per_second': 9.584, 'epoch': 3.14}


 32%|███▏      | 500/1560 [22:04<19:41,  1.11s/it]  

{'loss': 0.6454, 'grad_norm': 2.3556764125823975, 'learning_rate': 0.0007672329130639005, 'epoch': 3.2}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.69it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.71it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.16it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.70it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.44it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.47it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.47it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.47it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.25it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.29it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.34it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.28it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.33it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.37it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.41it/s][A
 21%|██        | 33/157 [00:02<00:10, 11.96it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 12.12it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.19it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5850012898445129, 'eval_acc': 0.824, 'eval_acc2': 0.9178, 'eval_f1': 0.8238293890325545, 'eval_roc_auc_micro': 0.9977302616161616, 'eval_precision': 0.824, 'eval_recall': 0.824, 'eval_runtime': 15.7085, 'eval_samples_per_second': 636.6, 'eval_steps_per_second': 9.995, 'epoch': 3.2}


 33%|███▎      | 510/1560 [22:30<20:11,  1.15s/it]  

{'loss': 0.6358, 'grad_norm': 3.1613657474517822, 'learning_rate': 0.0007586689070888284, 'epoch': 3.26}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:09, 15.96it/s][A
  3%|▎         | 5/157 [00:00<00:12, 12.10it/s][A
  4%|▍         | 7/157 [00:00<00:13, 11.06it/s][A
  6%|▌         | 9/157 [00:00<00:13, 11.07it/s][A
  7%|▋         | 11/157 [00:00<00:13, 11.08it/s][A
  8%|▊         | 13/157 [00:01<00:13, 10.99it/s][A
 10%|▉         | 15/157 [00:01<00:12, 10.94it/s][A
 11%|█         | 17/157 [00:01<00:13, 10.77it/s][A
 12%|█▏        | 19/157 [00:01<00:12, 10.84it/s][A
 13%|█▎        | 21/157 [00:01<00:12, 10.98it/s][A
 15%|█▍        | 23/157 [00:02<00:12, 10.83it/s][A
 16%|█▌        | 25/157 [00:02<00:12, 10.94it/s][A
 17%|█▋        | 27/157 [00:02<00:11, 10.90it/s][A
 18%|█▊        | 29/157 [00:02<00:11, 10.80it/s][A
 20%|█▉        | 31/157 [00:02<00:11, 10.93it/s][A
 21%|██        | 33/157 [00:02<00:11, 11.07it/s][A
 22%|██▏       | 35/157 [00:03<00:11, 10.99it/s][A
 24%|██▎       | 37/157 [00:03<00:11, 10.90it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.6045922636985779, 'eval_acc': 0.8169, 'eval_acc2': 0.9116, 'eval_f1': 0.8156570980807679, 'eval_roc_auc_micro': 0.9974833018181818, 'eval_precision': 0.8169, 'eval_recall': 0.8169, 'eval_runtime': 17.3348, 'eval_samples_per_second': 576.874, 'eval_steps_per_second': 9.057, 'epoch': 3.26}


 33%|███▎      | 520/1560 [22:56<19:15,  1.11s/it]  

{'loss': 0.6613, 'grad_norm': 3.288064479827881, 'learning_rate': 0.00075, 'epoch': 3.33}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.27it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.56it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.08it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.87it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.85it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.80it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.77it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.67it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.65it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.42it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.21it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.41it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.49it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.49it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.53it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.43it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.46it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.58it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.6146724820137024, 'eval_acc': 0.813, 'eval_acc2': 0.9097, 'eval_f1': 0.8112758923195902, 'eval_roc_auc_micro': 0.9974993501515151, 'eval_precision': 0.813, 'eval_recall': 0.813, 'eval_runtime': 15.4463, 'eval_samples_per_second': 647.403, 'eval_steps_per_second': 10.164, 'epoch': 3.33}


 34%|███▍      | 530/1560 [23:22<19:39,  1.15s/it]  

{'loss': 0.6273, 'grad_norm': 2.993682622909546, 'learning_rate': 0.0007412297074035968, 'epoch': 3.39}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:10, 15.04it/s][A
  3%|▎         | 5/157 [00:00<00:11, 13.13it/s][A
  4%|▍         | 7/157 [00:00<00:12, 12.38it/s][A
  6%|▌         | 9/157 [00:00<00:12, 12.06it/s][A
  7%|▋         | 11/157 [00:00<00:12, 11.53it/s][A
  8%|▊         | 13/157 [00:01<00:13, 11.03it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.04it/s][A
 11%|█         | 17/157 [00:01<00:13, 10.74it/s][A
 12%|█▏        | 19/157 [00:01<00:12, 10.79it/s][A
 13%|█▎        | 21/157 [00:01<00:12, 10.54it/s][A
 15%|█▍        | 23/157 [00:02<00:12, 10.75it/s][A
 16%|█▌        | 25/157 [00:02<00:12, 10.67it/s][A
 17%|█▋        | 27/157 [00:02<00:12, 10.68it/s][A
 18%|█▊        | 29/157 [00:02<00:11, 10.72it/s][A
 20%|█▉        | 31/157 [00:02<00:11, 10.80it/s][A
 21%|██        | 33/157 [00:02<00:11, 10.79it/s][A
 22%|██▏       | 35/157 [00:03<00:11, 10.72it/s][A
 24%|██▎       | 37/157 [00:03<00:11, 10.73it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5965828895568848, 'eval_acc': 0.8188, 'eval_acc2': 0.9142, 'eval_f1': 0.8169238707900063, 'eval_roc_auc_micro': 0.9975936118686868, 'eval_precision': 0.8188, 'eval_recall': 0.8188, 'eval_runtime': 16.7522, 'eval_samples_per_second': 596.935, 'eval_steps_per_second': 9.372, 'epoch': 3.39}


 35%|███▍      | 540/1560 [23:48<18:39,  1.10s/it]  

{'loss': 0.6974, 'grad_norm': 3.411116123199463, 'learning_rate': 0.0007323615860218843, 'epoch': 3.46}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:12, 12.01it/s][A
  3%|▎         | 5/157 [00:00<00:13, 11.53it/s][A
  4%|▍         | 7/157 [00:00<00:19,  7.64it/s][A
  6%|▌         | 9/157 [00:00<00:16,  8.78it/s][A
  7%|▋         | 11/157 [00:01<00:14,  9.77it/s][A
  8%|▊         | 13/157 [00:01<00:13, 10.53it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.09it/s][A
 11%|█         | 17/157 [00:01<00:12, 11.41it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.73it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.99it/s][A
 15%|█▍        | 23/157 [00:02<00:11, 12.16it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.27it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.39it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.49it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.45it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.44it/s][A
 22%|██▏       | 35/157 [00:03<00:09, 12.49it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.57it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.6069386005401611, 'eval_acc': 0.8146, 'eval_acc2': 0.913, 'eval_f1': 0.8149643267505116, 'eval_roc_auc_micro': 0.9975756639898989, 'eval_precision': 0.8146, 'eval_recall': 0.8146, 'eval_runtime': 15.5009, 'eval_samples_per_second': 645.123, 'eval_steps_per_second': 10.128, 'epoch': 3.46}


 35%|███▌      | 550/1560 [24:14<19:19,  1.15s/it]  

{'loss': 0.6641, 'grad_norm': 5.273350238800049, 'learning_rate': 0.000723399232250813, 'epoch': 3.52}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 17.46it/s][A
  3%|▎         | 5/157 [00:00<00:11, 13.53it/s][A
  4%|▍         | 7/157 [00:00<00:12, 12.29it/s][A
  6%|▌         | 9/157 [00:00<00:12, 12.24it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.36it/s][A
  8%|▊         | 13/157 [00:01<00:11, 12.45it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.49it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.56it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.57it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.56it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.58it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.61it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.66it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.69it/s][A
 20%|█▉        | 31/157 [00:02<00:09, 12.71it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.65it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.70it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.72it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.622794508934021, 'eval_acc': 0.813, 'eval_acc2': 0.9051, 'eval_f1': 0.8124634013768329, 'eval_roc_auc_micro': 0.9975193553535354, 'eval_precision': 0.813, 'eval_recall': 0.813, 'eval_runtime': 16.3623, 'eval_samples_per_second': 611.16, 'eval_steps_per_second': 9.595, 'epoch': 3.52}


 36%|███▌      | 560/1560 [24:40<18:22,  1.10s/it]  

{'loss': 0.65, 'grad_norm': 3.2915711402893066, 'learning_rate': 0.000714346280701527, 'epoch': 3.58}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:16,  9.37it/s][A
  3%|▎         | 4/157 [00:00<00:14, 10.92it/s][A
  4%|▍         | 6/157 [00:00<00:13, 11.56it/s][A
  5%|▌         | 8/157 [00:00<00:13, 11.46it/s][A
  6%|▋         | 10/157 [00:00<00:12, 11.50it/s][A
  8%|▊         | 12/157 [00:01<00:12, 11.80it/s][A
  9%|▉         | 14/157 [00:01<00:12, 11.90it/s][A
 10%|█         | 16/157 [00:01<00:11, 12.02it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 12.19it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 12.12it/s][A
 14%|█▍        | 22/157 [00:01<00:11, 11.91it/s][A
 15%|█▌        | 24/157 [00:02<00:11, 11.90it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.00it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 12.18it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 12.31it/s][A
 20%|██        | 32/157 [00:02<00:10, 12.19it/s][A
 22%|██▏       | 34/157 [00:02<00:09, 12.34it/s][A
 23%|██▎       | 36/157 [00:03<00:09, 12.37it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.5718729496002197, 'eval_acc': 0.8271, 'eval_acc2': 0.9166, 'eval_f1': 0.8267627112167937, 'eval_roc_auc_micro': 0.9978966551515152, 'eval_precision': 0.8271, 'eval_recall': 0.8271, 'eval_runtime': 16.2471, 'eval_samples_per_second': 615.493, 'eval_steps_per_second': 9.663, 'epoch': 3.58}


 37%|███▋      | 570/1560 [25:06<18:35,  1.13s/it]  

{'loss': 0.6605, 'grad_norm': 3.3264732360839844, 'learning_rate': 0.0007052064027263785, 'epoch': 3.65}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:17,  8.64it/s][A
  3%|▎         | 4/157 [00:00<00:14, 10.27it/s][A
  4%|▍         | 6/157 [00:00<00:21,  7.12it/s][A
  5%|▌         | 8/157 [00:00<00:17,  8.59it/s][A
  6%|▋         | 10/157 [00:01<00:15,  9.67it/s][A
  8%|▊         | 12/157 [00:01<00:13, 10.51it/s][A
  9%|▉         | 14/157 [00:01<00:12, 11.06it/s][A
 10%|█         | 16/157 [00:01<00:12, 11.52it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 11.92it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 12.03it/s][A
 14%|█▍        | 22/157 [00:02<00:11, 12.16it/s][A
 15%|█▌        | 24/157 [00:02<00:10, 12.34it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.38it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 12.39it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 12.45it/s][A
 20%|██        | 32/157 [00:02<00:10, 12.36it/s][A
 22%|██▏       | 34/157 [00:03<00:09, 12.50it/s][A
 23%|██▎       | 36/157 [00:03<00:09, 12.60it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.5603836178779602, 'eval_acc': 0.8299, 'eval_acc2': 0.9216, 'eval_f1': 0.8295840456687298, 'eval_roc_auc_micro': 0.9979898919696971, 'eval_precision': 0.8299, 'eval_recall': 0.8299, 'eval_runtime': 15.5275, 'eval_samples_per_second': 644.02, 'eval_steps_per_second': 10.111, 'epoch': 3.65}


 37%|███▋      | 580/1560 [25:31<17:38,  1.08s/it]  

{'loss': 0.6709, 'grad_norm': 3.8408825397491455, 'learning_rate': 0.0006959833049300376, 'epoch': 3.71}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.02it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.02it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.87it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.34it/s][A
  7%|▋         | 11/157 [00:00<00:11, 13.08it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.89it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.79it/s][A
 11%|█         | 17/157 [00:01<00:10, 12.78it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.75it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.77it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.78it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.73it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.46it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.59it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.52it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.34it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 11.79it/s][A
 24%|██▎       | 37/157 [00:02<00:10, 11.54it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.589067816734314, 'eval_acc': 0.8233, 'eval_acc2': 0.9163, 'eval_f1': 0.8217491329028132, 'eval_roc_auc_micro': 0.99775143989899, 'eval_precision': 0.8233, 'eval_recall': 0.8233, 'eval_runtime': 16.8572, 'eval_samples_per_second': 593.217, 'eval_steps_per_second': 9.314, 'epoch': 3.71}


 38%|███▊      | 590/1560 [25:58<17:50,  1.10s/it]  

{'loss': 0.6453, 'grad_norm': 3.042384624481201, 'learning_rate': 0.0006866807276663105, 'epoch': 3.78}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:09, 17.11it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.36it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.68it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.23it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.87it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.77it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.79it/s][A
 11%|█         | 17/157 [00:01<00:10, 12.78it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.76it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.65it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.49it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.32it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.13it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.06it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.24it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.27it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.29it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.42it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5549894571304321, 'eval_acc': 0.8318, 'eval_acc2': 0.9209, 'eval_f1': 0.8309228419661262, 'eval_roc_auc_micro': 0.9980708287878788, 'eval_precision': 0.8318, 'eval_recall': 0.8318, 'eval_runtime': 15.5041, 'eval_samples_per_second': 644.991, 'eval_steps_per_second': 10.126, 'epoch': 3.78}


 38%|███▊      | 600/1560 [26:23<17:21,  1.08s/it]  

{'loss': 0.638, 'grad_norm': 4.731353282928467, 'learning_rate': 0.0006773024435212678, 'epoch': 3.84}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 17.16it/s][A
  3%|▎         | 5/157 [00:00<00:23,  6.36it/s][A
  4%|▍         | 7/157 [00:00<00:19,  7.74it/s][A
  6%|▌         | 9/157 [00:01<00:17,  8.69it/s][A
  7%|▋         | 11/157 [00:01<00:15,  9.40it/s][A
  8%|▊         | 13/157 [00:01<00:14,  9.92it/s][A
 10%|▉         | 15/157 [00:01<00:13, 10.21it/s][A
 11%|█         | 17/157 [00:01<00:12, 10.82it/s][A
 12%|█▏        | 19/157 [00:01<00:12, 11.30it/s][A
 13%|█▎        | 21/157 [00:02<00:11, 11.56it/s][A
 15%|█▍        | 23/157 [00:02<00:11, 11.81it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.96it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 11.96it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 11.85it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 11.87it/s][A
 21%|██        | 33/157 [00:03<00:10, 11.73it/s][A
 22%|██▏       | 35/157 [00:03<00:10, 11.62it/s][A
 24%|██▎       | 37/157 [00:03<00:10, 11.82it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5836527347564697, 'eval_acc': 0.8253, 'eval_acc2': 0.9162, 'eval_f1': 0.8237652666780719, 'eval_roc_auc_micro': 0.9977871045959595, 'eval_precision': 0.8253, 'eval_recall': 0.8253, 'eval_runtime': 17.5018, 'eval_samples_per_second': 571.369, 'eval_steps_per_second': 8.97, 'epoch': 3.84}


 39%|███▉      | 610/1560 [26:50<17:29,  1.10s/it]  

{'loss': 0.7036, 'grad_norm': 2.716865062713623, 'learning_rate': 0.0006678522557833024, 'epoch': 3.9}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.05it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.37it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.11it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.85it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.79it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.53it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.50it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.56it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.48it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.50it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.57it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.59it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.59it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.56it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.56it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.57it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.60it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.63it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5291991233825684, 'eval_acc': 0.8368, 'eval_acc2': 0.9252, 'eval_f1': 0.8360310762034773, 'eval_roc_auc_micro': 0.9982066806565656, 'eval_precision': 0.8368, 'eval_recall': 0.8368, 'eval_runtime': 15.4656, 'eval_samples_per_second': 646.597, 'eval_steps_per_second': 10.152, 'epoch': 3.9}


 40%|███▉      | 620/1560 [27:15<17:35,  1.12s/it]  

{'loss': 0.6565, 'grad_norm': 2.3335087299346924, 'learning_rate': 0.0006583339969007363, 'epoch': 3.97}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:26,  5.88it/s][A
  3%|▎         | 4/157 [00:00<00:19,  7.99it/s][A
  4%|▍         | 6/157 [00:00<00:16,  9.12it/s][A
  5%|▌         | 8/157 [00:00<00:15,  9.68it/s][A
  6%|▋         | 10/157 [00:01<00:14, 10.02it/s][A
  8%|▊         | 12/157 [00:01<00:13, 10.36it/s][A
  9%|▉         | 14/157 [00:01<00:13, 10.49it/s][A
 10%|█         | 16/157 [00:01<00:13, 10.52it/s][A
 11%|█▏        | 18/157 [00:01<00:13, 10.67it/s][A
 13%|█▎        | 20/157 [00:02<00:12, 10.66it/s][A
 14%|█▍        | 22/157 [00:02<00:12, 10.60it/s][A
 15%|█▌        | 24/157 [00:02<00:12, 10.68it/s][A
 17%|█▋        | 26/157 [00:02<00:12, 10.72it/s][A
 18%|█▊        | 28/157 [00:02<00:12, 10.65it/s][A
 19%|█▉        | 30/157 [00:02<00:11, 10.63it/s][A
 20%|██        | 32/157 [00:03<00:11, 10.51it/s][A
 22%|██▏       | 34/157 [00:03<00:11, 10.68it/s][A
 23%|██▎       | 36/157 [00:03<00:11, 10.75it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.5388972759246826, 'eval_acc': 0.836, 'eval_acc2': 0.926, 'eval_f1': 0.8349348558150071, 'eval_roc_auc_micro': 0.998098459090909, 'eval_precision': 0.836, 'eval_recall': 0.836, 'eval_runtime': 16.6062, 'eval_samples_per_second': 602.183, 'eval_steps_per_second': 9.454, 'epoch': 3.97}


 40%|████      | 630/1560 [27:44<19:41,  1.27s/it]  

{'loss': 0.6315, 'grad_norm': 2.8175764083862305, 'learning_rate': 0.0006487515269276015, 'epoch': 4.03}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.67it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.39it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.71it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.43it/s][A
  7%|▋         | 11/157 [00:00<00:11, 13.09it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.92it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.67it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.61it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.47it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.56it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.56it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.60it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.65it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.62it/s][A
 20%|█▉        | 31/157 [00:02<00:09, 12.67it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.66it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.45it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.49it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.574783205986023, 'eval_acc': 0.8252, 'eval_acc2': 0.9157, 'eval_f1': 0.8246628580616536, 'eval_roc_auc_micro': 0.9979300230808081, 'eval_precision': 0.8252, 'eval_recall': 0.8252, 'eval_runtime': 16.266, 'eval_samples_per_second': 614.778, 'eval_steps_per_second': 9.652, 'epoch': 4.03}


 41%|████      | 640/1560 [28:10<17:12,  1.12s/it]  

{'loss': 0.5228, 'grad_norm': 2.679290294647217, 'learning_rate': 0.0006391087319582263, 'epoch': 4.1}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.65it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.63it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.75it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.28it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.68it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.67it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.67it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.53it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.63it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.70it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.75it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.72it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.70it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.52it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.59it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.44it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.52it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.42it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5441035032272339, 'eval_acc': 0.8338, 'eval_acc2': 0.923, 'eval_f1': 0.8333114279087728, 'eval_roc_auc_micro': 0.9981252406565657, 'eval_precision': 0.8338, 'eval_recall': 0.8338, 'eval_runtime': 15.4878, 'eval_samples_per_second': 645.67, 'eval_steps_per_second': 10.137, 'epoch': 4.1}


 42%|████▏     | 650/1560 [28:35<16:29,  1.09s/it]  

{'loss': 0.5072, 'grad_norm': 3.254708766937256, 'learning_rate': 0.0006294095225512603, 'epoch': 4.16}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:14, 10.74it/s][A
  3%|▎         | 4/157 [00:00<00:13, 11.36it/s][A
  4%|▍         | 6/157 [00:00<00:12, 11.64it/s][A
  5%|▌         | 8/157 [00:00<00:12, 11.63it/s][A
  6%|▋         | 10/157 [00:00<00:12, 11.76it/s][A
  8%|▊         | 12/157 [00:01<00:12, 12.06it/s][A
  9%|▉         | 14/157 [00:01<00:12, 11.85it/s][A
 10%|█         | 16/157 [00:01<00:11, 12.10it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 12.14it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 12.31it/s][A
 14%|█▍        | 22/157 [00:01<00:10, 12.40it/s][A
 15%|█▌        | 24/157 [00:01<00:10, 12.47it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.50it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 12.37it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 12.43it/s][A
 20%|██        | 32/157 [00:02<00:10, 12.38it/s][A
 22%|██▏       | 34/157 [00:02<00:10, 11.91it/s][A
 23%|██▎       | 36/157 [00:03<00:10, 11.52it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.5941540002822876, 'eval_acc': 0.8239, 'eval_acc2': 0.918, 'eval_f1': 0.823581127034945, 'eval_roc_auc_micro': 0.9975914875252525, 'eval_precision': 0.8239, 'eval_recall': 0.8239, 'eval_runtime': 16.8389, 'eval_samples_per_second': 593.861, 'eval_steps_per_second': 9.324, 'epoch': 4.16}


 42%|████▏     | 660/1560 [29:02<16:28,  1.10s/it]  

{'loss': 0.5245, 'grad_norm': 2.523192882537842, 'learning_rate': 0.0006196578321437789, 'epoch': 4.22}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:12, 12.25it/s][A
  3%|▎         | 4/157 [00:00<00:22,  6.75it/s][A
  4%|▍         | 6/157 [00:00<00:17,  8.44it/s][A
  5%|▌         | 8/157 [00:00<00:15,  9.33it/s][A
  6%|▋         | 10/157 [00:01<00:14, 10.27it/s][A
  8%|▊         | 12/157 [00:01<00:13, 10.89it/s][A
  9%|▉         | 14/157 [00:01<00:12, 11.42it/s][A
 10%|█         | 16/157 [00:01<00:12, 11.73it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 12.02it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 11.61it/s][A
 14%|█▍        | 22/157 [00:02<00:11, 11.73it/s][A
 15%|█▌        | 24/157 [00:02<00:11, 11.87it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.15it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 12.27it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 12.34it/s][A
 20%|██        | 32/157 [00:02<00:10, 12.40it/s][A
 22%|██▏       | 34/157 [00:03<00:09, 12.48it/s][A
 23%|██▎       | 36/157 [00:03<00:09, 12.52it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.5576910376548767, 'eval_acc': 0.8339, 'eval_acc2': 0.918, 'eval_f1': 0.8324161482718738, 'eval_roc_auc_micro': 0.9979393262121212, 'eval_precision': 0.8339, 'eval_recall': 0.8339, 'eval_runtime': 15.545, 'eval_samples_per_second': 643.294, 'eval_steps_per_second': 10.1, 'epoch': 4.22}


 43%|████▎     | 670/1560 [29:27<16:06,  1.09s/it]  

{'loss': 0.5062, 'grad_norm': 3.6709957122802734, 'learning_rate': 0.0006098576154561086, 'epoch': 4.29}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:09, 16.46it/s][A
  3%|▎         | 5/157 [00:00<00:10, 13.89it/s][A
  4%|▍         | 7/157 [00:00<00:11, 12.77it/s][A
  6%|▌         | 9/157 [00:00<00:12, 12.13it/s][A
  7%|▋         | 11/157 [00:00<00:12, 11.86it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.59it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.61it/s][A
 11%|█         | 17/157 [00:01<00:12, 11.46it/s][A
 12%|█▏        | 19/157 [00:01<00:12, 11.13it/s][A
 13%|█▎        | 21/157 [00:01<00:12, 10.94it/s][A
 15%|█▍        | 23/157 [00:01<00:12, 11.02it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.30it/s][A
 17%|█▋        | 27/157 [00:02<00:11, 11.59it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 11.81it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 11.99it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.04it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 12.08it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.02it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5661942362785339, 'eval_acc': 0.8316, 'eval_acc2': 0.9237, 'eval_f1': 0.8313341268818294, 'eval_roc_auc_micro': 0.997906879949495, 'eval_precision': 0.8316, 'eval_recall': 0.8316, 'eval_runtime': 17.2764, 'eval_samples_per_second': 578.825, 'eval_steps_per_second': 9.088, 'epoch': 4.29}


 44%|████▎     | 680/1560 [29:54<16:14,  1.11s/it]  

{'loss': 0.556, 'grad_norm': 4.266321659088135, 'learning_rate': 0.0006000128468880223, 'epoch': 4.35}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:15, 10.28it/s][A
  3%|▎         | 4/157 [00:00<00:13, 11.25it/s][A
  4%|▍         | 6/157 [00:00<00:12, 11.85it/s][A
  5%|▌         | 8/157 [00:00<00:12, 12.08it/s][A
  6%|▋         | 10/157 [00:00<00:12, 12.20it/s][A
  8%|▊         | 12/157 [00:00<00:11, 12.37it/s][A
  9%|▉         | 14/157 [00:01<00:11, 12.30it/s][A
 10%|█         | 16/157 [00:01<00:11, 12.37it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 12.47it/s][A
 13%|█▎        | 20/157 [00:01<00:10, 12.46it/s][A
 14%|█▍        | 22/157 [00:01<00:10, 12.46it/s][A
 15%|█▌        | 24/157 [00:01<00:10, 12.55it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.45it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 12.50it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 12.56it/s][A
 20%|██        | 32/157 [00:02<00:09, 12.62it/s][A
 22%|██▏       | 34/157 [00:02<00:09, 12.71it/s][A
 23%|██▎       | 36/157 [00:02<00:09, 12.71it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.5626367926597595, 'eval_acc': 0.8308, 'eval_acc2': 0.9185, 'eval_f1': 0.8304477930222178, 'eval_roc_auc_micro': 0.9980141035353536, 'eval_precision': 0.8308, 'eval_recall': 0.8308, 'eval_runtime': 15.4182, 'eval_samples_per_second': 648.583, 'eval_steps_per_second': 10.183, 'epoch': 4.35}


 44%|████▍     | 690/1560 [30:19<17:01,  1.17s/it]  

{'loss': 0.5275, 'grad_norm': 3.5551483631134033, 'learning_rate': 0.000590127518906953, 'epoch': 4.42}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:42,  3.63it/s][A
  3%|▎         | 4/157 [00:00<00:25,  5.92it/s][A
  4%|▍         | 6/157 [00:00<00:20,  7.55it/s][A
  5%|▌         | 8/157 [00:01<00:17,  8.59it/s][A
  6%|▋         | 10/157 [00:01<00:15,  9.20it/s][A
  8%|▊         | 12/157 [00:01<00:14,  9.67it/s][A
  9%|▉         | 14/157 [00:01<00:14,  9.98it/s][A
 10%|█         | 16/157 [00:01<00:13, 10.12it/s][A
 11%|█▏        | 18/157 [00:02<00:13, 10.16it/s][A
 13%|█▎        | 20/157 [00:02<00:13, 10.14it/s][A
 14%|█▍        | 22/157 [00:02<00:12, 10.49it/s][A
 15%|█▌        | 24/157 [00:02<00:12, 10.52it/s][A
 17%|█▋        | 26/157 [00:02<00:12, 10.65it/s][A
 18%|█▊        | 28/157 [00:02<00:12, 10.44it/s][A
 19%|█▉        | 30/157 [00:03<00:11, 10.59it/s][A
 20%|██        | 32/157 [00:03<00:11, 10.72it/s][A
 22%|██▏       | 34/157 [00:03<00:11, 10.78it/s][A
 23%|██▎       | 36/157 [00:03<00:11, 10.75it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.5653361678123474, 'eval_acc': 0.8257, 'eval_acc2': 0.9201, 'eval_f1': 0.8248240620894336, 'eval_roc_auc_micro': 0.9980093144949495, 'eval_precision': 0.8257, 'eval_recall': 0.8257, 'eval_runtime': 16.4245, 'eval_samples_per_second': 608.846, 'eval_steps_per_second': 9.559, 'epoch': 4.42}


 45%|████▍     | 700/1560 [30:45<15:40,  1.09s/it]  

{'loss': 0.5305, 'grad_norm': 3.49045729637146, 'learning_rate': 0.0005802056404288802, 'epoch': 4.48}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:09, 16.39it/s][A
  3%|▎         | 4/157 [00:00<00:11, 13.25it/s][A
  4%|▍         | 6/157 [00:00<00:11, 12.66it/s][A
  5%|▌         | 8/157 [00:00<00:12, 12.41it/s][A
  6%|▋         | 10/157 [00:00<00:11, 12.42it/s][A
  8%|▊         | 12/157 [00:00<00:11, 12.38it/s][A
  9%|▉         | 14/157 [00:01<00:11, 12.35it/s][A
 10%|█         | 16/157 [00:01<00:11, 12.40it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 12.43it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 12.45it/s][A
 14%|█▍        | 22/157 [00:01<00:10, 12.52it/s][A
 15%|█▌        | 24/157 [00:01<00:10, 12.58it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.57it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 12.58it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 12.50it/s][A
 20%|██        | 32/157 [00:02<00:10, 12.47it/s][A
 22%|██▏       | 34/157 [00:02<00:09, 12.40it/s][A
 23%|██▎       | 36/157 [00:02<00:09, 12.45it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.5380717515945435, 'eval_acc': 0.8367, 'eval_acc2': 0.9253, 'eval_f1': 0.8355354322642883, 'eval_roc_auc_micro': 0.9981231932323233, 'eval_precision': 0.8367, 'eval_recall': 0.8367, 'eval_runtime': 15.8795, 'eval_samples_per_second': 629.741, 'eval_steps_per_second': 9.887, 'epoch': 4.48}


 46%|████▌     | 710/1560 [31:12<16:13,  1.14s/it]  

{'loss': 0.5381, 'grad_norm': 2.773787498474121, 'learning_rate': 0.0005702512351925465, 'epoch': 4.54}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.44it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.03it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.91it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.24it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.52it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.45it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.44it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.43it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.37it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.39it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.42it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.12it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.11it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.18it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.37it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.33it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 12.17it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.26it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5434057712554932, 'eval_acc': 0.8371, 'eval_acc2': 0.9185, 'eval_f1': 0.8365357359934944, 'eval_roc_auc_micro': 0.9980318938888888, 'eval_precision': 0.8371, 'eval_recall': 0.8371, 'eval_runtime': 15.5758, 'eval_samples_per_second': 642.021, 'eval_steps_per_second': 10.08, 'epoch': 4.54}


 46%|████▌     | 720/1560 [31:37<15:19,  1.09s/it]  

{'loss': 0.5003, 'grad_norm': 2.5247175693511963, 'learning_rate': 0.0005602683401276614, 'epoch': 4.61}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.58it/s][A
  3%|▎         | 5/157 [00:00<00:18,  8.18it/s][A
  4%|▍         | 7/157 [00:00<00:15,  9.56it/s][A
  6%|▌         | 9/157 [00:00<00:14, 10.48it/s][A
  7%|▋         | 11/157 [00:01<00:13, 11.14it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.31it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.50it/s][A
 11%|█         | 17/157 [00:01<00:11, 11.85it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.95it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.09it/s][A
 15%|█▍        | 23/157 [00:02<00:11, 12.03it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.17it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.24it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.29it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.28it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.41it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.54it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.35it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5511701107025146, 'eval_acc': 0.8371, 'eval_acc2': 0.9222, 'eval_f1': 0.8361222969089814, 'eval_roc_auc_micro': 0.9979908426767676, 'eval_precision': 0.8371, 'eval_recall': 0.8371, 'eval_runtime': 16.2313, 'eval_samples_per_second': 616.095, 'eval_steps_per_second': 9.673, 'epoch': 4.61}


 47%|████▋     | 730/1560 [32:02<15:06,  1.09s/it]  

{'loss': 0.5118, 'grad_norm': 3.281435489654541, 'learning_rate': 0.0005502610037177585, 'epoch': 4.67}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 19.10it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.64it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.02it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.63it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.60it/s][A
  8%|▊         | 13/157 [00:01<00:11, 12.29it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.36it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.48it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.54it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.63it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.65it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.51it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.41it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.27it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.02it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.22it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.28it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.23it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5290099382400513, 'eval_acc': 0.8423, 'eval_acc2': 0.9259, 'eval_f1': 0.8422530646448018, 'eval_roc_auc_micro': 0.9981534383838384, 'eval_precision': 0.8423, 'eval_recall': 0.8423, 'eval_runtime': 15.5718, 'eval_samples_per_second': 642.187, 'eval_steps_per_second': 10.082, 'epoch': 4.67}


 47%|████▋     | 740/1560 [32:28<15:26,  1.13s/it]  

{'loss': 0.5119, 'grad_norm': 2.1762144565582275, 'learning_rate': 0.000540233284358363, 'epoch': 4.74}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:09, 16.75it/s][A
  3%|▎         | 5/157 [00:00<00:11, 13.50it/s][A
  4%|▍         | 7/157 [00:00<00:12, 12.25it/s][A
  6%|▌         | 9/157 [00:00<00:12, 11.85it/s][A
  7%|▋         | 11/157 [00:00<00:12, 11.26it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.25it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.34it/s][A
 11%|█         | 17/157 [00:01<00:12, 11.37it/s][A
 12%|█▏        | 19/157 [00:01<00:12, 11.04it/s][A
 13%|█▎        | 21/157 [00:01<00:12, 11.14it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 11.62it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.93it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.12it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.21it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.34it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.34it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.43it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.32it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5401949286460876, 'eval_acc': 0.8424, 'eval_acc2': 0.923, 'eval_f1': 0.8408966524057422, 'eval_roc_auc_micro': 0.998054312878788, 'eval_precision': 0.8424, 'eval_recall': 0.8424, 'eval_runtime': 16.4436, 'eval_samples_per_second': 608.14, 'eval_steps_per_second': 9.548, 'epoch': 4.74}


 48%|████▊     | 750/1560 [32:54<14:46,  1.09s/it]  

{'loss': 0.5112, 'grad_norm': 3.049009084701538, 'learning_rate': 0.0005301892487111431, 'epoch': 4.8}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:24,  6.34it/s][A
  3%|▎         | 5/157 [00:00<00:18,  8.31it/s][A
  4%|▍         | 7/157 [00:00<00:15,  9.60it/s][A
  6%|▌         | 9/157 [00:01<00:20,  7.34it/s][A
  7%|▋         | 11/157 [00:01<00:17,  8.58it/s][A
  8%|▊         | 13/157 [00:01<00:15,  9.53it/s][A
 10%|▉         | 15/157 [00:01<00:13, 10.25it/s][A
 11%|█         | 17/157 [00:01<00:12, 10.85it/s][A
 12%|█▏        | 19/157 [00:01<00:12, 11.39it/s][A
 13%|█▎        | 21/157 [00:02<00:11, 11.74it/s][A
 15%|█▍        | 23/157 [00:02<00:11, 11.98it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.88it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 11.96it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.12it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.28it/s][A
 21%|██        | 33/157 [00:03<00:09, 12.40it/s][A
 22%|██▏       | 35/157 [00:03<00:09, 12.45it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.49it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5238322615623474, 'eval_acc': 0.8451, 'eval_acc2': 0.9282, 'eval_f1': 0.8443427637686327, 'eval_roc_auc_micro': 0.998156143030303, 'eval_precision': 0.8451, 'eval_recall': 0.8451, 'eval_runtime': 16.2587, 'eval_samples_per_second': 615.057, 'eval_steps_per_second': 9.656, 'epoch': 4.8}


 49%|████▊     | 760/1560 [33:20<14:47,  1.11s/it]  

{'loss': 0.5008, 'grad_norm': 2.9343960285186768, 'learning_rate': 0.0005201329700547076, 'epoch': 4.86}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.70it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.99it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.95it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.38it/s][A
  7%|▋         | 11/157 [00:00<00:11, 13.09it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.82it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.74it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.70it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.65it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.64it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.43it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.31it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.50it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.54it/s][A
 20%|█▉        | 31/157 [00:02<00:09, 12.61it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.62it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.63it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.63it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5333725810050964, 'eval_acc': 0.8383, 'eval_acc2': 0.9235, 'eval_f1': 0.8371318049048161, 'eval_roc_auc_micro': 0.9981695103030302, 'eval_precision': 0.8383, 'eval_recall': 0.8383, 'eval_runtime': 15.5651, 'eval_samples_per_second': 642.461, 'eval_steps_per_second': 10.087, 'epoch': 4.86}


 49%|████▉     | 770/1560 [33:45<14:18,  1.09s/it]  

{'loss': 0.5071, 'grad_norm': 3.9468183517456055, 'learning_rate': 0.0005100685266327202, 'epoch': 4.93}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:08, 19.05it/s][A
  3%|▎         | 4/157 [00:00<00:10, 14.52it/s][A
  4%|▍         | 6/157 [00:00<00:11, 13.50it/s][A
  5%|▌         | 8/157 [00:00<00:11, 13.14it/s][A
  6%|▋         | 10/157 [00:00<00:11, 12.90it/s][A
  8%|▊         | 12/157 [00:00<00:11, 12.80it/s][A
  9%|▉         | 14/157 [00:01<00:11, 12.72it/s][A
 10%|█         | 16/157 [00:01<00:11, 12.67it/s][A
 11%|█▏        | 18/157 [00:01<00:10, 12.68it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 12.39it/s][A
 14%|█▍        | 22/157 [00:01<00:11, 11.87it/s][A
 15%|█▌        | 24/157 [00:01<00:11, 11.43it/s][A
 17%|█▋        | 26/157 [00:02<00:11, 11.44it/s][A
 18%|█▊        | 28/157 [00:02<00:11, 11.39it/s][A
 19%|█▉        | 30/157 [00:02<00:11, 11.27it/s][A
 20%|██        | 32/157 [00:02<00:11, 11.19it/s][A
 22%|██▏       | 34/157 [00:02<00:11, 11.13it/s][A
 23%|██▎       | 36/157 [00:03<00:10, 11.03it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.5005475878715515, 'eval_acc': 0.8478, 'eval_acc2': 0.931, 'eval_f1': 0.8479276114879711, 'eval_roc_auc_micro': 0.9983508354545454, 'eval_precision': 0.8478, 'eval_recall': 0.8478, 'eval_runtime': 16.2304, 'eval_samples_per_second': 616.129, 'eval_steps_per_second': 9.673, 'epoch': 4.93}


 50%|█████     | 780/1560 [34:11<14:05,  1.08s/it]  

{'loss': 0.5045, 'grad_norm': 2.667097568511963, 'learning_rate': 0.0005, 'epoch': 4.99}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:24,  6.37it/s][A
  3%|▎         | 4/157 [00:00<00:17,  8.51it/s][A
  4%|▍         | 6/157 [00:00<00:15,  9.78it/s][A
  5%|▌         | 8/157 [00:00<00:14, 10.54it/s][A
  6%|▋         | 10/157 [00:00<00:13, 11.11it/s][A
  8%|▊         | 12/157 [00:01<00:12, 11.47it/s][A
  9%|▉         | 14/157 [00:01<00:12, 11.86it/s][A
 10%|█         | 16/157 [00:01<00:11, 12.14it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 12.29it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 12.39it/s][A
 14%|█▍        | 22/157 [00:01<00:10, 12.45it/s][A
 15%|█▌        | 24/157 [00:02<00:10, 12.46it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.19it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 12.10it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 12.25it/s][A
 20%|██        | 32/157 [00:02<00:10, 12.40it/s][A
 22%|██▏       | 34/157 [00:02<00:09, 12.40it/s][A
 23%|██▎       | 36/157 [00:03<00:09, 12.45it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.4930117130279541, 'eval_acc': 0.8519, 'eval_acc2': 0.9318, 'eval_f1': 0.8511474433965074, 'eval_roc_auc_micro': 0.998368716010101, 'eval_precision': 0.8519, 'eval_recall': 0.8519, 'eval_runtime': 15.6683, 'eval_samples_per_second': 638.232, 'eval_steps_per_second': 10.02, 'epoch': 4.99}


 51%|█████     | 790/1560 [34:39<15:16,  1.19s/it]  

{'loss': 0.38, 'grad_norm': 2.274550199508667, 'learning_rate': 0.0004899314733672799, 'epoch': 5.06}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:09, 16.00it/s][A
  3%|▎         | 5/157 [00:00<00:11, 12.97it/s][A
  4%|▍         | 7/157 [00:00<00:12, 12.37it/s][A
  6%|▌         | 9/157 [00:00<00:12, 11.92it/s][A
  7%|▋         | 11/157 [00:00<00:12, 11.52it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.44it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.39it/s][A
 11%|█         | 17/157 [00:01<00:11, 11.75it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.96it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.13it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.20it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.30it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.35it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.41it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.48it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.46it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.44it/s][A
 24%|██▎       | 37/157 [00:03<00:10, 11.88it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5057158470153809, 'eval_acc': 0.8459, 'eval_acc2': 0.9275, 'eval_f1': 0.8450674743683773, 'eval_roc_auc_micro': 0.9983856031818181, 'eval_precision': 0.8459, 'eval_recall': 0.8459, 'eval_runtime': 16.6096, 'eval_samples_per_second': 602.06, 'eval_steps_per_second': 9.452, 'epoch': 5.06}


 51%|█████▏    | 800/1560 [35:06<14:07,  1.12s/it]  

{'loss': 0.3837, 'grad_norm': 2.447523593902588, 'learning_rate': 0.0004798670299452926, 'epoch': 5.12}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 19.05it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.14it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.95it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.38it/s][A
  7%|▋         | 11/157 [00:00<00:11, 13.06it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.88it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.80it/s][A
 11%|█         | 17/157 [00:01<00:10, 12.75it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.69it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.62it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.59it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.07it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.22it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.33it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.41it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.34it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.27it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.42it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5166809558868408, 'eval_acc': 0.8465, 'eval_acc2': 0.9266, 'eval_f1': 0.8459209144563352, 'eval_roc_auc_micro': 0.9982322840909091, 'eval_precision': 0.8465, 'eval_recall': 0.8465, 'eval_runtime': 16.168, 'eval_samples_per_second': 618.504, 'eval_steps_per_second': 9.711, 'epoch': 5.12}


 52%|█████▏    | 810/1560 [35:32<13:55,  1.11s/it]  

{'loss': 0.405, 'grad_norm': 2.048755168914795, 'learning_rate': 0.0004698107512888569, 'epoch': 5.18}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.72it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.00it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.48it/s][A
  6%|▌         | 9/157 [00:00<00:19,  7.42it/s][A
  7%|▋         | 11/157 [00:01<00:17,  8.27it/s][A
  8%|▊         | 13/157 [00:01<00:16,  8.83it/s][A
 10%|▉         | 15/157 [00:01<00:15,  9.46it/s][A
 11%|█         | 17/157 [00:01<00:13, 10.01it/s][A
 12%|█▏        | 19/157 [00:01<00:13, 10.43it/s][A
 13%|█▎        | 21/157 [00:02<00:12, 10.61it/s][A
 15%|█▍        | 23/157 [00:02<00:12, 10.68it/s][A
 16%|█▌        | 25/157 [00:02<00:12, 10.45it/s][A
 17%|█▋        | 27/157 [00:02<00:12, 10.49it/s][A
 18%|█▊        | 29/157 [00:02<00:12, 10.44it/s][A
 20%|█▉        | 31/157 [00:03<00:12, 10.44it/s][A
 21%|██        | 33/157 [00:03<00:11, 10.52it/s][A
 22%|██▏       | 35/157 [00:03<00:11, 10.66it/s][A
 24%|██▎       | 37/157 [00:03<00:11, 10.48it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5073953866958618, 'eval_acc': 0.846, 'eval_acc2': 0.9301, 'eval_f1': 0.8445659937442628, 'eval_roc_auc_micro': 0.998349363989899, 'eval_precision': 0.846, 'eval_recall': 0.846, 'eval_runtime': 17.4645, 'eval_samples_per_second': 572.592, 'eval_steps_per_second': 8.99, 'epoch': 5.18}


 53%|█████▎    | 820/1560 [35:59<13:47,  1.12s/it]  

{'loss': 0.4022, 'grad_norm': 2.0190343856811523, 'learning_rate': 0.00045976671564163706, 'epoch': 5.25}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.48it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.09it/s][A
  4%|▍         | 7/157 [00:00<00:10, 14.01it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.14it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.69it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.64it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.59it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.61it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.66it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.56it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 12.11it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.20it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.26it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.37it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.34it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.35it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.37it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.42it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5206233263015747, 'eval_acc': 0.8445, 'eval_acc2': 0.9288, 'eval_f1': 0.8428349057379666, 'eval_roc_auc_micro': 0.9982347557070708, 'eval_precision': 0.8445, 'eval_recall': 0.8445, 'eval_runtime': 15.8627, 'eval_samples_per_second': 630.408, 'eval_steps_per_second': 9.897, 'epoch': 5.25}


 53%|█████▎    | 830/1560 [36:25<13:52,  1.14s/it]  

{'loss': 0.4179, 'grad_norm': 2.818976879119873, 'learning_rate': 0.00044973899628224153, 'epoch': 5.31}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.18it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.40it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.11it/s][A
  6%|▌         | 9/157 [00:00<00:12, 12.18it/s][A
  7%|▋         | 11/157 [00:00<00:12, 11.84it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.55it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.43it/s][A
 11%|█         | 17/157 [00:01<00:12, 11.28it/s][A
 12%|█▏        | 19/157 [00:01<00:12, 11.13it/s][A
 13%|█▎        | 21/157 [00:01<00:12, 11.19it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 11.22it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.15it/s][A
 17%|█▋        | 27/157 [00:02<00:12, 10.81it/s][A
 18%|█▊        | 29/157 [00:02<00:11, 10.78it/s][A
 20%|█▉        | 31/157 [00:02<00:11, 10.56it/s][A
 21%|██        | 33/157 [00:02<00:12, 10.20it/s][A
 22%|██▏       | 35/157 [00:03<00:11, 10.85it/s][A
 24%|██▎       | 37/157 [00:03<00:10, 11.33it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5117075443267822, 'eval_acc': 0.8473, 'eval_acc2': 0.9285, 'eval_f1': 0.8474635407460223, 'eval_roc_auc_micro': 0.9983146030303031, 'eval_precision': 0.8473, 'eval_recall': 0.8473, 'eval_runtime': 16.1083, 'eval_samples_per_second': 620.797, 'eval_steps_per_second': 9.747, 'epoch': 5.31}


 54%|█████▍    | 840/1560 [36:51<13:06,  1.09s/it]  

{'loss': 0.4047, 'grad_norm': 2.379117250442505, 'learning_rate': 0.00043973165987233853, 'epoch': 5.38}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.60it/s][A
  3%|▎         | 5/157 [00:00<00:20,  7.42it/s][A
  4%|▍         | 7/157 [00:00<00:16,  8.89it/s][A
  6%|▌         | 9/157 [00:00<00:14,  9.93it/s][A
  7%|▋         | 11/157 [00:01<00:13, 10.65it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.18it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.42it/s][A
 11%|█         | 17/157 [00:01<00:12, 11.65it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.87it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.10it/s][A
 15%|█▍        | 23/157 [00:02<00:10, 12.21it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.23it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.35it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.44it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.52it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.55it/s][A
 22%|██▏       | 35/157 [00:03<00:10, 12.17it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.29it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5289676785469055, 'eval_acc': 0.8434, 'eval_acc2': 0.9265, 'eval_f1': 0.842774024685144, 'eval_roc_auc_micro': 0.9981096421717173, 'eval_precision': 0.8434, 'eval_recall': 0.8434, 'eval_runtime': 16.2378, 'eval_samples_per_second': 615.846, 'eval_steps_per_second': 9.669, 'epoch': 5.38}


 54%|█████▍    | 850/1560 [37:17<13:22,  1.13s/it]  

{'loss': 0.4483, 'grad_norm': 3.6744308471679688, 'learning_rate': 0.0004297487648074538, 'epoch': 5.44}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.80it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.08it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.96it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.43it/s][A
  7%|▋         | 11/157 [00:00<00:11, 13.06it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.90it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.49it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.38it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.38it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.49it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.50it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.20it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.36it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.45it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.54it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.55it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.58it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.50it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5067611932754517, 'eval_acc': 0.8489, 'eval_acc2': 0.9291, 'eval_f1': 0.8481402058793593, 'eval_roc_auc_micro': 0.9982832971717173, 'eval_precision': 0.8489, 'eval_recall': 0.8489, 'eval_runtime': 15.4737, 'eval_samples_per_second': 646.256, 'eval_steps_per_second': 10.146, 'epoch': 5.44}


 55%|█████▌    | 860/1560 [37:42<12:40,  1.09s/it]  

{'loss': 0.4228, 'grad_norm': 2.3862993717193604, 'learning_rate': 0.0004197943595711198, 'epoch': 5.5}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.58it/s][A
  3%|▎         | 5/157 [00:00<00:20,  7.49it/s][A
  4%|▍         | 7/157 [00:00<00:16,  8.97it/s][A
  6%|▌         | 9/157 [00:00<00:14,  9.92it/s][A
  7%|▋         | 11/157 [00:01<00:13, 10.74it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.31it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.65it/s][A
 11%|█         | 17/157 [00:01<00:11, 11.95it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.14it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.32it/s][A
 15%|█▍        | 23/157 [00:02<00:10, 12.22it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.34it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.40it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.36it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.27it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.44it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.48it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.55it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5113240480422974, 'eval_acc': 0.8454, 'eval_acc2': 0.9282, 'eval_f1': 0.8443924653755004, 'eval_roc_auc_micro': 0.9983370624242425, 'eval_precision': 0.8454, 'eval_recall': 0.8454, 'eval_runtime': 16.4259, 'eval_samples_per_second': 608.796, 'eval_steps_per_second': 9.558, 'epoch': 5.5}


 56%|█████▌    | 870/1560 [38:09<12:40,  1.10s/it]  

{'loss': 0.4209, 'grad_norm': 2.994689702987671, 'learning_rate': 0.00040987248109304716, 'epoch': 5.57}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.68it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.10it/s][A
  4%|▍         | 7/157 [00:00<00:20,  7.42it/s][A
  6%|▌         | 9/157 [00:00<00:16,  8.75it/s][A
  7%|▋         | 11/157 [00:01<00:14,  9.74it/s][A
  8%|▊         | 13/157 [00:01<00:13, 10.55it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.15it/s][A
 11%|█         | 17/157 [00:01<00:12, 11.57it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.88it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.03it/s][A
 15%|█▍        | 23/157 [00:02<00:11, 12.14it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.21it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.16it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.11it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.22it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.19it/s][A
 22%|██▏       | 35/157 [00:03<00:09, 12.32it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.35it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5196578502655029, 'eval_acc': 0.8437, 'eval_acc2': 0.9259, 'eval_f1': 0.8430004990650701, 'eval_roc_auc_micro': 0.9982375295959596, 'eval_precision': 0.8437, 'eval_recall': 0.8437, 'eval_runtime': 16.7754, 'eval_samples_per_second': 596.111, 'eval_steps_per_second': 9.359, 'epoch': 5.57}


 56%|█████▋    | 880/1560 [38:35<12:57,  1.14s/it]  

{'loss': 0.445, 'grad_norm': 2.3070971965789795, 'learning_rate': 0.0003999871531119779, 'epoch': 5.63}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:19,  8.06it/s][A
  3%|▎         | 4/157 [00:00<00:15, 10.16it/s][A
  4%|▍         | 6/157 [00:00<00:13, 11.14it/s][A
  5%|▌         | 8/157 [00:00<00:12, 11.58it/s][A
  6%|▋         | 10/157 [00:00<00:12, 11.65it/s][A
  8%|▊         | 12/157 [00:01<00:12, 11.80it/s][A
  9%|▉         | 14/157 [00:01<00:11, 11.94it/s][A
 10%|█         | 16/157 [00:01<00:11, 11.86it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 12.06it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 12.14it/s][A
 14%|█▍        | 22/157 [00:01<00:10, 12.37it/s][A
 15%|█▌        | 24/157 [00:02<00:10, 12.40it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.39it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 12.40it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 12.24it/s][A
 20%|██        | 32/157 [00:02<00:10, 12.34it/s][A
 22%|██▏       | 34/157 [00:02<00:10, 12.22it/s][A
 23%|██▎       | 36/157 [00:03<00:09, 12.30it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.4996539056301117, 'eval_acc': 0.8497, 'eval_acc2': 0.9309, 'eval_f1': 0.8490467424765771, 'eval_roc_auc_micro': 0.9983250099494949, 'eval_precision': 0.8497, 'eval_recall': 0.8497, 'eval_runtime': 16.0078, 'eval_samples_per_second': 624.695, 'eval_steps_per_second': 9.808, 'epoch': 5.63}


 56%|█████▋    | 880/1560 [38:52<30:02,  2.65s/it]


{'train_runtime': 2332.527, 'train_samples_per_second': 171.488, 'train_steps_per_second': 0.669, 'train_loss': 0.931338568167253, 'epoch': 5.63}


100%|██████████| 157/157 [00:13<00:00, 11.71it/s]
wandb:                                                                                
wandb: 
wandb: Run history:
wandb:                eval/acc ▁▃▃▄▄▄▅▅▄▆▆▆▆▆▆▆▆▇▇▆▇▇▇▇▇▇▇██▇▇█████████
wandb:               eval/acc2 ▁▄▆▇▇▇▇▇▇▇▇▇▇▇██▇███████████████████████
wandb:                 eval/f1 ▁▃▅▆▇▇▇▇▇▇▇▇▇▇▇▇▇██▇████████████████████
wandb:               eval/loss █▇▆▆▆▅▅▄▄▄▄▃▃▃▃▃▂▃▃▂▂▂▃▂▂▂▁▂▂▂▂▁▂▁▁▁▁▁▁▁
wandb:          eval/precision ▁▃▆▆▆▇▇▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇█▇▇██████████████
wandb:             eval/recall ▁▄▆▆▆▆▆▇▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇█▇█▇█▇███████████
wandb:      eval/roc_auc_micro ▁▇███████��██████████████████████████████
wandb:            eval/runtime ▇▅▂█▅▂▄▂▁▃▃▁▁▇▅▁▄▄▂▁▃▁▃▂▁▂▁▃▃▂▃▂▂▃▂▂▃▁▃▂
wandb: eval/samples_per_second ▁▇▄▆▇██▆█▅▄▁▄▄▇▄▃▇▅▇▄▄▅▆▇██▅▆▅█▆▇▇▇▆▅▆▃▅
wandb:   eval/steps_per_second ▂▇▇▄▆▁▄▄▆▆▅█▄▇▄▅▄▃▄▇▇█▅▇▇▇▅▆▅▄▇▆▇▆▇▄▆▆▇▆
wandb:                test/acc ▁
wandb:               test/acc2 ▁
wandb:                 test/f1 ▁
wandb:         

{'loss': 4.3534, 'grad_norm': 1.4045732021331787, 'learning_rate': 0.0009998986144924252, 'epoch': 0.06}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:10, 14.86it/s][A
  3%|▎         | 5/157 [00:00<00:12, 11.95it/s][A
  4%|▍         | 7/157 [00:00<00:13, 10.91it/s][A
  6%|▌         | 9/157 [00:00<00:13, 11.19it/s][A
  7%|▋         | 11/157 [00:00<00:12, 11.33it/s][A
  8%|▊         | 13/157 [00:01<00:13, 10.67it/s][A
 10%|▉         | 15/157 [00:01<00:14,  9.97it/s][A
 11%|█         | 17/157 [00:01<00:14,  9.83it/s][A
 12%|█▏        | 19/157 [00:01<00:12, 10.64it/s][A
 13%|█▎        | 21/157 [00:01<00:12, 11.21it/s][A
 15%|█▍        | 23/157 [00:02<00:11, 11.50it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.81it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.06it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.27it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.34it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.52it/s][A
 22%|██▏       | 35/157 [00:03<00:09, 12.48it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.57it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 3.7710418701171875, 'eval_acc': 0.2661, 'eval_acc2': 0.3937, 'eval_f1': 0.22800571409468207, 'eval_roc_auc_micro': 0.9115869521212121, 'eval_precision': 0.2661, 'eval_recall': 0.2661, 'eval_runtime': 15.947, 'eval_samples_per_second': 627.077, 'eval_steps_per_second': 9.845, 'epoch': 0.06}


  1%|▏         | 20/1560 [00:38<28:17,  1.10s/it]  

{'loss': 3.3913, 'grad_norm': 2.8589279651641846, 'learning_rate': 0.0009995944990857848, 'epoch': 0.13}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:11, 13.63it/s][A
  3%|▎         | 4/157 [00:00<00:11, 12.93it/s][A
  4%|▍         | 6/157 [00:00<00:12, 12.58it/s][A
  5%|▌         | 8/157 [00:00<00:12, 12.31it/s][A
  6%|▋         | 10/157 [00:00<00:11, 12.31it/s][A
  8%|▊         | 12/157 [00:00<00:11, 12.13it/s][A
  9%|▉         | 14/157 [00:01<00:11, 12.12it/s][A
 10%|█         | 16/157 [00:01<00:11, 12.26it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 12.30it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 12.33it/s][A
 14%|█▍        | 22/157 [00:01<00:11, 12.10it/s][A
 15%|█▌        | 24/157 [00:01<00:11, 12.09it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.17it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 12.22it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 12.30it/s][A
 20%|██        | 32/157 [00:02<00:10, 12.21it/s][A
 22%|██▏       | 34/157 [00:02<00:10, 12.27it/s][A
 23%|██▎       | 36/157 [00:02<00:09, 12.31it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 2.5744569301605225, 'eval_acc': 0.4585, 'eval_acc2': 0.6136, 'eval_f1': 0.4186601089091933, 'eval_roc_auc_micro': 0.9738249905050506, 'eval_precision': 0.4585, 'eval_recall': 0.4585, 'eval_runtime': 16.2101, 'eval_samples_per_second': 616.901, 'eval_steps_per_second': 9.685, 'epoch': 0.13}


  2%|▏         | 30/1560 [01:12<53:57,  2.12s/it]  

{'loss': 2.5558, 'grad_norm': 4.059248924255371, 'learning_rate': 0.0009990877771116587, 'epoch': 0.19}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:09, 16.42it/s][A
  3%|▎         | 5/157 [00:00<00:12, 12.46it/s][A
  4%|▍         | 7/157 [00:00<00:14, 10.64it/s][A
  6%|▌         | 9/157 [00:00<00:14,  9.88it/s][A
  7%|▋         | 11/157 [00:01<00:21,  6.89it/s][A
  8%|▊         | 12/157 [00:01<00:20,  7.24it/s][A
  8%|▊         | 13/157 [00:01<00:19,  7.53it/s][A
  9%|▉         | 14/157 [00:01<00:18,  7.86it/s][A
 10%|▉         | 15/157 [00:01<00:17,  8.04it/s][A
 10%|█         | 16/157 [00:01<00:17,  8.28it/s][A
 11%|█         | 17/157 [00:01<00:16,  8.47it/s][A
 11%|█▏        | 18/157 [00:02<00:16,  8.56it/s][A
 12%|█▏        | 19/157 [00:02<00:15,  8.72it/s][A
 13%|█▎        | 20/157 [00:02<00:15,  8.78it/s][A
 13%|█▎        | 21/157 [00:02<00:15,  8.73it/s][A
 14%|█▍        | 22/157 [00:02<00:15,  8.82it/s][A
 15%|█▍        | 23/157 [00:02<00:15,  8.75it/s][A
 15%|█▌        | 24/157 [00:02<00:15,  8.66it/s][A
 16%|█▌        | 25/157 

{'eval_loss': 1.92024827003479, 'eval_acc': 0.5516, 'eval_acc2': 0.7085, 'eval_f1': 0.5204265241068243, 'eval_roc_auc_micro': 0.9832170511111111, 'eval_precision': 0.5516, 'eval_recall': 0.5516, 'eval_runtime': 29.3188, 'eval_samples_per_second': 341.078, 'eval_steps_per_second': 5.355, 'epoch': 0.19}


  3%|▎         | 40/1560 [02:04<55:13,  2.18s/it]  

{'loss': 2.0841, 'grad_norm': 3.0197620391845703, 'learning_rate': 0.000998378654067105, 'epoch': 0.26}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 17.52it/s][A
  3%|▎         | 5/157 [00:00<00:10, 13.96it/s][A
  4%|▍         | 7/157 [00:00<00:11, 12.94it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.57it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.39it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.49it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.62it/s][A
 11%|█         | 17/157 [00:01<00:11, 11.77it/s][A
 12%|█▏        | 19/157 [00:01<00:12, 10.71it/s][A
 13%|█▎        | 21/157 [00:01<00:13, 10.21it/s][A
 15%|█▍        | 23/157 [00:02<00:13,  9.89it/s][A
 16%|█▌        | 25/157 [00:02<00:13,  9.69it/s][A
 17%|█▋        | 26/157 [00:02<00:13,  9.67it/s][A
 17%|█▋        | 27/157 [00:02<00:13,  9.60it/s][A
 18%|█▊        | 28/157 [00:02<00:13,  9.48it/s][A
 18%|█▊        | 29/157 [00:02<00:13,  9.53it/s][A
 20%|█▉        | 31/157 [00:02<00:12,  9.72it/s][A
 21%|██        | 33/157 [00:03<00:12,  9.97it/s][A
 22%|██▏       | 35/157 

{'eval_loss': 1.5340222120285034, 'eval_acc': 0.6207, 'eval_acc2': 0.7715, 'eval_f1': 0.5939917591180746, 'eval_roc_auc_micro': 0.988715671010101, 'eval_precision': 0.6207, 'eval_recall': 0.6207, 'eval_runtime': 16.8295, 'eval_samples_per_second': 594.195, 'eval_steps_per_second': 9.329, 'epoch': 0.26}


  3%|▎         | 50/1560 [02:31<29:48,  1.18s/it]  

{'loss': 1.7845, 'grad_norm': 5.1053900718688965, 'learning_rate': 0.0009974674175313228, 'epoch': 0.32}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:12, 11.92it/s][A
  3%|▎         | 5/157 [00:00<00:12, 11.91it/s][A
  4%|▍         | 7/157 [00:00<00:12, 11.85it/s][A
  6%|▌         | 9/157 [00:00<00:12, 11.98it/s][A
  7%|▋         | 11/157 [00:00<00:12, 11.88it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.81it/s][A
 10%|▉         | 15/157 [00:01<00:11, 11.86it/s][A
 11%|█         | 17/157 [00:01<00:11, 11.70it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.77it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.88it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 11.80it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.82it/s][A
 17%|█▋        | 27/157 [00:02<00:11, 11.70it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 11.74it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 11.76it/s][A
 21%|██        | 33/157 [00:02<00:10, 11.75it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 11.81it/s][A
 24%|██▎       | 37/157 [00:03<00:10, 11.93it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 1.328546404838562, 'eval_acc': 0.6502, 'eval_acc2': 0.797, 'eval_f1': 0.6285251194307628, 'eval_roc_auc_micro': 0.9903941966161616, 'eval_precision': 0.6502, 'eval_recall': 0.6502, 'eval_runtime': 16.0888, 'eval_samples_per_second': 621.551, 'eval_steps_per_second': 9.758, 'epoch': 0.32}


  4%|▍         | 60/1560 [02:57<28:19,  1.13s/it]  

{'loss': 1.6981, 'grad_norm': 5.185550212860107, 'learning_rate': 0.000996354437049027, 'epoch': 0.38}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 17.52it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.03it/s][A
  4%|▍         | 7/157 [00:00<00:11, 12.95it/s][A
  6%|▌         | 9/157 [00:00<00:18,  7.92it/s][A
  7%|▋         | 11/157 [00:01<00:16,  8.98it/s][A
  8%|▊         | 13/157 [00:01<00:14,  9.93it/s][A
 10%|▉         | 15/157 [00:01<00:13, 10.62it/s][A
 11%|█         | 17/157 [00:01<00:12, 11.08it/s][A
 12%|█▏        | 19/157 [00:01<00:12, 11.48it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.67it/s][A
 15%|█▍        | 23/157 [00:02<00:11, 11.93it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.01it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 11.99it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.11it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.22it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.23it/s][A
 22%|██▏       | 35/157 [00:03<00:10, 12.14it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.08it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 1.146562099456787, 'eval_acc': 0.6889, 'eval_acc2': 0.8289, 'eval_f1': 0.676905682753771, 'eval_roc_auc_micro': 0.9927249009595961, 'eval_precision': 0.6889, 'eval_recall': 0.6889, 'eval_runtime': 16.1819, 'eval_samples_per_second': 617.976, 'eval_steps_per_second': 9.702, 'epoch': 0.38}


  4%|▍         | 70/1560 [03:23<27:32,  1.11s/it]  

{'loss': 1.5497, 'grad_norm': 4.098487854003906, 'learning_rate': 0.0009950401639805821, 'epoch': 0.45}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:10, 14.51it/s][A
  3%|▎         | 4/157 [00:00<00:12, 12.26it/s][A
  4%|▍         | 6/157 [00:00<00:13, 11.57it/s][A
  5%|▌         | 8/157 [00:00<00:13, 11.27it/s][A
  6%|▋         | 10/157 [00:00<00:12, 11.37it/s][A
  8%|▊         | 12/157 [00:01<00:12, 11.55it/s][A
  9%|▉         | 14/157 [00:01<00:12, 11.66it/s][A
 10%|█         | 16/157 [00:01<00:12, 11.45it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 11.80it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 12.02it/s][A
 14%|█▍        | 22/157 [00:01<00:11, 12.16it/s][A
 15%|█▌        | 24/157 [00:02<00:11, 11.94it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.03it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 12.07it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 12.14it/s][A
 20%|██        | 32/157 [00:02<00:10, 12.20it/s][A
 22%|██▏       | 34/157 [00:02<00:10, 12.03it/s][A
 23%|██▎       | 36/157 [00:03<00:09, 12.19it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 1.073900580406189, 'eval_acc': 0.704, 'eval_acc2': 0.8407, 'eval_f1': 0.6988377454615045, 'eval_roc_auc_micro': 0.9933972701515151, 'eval_precision': 0.704, 'eval_recall': 0.704, 'eval_runtime': 15.4355, 'eval_samples_per_second': 647.856, 'eval_steps_per_second': 10.171, 'epoch': 0.45}


  5%|▌         | 80/1560 [03:48<27:03,  1.10s/it]  

{'loss': 1.4591, 'grad_norm': 3.648939371109009, 'learning_rate': 0.0009935251313189565, 'epoch': 0.51}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:12, 12.82it/s][A
  3%|▎         | 4/157 [00:00<00:12, 12.63it/s][A
  4%|▍         | 6/157 [00:00<00:12, 12.49it/s][A
  5%|▌         | 8/157 [00:00<00:12, 12.34it/s][A
  6%|▋         | 10/157 [00:00<00:12, 12.07it/s][A
  8%|▊         | 12/157 [00:00<00:12, 11.91it/s][A
  9%|▉         | 14/157 [00:01<00:12, 11.87it/s][A
 10%|█         | 16/157 [00:01<00:11, 11.89it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 11.92it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 11.89it/s][A
 14%|█▍        | 22/157 [00:01<00:11, 12.00it/s][A
 15%|█▌        | 24/157 [00:01<00:10, 12.11it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.09it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 11.98it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 12.05it/s][A
 20%|██        | 32/157 [00:02<00:10, 11.76it/s][A
 22%|██▏       | 34/157 [00:02<00:10, 11.92it/s][A
 23%|██▎       | 36/157 [00:02<00:10, 11.95it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.9890788793563843, 'eval_acc': 0.7235, 'eval_acc2': 0.8527, 'eval_f1': 0.7159972228967822, 'eval_roc_auc_micro': 0.994094697929293, 'eval_precision': 0.7235, 'eval_recall': 0.7235, 'eval_runtime': 15.2788, 'eval_samples_per_second': 654.5, 'eval_steps_per_second': 10.276, 'epoch': 0.51}


  6%|▌         | 90/1560 [04:13<26:28,  1.08s/it]  

{'loss': 1.4016, 'grad_norm': 4.595393180847168, 'learning_rate': 0.0009918099534735718, 'epoch': 0.58}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.08it/s][A
  3%|▎         | 5/157 [00:00<00:22,  6.78it/s][A
  4%|▍         | 7/157 [00:00<00:18,  8.15it/s][A
  6%|▌         | 9/157 [00:00<00:16,  9.21it/s][A
  7%|▋         | 11/157 [00:01<00:14,  9.95it/s][A
  8%|▊         | 13/157 [00:01<00:13, 10.55it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.00it/s][A
 11%|█         | 17/157 [00:01<00:12, 11.28it/s][A
 12%|█▏        | 19/157 [00:01<00:12, 11.41it/s][A
 13%|█▎        | 21/157 [00:02<00:11, 11.64it/s][A
 15%|█▍        | 23/157 [00:02<00:11, 11.93it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.12it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.19it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.25it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.24it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.30it/s][A
 22%|██▏       | 35/157 [00:03<00:09, 12.41it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.47it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.9476460814476013, 'eval_acc': 0.7332, 'eval_acc2': 0.8518, 'eval_f1': 0.7253962280920586, 'eval_roc_auc_micro': 0.9945875185353537, 'eval_precision': 0.7332, 'eval_recall': 0.7332, 'eval_runtime': 15.8835, 'eval_samples_per_second': 629.582, 'eval_steps_per_second': 9.884, 'epoch': 0.58}


  6%|▋         | 100/1560 [04:38<26:19,  1.08s/it] 

{'loss': 1.3703, 'grad_norm': 3.9414706230163574, 'learning_rate': 0.0009898953260211339, 'epoch': 0.64}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:11, 13.06it/s][A
  3%|▎         | 5/157 [00:00<00:11, 12.75it/s][A
  4%|▍         | 7/157 [00:00<00:11, 12.50it/s][A
  6%|▌         | 9/157 [00:00<00:12, 12.17it/s][A
  7%|▋         | 11/157 [00:00<00:12, 11.86it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.60it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.68it/s][A
 11%|█         | 17/157 [00:01<00:12, 11.60it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.79it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.90it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 11.76it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.95it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.11it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.23it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.29it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.09it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 12.18it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.28it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.884806752204895, 'eval_acc': 0.7468, 'eval_acc2': 0.863, 'eval_f1': 0.7406739422968706, 'eval_roc_auc_micro': 0.9952611766161616, 'eval_precision': 0.7468, 'eval_recall': 0.7468, 'eval_runtime': 15.2276, 'eval_samples_per_second': 656.704, 'eval_steps_per_second': 10.31, 'epoch': 0.64}


  7%|▋         | 110/1560 [05:03<25:58,  1.08s/it]  

{'loss': 1.2841, 'grad_norm': 5.750749111175537, 'learning_rate': 0.000987782025423547, 'epoch': 0.7}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:08, 18.18it/s][A
  3%|▎         | 4/157 [00:00<00:10, 14.31it/s][A
  4%|▍         | 6/157 [00:00<00:11, 13.52it/s][A
  5%|▌         | 8/157 [00:00<00:11, 12.76it/s][A
  6%|▋         | 10/157 [00:00<00:12, 12.16it/s][A
  8%|▊         | 12/157 [00:00<00:12, 12.00it/s][A
  9%|▉         | 14/157 [00:01<00:11, 11.93it/s][A
 10%|█         | 16/157 [00:01<00:11, 11.88it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 11.92it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 11.97it/s][A
 14%|█▍        | 22/157 [00:01<00:11, 12.05it/s][A
 15%|█▌        | 24/157 [00:01<00:11, 11.88it/s][A
 17%|█▋        | 26/157 [00:02<00:11, 11.35it/s][A
 18%|█▊        | 28/157 [00:02<00:11, 11.59it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 11.76it/s][A
 20%|██        | 32/157 [00:02<00:10, 11.98it/s][A
 22%|██▏       | 34/157 [00:02<00:10, 12.05it/s][A
 23%|██▎       | 36/157 [00:02<00:09, 12.20it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.8701398372650146, 'eval_acc': 0.7497, 'eval_acc2': 0.8739, 'eval_f1': 0.7464673347494225, 'eval_roc_auc_micro': 0.9953194576262626, 'eval_precision': 0.7497, 'eval_recall': 0.7497, 'eval_runtime': 15.1565, 'eval_samples_per_second': 659.782, 'eval_steps_per_second': 10.359, 'epoch': 0.7}


  8%|▊         | 120/1560 [05:28<25:48,  1.08s/it]  

{'loss': 1.315, 'grad_norm': 4.027154445648193, 'learning_rate': 0.000985470908713026, 'epoch': 0.77}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.85it/s][A
  3%|▎         | 5/157 [00:00<00:12, 12.21it/s][A
  4%|▍         | 7/157 [00:00<00:12, 12.40it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.39it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.29it/s][A
  8%|▊         | 13/157 [00:01<00:12, 12.00it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.83it/s][A
 11%|█         | 17/157 [00:01<00:11, 11.88it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.98it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.02it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 12.08it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.98it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 11.97it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.04it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.12it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.31it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.43it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.42it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.8649842739105225, 'eval_acc': 0.7544, 'eval_acc2': 0.8703, 'eval_f1': 0.7502444552562313, 'eval_roc_auc_micro': 0.995152830858586, 'eval_precision': 0.7544, 'eval_recall': 0.7544, 'eval_runtime': 15.7668, 'eval_samples_per_second': 634.242, 'eval_steps_per_second': 9.958, 'epoch': 0.77}


  8%|▊         | 130/1560 [05:53<25:52,  1.09s/it]  

{'loss': 1.3186, 'grad_norm': 4.0918097496032715, 'learning_rate': 0.0009829629131445341, 'epoch': 0.83}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.02it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.83it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.84it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.10it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.90it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.81it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.67it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.62it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.59it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.56it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.49it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.49it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.52it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.22it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.33it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.38it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.38it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.36it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.8180635571479797, 'eval_acc': 0.7618, 'eval_acc2': 0.8772, 'eval_f1': 0.7600565173716071, 'eval_roc_auc_micro': 0.9958597442424243, 'eval_precision': 0.7618, 'eval_recall': 0.7618, 'eval_runtime': 32.0456, 'eval_samples_per_second': 312.056, 'eval_steps_per_second': 4.899, 'epoch': 0.83}


  9%|▉         | 140/1560 [06:35<30:15,  1.28s/it]  

{'loss': 1.2352, 'grad_norm': 4.080450057983398, 'learning_rate': 0.000980259055815686, 'epoch': 0.9}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:09, 15.61it/s][A
  3%|▎         | 5/157 [00:00<00:11, 13.22it/s][A
  4%|▍         | 7/157 [00:00<00:11, 12.98it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.80it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.65it/s][A
  8%|▊         | 13/157 [00:01<00:11, 12.68it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.66it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.65it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.69it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.59it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.55it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.37it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.37it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.46it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.43it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.48it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.61it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.63it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.8399779200553894, 'eval_acc': 0.7499, 'eval_acc2': 0.8743, 'eval_f1': 0.7395806815901395, 'eval_roc_auc_micro': 0.9956113307070708, 'eval_precision': 0.7499, 'eval_recall': 0.7499, 'eval_runtime': 26.184, 'eval_samples_per_second': 381.912, 'eval_steps_per_second': 5.996, 'epoch': 0.9}


 10%|▉         | 150/1560 [07:10<28:59,  1.23s/it]  

{'loss': 1.2819, 'grad_norm': 7.193608283996582, 'learning_rate': 0.0009773604332542728, 'epoch': 0.96}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 19.10it/s][A
  3%|▎         | 5/157 [00:00<00:09, 15.26it/s][A
  4%|▍         | 7/157 [00:00<00:10, 14.03it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.42it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.83it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.56it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.53it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.54it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.54it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.58it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.42it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.12it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.12it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 11.97it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.00it/s][A
 21%|██        | 33/157 [00:02<00:10, 11.90it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 11.81it/s][A
 24%|██▎       | 37/157 [00:02<00:10, 11.96it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.8475467562675476, 'eval_acc': 0.7565, 'eval_acc2': 0.8675, 'eval_f1': 0.7522382383187283, 'eval_roc_auc_micro': 0.9954391140404041, 'eval_precision': 0.7565, 'eval_recall': 0.7565, 'eval_runtime': 16.4234, 'eval_samples_per_second': 608.889, 'eval_steps_per_second': 9.56, 'epoch': 0.96}


 10%|█         | 160/1560 [07:39<32:15,  1.38s/it]  

{'loss': 1.2457, 'grad_norm': 2.753074884414673, 'learning_rate': 0.0009742682209735727, 'epoch': 1.02}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.94it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.16it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.90it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.11it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.78it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.69it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.24it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.01it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.13it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.31it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.38it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.32it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.29it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.31it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.31it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.08it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 12.07it/s][A
 24%|██▎       | 37/157 [00:03<00:19,  6.14it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.7719382047653198, 'eval_acc': 0.7698, 'eval_acc2': 0.8844, 'eval_f1': 0.7690246675416074, 'eval_roc_auc_micro': 0.9961682292929293, 'eval_precision': 0.7698, 'eval_recall': 0.7698, 'eval_runtime': 15.785, 'eval_samples_per_second': 633.511, 'eval_steps_per_second': 9.946, 'epoch': 1.02}


 11%|█         | 170/1560 [08:05<28:00,  1.21s/it]  

{'loss': 1.0951, 'grad_norm': 3.946941375732422, 'learning_rate': 0.0009709836729956326, 'epoch': 1.09}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.04it/s][A
  3%|▎         | 5/157 [00:00<00:22,  6.85it/s][A
  4%|▍         | 6/157 [00:00<00:25,  5.99it/s][A
  4%|▍         | 7/157 [00:01<00:26,  5.72it/s][A
  5%|▌         | 8/157 [00:01<00:26,  5.73it/s][A
  6%|▌         | 9/157 [00:01<00:24,  5.96it/s][A
  6%|▋         | 10/157 [00:01<00:23,  6.29it/s][A
  7%|▋         | 11/157 [00:01<00:22,  6.56it/s][A
  8%|▊         | 12/157 [00:01<00:21,  6.89it/s][A
  8%|▊         | 13/157 [00:01<00:19,  7.34it/s][A
  9%|▉         | 14/157 [00:02<00:18,  7.65it/s][A
 10%|▉         | 15/157 [00:02<00:17,  7.90it/s][A
 10%|█         | 16/157 [00:02<00:17,  8.04it/s][A
 11%|█         | 17/157 [00:02<00:17,  8.13it/s][A
 11%|█▏        | 18/157 [00:02<00:16,  8.28it/s][A
 12%|█▏        | 19/157 [00:02<00:16,  8.51it/s][A
 13%|█▎        | 20/157 [00:02<00:15,  8.82it/s][A
 13%|█▎        | 21/157 [00:02<00:14,  9.11it/s][A
 14%|█▍        | 22/157 [0

{'eval_loss': 0.7859816551208496, 'eval_acc': 0.7662, 'eval_acc2': 0.8745, 'eval_f1': 0.7613706205456897, 'eval_roc_auc_micro': 0.9957880580303031, 'eval_precision': 0.7662, 'eval_recall': 0.7662, 'eval_runtime': 24.3081, 'eval_samples_per_second': 411.386, 'eval_steps_per_second': 6.459, 'epoch': 1.09}


 12%|█▏        | 180/1560 [08:49<45:05,  1.96s/it]  

{'loss': 1.0446, 'grad_norm': 7.503972053527832, 'learning_rate': 0.0009675081213427075, 'epoch': 1.15}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.43it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.80it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.28it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.04it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.65it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.32it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.33it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.20it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.05it/s][A
 13%|█▎        | 21/157 [00:02<00:19,  7.13it/s][A
 14%|█▍        | 22/157 [00:02<00:21,  6.16it/s][A
 15%|█▌        | 24/157 [00:02<00:18,  7.24it/s][A
 17%|█▋        | 26/157 [00:02<00:15,  8.31it/s][A
 18%|█▊        | 28/157 [00:02<00:13,  9.27it/s][A
 19%|█▉        | 30/157 [00:02<00:12, 10.04it/s][A
 20%|██        | 32/157 [00:03<00:11, 10.60it/s][A
 22%|██▏       | 34/157 [00:03<00:11, 10.78it/s][A
 23%|██▎       | 36/157 [00:03<00:11, 10.90it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.815513014793396, 'eval_acc': 0.7622, 'eval_acc2': 0.8733, 'eval_f1': 0.7580877208622866, 'eval_roc_auc_micro': 0.9955198227777778, 'eval_precision': 0.7622, 'eval_recall': 0.7622, 'eval_runtime': 17.5912, 'eval_samples_per_second': 568.465, 'eval_steps_per_second': 8.925, 'epoch': 1.15}


 12%|█▏        | 190/1560 [09:18<29:55,  1.31s/it]  

{'loss': 1.0793, 'grad_norm': 6.419629096984863, 'learning_rate': 0.0009638429754970715, 'epoch': 1.22}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.40it/s][A
  3%|▎         | 5/157 [00:00<00:10, 13.82it/s][A
  4%|▍         | 7/157 [00:00<00:11, 12.83it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.59it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.24it/s][A
  8%|▊         | 13/157 [00:01<00:11, 12.21it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.28it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.34it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.34it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.18it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.26it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.22it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.01it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.09it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.04it/s][A
 21%|██        | 33/157 [00:02<00:10, 11.90it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 11.90it/s][A
 24%|██▎       | 37/157 [00:03<00:10, 11.83it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.7566003799438477, 'eval_acc': 0.7728, 'eval_acc2': 0.882, 'eval_f1': 0.7727812079378664, 'eval_roc_auc_micro': 0.9963696145454546, 'eval_precision': 0.7728, 'eval_recall': 0.7728, 'eval_runtime': 16.0648, 'eval_samples_per_second': 622.477, 'eval_steps_per_second': 9.773, 'epoch': 1.22}


 13%|█▎        | 200/1560 [09:44<25:48,  1.14s/it]  

{'loss': 1.0429, 'grad_norm': 4.645038604736328, 'learning_rate': 0.0009599897218294122, 'epoch': 1.28}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.31it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.39it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.17it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.78it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.44it/s][A
  8%|▊         | 13/157 [00:01<00:11, 12.41it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.34it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.25it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.25it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.14it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 12.17it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.92it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 11.94it/s][A
 18%|█▊        | 29/157 [00:02<00:11, 11.44it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 11.52it/s][A
 21%|██        | 33/157 [00:02<00:10, 11.43it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 11.26it/s][A
 24%|██▎       | 37/157 [00:03<00:10, 11.41it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.7303251028060913, 'eval_acc': 0.7835, 'eval_acc2': 0.8899, 'eval_f1': 0.7793047699782163, 'eval_roc_auc_micro': 0.996496318989899, 'eval_precision': 0.7835, 'eval_recall': 0.7835, 'eval_runtime': 15.6826, 'eval_samples_per_second': 637.651, 'eval_steps_per_second': 10.011, 'epoch': 1.28}


 13%|█▎        | 210/1560 [10:09<25:42,  1.14s/it]  

{'loss': 1.1378, 'grad_norm': 3.4901392459869385, 'learning_rate': 0.0009559499229960451, 'epoch': 1.34}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.39it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.72it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.65it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.14it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.63it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.30it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.12it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.23it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.17it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.19it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 12.08it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.18it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.32it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.33it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.29it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.33it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.37it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.42it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.7288394570350647, 'eval_acc': 0.7878, 'eval_acc2': 0.8929, 'eval_f1': 0.7855960660044461, 'eval_roc_auc_micro': 0.9964393964646465, 'eval_precision': 0.7878, 'eval_recall': 0.7878, 'eval_runtime': 15.3085, 'eval_samples_per_second': 653.232, 'eval_steps_per_second': 10.256, 'epoch': 1.34}


 14%|█▍        | 220/1560 [10:34<24:10,  1.08s/it]  

{'loss': 0.9921, 'grad_norm': 4.282233238220215, 'learning_rate': 0.0009517252173051911, 'epoch': 1.41}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 19.09it/s][A
  3%|▎         | 5/157 [00:00<00:09, 15.26it/s][A
  4%|▍         | 7/157 [00:00<00:11, 12.72it/s][A
  6%|▌         | 9/157 [00:00<00:12, 12.01it/s][A
  7%|▋         | 11/157 [00:00<00:12, 11.92it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.85it/s][A
 10%|▉         | 15/157 [00:01<00:11, 11.91it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.00it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.99it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.05it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 12.10it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.15it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.11it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.21it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.29it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.29it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.32it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.33it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.7071507573127747, 'eval_acc': 0.7914, 'eval_acc2': 0.8915, 'eval_f1': 0.7898944111013864, 'eval_roc_auc_micro': 0.9966936262626263, 'eval_precision': 0.7914, 'eval_recall': 0.7914, 'eval_runtime': 15.4539, 'eval_samples_per_second': 647.085, 'eval_steps_per_second': 10.159, 'epoch': 1.41}


 15%|█▍        | 230/1560 [10:59<24:07,  1.09s/it]  

{'loss': 1.034, 'grad_norm': 3.8420331478118896, 'learning_rate': 0.0009473173180525737, 'epoch': 1.47}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:08, 17.41it/s][A
  3%|▎         | 4/157 [00:00<00:22,  6.95it/s][A
  4%|▍         | 6/157 [00:00<00:18,  8.38it/s][A
  5%|▌         | 8/157 [00:00<00:16,  9.18it/s][A
  6%|▋         | 10/157 [00:01<00:15,  9.60it/s][A
  8%|▊         | 12/157 [00:01<00:14,  9.78it/s][A
  9%|▉         | 14/157 [00:01<00:14,  9.62it/s][A
 10%|█         | 16/157 [00:01<00:14,  9.68it/s][A
 11%|█         | 17/157 [00:01<00:14,  9.66it/s][A
 11%|█▏        | 18/157 [00:01<00:14,  9.64it/s][A
 12%|█▏        | 19/157 [00:02<00:14,  9.60it/s][A
 13%|█▎        | 21/157 [00:02<00:13,  9.97it/s][A
 15%|█▍        | 23/157 [00:02<00:13, 10.29it/s][A
 16%|█▌        | 25/157 [00:02<00:12, 10.28it/s][A
 17%|█▋        | 27/157 [00:02<00:12, 10.41it/s][A
 18%|█▊        | 29/157 [00:02<00:12, 10.57it/s][A
 20%|█▉        | 31/157 [00:03<00:11, 10.53it/s][A
 21%|██        | 33/157 [00:03<00:11, 10.55it/s][A
 22%|██▏       | 35/157 

{'eval_loss': 0.7191241979598999, 'eval_acc': 0.7833, 'eval_acc2': 0.8918, 'eval_f1': 0.7826992154201476, 'eval_roc_auc_micro': 0.9966939018181818, 'eval_precision': 0.7833, 'eval_recall': 0.7833, 'eval_runtime': 31.8543, 'eval_samples_per_second': 313.93, 'eval_steps_per_second': 4.929, 'epoch': 1.47}


 15%|█▌        | 240/1560 [11:41<28:45,  1.31s/it]  

{'loss': 1.0133, 'grad_norm': 4.384955406188965, 'learning_rate': 0.0009427280128266049, 'epoch': 1.54}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:31,  4.98it/s][A
  3%|▎         | 4/157 [00:00<00:19,  7.72it/s][A
  4%|▍         | 6/157 [00:00<00:16,  9.40it/s][A
  5%|▌         | 8/157 [00:00<00:14, 10.33it/s][A
  6%|▋         | 10/157 [00:01<00:13, 10.86it/s][A
  8%|▊         | 12/157 [00:01<00:12, 11.27it/s][A
  9%|▉         | 14/157 [00:01<00:12, 11.44it/s][A
 10%|█         | 16/157 [00:01<00:12, 11.57it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 11.70it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 11.81it/s][A
 14%|█▍        | 22/157 [00:02<00:11, 11.91it/s][A
 15%|█▌        | 24/157 [00:02<00:11, 11.93it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.04it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 12.08it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 12.30it/s][A
 20%|██        | 32/157 [00:02<00:10, 12.39it/s][A
 22%|██▏       | 34/157 [00:03<00:09, 12.45it/s][A
 23%|██▎       | 36/157 [00:03<00:09, 12.44it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.7402344346046448, 'eval_acc': 0.781, 'eval_acc2': 0.8847, 'eval_f1': 0.7795562294874943, 'eval_roc_auc_micro': 0.9964319201010101, 'eval_precision': 0.781, 'eval_recall': 0.781, 'eval_runtime': 53.7823, 'eval_samples_per_second': 185.935, 'eval_steps_per_second': 2.919, 'epoch': 1.54}


 16%|█▌        | 250/1560 [12:44<33:28,  1.53s/it]  

{'loss': 1.04, 'grad_norm': 3.793153762817383, 'learning_rate': 0.000937959162783444, 'epoch': 1.6}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:17,  8.82it/s][A
  3%|▎         | 4/157 [00:00<00:14, 10.68it/s][A
  4%|▍         | 6/157 [00:00<00:13, 11.50it/s][A
  5%|▌         | 8/157 [00:00<00:12, 11.94it/s][A
  6%|▋         | 10/157 [00:00<00:12, 12.08it/s][A
  8%|▊         | 12/157 [00:01<00:11, 12.27it/s][A
  9%|▉         | 14/157 [00:01<00:11, 12.41it/s][A
 10%|█         | 16/157 [00:01<00:11, 12.45it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 12.59it/s][A
 13%|█▎        | 20/157 [00:01<00:10, 12.69it/s][A
 14%|█▍        | 22/157 [00:01<00:10, 12.74it/s][A
 15%|█▌        | 24/157 [00:01<00:10, 12.69it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.71it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 12.79it/s][A
 19%|█▉        | 30/157 [00:02<00:09, 12.81it/s][A
 20%|██        | 32/157 [00:02<00:09, 12.75it/s][A
 22%|██▏       | 34/157 [00:03<00:14,  8.27it/s][A
 23%|██▎       | 36/157 [00:03<00:13,  9.21it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.6902532577514648, 'eval_acc': 0.7925, 'eval_acc2': 0.9002, 'eval_f1': 0.7919073182366096, 'eval_roc_auc_micro': 0.9969098284343434, 'eval_precision': 0.7925, 'eval_recall': 0.7925, 'eval_runtime': 38.5068, 'eval_samples_per_second': 259.695, 'eval_steps_per_second': 4.077, 'epoch': 1.6}


 17%|█▋        | 260/1560 [13:32<29:37,  1.37s/it]  

{'loss': 1.0572, 'grad_norm': 3.1044867038726807, 'learning_rate': 0.0009330127018922195, 'epoch': 1.66}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.87it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.54it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.38it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.06it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.92it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.74it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.56it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.36it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.04it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.96it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 12.03it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.03it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.01it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.07it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 11.65it/s][A
 21%|██        | 33/157 [00:02<00:10, 11.83it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 12.05it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.19it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.6842555403709412, 'eval_acc': 0.7924, 'eval_acc2': 0.8979, 'eval_f1': 0.7921850618735391, 'eval_roc_auc_micro': 0.996948624949495, 'eval_precision': 0.7924, 'eval_recall': 0.7924, 'eval_runtime': 15.476, 'eval_samples_per_second': 646.161, 'eval_steps_per_second': 10.145, 'epoch': 1.66}


 17%|█▋        | 270/1560 [13:58<25:13,  1.17s/it]  

{'loss': 1.0293, 'grad_norm': 3.277191162109375, 'learning_rate': 0.0009278906361507238, 'epoch': 1.73}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:23,  6.59it/s][A
  3%|▎         | 4/157 [00:00<00:17,  8.66it/s][A
  4%|▍         | 6/157 [00:00<00:15, 10.00it/s][A
  5%|▌         | 8/157 [00:00<00:13, 10.79it/s][A
  6%|▋         | 10/157 [00:00<00:13, 10.92it/s][A
  8%|▊         | 12/157 [00:01<00:12, 11.28it/s][A
  9%|▉         | 14/157 [00:01<00:12, 11.38it/s][A
 10%|█         | 16/157 [00:01<00:12, 11.51it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 11.67it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 11.92it/s][A
 14%|█▍        | 22/157 [00:01<00:11, 11.97it/s][A
 15%|█▌        | 24/157 [00:02<00:11, 11.85it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.07it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 12.10it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 12.18it/s][A
 20%|██        | 32/157 [00:02<00:10, 12.18it/s][A
 22%|██▏       | 34/157 [00:02<00:10, 12.05it/s][A
 23%|██▎       | 36/157 [00:03<00:09, 12.16it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.6900432705879211, 'eval_acc': 0.7921, 'eval_acc2': 0.8962, 'eval_f1': 0.7907904339897694, 'eval_roc_auc_micro': 0.996977140909091, 'eval_precision': 0.7921, 'eval_recall': 0.7921, 'eval_runtime': 46.0918, 'eval_samples_per_second': 216.958, 'eval_steps_per_second': 3.406, 'epoch': 1.73}


 18%|█▊        | 280/1560 [14:54<32:07,  1.51s/it]  

{'loss': 1.0408, 'grad_norm': 5.3846611976623535, 'learning_rate': 0.0009225950427718975, 'epoch': 1.79}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:24,  6.28it/s][A
  3%|▎         | 4/157 [00:00<00:17,  8.70it/s][A
  4%|▍         | 6/157 [00:00<00:15,  9.83it/s][A
  5%|▌         | 8/157 [00:00<00:14, 10.42it/s][A
  6%|▋         | 10/157 [00:01<00:14, 10.49it/s][A
  8%|▊         | 12/157 [00:01<00:14,  9.86it/s][A
  9%|▉         | 14/157 [00:01<00:13, 10.23it/s][A
 10%|█         | 16/157 [00:01<00:13, 10.52it/s][A
 11%|█▏        | 18/157 [00:01<00:12, 10.89it/s][A
 13%|█▎        | 20/157 [00:01<00:12, 11.09it/s][A
 14%|█▍        | 22/157 [00:02<00:12, 11.24it/s][A
 15%|█▌        | 24/157 [00:02<00:11, 11.44it/s][A
 17%|█▋        | 26/157 [00:02<00:11, 11.57it/s][A
 18%|█▊        | 28/157 [00:02<00:11, 11.61it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 11.70it/s][A
 20%|██        | 32/157 [00:02<00:10, 11.70it/s][A
 22%|██▏       | 34/157 [00:03<00:16,  7.68it/s][A
 23%|██▎       | 36/157 [00:03<00:14,  8.55it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.7155510783195496, 'eval_acc': 0.7812, 'eval_acc2': 0.8932, 'eval_f1': 0.7784995933667342, 'eval_roc_auc_micro': 0.9967167475252525, 'eval_precision': 0.7812, 'eval_recall': 0.7812, 'eval_runtime': 18.969, 'eval_samples_per_second': 527.177, 'eval_steps_per_second': 8.277, 'epoch': 1.79}


 19%|█▊        | 290/1560 [15:34<53:03,  2.51s/it]  

{'loss': 0.9539, 'grad_norm': 3.037106990814209, 'learning_rate': 0.0009171280693414306, 'epoch': 1.86}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:07, 19.55it/s][A
  3%|▎         | 4/157 [00:00<00:23,  6.45it/s][A
  4%|▍         | 6/157 [00:00<00:18,  8.07it/s][A
  5%|▌         | 8/157 [00:00<00:16,  8.88it/s][A
  6%|▋         | 10/157 [00:01<00:15,  9.74it/s][A
  8%|▊         | 12/157 [00:01<00:22,  6.45it/s][A
  8%|▊         | 13/157 [00:01<00:25,  5.62it/s][A
  9%|▉         | 14/157 [00:02<00:26,  5.36it/s][A
 10%|▉         | 15/157 [00:02<00:26,  5.39it/s][A
 10%|█         | 16/157 [00:02<00:25,  5.45it/s][A
 11%|█         | 17/157 [00:02<00:25,  5.51it/s][A
 11%|█▏        | 18/157 [00:02<00:24,  5.59it/s][A
 12%|█▏        | 19/157 [00:02<00:23,  5.79it/s][A
 13%|█▎        | 20/157 [00:03<00:23,  5.94it/s][A
 13%|█▎        | 21/157 [00:03<00:21,  6.22it/s][A
 14%|█▍        | 22/157 [00:03<00:20,  6.43it/s][A
 15%|█▍        | 23/157 [00:03<00:20,  6.62it/s][A
 15%|█▌        | 24/157 [00:03<00:19,  6.89it/s][A
 16%|█▌        | 25/157 

{'eval_loss': 0.6806042194366455, 'eval_acc': 0.7966, 'eval_acc2': 0.8987, 'eval_f1': 0.7967278923143369, 'eval_roc_auc_micro': 0.9970002179292929, 'eval_precision': 0.7966, 'eval_recall': 0.7966, 'eval_runtime': 49.9817, 'eval_samples_per_second': 200.073, 'eval_steps_per_second': 3.141, 'epoch': 1.86}


 19%|█▉        | 300/1560 [16:34<32:56,  1.57s/it]  

{'loss': 0.9729, 'grad_norm': 3.546905994415283, 'learning_rate': 0.0009114919329468282, 'epoch': 1.92}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.79it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.94it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.81it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.05it/s][A
  7%|▋         | 11/157 [00:00<00:12, 11.83it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.91it/s][A
 10%|▉         | 15/157 [00:01<00:11, 11.98it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.16it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.23it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.31it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.35it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.41it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.50it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.49it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.58it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.59it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.53it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.37it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.6838566660881042, 'eval_acc': 0.798, 'eval_acc2': 0.8963, 'eval_f1': 0.7974579759903236, 'eval_roc_auc_micro': 0.9968355629292929, 'eval_precision': 0.798, 'eval_recall': 0.798, 'eval_runtime': 15.5082, 'eval_samples_per_second': 644.821, 'eval_steps_per_second': 10.124, 'epoch': 1.92}


 20%|█▉        | 310/1560 [16:59<22:42,  1.09s/it]  

{'loss': 1.0033, 'grad_norm': 3.486158847808838, 'learning_rate': 0.0009056889192782866, 'epoch': 1.98}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.84it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.05it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.86it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.27it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.99it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.79it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.67it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.60it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.59it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.43it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.47it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.46it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.28it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.32it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.31it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.33it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 12.15it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.24it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.6560379862785339, 'eval_acc': 0.8056, 'eval_acc2': 0.9022, 'eval_f1': 0.8033675570632917, 'eval_roc_auc_micro': 0.9972000562121213, 'eval_precision': 0.8056, 'eval_recall': 0.8056, 'eval_runtime': 42.885, 'eval_samples_per_second': 233.182, 'eval_steps_per_second': 3.661, 'epoch': 1.98}


 21%|██        | 320/1560 [17:54<30:40,  1.48s/it]  

{'loss': 0.8423, 'grad_norm': 3.494546413421631, 'learning_rate': 0.0008997213817017506, 'epoch': 2.05}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.89it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.08it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.77it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.68it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.50it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.49it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.10it/s][A
 11%|█         | 17/157 [00:01<00:11, 11.88it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.96it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.93it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 12.02it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.25it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.40it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.50it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.56it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.50it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.60it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.47it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.6604793071746826, 'eval_acc': 0.801, 'eval_acc2': 0.8981, 'eval_f1': 0.8002020040996459, 'eval_roc_auc_micro': 0.9970759890404041, 'eval_precision': 0.801, 'eval_recall': 0.801, 'eval_runtime': 36.5452, 'eval_samples_per_second': 273.634, 'eval_steps_per_second': 4.296, 'epoch': 2.05}


 21%|██        | 330/1560 [18:40<28:10,  1.37s/it]  

{'loss': 0.8231, 'grad_norm': 3.4576222896575928, 'learning_rate': 0.000893591740304525, 'epoch': 2.11}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:44,  3.51it/s][A
  3%|▎         | 4/157 [00:00<00:25,  6.09it/s][A
  4%|▍         | 6/157 [00:00<00:19,  7.93it/s][A
  5%|▌         | 8/157 [00:01<00:16,  9.25it/s][A
  6%|▋         | 10/157 [00:01<00:14, 10.22it/s][A
  8%|▊         | 12/157 [00:01<00:13, 10.94it/s][A
  9%|▉         | 14/157 [00:01<00:12, 11.27it/s][A
 10%|█         | 16/157 [00:01<00:12, 11.55it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 11.89it/s][A
 13%|█▎        | 20/157 [00:02<00:11, 11.95it/s][A
 14%|█▍        | 22/157 [00:02<00:11, 12.08it/s][A
 15%|█▌        | 24/157 [00:02<00:10, 12.26it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.36it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 12.40it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 12.36it/s][A
 20%|██        | 32/157 [00:02<00:10, 12.42it/s][A
 22%|██▏       | 34/157 [00:03<00:09, 12.48it/s][A
 23%|██▎       | 36/157 [00:03<00:09, 12.57it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.6816920042037964, 'eval_acc': 0.8, 'eval_acc2': 0.8928, 'eval_f1': 0.7988221441236921, 'eval_roc_auc_micro': 0.9968711302020203, 'eval_precision': 0.8, 'eval_recall': 0.8, 'eval_runtime': 18.8517, 'eval_samples_per_second': 530.455, 'eval_steps_per_second': 8.328, 'epoch': 2.11}


 22%|██▏       | 340/1560 [19:09<23:01,  1.13s/it]  

{'loss': 0.8687, 'grad_norm': 3.9004595279693604, 'learning_rate': 0.0008873024809138273, 'epoch': 2.18}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.46it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.94it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.75it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.81it/s][A
  7%|▋         | 11/157 [00:00<00:12, 12.15it/s][A
  8%|▊         | 13/157 [00:01<00:11, 12.19it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.19it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.25it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.27it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.29it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.29it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.14it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.07it/s][A
 18%|█▊        | 29/157 [00:02<00:16,  7.70it/s][A
 20%|█▉        | 31/157 [00:02<00:14,  8.54it/s][A
 21%|██        | 33/157 [00:02<00:13,  9.10it/s][A
 22%|██▏       | 35/157 [00:03<00:12,  9.58it/s][A
 24%|██▎       | 37/157 [00:03<00:11, 10.01it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.6741493940353394, 'eval_acc': 0.8005, 'eval_acc2': 0.9019, 'eval_f1': 0.7987584734957861, 'eval_roc_auc_micro': 0.9967779511616162, 'eval_precision': 0.8005, 'eval_recall': 0.8005, 'eval_runtime': 16.053, 'eval_samples_per_second': 622.938, 'eval_steps_per_second': 9.78, 'epoch': 2.18}


 22%|██▏       | 350/1560 [19:36<27:56,  1.39s/it]  

{'loss': 0.8747, 'grad_norm': 4.229421615600586, 'learning_rate': 0.0008808561540886796, 'epoch': 2.24}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 17.58it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.15it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.00it/s][A
  6%|▌         | 9/157 [00:00<00:15,  9.40it/s][A
  7%|▋         | 11/157 [00:01<00:25,  5.68it/s][A
  8%|▊         | 12/157 [00:01<00:28,  5.15it/s][A
  8%|▊         | 13/157 [00:01<00:28,  5.00it/s][A
  9%|▉         | 14/157 [00:02<00:28,  5.08it/s][A
 10%|▉         | 15/157 [00:02<00:26,  5.30it/s][A
 10%|█         | 16/157 [00:02<00:25,  5.63it/s][A
 11%|█         | 17/157 [00:02<00:23,  6.02it/s][A
 11%|█▏        | 18/157 [00:02<00:21,  6.35it/s][A
 12%|█▏        | 19/157 [00:02<00:20,  6.74it/s][A
 13%|█▎        | 20/157 [00:02<00:19,  7.19it/s][A
 13%|█▎        | 21/157 [00:03<00:18,  7.54it/s][A
 14%|█▍        | 22/157 [00:03<00:17,  7.83it/s][A
 15%|█▍        | 23/157 [00:03<00:16,  8.10it/s][A
 15%|█▌        | 24/157 [00:03<00:16,  8.26it/s][A
 16%|█▌        | 25/157 

{'eval_loss': 0.681240439414978, 'eval_acc': 0.7977, 'eval_acc2': 0.8966, 'eval_f1': 0.7986286288941066, 'eval_roc_auc_micro': 0.9968156060606062, 'eval_precision': 0.7977, 'eval_recall': 0.7977, 'eval_runtime': 49.0832, 'eval_samples_per_second': 203.736, 'eval_steps_per_second': 3.199, 'epoch': 2.24}


 23%|██▎       | 360/1560 [20:36<33:19,  1.67s/it]  

{'loss': 0.8869, 'grad_norm': 5.196685314178467, 'learning_rate': 0.0008742553740855505, 'epoch': 2.3}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:23,  6.68it/s][A
  2%|▏         | 3/157 [00:00<00:33,  4.55it/s][A
  3%|▎         | 5/157 [00:00<00:22,  6.76it/s][A
  4%|▍         | 7/157 [00:00<00:17,  8.36it/s][A
  6%|▌         | 9/157 [00:01<00:15,  9.36it/s][A
  7%|▋         | 11/157 [00:01<00:14, 10.12it/s][A
  8%|▊         | 13/157 [00:01<00:21,  6.68it/s][A
  9%|▉         | 14/157 [00:02<00:23,  6.01it/s][A
 10%|▉         | 15/157 [00:02<00:24,  5.79it/s][A
 10%|█         | 16/157 [00:02<00:24,  5.83it/s][A
 11%|█         | 17/157 [00:02<00:23,  5.98it/s][A
 11%|█▏        | 18/157 [00:02<00:22,  6.26it/s][A
 12%|█▏        | 19/157 [00:02<00:21,  6.55it/s][A
 13%|█▎        | 20/157 [00:02<00:19,  6.91it/s][A
 13%|█▎        | 21/157 [00:03<00:18,  7.28it/s][A
 14%|█▍        | 22/157 [00:03<00:17,  7.56it/s][A
 15%|█▍        | 23/157 [00:03<00:16,  7.98it/s][A
 15%|█▌        | 24/157 [00:03<00:15,  8.33it/s][A
 16%|█▌        | 25/157 [

{'eval_loss': 0.6942543983459473, 'eval_acc': 0.7936, 'eval_acc2': 0.8967, 'eval_f1': 0.791463709978157, 'eval_roc_auc_micro': 0.9967709013636363, 'eval_precision': 0.7936, 'eval_recall': 0.7936, 'eval_runtime': 19.7437, 'eval_samples_per_second': 506.49, 'eval_steps_per_second': 7.952, 'epoch': 2.3}


 24%|██▎       | 370/1560 [21:07<26:29,  1.34s/it]  

{'loss': 0.8411, 'grad_norm': 2.30686092376709, 'learning_rate': 0.0008675028177981643, 'epoch': 2.37}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 17.58it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.50it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.32it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.61it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.24it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.87it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.44it/s][A
 11%|█         | 17/157 [00:01<00:12, 11.50it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.64it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.62it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 11.66it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.69it/s][A
 17%|█▋        | 27/157 [00:02<00:16,  7.90it/s][A
 18%|█▊        | 29/157 [00:02<00:14,  8.74it/s][A
 20%|█▉        | 31/157 [00:02<00:13,  9.42it/s][A
 21%|██        | 33/157 [00:03<00:12, 10.07it/s][A
 22%|██▏       | 35/157 [00:03<00:11, 10.58it/s][A
 24%|██▎       | 37/157 [00:03<00:10, 10.96it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.6232584118843079, 'eval_acc': 0.8142, 'eval_acc2': 0.9081, 'eval_f1': 0.8128479835865222, 'eval_roc_auc_micro': 0.9973963604545454, 'eval_precision': 0.8142, 'eval_recall': 0.8142, 'eval_runtime': 38.2758, 'eval_samples_per_second': 261.261, 'eval_steps_per_second': 4.102, 'epoch': 2.37}


 24%|██▍       | 380/1560 [21:56<27:03,  1.38s/it]  

{'loss': 0.8719, 'grad_norm': 3.639350414276123, 'learning_rate': 0.0008606012236719073, 'epoch': 2.43}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:09, 17.00it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.10it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.48it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.08it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.88it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.60it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.54it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.52it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.22it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.33it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 12.17it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.24it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 11.97it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 11.79it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 11.98it/s][A
 21%|██        | 33/157 [00:02<00:10, 11.99it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 12.12it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.28it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.6658890843391418, 'eval_acc': 0.8013, 'eval_acc2': 0.9002, 'eval_f1': 0.7998190298974164, 'eval_roc_auc_micro': 0.997027808030303, 'eval_precision': 0.8013, 'eval_recall': 0.8013, 'eval_runtime': 19.2897, 'eval_samples_per_second': 518.413, 'eval_steps_per_second': 8.139, 'epoch': 2.43}


 25%|██▌       | 390/1560 [22:25<22:51,  1.17s/it]  

{'loss': 0.8356, 'grad_norm': 3.230999708175659, 'learning_rate': 0.0008535533905932737, 'epoch': 2.5}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:21,  7.37it/s][A
  3%|▎         | 4/157 [00:00<00:16,  9.11it/s][A
  4%|▍         | 6/157 [00:00<00:14, 10.23it/s][A
  5%|▌         | 8/157 [00:00<00:13, 11.10it/s][A
  6%|▋         | 10/157 [00:00<00:12, 11.40it/s][A
  8%|▊         | 12/157 [00:01<00:12, 11.76it/s][A
  9%|▉         | 14/157 [00:01<00:11, 12.04it/s][A
 10%|█         | 16/157 [00:01<00:11, 12.18it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 12.28it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 12.38it/s][A
 14%|█▍        | 22/157 [00:01<00:10, 12.34it/s][A
 15%|█▌        | 24/157 [00:02<00:10, 12.47it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.53it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 12.53it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 12.60it/s][A
 20%|██        | 32/157 [00:02<00:09, 12.64it/s][A
 22%|██▏       | 34/157 [00:02<00:09, 12.67it/s][A
 23%|██▎       | 36/157 [00:03<00:09, 12.68it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.6269935369491577, 'eval_acc': 0.8122, 'eval_acc2': 0.9083, 'eval_f1': 0.8111613502706855, 'eval_roc_auc_micro': 0.9973275246969697, 'eval_precision': 0.8122, 'eval_recall': 0.8122, 'eval_runtime': 39.3943, 'eval_samples_per_second': 253.844, 'eval_steps_per_second': 3.985, 'epoch': 2.5}


 26%|██▌       | 400/1560 [23:14<26:37,  1.38s/it]  

{'loss': 0.8369, 'grad_norm': 3.3983042240142822, 'learning_rate': 0.0008463621767547997, 'epoch': 2.56}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:08, 18.33it/s][A
  3%|▎         | 4/157 [00:00<00:11, 13.91it/s][A
  4%|▍         | 6/157 [00:00<00:11, 13.25it/s][A
  5%|▌         | 8/157 [00:00<00:11, 12.96it/s][A
  6%|▋         | 10/157 [00:00<00:11, 12.75it/s][A
  8%|▊         | 12/157 [00:00<00:11, 12.10it/s][A
  9%|▉         | 14/157 [00:01<00:11, 12.08it/s][A
 10%|█         | 16/157 [00:01<00:11, 12.21it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 12.37it/s][A
 13%|█▎        | 20/157 [00:01<00:10, 12.56it/s][A
 14%|█▍        | 22/157 [00:01<00:10, 12.65it/s][A
 15%|█▌        | 24/157 [00:01<00:10, 12.68it/s][A
 17%|█▋        | 26/157 [00:02<00:16,  8.00it/s][A
 18%|█▊        | 28/157 [00:02<00:14,  9.00it/s][A
 19%|█▉        | 30/157 [00:02<00:12,  9.86it/s][A
 20%|██        | 32/157 [00:02<00:11, 10.51it/s][A
 22%|██▏       | 34/157 [00:02<00:11, 10.90it/s][A
 23%|██▎       | 36/157 [00:03<00:10, 11.35it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.6256312131881714, 'eval_acc': 0.8122, 'eval_acc2': 0.9075, 'eval_f1': 0.8108493332025457, 'eval_roc_auc_micro': 0.9973430185858585, 'eval_precision': 0.8122, 'eval_recall': 0.8122, 'eval_runtime': 15.4363, 'eval_samples_per_second': 647.824, 'eval_steps_per_second': 10.171, 'epoch': 2.56}


 26%|██▋       | 410/1560 [23:39<20:53,  1.09s/it]  

{'loss': 0.8328, 'grad_norm': 3.3804831504821777, 'learning_rate': 0.0008390304984959455, 'epoch': 2.62}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 19.03it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.07it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.66it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.91it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.61it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.27it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.05it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.05it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.14it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.11it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 12.00it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.08it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.05it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.13it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.25it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.33it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.36it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.45it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.6185318231582642, 'eval_acc': 0.8151, 'eval_acc2': 0.9064, 'eval_f1': 0.8144913083840436, 'eval_roc_auc_micro': 0.997456518939394, 'eval_precision': 0.8151, 'eval_recall': 0.8151, 'eval_runtime': 41.5513, 'eval_samples_per_second': 240.666, 'eval_steps_per_second': 3.778, 'epoch': 2.62}


 27%|██▋       | 420/1560 [24:30<27:02,  1.42s/it]  

{'loss': 0.8242, 'grad_norm': 4.424212455749512, 'learning_rate': 0.0008315613291203976, 'epoch': 2.69}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:07, 19.27it/s][A
  3%|▎         | 5/157 [00:00<00:09, 15.21it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.70it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.03it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.48it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.35it/s][A
 10%|▉         | 15/157 [00:01<00:11, 11.95it/s][A
 11%|█         | 17/157 [00:01<00:11, 11.84it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.86it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.83it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 11.83it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.97it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.05it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.15it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.21it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.17it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 12.12it/s][A
 24%|██▎       | 37/157 [00:03<00:10, 11.72it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.6059023141860962, 'eval_acc': 0.8148, 'eval_acc2': 0.9119, 'eval_f1': 0.814262870979737, 'eval_roc_auc_micro': 0.997627567070707, 'eval_precision': 0.8148, 'eval_recall': 0.8148, 'eval_runtime': 18.5024, 'eval_samples_per_second': 540.472, 'eval_steps_per_second': 8.485, 'epoch': 2.69}


 28%|██▊       | 430/1560 [25:00<26:33,  1.41s/it]  

{'loss': 0.7473, 'grad_norm': 4.150497913360596, 'learning_rate': 0.0008239576976902694, 'epoch': 2.75}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:23,  6.48it/s][A
  3%|▎         | 4/157 [00:00<00:17,  8.51it/s][A
  4%|▍         | 6/157 [00:00<00:15,  9.68it/s][A
  5%|▌         | 8/157 [00:00<00:14, 10.09it/s][A
  6%|▋         | 10/157 [00:01<00:13, 10.56it/s][A
  8%|▊         | 12/157 [00:01<00:13, 10.61it/s][A
  9%|▉         | 14/157 [00:01<00:14,  9.93it/s][A
 10%|█         | 16/157 [00:01<00:14,  9.70it/s][A
 11%|█         | 17/157 [00:01<00:14,  9.75it/s][A
 12%|█▏        | 19/157 [00:01<00:13,  9.89it/s][A
 13%|█▎        | 21/157 [00:02<00:13, 10.00it/s][A
 15%|█▍        | 23/157 [00:02<00:13, 10.13it/s][A
 16%|█▌        | 25/157 [00:02<00:12, 10.32it/s][A
 17%|█▋        | 27/157 [00:02<00:12, 10.59it/s][A
 18%|█▊        | 29/157 [00:03<00:16,  7.78it/s][A
 20%|█▉        | 31/157 [00:03<00:14,  8.66it/s][A
 21%|██        | 33/157 [00:03<00:13,  9.41it/s][A
 22%|██▏       | 35/157 [00:03<00:12,  9.99it/s][A
 24%|██▎       | 37/157 

{'eval_loss': 0.6203445792198181, 'eval_acc': 0.8136, 'eval_acc2': 0.9064, 'eval_f1': 0.8119174364811697, 'eval_roc_auc_micro': 0.9974224776767677, 'eval_precision': 0.8136, 'eval_recall': 0.8136, 'eval_runtime': 19.0996, 'eval_samples_per_second': 523.571, 'eval_steps_per_second': 8.22, 'epoch': 2.75}


 28%|██▊       | 440/1560 [25:35<35:50,  1.92s/it]  

{'loss': 0.7883, 'grad_norm': 2.5026016235351562, 'learning_rate': 0.0008162226877976886, 'epoch': 2.82}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:12, 12.76it/s][A
  3%|▎         | 4/157 [00:00<00:13, 11.36it/s][A
  4%|▍         | 6/157 [00:00<00:16,  9.41it/s][A
  4%|▍         | 7/157 [00:00<00:19,  7.85it/s][A
  5%|▌         | 8/157 [00:00<00:20,  7.15it/s][A
  6%|▌         | 9/157 [00:01<00:21,  6.94it/s][A
  6%|▋         | 10/157 [00:01<00:21,  6.84it/s][A
  7%|▋         | 11/157 [00:01<00:21,  6.76it/s][A
  8%|▊         | 12/157 [00:01<00:21,  6.74it/s][A
  8%|▊         | 13/157 [00:01<00:21,  6.68it/s][A
  9%|▉         | 14/157 [00:01<00:21,  6.68it/s][A
 10%|▉         | 15/157 [00:02<00:21,  6.64it/s][A
 10%|█         | 16/157 [00:02<00:21,  6.59it/s][A
 11%|█         | 17/157 [00:02<00:21,  6.58it/s][A
 11%|█▏        | 18/157 [00:02<00:21,  6.59it/s][A
 12%|█▏        | 19/157 [00:02<00:20,  6.63it/s][A
 13%|█▎        | 20/157 [00:02<00:20,  6.60it/s][A
 13%|█▎        | 21/157 [00:02<00:20,  6.61it/s][A
 14%|█▍        | 22/157 [0

{'eval_loss': 0.6182556748390198, 'eval_acc': 0.816, 'eval_acc2': 0.9097, 'eval_f1': 0.8144703925105666, 'eval_roc_auc_micro': 0.9975329432828283, 'eval_precision': 0.816, 'eval_recall': 0.816, 'eval_runtime': 30.7989, 'eval_samples_per_second': 324.687, 'eval_steps_per_second': 5.098, 'epoch': 2.82}


 29%|██▉       | 450/1560 [26:23<33:35,  1.82s/it]  

{'loss': 0.8099, 'grad_norm': 4.3009114265441895, 'learning_rate': 0.0008083594363142716, 'epoch': 2.88}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:08, 17.80it/s][A
  3%|▎         | 4/157 [00:00<00:11, 13.40it/s][A
  4%|▍         | 6/157 [00:00<00:12, 12.13it/s][A
  5%|▌         | 8/157 [00:00<00:12, 12.04it/s][A
  6%|▋         | 10/157 [00:00<00:15,  9.33it/s][A
  8%|▊         | 12/157 [00:01<00:21,  6.65it/s][A
  8%|▊         | 13/157 [00:01<00:22,  6.40it/s][A
  9%|▉         | 14/157 [00:01<00:22,  6.41it/s][A
 10%|█         | 16/157 [00:01<00:18,  7.67it/s][A
 11%|█▏        | 18/157 [00:02<00:15,  8.71it/s][A
 13%|█▎        | 20/157 [00:02<00:14,  9.57it/s][A
 14%|█▍        | 22/157 [00:02<00:13, 10.33it/s][A
 15%|█▌        | 24/157 [00:02<00:12, 10.83it/s][A
 17%|█▋        | 26/157 [00:02<00:11, 11.21it/s][A
 18%|█▊        | 28/157 [00:02<00:11, 11.31it/s][A
 19%|█▉        | 30/157 [00:03<00:11, 11.42it/s][A
 20%|██        | 32/157 [00:03<00:13,  9.55it/s][A
 22%|██▏       | 34/157 [00:03<00:17,  6.98it/s][A
 23%|██▎       | 36/157 

{'eval_loss': 0.627607524394989, 'eval_acc': 0.8087, 'eval_acc2': 0.9068, 'eval_f1': 0.806522040682762, 'eval_roc_auc_micro': 0.9973683042929293, 'eval_precision': 0.8087, 'eval_recall': 0.8087, 'eval_runtime': 40.9955, 'eval_samples_per_second': 243.929, 'eval_steps_per_second': 3.83, 'epoch': 2.88}


 29%|██▉       | 460/1560 [27:13<25:45,  1.40s/it]  

{'loss': 0.7875, 'grad_norm': 3.2317850589752197, 'learning_rate': 0.0008003711321189895, 'epoch': 2.94}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:43,  3.54it/s][A
  3%|▎         | 4/157 [00:00<00:25,  6.09it/s][A
  4%|▍         | 6/157 [00:00<00:19,  7.90it/s][A
  5%|▌         | 8/157 [00:01<00:16,  9.06it/s][A
  6%|▋         | 10/157 [00:01<00:14,  9.86it/s][A
  8%|▊         | 12/157 [00:01<00:13, 10.40it/s][A
  9%|▉         | 14/157 [00:01<00:13, 10.78it/s][A
 10%|█         | 16/157 [00:01<00:12, 10.98it/s][A
 11%|█▏        | 18/157 [00:01<00:12, 10.98it/s][A
 13%|█▎        | 20/157 [00:02<00:12, 11.14it/s][A
 14%|█▍        | 22/157 [00:02<00:11, 11.53it/s][A
 15%|█▌        | 24/157 [00:02<00:17,  7.58it/s][A
 17%|█▋        | 26/157 [00:02<00:15,  8.62it/s][A
 18%|█▊        | 28/157 [00:03<00:13,  9.51it/s][A
 19%|█▉        | 30/157 [00:03<00:12, 10.17it/s][A
 20%|██        | 32/157 [00:03<00:11, 10.71it/s][A
 22%|██▏       | 34/157 [00:03<00:11, 11.13it/s][A
 23%|██▎       | 36/157 [00:03<00:10, 11.51it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.5873494148254395, 'eval_acc': 0.8212, 'eval_acc2': 0.9166, 'eval_f1': 0.8181913803125538, 'eval_roc_auc_micro': 0.9977295306060606, 'eval_precision': 0.8212, 'eval_recall': 0.8212, 'eval_runtime': 18.1872, 'eval_samples_per_second': 549.838, 'eval_steps_per_second': 8.632, 'epoch': 2.94}


 30%|███       | 470/1560 [27:44<30:03,  1.65s/it]  

{'loss': 0.8031, 'grad_norm': 3.1117327213287354, 'learning_rate': 0.0007922610148049445, 'epoch': 3.01}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.87it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.51it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.71it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.22it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.19it/s][A
  8%|▊         | 13/157 [00:01<00:11, 12.08it/s][A
 10%|▉         | 15/157 [00:01<00:11, 11.99it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.19it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.34it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.44it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.49it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.33it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.41it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.51it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.50it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.54it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.57it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.46it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.6337178945541382, 'eval_acc': 0.8139, 'eval_acc2': 0.9073, 'eval_f1': 0.8126555948462566, 'eval_roc_auc_micro': 0.9972334178787879, 'eval_precision': 0.8139, 'eval_recall': 0.8139, 'eval_runtime': 37.5541, 'eval_samples_per_second': 266.282, 'eval_steps_per_second': 4.181, 'epoch': 3.01}


 31%|███       | 480/1560 [28:31<24:58,  1.39s/it]  

{'loss': 0.6874, 'grad_norm': 2.6690635681152344, 'learning_rate': 0.0007840323733655779, 'epoch': 3.07}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.74it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.17it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.91it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.10it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.63it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.28it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.14it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.12it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.96it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.02it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 12.15it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.17it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.37it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.53it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.52it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.50it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.42it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.48it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.6385941505432129, 'eval_acc': 0.8056, 'eval_acc2': 0.9081, 'eval_f1': 0.803183781411048, 'eval_roc_auc_micro': 0.9973809159090908, 'eval_precision': 0.8056, 'eval_recall': 0.8056, 'eval_runtime': 19.1102, 'eval_samples_per_second': 523.281, 'eval_steps_per_second': 8.216, 'epoch': 3.07}


 31%|███▏      | 490/1560 [29:00<20:03,  1.13s/it]  

{'loss': 0.6888, 'grad_norm': 2.904775381088257, 'learning_rate': 0.000775688544860846, 'epoch': 3.14}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.00it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.86it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.39it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.92it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.62it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.26it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.18it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.00it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.97it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.97it/s][A
 15%|█▍        | 23/157 [00:02<00:17,  7.79it/s][A
 16%|█▌        | 25/157 [00:02<00:15,  8.74it/s][A
 17%|█▋        | 27/157 [00:02<00:13,  9.51it/s][A
 18%|█▊        | 29/157 [00:02<00:12,  9.98it/s][A
 20%|█▉        | 31/157 [00:02<00:11, 10.59it/s][A
 21%|██        | 33/157 [00:02<00:11, 10.96it/s][A
 22%|██▏       | 35/157 [00:03<00:10, 11.38it/s][A
 24%|██▎       | 37/157 [00:03<00:10, 11.67it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.6077675223350525, 'eval_acc': 0.8163, 'eval_acc2': 0.9139, 'eval_f1': 0.8152310338264037, 'eval_roc_auc_micro': 0.9974891971212121, 'eval_precision': 0.8163, 'eval_recall': 0.8163, 'eval_runtime': 42.249, 'eval_samples_per_second': 236.692, 'eval_steps_per_second': 3.716, 'epoch': 3.14}


 32%|███▏      | 500/1560 [29:51<24:57,  1.41s/it]  

{'loss': 0.6622, 'grad_norm': 4.283827781677246, 'learning_rate': 0.0007672329130639005, 'epoch': 3.2}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:23,  6.59it/s][A
  2%|▏         | 3/157 [00:00<00:23,  6.51it/s][A
  3%|▎         | 5/157 [00:00<00:17,  8.77it/s][A
  4%|▍         | 7/157 [00:00<00:14, 10.02it/s][A
  6%|▌         | 9/157 [00:00<00:13, 10.59it/s][A
  7%|▋         | 11/157 [00:01<00:13, 10.90it/s][A
  8%|▊         | 13/157 [00:01<00:13, 10.87it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.11it/s][A
 11%|█         | 17/157 [00:01<00:12, 11.43it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.61it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.71it/s][A
 15%|█▍        | 23/157 [00:02<00:11, 11.87it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.82it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 11.86it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 11.84it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.03it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.19it/s][A
 22%|██▏       | 35/157 [00:03<00:09, 12.20it/s][A
 24%|██▎       | 37/157 [

{'eval_loss': 0.5900854468345642, 'eval_acc': 0.8247, 'eval_acc2': 0.9152, 'eval_f1': 0.8235683278635504, 'eval_roc_auc_micro': 0.9977274573737375, 'eval_precision': 0.8247, 'eval_recall': 0.8247, 'eval_runtime': 20.2556, 'eval_samples_per_second': 493.689, 'eval_steps_per_second': 7.751, 'epoch': 3.2}


 33%|███▎      | 510/1560 [30:23<24:23,  1.39s/it]  

{'loss': 0.6738, 'grad_norm': 3.478165864944458, 'learning_rate': 0.0007586689070888284, 'epoch': 3.26}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:09, 16.95it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.43it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.12it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.48it/s][A
  7%|▋         | 11/157 [00:01<00:16,  8.96it/s][A
  8%|▊         | 13/157 [00:01<00:24,  5.80it/s][A
  9%|▉         | 14/157 [00:01<00:26,  5.42it/s][A
 10%|▉         | 15/157 [00:02<00:26,  5.35it/s][A
 10%|█         | 16/157 [00:02<00:25,  5.45it/s][A
 11%|█         | 17/157 [00:02<00:24,  5.73it/s][A
 11%|█▏        | 18/157 [00:02<00:23,  6.04it/s][A
 12%|█▏        | 19/157 [00:02<00:21,  6.38it/s][A
 13%|█▎        | 20/157 [00:02<00:20,  6.79it/s][A
 13%|█▎        | 21/157 [00:02<00:18,  7.17it/s][A
 14%|█▍        | 22/157 [00:02<00:17,  7.59it/s][A
 15%|█▍        | 23/157 [00:03<00:16,  7.97it/s][A
 15%|█▌        | 24/157 [00:03<00:16,  8.23it/s][A
 16%|█▌        | 25/157 [00:03<00:15,  8.65it/s][A
 17%|█▋        | 26/157 

{'eval_loss': 0.6016741394996643, 'eval_acc': 0.8205, 'eval_acc2': 0.9105, 'eval_f1': 0.8189408429458099, 'eval_roc_auc_micro': 0.9975145048989899, 'eval_precision': 0.8205, 'eval_recall': 0.8205, 'eval_runtime': 19.1559, 'eval_samples_per_second': 522.033, 'eval_steps_per_second': 8.196, 'epoch': 3.26}


 33%|███▎      | 520/1560 [31:02<47:42,  2.75s/it]  

{'loss': 0.6754, 'grad_norm': 2.645423650741577, 'learning_rate': 0.00075, 'epoch': 3.33}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:17,  8.80it/s][A
  3%|▎         | 5/157 [00:00<00:15,  9.52it/s][A
  4%|▍         | 7/157 [00:00<00:15,  9.86it/s][A
  6%|▌         | 9/157 [00:00<00:14,  9.88it/s][A
  6%|▋         | 10/157 [00:01<00:19,  7.51it/s][A
  7%|▋         | 11/157 [00:01<00:25,  5.70it/s][A
  8%|▊         | 12/157 [00:01<00:30,  4.76it/s][A
  8%|▊         | 13/157 [00:02<00:34,  4.22it/s][A
  9%|▉         | 14/157 [00:02<00:36,  3.88it/s][A
 10%|▉         | 15/157 [00:02<00:38,  3.69it/s][A
 10%|█         | 16/157 [00:03<00:38,  3.66it/s][A
 11%|█         | 17/157 [00:03<00:36,  3.88it/s][A
 11%|█▏        | 18/157 [00:03<00:34,  4.05it/s][A
 12%|█▏        | 19/157 [00:03<00:33,  4.17it/s][A
 13%|█▎        | 20/157 [00:03<00:32,  4.28it/s][A
 13%|█▎        | 21/157 [00:04<00:31,  4.35it/s][A
 14%|█▍        | 22/157 [00:04<00:30,  4.40it/s][A
 15%|█▍        | 23/157 [00:04<00:30,  4.41it/s][A
 15%|█▌        | 24/157 

{'eval_loss': 0.5960604548454285, 'eval_acc': 0.8237, 'eval_acc2': 0.9148, 'eval_f1': 0.8218011346884921, 'eval_roc_auc_micro': 0.9974847206060606, 'eval_precision': 0.8237, 'eval_recall': 0.8237, 'eval_runtime': 24.098, 'eval_samples_per_second': 414.972, 'eval_steps_per_second': 6.515, 'epoch': 3.33}


 34%|███▍      | 530/1560 [31:40<27:55,  1.63s/it]  

{'loss': 0.6923, 'grad_norm': 3.240302562713623, 'learning_rate': 0.0007412297074035968, 'epoch': 3.39}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:33,  4.65it/s][A
  3%|▎         | 4/157 [00:00<00:21,  7.20it/s][A
  4%|▍         | 6/157 [00:00<00:17,  8.46it/s][A
  5%|▌         | 8/157 [00:00<00:16,  9.11it/s][A
  6%|▋         | 10/157 [00:01<00:15,  9.78it/s][A
  8%|▊         | 12/157 [00:01<00:14, 10.26it/s][A
  9%|▉         | 14/157 [00:01<00:13, 10.66it/s][A
 10%|█         | 16/157 [00:01<00:12, 10.89it/s][A
 11%|█▏        | 18/157 [00:01<00:12, 11.09it/s][A
 13%|█▎        | 20/157 [00:02<00:12, 11.32it/s][A
 14%|█▍        | 22/157 [00:02<00:19,  7.06it/s][A
 15%|█▍        | 23/157 [00:02<00:21,  6.22it/s][A
 15%|█▌        | 24/157 [00:03<00:22,  5.86it/s][A
 16%|█▌        | 25/157 [00:03<00:20,  6.31it/s][A
 17%|█▋        | 27/157 [00:03<00:16,  7.76it/s][A
 18%|█▊        | 29/157 [00:03<00:14,  8.96it/s][A
 20%|█▉        | 31/157 [00:03<00:12,  9.79it/s][A
 21%|██        | 33/157 [00:03<00:11, 10.41it/s][A
 22%|██▏       | 35/157 

{'eval_loss': 0.5816601514816284, 'eval_acc': 0.826, 'eval_acc2': 0.9145, 'eval_f1': 0.8259364822149274, 'eval_roc_auc_micro': 0.9976807802525252, 'eval_precision': 0.826, 'eval_recall': 0.826, 'eval_runtime': 22.9115, 'eval_samples_per_second': 436.462, 'eval_steps_per_second': 6.852, 'epoch': 3.39}


 35%|███▍      | 540/1560 [32:14<21:25,  1.26s/it]  

{'loss': 0.6437, 'grad_norm': 2.657573699951172, 'learning_rate': 0.0007323615860218843, 'epoch': 3.46}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:29,  5.26it/s][A
  3%|▎         | 4/157 [00:00<00:19,  7.88it/s][A
  4%|▍         | 6/157 [00:00<00:16,  9.21it/s][A
  5%|▌         | 8/157 [00:00<00:14, 10.05it/s][A
  6%|▋         | 10/157 [00:01<00:13, 10.54it/s][A
  8%|▊         | 12/157 [00:01<00:13, 10.92it/s][A
  9%|▉         | 14/157 [00:01<00:13, 10.96it/s][A
 10%|█         | 16/157 [00:01<00:13, 10.78it/s][A
 11%|█▏        | 18/157 [00:01<00:12, 11.25it/s][A
 13%|█▎        | 20/157 [00:01<00:12, 11.25it/s][A
 14%|█▍        | 22/157 [00:02<00:11, 11.41it/s][A
 15%|█▌        | 24/157 [00:02<00:11, 11.54it/s][A
 17%|█▋        | 26/157 [00:02<00:11, 11.72it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 11.77it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 11.75it/s][A
 20%|██        | 32/157 [00:02<00:10, 11.81it/s][A
 22%|██▏       | 34/157 [00:03<00:10, 12.01it/s][A
 23%|██▎       | 36/157 [00:03<00:10, 12.03it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.5844616293907166, 'eval_acc': 0.8272, 'eval_acc2': 0.916, 'eval_f1': 0.8249867088072904, 'eval_roc_auc_micro': 0.9976954013636363, 'eval_precision': 0.8272, 'eval_recall': 0.8272, 'eval_runtime': 19.0877, 'eval_samples_per_second': 523.896, 'eval_steps_per_second': 8.225, 'epoch': 3.46}


 35%|███▌      | 550/1560 [32:43<19:41,  1.17s/it]  

{'loss': 0.6696, 'grad_norm': 2.4872899055480957, 'learning_rate': 0.000723399232250813, 'epoch': 3.52}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:09, 16.95it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.38it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.55it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.08it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.80it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.33it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.26it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.27it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.11it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.14it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 12.14it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.21it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.20it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.25it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 11.77it/s][A
 21%|██        | 33/157 [00:02<00:10, 11.94it/s][A
 22%|██▏       | 35/157 [00:03<00:16,  7.28it/s][A
 24%|██▎       | 37/157 [00:03<00:14,  8.36it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5820586681365967, 'eval_acc': 0.8227, 'eval_acc2': 0.918, 'eval_f1': 0.8224562521843524, 'eval_roc_auc_micro': 0.9976214052020203, 'eval_precision': 0.8227, 'eval_recall': 0.8227, 'eval_runtime': 15.6415, 'eval_samples_per_second': 639.324, 'eval_steps_per_second': 10.037, 'epoch': 3.52}


 36%|███▌      | 560/1560 [33:08<18:22,  1.10s/it]  

{'loss': 0.6823, 'grad_norm': 2.6282105445861816, 'learning_rate': 0.000714346280701527, 'epoch': 3.58}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:09, 16.44it/s][A
  3%|▎         | 4/157 [00:00<00:10, 14.01it/s][A
  4%|▍         | 6/157 [00:00<00:11, 13.32it/s][A
  5%|▌         | 8/157 [00:00<00:11, 12.58it/s][A
  6%|▋         | 10/157 [00:00<00:11, 12.57it/s][A
  8%|▊         | 12/157 [00:00<00:11, 12.57it/s][A
  9%|▉         | 14/157 [00:01<00:11, 12.45it/s][A
 10%|█         | 16/157 [00:01<00:11, 12.54it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 12.40it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 12.30it/s][A
 14%|█▍        | 22/157 [00:01<00:10, 12.40it/s][A
 15%|█▌        | 24/157 [00:01<00:10, 12.45it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.49it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 12.30it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 12.33it/s][A
 20%|██        | 32/157 [00:02<00:10, 12.38it/s][A
 22%|██▏       | 34/157 [00:02<00:09, 12.44it/s][A
 23%|██▎       | 36/157 [00:02<00:09, 12.50it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.5885379910469055, 'eval_acc': 0.8237, 'eval_acc2': 0.914, 'eval_f1': 0.8211818527048954, 'eval_roc_auc_micro': 0.9976879154545455, 'eval_precision': 0.8237, 'eval_recall': 0.8237, 'eval_runtime': 38.3596, 'eval_samples_per_second': 260.691, 'eval_steps_per_second': 4.093, 'epoch': 3.58}


 37%|███▋      | 570/1560 [33:56<22:43,  1.38s/it]  

{'loss': 0.6661, 'grad_norm': 2.4638044834136963, 'learning_rate': 0.0007052064027263785, 'epoch': 3.65}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.92it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.18it/s][A
  4%|▍         | 7/157 [00:00<00:10, 14.03it/s][A
  6%|▌         | 9/157 [00:00<00:10, 13.49it/s][A
  7%|▋         | 11/157 [00:00<00:11, 13.23it/s][A
  8%|▊         | 13/157 [00:00<00:10, 13.10it/s][A
 10%|▉         | 15/157 [00:01<00:10, 13.01it/s][A
 11%|█         | 17/157 [00:01<00:10, 12.92it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.87it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.87it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.85it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.79it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.79it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.79it/s][A
 20%|█▉        | 31/157 [00:02<00:09, 12.66it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.68it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.64it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.67it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5828297138214111, 'eval_acc': 0.8256, 'eval_acc2': 0.9144, 'eval_f1': 0.8252072978222584, 'eval_roc_auc_micro': 0.9976880703535354, 'eval_precision': 0.8256, 'eval_recall': 0.8256, 'eval_runtime': 16.1345, 'eval_samples_per_second': 619.789, 'eval_steps_per_second': 9.731, 'epoch': 3.65}


 37%|███▋      | 580/1560 [34:21<17:38,  1.08s/it]  

{'loss': 0.701, 'grad_norm': 3.565800905227661, 'learning_rate': 0.0006959833049300376, 'epoch': 3.71}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.19it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.38it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.60it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.66it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.41it/s][A
  8%|▊         | 13/157 [00:01<00:11, 12.25it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.09it/s][A
 11%|█         | 17/157 [00:01<00:11, 11.99it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.07it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.10it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 12.16it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.14it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 11.84it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.04it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.24it/s][A
 21%|██        | 33/157 [00:02<00:15,  8.23it/s][A
 22%|██▏       | 35/157 [00:03<00:13,  9.07it/s][A
 24%|██▎       | 37/157 [00:03<00:12,  9.86it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5839572548866272, 'eval_acc': 0.8247, 'eval_acc2': 0.9157, 'eval_f1': 0.8243506994157446, 'eval_roc_auc_micro': 0.9977670380303031, 'eval_precision': 0.8247, 'eval_recall': 0.8247, 'eval_runtime': 15.2368, 'eval_samples_per_second': 656.307, 'eval_steps_per_second': 10.304, 'epoch': 3.71}


 38%|███▊      | 590/1560 [34:46<17:13,  1.07s/it]  

{'loss': 0.668, 'grad_norm': 3.084927797317505, 'learning_rate': 0.0006866807276663105, 'epoch': 3.78}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.75it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.01it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.64it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.90it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.36it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.20it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.08it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.05it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.13it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.87it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 11.93it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.08it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.20it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.27it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.34it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.41it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.45it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.40it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5880886912345886, 'eval_acc': 0.8245, 'eval_acc2': 0.9142, 'eval_f1': 0.8240740425276224, 'eval_roc_auc_micro': 0.9975793450505049, 'eval_precision': 0.8245, 'eval_recall': 0.8245, 'eval_runtime': 15.3953, 'eval_samples_per_second': 649.549, 'eval_steps_per_second': 10.198, 'epoch': 3.78}


 38%|███▊      | 600/1560 [35:11<17:41,  1.11s/it]  

{'loss': 0.6729, 'grad_norm': 3.4999196529388428, 'learning_rate': 0.0006773024435212678, 'epoch': 3.84}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.72it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.10it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.84it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.82it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.60it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.26it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.18it/s][A
 11%|█         | 17/157 [00:01<00:11, 11.93it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.76it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.93it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 12.09it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.17it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.21it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.27it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.30it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.20it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.32it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.33it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5906975865364075, 'eval_acc': 0.8232, 'eval_acc2': 0.9165, 'eval_f1': 0.8216630560130725, 'eval_roc_auc_micro': 0.9976362823737375, 'eval_precision': 0.8232, 'eval_recall': 0.8232, 'eval_runtime': 20.0897, 'eval_samples_per_second': 497.767, 'eval_steps_per_second': 7.815, 'epoch': 3.84}


 39%|███▉      | 610/1560 [35:40<17:53,  1.13s/it]  

{'loss': 0.6404, 'grad_norm': 2.965864896774292, 'learning_rate': 0.0006678522557833024, 'epoch': 3.9}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 17.15it/s][A
  3%|▎         | 5/157 [00:00<00:17,  8.51it/s][A
  4%|▍         | 7/157 [00:00<00:15,  9.80it/s][A
  6%|▌         | 9/157 [00:00<00:13, 10.64it/s][A
  7%|▋         | 11/157 [00:01<00:13, 11.22it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.64it/s][A
 10%|▉         | 15/157 [00:01<00:11, 11.93it/s][A
 11%|█         | 17/157 [00:01<00:11, 11.97it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.23it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.33it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.22it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.35it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.38it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.42it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.57it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.56it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.54it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.52it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5744173526763916, 'eval_acc': 0.8251, 'eval_acc2': 0.9179, 'eval_f1': 0.8251123716958612, 'eval_roc_auc_micro': 0.9978285940404041, 'eval_precision': 0.8251, 'eval_recall': 0.8251, 'eval_runtime': 37.3312, 'eval_samples_per_second': 267.873, 'eval_steps_per_second': 4.206, 'epoch': 3.9}


 40%|███▉      | 620/1560 [36:28<21:05,  1.35s/it]  

{'loss': 0.6479, 'grad_norm': 2.180448293685913, 'learning_rate': 0.0006583339969007363, 'epoch': 3.97}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.51it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.76it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.25it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.67it/s][A
  7%|▋         | 11/157 [00:00<00:12, 12.11it/s][A
  8%|▊         | 13/157 [00:01<00:11, 12.03it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.03it/s][A
 11%|█         | 17/157 [00:01<00:11, 11.99it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.96it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.75it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 11.87it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.00it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.13it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.25it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.22it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.22it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.32it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.35it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5482177138328552, 'eval_acc': 0.8344, 'eval_acc2': 0.9225, 'eval_f1': 0.8340904596930379, 'eval_roc_auc_micro': 0.9980483288888888, 'eval_precision': 0.8344, 'eval_recall': 0.8344, 'eval_runtime': 15.7494, 'eval_samples_per_second': 634.943, 'eval_steps_per_second': 9.969, 'epoch': 3.97}


 40%|████      | 630/1560 [36:56<21:12,  1.37s/it]  

{'loss': 0.6011, 'grad_norm': 2.791102647781372, 'learning_rate': 0.0006487515269276015, 'epoch': 4.03}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:09, 16.76it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.16it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.13it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.57it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.32it/s][A
  8%|▊         | 13/157 [00:01<00:11, 12.13it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.05it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.09it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.03it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.95it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 12.02it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.01it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.09it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.12it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.17it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.11it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 11.98it/s][A
 24%|██▎       | 37/157 [00:03<00:10, 12.00it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5517567992210388, 'eval_acc': 0.8339, 'eval_acc2': 0.924, 'eval_f1': 0.8334905020000797, 'eval_roc_auc_micro': 0.9979627996969698, 'eval_precision': 0.8339, 'eval_recall': 0.8339, 'eval_runtime': 45.4144, 'eval_samples_per_second': 220.194, 'eval_steps_per_second': 3.457, 'epoch': 4.03}


 41%|████      | 640/1560 [37:51<22:47,  1.49s/it]  

{'loss': 0.5396, 'grad_norm': 2.7412281036376953, 'learning_rate': 0.0006391087319582263, 'epoch': 4.1}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:09, 16.92it/s][A
  3%|▎         | 5/157 [00:00<00:11, 13.58it/s][A
  4%|▍         | 7/157 [00:00<00:12, 12.42it/s][A
  6%|▌         | 9/157 [00:00<00:12, 11.81it/s][A
  7%|▋         | 11/157 [00:00<00:12, 11.65it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.54it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.42it/s][A
 11%|█         | 17/157 [00:01<00:12, 11.57it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.57it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.54it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 11.70it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.83it/s][A
 17%|█▋        | 27/157 [00:02<00:11, 11.45it/s][A
 18%|█▊        | 29/157 [00:02<00:11, 11.51it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 11.60it/s][A
 21%|██        | 33/157 [00:02<00:10, 11.52it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 11.44it/s][A
 24%|██▎       | 37/157 [00:03<00:10, 11.60it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5685362219810486, 'eval_acc': 0.8307, 'eval_acc2': 0.9191, 'eval_f1': 0.8291722860019092, 'eval_roc_auc_micro': 0.9978469223232322, 'eval_precision': 0.8307, 'eval_recall': 0.8307, 'eval_runtime': 24.8669, 'eval_samples_per_second': 402.141, 'eval_steps_per_second': 6.314, 'epoch': 4.1}


 42%|████▏     | 650/1560 [38:35<34:06,  2.25s/it]  

{'loss': 0.477, 'grad_norm': 3.246076822280884, 'learning_rate': 0.0006294095225512603, 'epoch': 4.16}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 17.19it/s][A
  3%|▎         | 5/157 [00:00<00:11, 13.35it/s][A
  4%|▍         | 7/157 [00:00<00:11, 12.62it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.35it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.24it/s][A
  8%|▊         | 13/157 [00:01<00:11, 12.23it/s][A
 10%|▉         | 15/157 [00:01<00:13, 10.71it/s][A
 11%|█         | 17/157 [00:01<00:18,  7.76it/s][A
 11%|█▏        | 18/157 [00:01<00:19,  7.00it/s][A
 12%|█▏        | 19/157 [00:02<00:21,  6.43it/s][A
 13%|█▎        | 20/157 [00:02<00:22,  6.06it/s][A
 13%|█▎        | 21/157 [00:02<00:23,  5.71it/s][A
 14%|█▍        | 22/157 [00:02<00:24,  5.50it/s][A
 15%|█▍        | 23/157 [00:02<00:25,  5.34it/s][A
 15%|█▌        | 24/157 [00:03<00:25,  5.28it/s][A
 16%|█▌        | 25/157 [00:03<00:25,  5.18it/s][A
 17%|█▋        | 26/157 [00:03<00:25,  5.10it/s][A
 17%|█▋        | 27/157 [00:03<00:25,  5.07it/s][A
 18%|█▊        | 28/157 

{'eval_loss': 0.570350706577301, 'eval_acc': 0.8312, 'eval_acc2': 0.9176, 'eval_f1': 0.8307714272461397, 'eval_roc_auc_micro': 0.9978090047474748, 'eval_precision': 0.8312, 'eval_recall': 0.8312, 'eval_runtime': 21.5189, 'eval_samples_per_second': 464.709, 'eval_steps_per_second': 7.296, 'epoch': 4.16}


 42%|████▏     | 660/1560 [39:08<20:40,  1.38s/it]  

{'loss': 0.4985, 'grad_norm': 2.0778915882110596, 'learning_rate': 0.0006196578321437789, 'epoch': 4.22}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:24,  6.36it/s][A
  3%|▎         | 4/157 [00:00<00:17,  8.67it/s][A
  4%|▍         | 6/157 [00:00<00:15,  9.77it/s][A
  5%|▌         | 8/157 [00:00<00:14, 10.40it/s][A
  6%|▋         | 10/157 [00:01<00:13, 10.72it/s][A
  8%|▊         | 12/157 [00:01<00:13, 10.92it/s][A
  9%|▉         | 14/157 [00:01<00:12, 11.09it/s][A
 10%|█         | 16/157 [00:01<00:12, 11.24it/s][A
 11%|█▏        | 18/157 [00:01<00:12, 11.28it/s][A
 13%|█▎        | 20/157 [00:01<00:12, 11.30it/s][A
 14%|█▍        | 22/157 [00:02<00:11, 11.43it/s][A
 15%|█▌        | 24/157 [00:02<00:11, 11.33it/s][A
 17%|█▋        | 26/157 [00:02<00:11, 11.31it/s][A
 18%|█▊        | 28/157 [00:02<00:11, 11.46it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 11.56it/s][A
 20%|██        | 32/157 [00:02<00:10, 11.59it/s][A
 22%|██▏       | 34/157 [00:03<00:10, 11.68it/s][A
 23%|██▎       | 36/157 [00:03<00:10, 11.75it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.568821370601654, 'eval_acc': 0.8356, 'eval_acc2': 0.9168, 'eval_f1': 0.8346222855205561, 'eval_roc_auc_micro': 0.9977242508080808, 'eval_precision': 0.8356, 'eval_recall': 0.8356, 'eval_runtime': 20.124, 'eval_samples_per_second': 496.918, 'eval_steps_per_second': 7.802, 'epoch': 4.22}


 43%|████▎     | 670/1560 [39:38<17:38,  1.19s/it]  

{'loss': 0.5005, 'grad_norm': 6.936652660369873, 'learning_rate': 0.0006098576154561086, 'epoch': 4.29}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.49it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.16it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.62it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.91it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.18it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.88it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.73it/s][A
 11%|█         | 17/157 [00:01<00:12, 11.64it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.73it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.84it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 11.71it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.81it/s][A
 17%|█▋        | 27/157 [00:02<00:11, 11.71it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 11.85it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 11.82it/s][A
 21%|██        | 33/157 [00:02<00:10, 11.97it/s][A
 22%|██▏       | 35/157 [00:03<00:15,  7.71it/s][A
 24%|██▎       | 37/157 [00:03<00:13,  8.73it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5693998336791992, 'eval_acc': 0.8276, 'eval_acc2': 0.9157, 'eval_f1': 0.8262263318589418, 'eval_roc_auc_micro': 0.9978768146969696, 'eval_precision': 0.8276, 'eval_recall': 0.8276, 'eval_runtime': 20.5621, 'eval_samples_per_second': 486.331, 'eval_steps_per_second': 7.635, 'epoch': 4.29}


 44%|████▎     | 680/1560 [40:08<16:55,  1.15s/it]  

{'loss': 0.549, 'grad_norm': 3.0375709533691406, 'learning_rate': 0.0006000128468880223, 'epoch': 4.35}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.92it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.04it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.90it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.16it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.62it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.31it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.09it/s][A
 11%|█         | 17/157 [00:01<00:11, 11.98it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.94it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.78it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 11.94it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.76it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 11.87it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 11.95it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 11.98it/s][A
 21%|██        | 33/157 [00:02<00:10, 11.99it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 11.97it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.06it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5572705268859863, 'eval_acc': 0.8331, 'eval_acc2': 0.921, 'eval_f1': 0.8330223201833035, 'eval_roc_auc_micro': 0.9979268939393939, 'eval_precision': 0.8331, 'eval_recall': 0.8331, 'eval_runtime': 37.525, 'eval_samples_per_second': 266.489, 'eval_steps_per_second': 4.184, 'epoch': 4.35}


 44%|████▍     | 690/1560 [40:55<19:56,  1.38s/it]  

{'loss': 0.5727, 'grad_norm': 2.4269368648529053, 'learning_rate': 0.000590127518906953, 'epoch': 4.42}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:11, 12.94it/s][A
  3%|▎         | 4/157 [00:00<00:24,  6.12it/s][A
  4%|▍         | 6/157 [00:00<00:18,  7.96it/s][A
  5%|▌         | 8/157 [00:00<00:15,  9.32it/s][A
  6%|▋         | 10/157 [00:01<00:14, 10.09it/s][A
  8%|▊         | 12/157 [00:01<00:13, 10.50it/s][A
  9%|▉         | 14/157 [00:01<00:12, 11.02it/s][A
 10%|█         | 16/157 [00:01<00:12, 11.21it/s][A
 11%|█▏        | 18/157 [00:01<00:12, 11.48it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 11.77it/s][A
 14%|█▍        | 22/157 [00:02<00:11, 11.89it/s][A
 15%|█▌        | 24/157 [00:02<00:11, 11.93it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.01it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 12.14it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 12.26it/s][A
 20%|██        | 32/157 [00:02<00:10, 12.23it/s][A
 22%|██▏       | 34/157 [00:03<00:09, 12.35it/s][A
 23%|██▎       | 36/157 [00:03<00:09, 12.36it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.5256785154342651, 'eval_acc': 0.8409, 'eval_acc2': 0.924, 'eval_f1': 0.8396119837992781, 'eval_roc_auc_micro': 0.9980553086868689, 'eval_precision': 0.8409, 'eval_recall': 0.8409, 'eval_runtime': 21.3783, 'eval_samples_per_second': 467.763, 'eval_steps_per_second': 7.344, 'epoch': 4.42}


 45%|████▍     | 700/1560 [41:26<16:28,  1.15s/it]  

{'loss': 0.5342, 'grad_norm': 3.0577046871185303, 'learning_rate': 0.0005802056404288802, 'epoch': 4.48}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:07, 19.96it/s][A
  3%|▎         | 4/157 [00:00<00:11, 13.71it/s][A
  4%|▍         | 6/157 [00:00<00:12, 11.78it/s][A
  5%|▌         | 8/157 [00:00<00:12, 12.12it/s][A
  6%|▋         | 10/157 [00:00<00:11, 12.39it/s][A
  8%|▊         | 12/157 [00:00<00:11, 12.47it/s][A
  9%|▉         | 14/157 [00:01<00:11, 12.52it/s][A
 10%|█         | 16/157 [00:01<00:11, 12.57it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 12.61it/s][A
 13%|█▎        | 20/157 [00:01<00:10, 12.63it/s][A
 14%|█▍        | 22/157 [00:01<00:10, 12.70it/s][A
 15%|█▌        | 24/157 [00:01<00:10, 12.62it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.69it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 12.68it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 12.66it/s][A
 20%|██        | 32/157 [00:02<00:09, 12.69it/s][A
 22%|██▏       | 34/157 [00:02<00:09, 12.67it/s][A
 23%|██▎       | 36/157 [00:02<00:09, 12.54it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.559129536151886, 'eval_acc': 0.8307, 'eval_acc2': 0.9186, 'eval_f1': 0.8309430302967618, 'eval_roc_auc_micro': 0.9978031694949494, 'eval_precision': 0.8307, 'eval_recall': 0.8307, 'eval_runtime': 16.3462, 'eval_samples_per_second': 611.763, 'eval_steps_per_second': 9.605, 'epoch': 4.48}


 46%|████▌     | 710/1560 [41:52<15:27,  1.09s/it]  

{'loss': 0.5513, 'grad_norm': 3.3353028297424316, 'learning_rate': 0.0005702512351925465, 'epoch': 4.54}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.56it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.84it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.76it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.04it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.63it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.20it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.00it/s][A
 11%|█         | 17/157 [00:01<00:11, 11.97it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.00it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.10it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.19it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.11it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 11.94it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.16it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.23it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.18it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.23it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.30it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5494104623794556, 'eval_acc': 0.8355, 'eval_acc2': 0.9232, 'eval_f1': 0.8350357254033676, 'eval_roc_auc_micro': 0.9978792161616162, 'eval_precision': 0.8355, 'eval_recall': 0.8355, 'eval_runtime': 53.099, 'eval_samples_per_second': 188.328, 'eval_steps_per_second': 2.957, 'epoch': 4.54}


 46%|████▌     | 720/1560 [42:55<22:26,  1.60s/it]  

{'loss': 0.4968, 'grad_norm': 3.454883337020874, 'learning_rate': 0.0005602683401276614, 'epoch': 4.61}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:41,  3.76it/s][A
  3%|▎         | 4/157 [00:00<00:24,  6.34it/s][A
  4%|▍         | 6/157 [00:00<00:18,  8.27it/s][A
  5%|▌         | 8/157 [00:01<00:15,  9.54it/s][A
  6%|▋         | 10/157 [00:01<00:14, 10.32it/s][A
  8%|▊         | 12/157 [00:01<00:13, 10.77it/s][A
  9%|▉         | 14/157 [00:01<00:12, 11.32it/s][A
 10%|█         | 16/157 [00:01<00:12, 11.65it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 11.99it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 12.23it/s][A
 14%|█▍        | 22/157 [00:02<00:10, 12.31it/s][A
 15%|█▌        | 24/157 [00:02<00:10, 12.40it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.47it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 12.52it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 12.41it/s][A
 20%|██        | 32/157 [00:02<00:10, 12.36it/s][A
 22%|██▏       | 34/157 [00:03<00:09, 12.42it/s][A
 23%|██▎       | 36/157 [00:03<00:09, 12.46it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.537118673324585, 'eval_acc': 0.8378, 'eval_acc2': 0.9235, 'eval_f1': 0.8361877282102818, 'eval_roc_auc_micro': 0.9980716970707071, 'eval_precision': 0.8378, 'eval_recall': 0.8378, 'eval_runtime': 38.74, 'eval_samples_per_second': 258.131, 'eval_steps_per_second': 4.053, 'epoch': 4.61}


 47%|████▋     | 730/1560 [43:43<19:13,  1.39s/it]  

{'loss': 0.5452, 'grad_norm': 3.492022752761841, 'learning_rate': 0.0005502610037177585, 'epoch': 4.67}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:12, 12.53it/s][A
  3%|▎         | 5/157 [00:00<00:12, 12.39it/s][A
  4%|▍         | 7/157 [00:00<00:12, 12.34it/s][A
  6%|▌         | 9/157 [00:00<00:12, 12.10it/s][A
  7%|▋         | 11/157 [00:00<00:12, 11.99it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.77it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.66it/s][A
 11%|█         | 17/157 [00:01<00:11, 11.72it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.85it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.61it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 11.27it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.50it/s][A
 17%|█▋        | 27/157 [00:02<00:11, 11.49it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 11.71it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 11.84it/s][A
 21%|██        | 33/157 [00:02<00:10, 11.91it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 12.01it/s][A
 24%|██▎       | 37/157 [00:03<00:10, 11.95it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.549993634223938, 'eval_acc': 0.8345, 'eval_acc2': 0.9241, 'eval_f1': 0.8342762786325023, 'eval_roc_auc_micro': 0.9978997838383838, 'eval_precision': 0.8345, 'eval_recall': 0.8345, 'eval_runtime': 41.7645, 'eval_samples_per_second': 239.438, 'eval_steps_per_second': 3.759, 'epoch': 4.67}


 47%|████▋     | 740/1560 [44:35<19:48,  1.45s/it]  

{'loss': 0.4975, 'grad_norm': 2.6300740242004395, 'learning_rate': 0.000540233284358363, 'epoch': 4.74}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:09, 16.99it/s][A
  3%|▎         | 5/157 [00:00<00:17,  8.77it/s][A
  4%|▍         | 7/157 [00:00<00:15, 10.00it/s][A
  6%|▌         | 9/157 [00:00<00:13, 10.70it/s][A
  7%|▋         | 11/157 [00:01<00:12, 11.26it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.61it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.62it/s][A
 11%|█         | 17/157 [00:01<00:11, 11.76it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.80it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.66it/s][A
 15%|█▍        | 23/157 [00:02<00:11, 11.94it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.97it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 11.97it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.09it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.04it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.07it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 12.08it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.02it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5241796970367432, 'eval_acc': 0.8419, 'eval_acc2': 0.9269, 'eval_f1': 0.8410666653661283, 'eval_roc_auc_micro': 0.9980617668686869, 'eval_precision': 0.8419, 'eval_recall': 0.8419, 'eval_runtime': 20.1256, 'eval_samples_per_second': 496.879, 'eval_steps_per_second': 7.801, 'epoch': 4.74}


 48%|████▊     | 750/1560 [45:10<24:36,  1.82s/it]  

{'loss': 0.5102, 'grad_norm': 2.409785032272339, 'learning_rate': 0.0005301892487111431, 'epoch': 4.8}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 17.80it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.17it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.01it/s][A
  6%|▌         | 9/157 [00:00<00:13, 10.86it/s][A
  7%|▋         | 11/157 [00:01<00:16,  8.74it/s][A
  8%|▊         | 12/157 [00:01<00:17,  8.34it/s][A
  8%|▊         | 13/157 [00:01<00:17,  8.17it/s][A
  9%|▉         | 14/157 [00:01<00:17,  8.07it/s][A
 10%|▉         | 15/157 [00:01<00:17,  7.99it/s][A
 10%|█         | 16/157 [00:01<00:17,  8.08it/s][A
 11%|█         | 17/157 [00:01<00:17,  8.17it/s][A
 11%|█▏        | 18/157 [00:01<00:16,  8.37it/s][A
 12%|█▏        | 19/157 [00:02<00:16,  8.57it/s][A
 13%|█▎        | 20/157 [00:02<00:15,  8.74it/s][A
 13%|█▎        | 21/157 [00:02<00:15,  8.83it/s][A
 14%|█▍        | 22/157 [00:02<00:15,  8.88it/s][A
 15%|█▍        | 23/157 [00:02<00:14,  8.96it/s][A
 15%|█▌        | 24/157 [00:02<00:14,  8.99it/s][A
 16%|█▌        | 25/157 

{'eval_loss': 0.5263120532035828, 'eval_acc': 0.842, 'eval_acc2': 0.9251, 'eval_f1': 0.841907200225835, 'eval_roc_auc_micro': 0.998121625, 'eval_precision': 0.842, 'eval_recall': 0.842, 'eval_runtime': 24.536, 'eval_samples_per_second': 407.564, 'eval_steps_per_second': 6.399, 'epoch': 4.8}


 49%|████▊     | 760/1560 [46:06<40:33,  3.04s/it]  

{'loss': 0.5233, 'grad_norm': 3.4599685668945312, 'learning_rate': 0.0005201329700547076, 'epoch': 4.86}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:09, 16.73it/s][A
  3%|▎         | 5/157 [00:00<00:11, 13.58it/s][A
  4%|▍         | 7/157 [00:00<00:12, 12.47it/s][A
  6%|▌         | 9/157 [00:00<00:12, 12.03it/s][A
  7%|▋         | 11/157 [00:00<00:14, 10.23it/s][A
  8%|▊         | 13/157 [00:01<00:15,  9.37it/s][A
  9%|▉         | 14/157 [00:01<00:15,  9.19it/s][A
 10%|▉         | 15/157 [00:01<00:15,  9.13it/s][A
 10%|█         | 16/157 [00:01<00:15,  9.07it/s][A
 11%|█         | 17/157 [00:01<00:15,  9.10it/s][A
 11%|█▏        | 18/157 [00:01<00:15,  9.03it/s][A
 12%|█▏        | 19/157 [00:01<00:15,  9.12it/s][A
 13%|█▎        | 20/157 [00:01<00:15,  9.11it/s][A
 13%|█▎        | 21/157 [00:02<00:14,  9.28it/s][A
 14%|█▍        | 22/157 [00:02<00:14,  9.24it/s][A
 15%|█▍        | 23/157 [00:02<00:14,  9.29it/s][A
 16%|█▌        | 25/157 [00:02<00:13,  9.59it/s][A
 17%|█▋        | 27/157 [00:02<00:13,  9.89it/s][A
 18%|█▊        | 29/157 

{'eval_loss': 0.5532627701759338, 'eval_acc': 0.8343, 'eval_acc2': 0.9203, 'eval_f1': 0.8320388101174145, 'eval_roc_auc_micro': 0.9978946702020202, 'eval_precision': 0.8343, 'eval_recall': 0.8343, 'eval_runtime': 17.9431, 'eval_samples_per_second': 557.317, 'eval_steps_per_second': 8.75, 'epoch': 4.86}


 49%|████▉     | 770/1560 [46:34<16:18,  1.24s/it]  

{'loss': 0.5248, 'grad_norm': 2.74430775642395, 'learning_rate': 0.0005100685266327202, 'epoch': 4.93}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 17.84it/s][A
  3%|▎         | 5/157 [00:00<00:10, 13.94it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.11it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.59it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.45it/s][A
  8%|▊         | 13/157 [00:01<00:11, 12.39it/s][A
 10%|▉         | 15/157 [00:01<00:11, 11.95it/s][A
 11%|█         | 17/157 [00:01<00:11, 11.72it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.87it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.86it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 11.90it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.08it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.21it/s][A
 18%|█▊        | 29/157 [00:02<00:12, 10.55it/s][A
 20%|█▉        | 31/157 [00:02<00:12, 10.46it/s][A
 21%|██        | 33/157 [00:02<00:11, 10.84it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 11.15it/s][A
 24%|██▎       | 37/157 [00:03<00:10, 11.26it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.531029999256134, 'eval_acc': 0.8398, 'eval_acc2': 0.9261, 'eval_f1': 0.839530846174064, 'eval_roc_auc_micro': 0.9981198716666666, 'eval_precision': 0.8398, 'eval_recall': 0.8398, 'eval_runtime': 15.8594, 'eval_samples_per_second': 630.539, 'eval_steps_per_second': 9.899, 'epoch': 4.93}


 50%|█████     | 780/1560 [46:59<14:48,  1.14s/it]  

{'loss': 0.4769, 'grad_norm': 3.4329733848571777, 'learning_rate': 0.0005, 'epoch': 4.99}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.72it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.09it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.88it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.17it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.59it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.48it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.37it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.35it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.23it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.28it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.35it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.34it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.44it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.48it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.37it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.25it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 12.10it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.12it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5334154367446899, 'eval_acc': 0.8401, 'eval_acc2': 0.9248, 'eval_f1': 0.838894005811964, 'eval_roc_auc_micro': 0.9981383222727273, 'eval_precision': 0.8401, 'eval_recall': 0.8401, 'eval_runtime': 14.9375, 'eval_samples_per_second': 669.457, 'eval_steps_per_second': 10.51, 'epoch': 4.99}


 51%|█████     | 790/1560 [47:26<14:21,  1.12s/it]  

{'loss': 0.4187, 'grad_norm': 2.4025449752807617, 'learning_rate': 0.0004899314733672799, 'epoch': 5.06}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:20,  7.52it/s][A
  3%|▎         | 5/157 [00:00<00:16,  9.30it/s][A
  4%|▍         | 7/157 [00:00<00:14, 10.41it/s][A
  6%|▌         | 9/157 [00:00<00:13, 11.11it/s][A
  7%|▋         | 11/157 [00:01<00:12, 11.59it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.94it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.15it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.36it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.46it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.58it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.53it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.58it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.49it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.48it/s][A
 20%|█▉        | 31/157 [00:02<00:14,  8.75it/s][A
 21%|██        | 33/157 [00:03<00:12,  9.62it/s][A
 22%|██▏       | 35/157 [00:03<00:11, 10.19it/s][A
 24%|██▎       | 37/157 [00:03<00:11, 10.58it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5198392868041992, 'eval_acc': 0.845, 'eval_acc2': 0.9289, 'eval_f1': 0.844512098787281, 'eval_roc_auc_micro': 0.9981937336363635, 'eval_precision': 0.845, 'eval_recall': 0.845, 'eval_runtime': 15.1284, 'eval_samples_per_second': 661.008, 'eval_steps_per_second': 10.378, 'epoch': 5.06}


 51%|█████▏    | 800/1560 [47:51<13:36,  1.07s/it]  

{'loss': 0.3841, 'grad_norm': 2.318310499191284, 'learning_rate': 0.0004798670299452926, 'epoch': 5.12}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.97it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.82it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.66it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.19it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.99it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.86it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.77it/s][A
 11%|█         | 17/157 [00:01<00:10, 12.74it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.66it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.60it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.53it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.35it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.48it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.42it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.51it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.61it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.56it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.27it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5165827870368958, 'eval_acc': 0.8461, 'eval_acc2': 0.9295, 'eval_f1': 0.8461493517240071, 'eval_roc_auc_micro': 0.9982292984848484, 'eval_precision': 0.8461, 'eval_recall': 0.8461, 'eval_runtime': 14.9067, 'eval_samples_per_second': 670.839, 'eval_steps_per_second': 10.532, 'epoch': 5.12}


 52%|█████▏    | 810/1560 [48:16<13:46,  1.10s/it]  

{'loss': 0.4261, 'grad_norm': 2.0162270069122314, 'learning_rate': 0.0004698107512888569, 'epoch': 5.18}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:13, 11.23it/s][A
  3%|▎         | 4/157 [00:00<00:12, 12.10it/s][A
  4%|▍         | 6/157 [00:00<00:12, 12.37it/s][A
  5%|▌         | 8/157 [00:00<00:11, 12.46it/s][A
  6%|▋         | 10/157 [00:00<00:11, 12.37it/s][A
  8%|▊         | 12/157 [00:00<00:11, 12.49it/s][A
  9%|▉         | 14/157 [00:01<00:11, 12.64it/s][A
 10%|█         | 16/157 [00:01<00:11, 12.64it/s][A
 11%|█▏        | 18/157 [00:01<00:10, 12.65it/s][A
 13%|█▎        | 20/157 [00:01<00:10, 12.69it/s][A
 14%|█▍        | 22/157 [00:01<00:10, 12.78it/s][A
 15%|█▌        | 24/157 [00:01<00:10, 12.82it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.82it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 12.60it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 12.62it/s][A
 20%|██        | 32/157 [00:02<00:09, 12.62it/s][A
 22%|██▏       | 34/157 [00:02<00:09, 12.47it/s][A
 23%|██▎       | 36/157 [00:02<00:09, 12.50it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.5306369066238403, 'eval_acc': 0.8421, 'eval_acc2': 0.9268, 'eval_f1': 0.8421270732327021, 'eval_roc_auc_micro': 0.9980528449999999, 'eval_precision': 0.8421, 'eval_recall': 0.8421, 'eval_runtime': 14.8057, 'eval_samples_per_second': 675.417, 'eval_steps_per_second': 10.604, 'epoch': 5.18}


 53%|█████▎    | 820/1560 [48:40<13:07,  1.06s/it]  

{'loss': 0.4261, 'grad_norm': 1.8545204401016235, 'learning_rate': 0.00045976671564163706, 'epoch': 5.25}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 19.09it/s][A
  3%|▎         | 5/157 [00:00<00:09, 15.35it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.88it/s][A
  6%|▌         | 9/157 [00:00<00:10, 13.47it/s][A
  7%|▋         | 11/157 [00:00<00:11, 13.15it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.98it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.82it/s][A
 11%|█         | 17/157 [00:01<00:10, 12.86it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.85it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.82it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.79it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.74it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.54it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.61it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.52it/s][A
 21%|██        | 33/157 [00:02<00:13,  9.01it/s][A
 22%|██▏       | 35/157 [00:02<00:12,  9.90it/s][A
 24%|██▎       | 37/157 [00:03<00:11, 10.62it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5183253288269043, 'eval_acc': 0.8434, 'eval_acc2': 0.9268, 'eval_f1': 0.8429567668046327, 'eval_roc_auc_micro': 0.998207316010101, 'eval_precision': 0.8434, 'eval_recall': 0.8434, 'eval_runtime': 14.7901, 'eval_samples_per_second': 676.126, 'eval_steps_per_second': 10.615, 'epoch': 5.25}


 53%|█████▎    | 830/1560 [49:04<12:52,  1.06s/it]  

{'loss': 0.4265, 'grad_norm': 2.4050486087799072, 'learning_rate': 0.00044973899628224153, 'epoch': 5.31}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:09, 15.79it/s][A
  3%|▎         | 4/157 [00:00<00:11, 13.81it/s][A
  4%|▍         | 6/157 [00:00<00:11, 13.27it/s][A
  5%|▌         | 8/157 [00:00<00:11, 12.93it/s][A
  6%|▋         | 10/157 [00:00<00:11, 12.73it/s][A
  8%|▊         | 12/157 [00:00<00:11, 12.75it/s][A
  9%|▉         | 14/157 [00:01<00:11, 12.68it/s][A
 10%|█         | 16/157 [00:01<00:11, 12.69it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 12.52it/s][A
 13%|█▎        | 20/157 [00:01<00:10, 12.59it/s][A
 14%|█▍        | 22/157 [00:01<00:10, 12.68it/s][A
 15%|█▌        | 24/157 [00:01<00:10, 12.71it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.69it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 12.74it/s][A
 19%|█▉        | 30/157 [00:02<00:09, 12.80it/s][A
 20%|██        | 32/157 [00:02<00:09, 12.79it/s][A
 22%|██▏       | 34/157 [00:02<00:09, 12.78it/s][A
 23%|██▎       | 36/157 [00:02<00:09, 12.80it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.5362663865089417, 'eval_acc': 0.8388, 'eval_acc2': 0.925, 'eval_f1': 0.8385753052717849, 'eval_roc_auc_micro': 0.9980783133333333, 'eval_precision': 0.8388, 'eval_recall': 0.8388, 'eval_runtime': 14.8487, 'eval_samples_per_second': 673.458, 'eval_steps_per_second': 10.573, 'epoch': 5.31}


 54%|█████▍    | 840/1560 [49:28<13:03,  1.09s/it]  

{'loss': 0.3955, 'grad_norm': 3.0566704273223877, 'learning_rate': 0.00043973165987233853, 'epoch': 5.38}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 19.09it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.98it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.56it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.90it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.43it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.47it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.39it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.56it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.68it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.72it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.78it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.47it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.59it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.66it/s][A
 20%|█▉        | 31/157 [00:02<00:09, 12.72it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.45it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.53it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.53it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5157256126403809, 'eval_acc': 0.8462, 'eval_acc2': 0.93, 'eval_f1': 0.8457016862460925, 'eval_roc_auc_micro': 0.9982157155555555, 'eval_precision': 0.8462, 'eval_recall': 0.8462, 'eval_runtime': 14.9136, 'eval_samples_per_second': 670.527, 'eval_steps_per_second': 10.527, 'epoch': 5.38}


 54%|█████▍    | 850/1560 [49:53<12:35,  1.06s/it]  

{'loss': 0.4006, 'grad_norm': 2.650606632232666, 'learning_rate': 0.0004297487648074538, 'epoch': 5.44}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 19.03it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.42it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.66it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.26it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.91it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.83it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.86it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.72it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.75it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.80it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.83it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.82it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.69it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.48it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.55it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.61it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.65it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.69it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5147303938865662, 'eval_acc': 0.8459, 'eval_acc2': 0.927, 'eval_f1': 0.8449523074356188, 'eval_roc_auc_micro': 0.9982033138888888, 'eval_precision': 0.8459, 'eval_recall': 0.8459, 'eval_runtime': 15.2392, 'eval_samples_per_second': 656.202, 'eval_steps_per_second': 10.302, 'epoch': 5.44}


 55%|█████▌    | 860/1560 [50:17<12:34,  1.08s/it]  

{'loss': 0.4242, 'grad_norm': 2.51873517036438, 'learning_rate': 0.0004197943595711198, 'epoch': 5.5}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 19.10it/s][A
  3%|▎         | 5/157 [00:00<00:09, 15.29it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.98it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.16it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.88it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.67it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.69it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.38it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.52it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.62it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.61it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.57it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.63it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.51it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.58it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.47it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.54it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.58it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5045496225357056, 'eval_acc': 0.8485, 'eval_acc2': 0.9315, 'eval_f1': 0.8482208179608839, 'eval_roc_auc_micro': 0.9982873751010101, 'eval_precision': 0.8485, 'eval_recall': 0.8485, 'eval_runtime': 15.0411, 'eval_samples_per_second': 664.847, 'eval_steps_per_second': 10.438, 'epoch': 5.5}


 56%|█████▌    | 870/1560 [50:42<12:26,  1.08s/it]  

{'loss': 0.39, 'grad_norm': 2.4911465644836426, 'learning_rate': 0.00040987248109304716, 'epoch': 5.57}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 19.14it/s][A
  3%|▎         | 5/157 [00:00<00:09, 15.36it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.87it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.42it/s][A
  7%|▋         | 11/157 [00:00<00:11, 13.20it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.71it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.45it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.27it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.28it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.37it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.48it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.60it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.66it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.33it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.36it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.40it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 11.85it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.12it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5422713160514832, 'eval_acc': 0.8409, 'eval_acc2': 0.9245, 'eval_f1': 0.8402460454951233, 'eval_roc_auc_micro': 0.9980483403535354, 'eval_precision': 0.8409, 'eval_recall': 0.8409, 'eval_runtime': 15.121, 'eval_samples_per_second': 661.33, 'eval_steps_per_second': 10.383, 'epoch': 5.57}


 56%|█████▋    | 880/1560 [51:07<12:08,  1.07s/it]  

{'loss': 0.375, 'grad_norm': 1.8246468305587769, 'learning_rate': 0.0003999871531119779, 'epoch': 5.63}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:23,  6.51it/s][A
  3%|▎         | 5/157 [00:00<00:17,  8.53it/s][A
  4%|▍         | 7/157 [00:00<00:15,  9.81it/s][A
  6%|▌         | 9/157 [00:00<00:13, 10.69it/s][A
  7%|▋         | 11/157 [00:01<00:12, 11.27it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.64it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.75it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.05it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.29it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.42it/s][A
 15%|█▍        | 23/157 [00:02<00:10, 12.45it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.43it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.53it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.58it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.38it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.41it/s][A
 22%|██▏       | 35/157 [00:03<00:09, 12.45it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.28it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5029392838478088, 'eval_acc': 0.851, 'eval_acc2': 0.9312, 'eval_f1': 0.8512875534626102, 'eval_roc_auc_micro': 0.9983018131818182, 'eval_precision': 0.851, 'eval_recall': 0.851, 'eval_runtime': 15.068, 'eval_samples_per_second': 663.658, 'eval_steps_per_second': 10.419, 'epoch': 5.63}


 57%|█████▋    | 890/1560 [51:31<12:07,  1.09s/it]  

{'loss': 0.3901, 'grad_norm': 2.3925065994262695, 'learning_rate': 0.0003901423845438916, 'epoch': 5.7}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:13, 11.66it/s][A
  3%|▎         | 5/157 [00:00<00:12, 12.09it/s][A
  4%|▍         | 7/157 [00:00<00:12, 12.28it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.41it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.46it/s][A
  8%|▊         | 13/157 [00:01<00:11, 12.43it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.16it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.33it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.42it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.54it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.57it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.64it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.65it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.68it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.45it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.32it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.38it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.27it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5087773203849792, 'eval_acc': 0.8504, 'eval_acc2': 0.9283, 'eval_f1': 0.8501210052507601, 'eval_roc_auc_micro': 0.9982285678282828, 'eval_precision': 0.8504, 'eval_recall': 0.8504, 'eval_runtime': 16.2637, 'eval_samples_per_second': 614.868, 'eval_steps_per_second': 9.653, 'epoch': 5.7}


 58%|█████▊    | 900/1560 [51:57<12:04,  1.10s/it]  

{'loss': 0.3977, 'grad_norm': 2.7313601970672607, 'learning_rate': 0.00038034216785622126, 'epoch': 5.76}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.97it/s][A
  3%|▎         | 5/157 [00:00<00:09, 15.21it/s][A
  4%|▍         | 7/157 [00:00<00:10, 14.02it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.41it/s][A
  7%|▋         | 11/157 [00:00<00:11, 13.11it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.88it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.65it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.47it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.39it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.44it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.49it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.58it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.22it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.23it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.26it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.46it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.53it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.52it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5137879848480225, 'eval_acc': 0.8492, 'eval_acc2': 0.9297, 'eval_f1': 0.8489744128788731, 'eval_roc_auc_micro': 0.9982185755555556, 'eval_precision': 0.8492, 'eval_recall': 0.8492, 'eval_runtime': 15.466, 'eval_samples_per_second': 646.58, 'eval_steps_per_second': 10.151, 'epoch': 5.76}


 58%|█████▊    | 910/1560 [52:22<11:40,  1.08s/it]  

{'loss': 0.3886, 'grad_norm': 2.814476251602173, 'learning_rate': 0.0003705904774487396, 'epoch': 5.82}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.14it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.01it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.87it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.43it/s][A
  7%|▋         | 11/157 [00:00<00:11, 13.10it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.95it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.81it/s][A
 11%|█         | 17/157 [00:01<00:10, 12.79it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.66it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.56it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.63it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.69it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.49it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.51it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.56it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.60it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.62it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.28it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5090830326080322, 'eval_acc': 0.8476, 'eval_acc2': 0.9324, 'eval_f1': 0.8468378126729714, 'eval_roc_auc_micro': 0.9982407695959595, 'eval_precision': 0.8476, 'eval_recall': 0.8476, 'eval_runtime': 15.0499, 'eval_samples_per_second': 664.454, 'eval_steps_per_second': 10.432, 'epoch': 5.82}


 59%|█████▉    | 920/1560 [52:47<11:26,  1.07s/it]  

{'loss': 0.3728, 'grad_norm': 3.5507631301879883, 'learning_rate': 0.0003608912680417737, 'epoch': 5.89}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.95it/s][A
  3%|▎         | 5/157 [00:00<00:09, 15.32it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.81it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.39it/s][A
  7%|▋         | 11/157 [00:00<00:11, 13.09it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.88it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.75it/s][A
 11%|█         | 17/157 [00:01<00:10, 12.74it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.66it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.64it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.58it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.58it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.46it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.50it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.39it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.25it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.36it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.28it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.49947983026504517, 'eval_acc': 0.8515, 'eval_acc2': 0.9352, 'eval_f1': 0.8512973386911034, 'eval_roc_auc_micro': 0.998246095909091, 'eval_precision': 0.8515, 'eval_recall': 0.8515, 'eval_runtime': 15.0853, 'eval_samples_per_second': 662.895, 'eval_steps_per_second': 10.407, 'epoch': 5.89}


 60%|█████▉    | 930/1560 [53:11<11:48,  1.12s/it]  

{'loss': 0.3798, 'grad_norm': 3.1957263946533203, 'learning_rate': 0.0003512484730723986, 'epoch': 5.95}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:12, 12.65it/s][A
  3%|▎         | 5/157 [00:00<00:14, 10.83it/s][A
  4%|▍         | 7/157 [00:00<00:13, 11.49it/s][A
  6%|▌         | 9/157 [00:00<00:12, 11.85it/s][A
  7%|▋         | 11/157 [00:00<00:12, 12.01it/s][A
  8%|▊         | 13/157 [00:01<00:11, 12.21it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.27it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.36it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.48it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.60it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.67it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.67it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.78it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.74it/s][A
 20%|█▉        | 31/157 [00:02<00:09, 12.70it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.69it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.60it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.67it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.49410519003868103, 'eval_acc': 0.8525, 'eval_acc2': 0.932, 'eval_f1': 0.851959449812488, 'eval_roc_auc_micro': 0.9983354412121213, 'eval_precision': 0.8525, 'eval_recall': 0.8525, 'eval_runtime': 14.5856, 'eval_samples_per_second': 685.61, 'eval_steps_per_second': 10.764, 'epoch': 5.95}


 60%|██████    | 940/1560 [53:38<15:00,  1.45s/it]

{'loss': 0.357, 'grad_norm': 3.9451088905334473, 'learning_rate': 0.00034166600309926387, 'epoch': 6.02}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:27,  5.58it/s][A
  3%|▎         | 5/157 [00:00<00:19,  7.66it/s][A
  4%|▍         | 7/157 [00:00<00:16,  9.12it/s][A
  6%|▌         | 9/157 [00:01<00:14, 10.16it/s][A
  7%|▋         | 11/157 [00:01<00:13, 10.85it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.34it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.65it/s][A
 11%|█         | 17/157 [00:01<00:11, 11.98it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.13it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.24it/s][A
 15%|█▍        | 23/157 [00:02<00:10, 12.32it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.33it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.38it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.19it/s][A
 20%|█▉        | 31/157 [00:03<00:16,  7.83it/s][A
 21%|██        | 33/157 [00:03<00:14,  8.82it/s][A
 22%|██▏       | 35/157 [00:03<00:12,  9.57it/s][A
 24%|██▎       | 37/157 [00:03<00:11, 10.21it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.50197434425354, 'eval_acc': 0.852, 'eval_acc2': 0.932, 'eval_f1': 0.8517213577516844, 'eval_roc_auc_micro': 0.998280356010101, 'eval_precision': 0.852, 'eval_recall': 0.852, 'eval_runtime': 19.406, 'eval_samples_per_second': 515.304, 'eval_steps_per_second': 8.09, 'epoch': 6.02}


 61%|██████    | 950/1560 [54:07<11:40,  1.15s/it]  

{'loss': 0.3215, 'grad_norm': 2.4268417358398438, 'learning_rate': 0.00033214774421669774, 'epoch': 6.08}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.96it/s][A
  3%|▎         | 5/157 [00:00<00:09, 15.25it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.50it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.15it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.95it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.81it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.76it/s][A
 11%|█         | 17/157 [00:01<00:10, 12.75it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.74it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.65it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.64it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.64it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.63it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.64it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.59it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.57it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.57it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.51it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5118877291679382, 'eval_acc': 0.8486, 'eval_acc2': 0.9306, 'eval_f1': 0.848009355409641, 'eval_roc_auc_micro': 0.9982173196969697, 'eval_precision': 0.8486, 'eval_recall': 0.8486, 'eval_runtime': 15.608, 'eval_samples_per_second': 640.695, 'eval_steps_per_second': 10.059, 'epoch': 6.08}


 62%|██████▏   | 960/1560 [54:32<11:16,  1.13s/it]  

{'loss': 0.2604, 'grad_norm': 2.6804378032684326, 'learning_rate': 0.00032269755647873217, 'epoch': 6.14}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 19.05it/s][A
  3%|▎         | 5/157 [00:00<00:09, 15.33it/s][A
  4%|▍         | 7/157 [00:00<00:10, 14.07it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.34it/s][A
  7%|▋         | 11/157 [00:00<00:11, 13.04it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.93it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.87it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.53it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.60it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.23it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.45it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.62it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.62it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.67it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.33it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.28it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.40it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.55it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.4967518150806427, 'eval_acc': 0.854, 'eval_acc2': 0.9356, 'eval_f1': 0.8534210913973479, 'eval_roc_auc_micro': 0.9983686393434344, 'eval_precision': 0.854, 'eval_recall': 0.854, 'eval_runtime': 18.676, 'eval_samples_per_second': 535.448, 'eval_steps_per_second': 8.407, 'epoch': 6.14}


 62%|██████▏   | 970/1560 [55:01<11:04,  1.13s/it]  

{'loss': 0.3016, 'grad_norm': 2.4478445053100586, 'learning_rate': 0.0003133192723336895, 'epoch': 6.21}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 19.17it/s][A
  3%|▎         | 5/157 [00:00<00:09, 15.21it/s][A
  4%|▍         | 7/157 [00:00<00:10, 14.02it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.42it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.80it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.58it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.52it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.57it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.62it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.57it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.57it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.67it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.72it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.73it/s][A
 20%|█▉        | 31/157 [00:02<00:14,  8.84it/s][A
 21%|██        | 33/157 [00:02<00:12,  9.63it/s][A
 22%|██▏       | 35/157 [00:02<00:11, 10.29it/s][A
 24%|██▎       | 37/157 [00:03<00:10, 10.94it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5175423622131348, 'eval_acc': 0.8473, 'eval_acc2': 0.9302, 'eval_f1': 0.8469072072580033, 'eval_roc_auc_micro': 0.9982265820202021, 'eval_precision': 0.8473, 'eval_recall': 0.8473, 'eval_runtime': 41.1166, 'eval_samples_per_second': 243.211, 'eval_steps_per_second': 3.818, 'epoch': 6.21}


 63%|██████▎   | 980/1560 [55:51<13:25,  1.39s/it]  

{'loss': 0.2798, 'grad_norm': 2.042588233947754, 'learning_rate': 0.0003040166950699625, 'epoch': 6.27}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.90it/s][A
  3%|▎         | 5/157 [00:00<00:09, 15.27it/s][A
  4%|▍         | 7/157 [00:00<00:10, 14.10it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.42it/s][A
  7%|▋         | 11/157 [00:00<00:11, 13.09it/s][A
  8%|▊         | 13/157 [00:00<00:11, 13.03it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.67it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.46it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.42it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.52it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.59it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.58it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.39it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.29it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.45it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.50it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.56it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.54it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.485455185174942, 'eval_acc': 0.8596, 'eval_acc2': 0.9375, 'eval_f1': 0.8594860325792643, 'eval_roc_auc_micro': 0.998395233939394, 'eval_precision': 0.8596, 'eval_recall': 0.8596, 'eval_runtime': 19.8121, 'eval_samples_per_second': 504.741, 'eval_steps_per_second': 7.924, 'epoch': 6.27}


 63%|██████▎   | 990/1560 [56:21<11:22,  1.20s/it]  

{'loss': 0.2833, 'grad_norm': 1.7779461145401, 'learning_rate': 0.0002947935972736217, 'epoch': 6.34}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 17.17it/s][A
  3%|▎         | 5/157 [00:00<00:10, 13.87it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.14it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.62it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.19it/s][A
  8%|▊         | 13/157 [00:01<00:11, 12.28it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.18it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.24it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.29it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.41it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.41it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.27it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.15it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.09it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.16it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.30it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.29it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.32it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5008781552314758, 'eval_acc': 0.8531, 'eval_acc2': 0.935, 'eval_f1': 0.8527832782804322, 'eval_roc_auc_micro': 0.9983630983333334, 'eval_precision': 0.8531, 'eval_recall': 0.8531, 'eval_runtime': 21.1142, 'eval_samples_per_second': 473.615, 'eval_steps_per_second': 7.436, 'epoch': 6.34}


 64%|██████▍   | 1000/1560 [56:59<17:50,  1.91s/it] 

{'loss': 0.3063, 'grad_norm': 2.98540997505188, 'learning_rate': 0.00028565371929847286, 'epoch': 6.4}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:09, 16.78it/s][A
  3%|▎         | 5/157 [00:00<00:11, 13.68it/s][A
  4%|▍         | 7/157 [00:00<00:11, 12.78it/s][A
  6%|▌         | 9/157 [00:00<00:12, 12.20it/s][A
  7%|▋         | 11/157 [00:00<00:14,  9.79it/s][A
  8%|▊         | 13/157 [00:01<00:20,  7.13it/s][A
  9%|▉         | 14/157 [00:01<00:22,  6.43it/s][A
 10%|▉         | 15/157 [00:01<00:24,  5.91it/s][A
 10%|█         | 16/157 [00:02<00:25,  5.51it/s][A
 11%|█         | 17/157 [00:02<00:26,  5.24it/s][A
 11%|█▏        | 18/157 [00:02<00:27,  5.04it/s][A
 12%|█▏        | 19/157 [00:02<00:28,  4.91it/s][A
 13%|█▎        | 20/157 [00:02<00:28,  4.82it/s][A
 13%|█▎        | 21/157 [00:03<00:28,  4.75it/s][A
 14%|█▍        | 22/157 [00:03<00:28,  4.72it/s][A
 15%|█▍        | 23/157 [00:03<00:28,  4.69it/s][A
 15%|█▌        | 24/157 [00:03<00:28,  4.66it/s][A
 16%|█▌        | 25/157 [00:04<00:28,  4.63it/s][A
 17%|█▋        | 26/157 

{'eval_loss': 0.4930467903614044, 'eval_acc': 0.8593, 'eval_acc2': 0.9342, 'eval_f1': 0.8585545371735823, 'eval_roc_auc_micro': 0.9983825003030302, 'eval_precision': 0.8593, 'eval_recall': 0.8593, 'eval_runtime': 37.5996, 'eval_samples_per_second': 265.96, 'eval_steps_per_second': 4.176, 'epoch': 6.4}


 65%|██████▍   | 1010/1560 [57:53<17:45,  1.94s/it]  

{'loss': 0.3136, 'grad_norm': 1.850750207901001, 'learning_rate': 0.0002766007677491871, 'epoch': 6.46}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 17.64it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.12it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.26it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.79it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.48it/s][A
  8%|▊         | 13/157 [00:01<00:11, 12.30it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.15it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.11it/s][A
 12%|█▏        | 19/157 [00:01<00:13, 10.31it/s][A
 13%|█▎        | 21/157 [00:01<00:16,  8.04it/s][A
 14%|█▍        | 22/157 [00:02<00:17,  7.66it/s][A
 15%|█▍        | 23/157 [00:02<00:17,  7.45it/s][A
 15%|█▌        | 24/157 [00:02<00:18,  7.34it/s][A
 16%|█▌        | 25/157 [00:02<00:17,  7.53it/s][A
 17%|█▋        | 26/157 [00:02<00:17,  7.69it/s][A
 17%|█▋        | 27/157 [00:02<00:16,  7.93it/s][A
 18%|█▊        | 28/157 [00:02<00:15,  8.14it/s][A
 18%|█▊        | 29/157 [00:03<00:15,  8.53it/s][A
 19%|█▉        | 30/157 

{'eval_loss': 0.49303770065307617, 'eval_acc': 0.8542, 'eval_acc2': 0.9338, 'eval_f1': 0.853559089591144, 'eval_roc_auc_micro': 0.9984675241919192, 'eval_precision': 0.8542, 'eval_recall': 0.8542, 'eval_runtime': 16.9156, 'eval_samples_per_second': 591.17, 'eval_steps_per_second': 9.281, 'epoch': 6.46}


 65%|██████▌   | 1020/1560 [58:20<10:39,  1.18s/it]  

{'loss': 0.3054, 'grad_norm': 1.786624789237976, 'learning_rate': 0.00026763841397811573, 'epoch': 6.53}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.29it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.41it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.04it/s][A
  6%|▌         | 9/157 [00:00<00:12, 12.28it/s][A
  7%|▋         | 11/157 [00:00<00:12, 12.16it/s][A
  8%|▊         | 13/157 [00:01<00:11, 12.02it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.06it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.00it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.99it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.07it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 12.03it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.13it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.12it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.04it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.11it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.04it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 12.09it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.14it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5006928443908691, 'eval_acc': 0.855, 'eval_acc2': 0.9335, 'eval_f1': 0.8550535839181254, 'eval_roc_auc_micro': 0.9983594951515151, 'eval_precision': 0.855, 'eval_recall': 0.855, 'eval_runtime': 39.5114, 'eval_samples_per_second': 253.091, 'eval_steps_per_second': 3.974, 'epoch': 6.53}


 66%|██████▌   | 1030/1560 [59:08<12:01,  1.36s/it]  

{'loss': 0.2971, 'grad_norm': 2.2288689613342285, 'learning_rate': 0.0002587702925964034, 'epoch': 6.59}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.83it/s][A
  3%|▎         | 5/157 [00:00<00:09, 15.26it/s][A
  4%|▍         | 7/157 [00:00<00:10, 14.01it/s][A
  6%|▌         | 9/157 [00:00<00:10, 13.53it/s][A
  7%|▋         | 11/157 [00:00<00:11, 13.27it/s][A
  8%|▊         | 13/157 [00:00<00:11, 13.06it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.77it/s][A
 11%|█         | 17/157 [00:01<00:10, 12.76it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.80it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.77it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.78it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.72it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.64it/s][A
 18%|█▊        | 29/157 [00:02<00:14,  8.93it/s][A
 20%|█▉        | 31/157 [00:02<00:12,  9.87it/s][A
 21%|██        | 33/157 [00:02<00:11, 10.60it/s][A
 22%|██▏       | 35/157 [00:02<00:11, 11.05it/s][A
 24%|██▎       | 37/157 [00:03<00:10, 11.57it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.48954713344573975, 'eval_acc': 0.8553, 'eval_acc2': 0.9363, 'eval_f1': 0.8552865490818098, 'eval_roc_auc_micro': 0.9983961127777778, 'eval_precision': 0.8553, 'eval_recall': 0.8553, 'eval_runtime': 19.4963, 'eval_samples_per_second': 512.917, 'eval_steps_per_second': 8.053, 'epoch': 6.59}


 67%|██████▋   | 1040/1560 [59:38<09:45,  1.13s/it]  

{'loss': 0.277, 'grad_norm': 2.119033098220825, 'learning_rate': 0.0002500000000000001, 'epoch': 6.66}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 19.01it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.09it/s][A
  4%|▍         | 7/157 [00:00<00:10, 14.02it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.06it/s][A
  7%|▋         | 11/157 [00:00<00:11, 13.03it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.96it/s][A
 10%|▉         | 15/157 [00:01<00:10, 12.94it/s][A
 11%|█         | 17/157 [00:01<00:10, 12.96it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.86it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.80it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.74it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.64it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.58it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.56it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.52it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.53it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.62it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.64it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.48121464252471924, 'eval_acc': 0.8582, 'eval_acc2': 0.9362, 'eval_f1': 0.8574623448546926, 'eval_roc_auc_micro': 0.9984865197474748, 'eval_precision': 0.8582, 'eval_recall': 0.8582, 'eval_runtime': 18.8848, 'eval_samples_per_second': 529.526, 'eval_steps_per_second': 8.314, 'epoch': 6.66}


 67%|██████▋   | 1050/1560 [1:00:06<09:36,  1.13s/it]

{'loss': 0.2907, 'grad_norm': 2.1202805042266846, 'learning_rate': 0.00024133109291117155, 'epoch': 6.72}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 19.12it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.18it/s][A
  4%|▍         | 7/157 [00:00<00:10, 14.08it/s][A
  6%|▌         | 9/157 [00:00<00:10, 13.56it/s][A
  7%|▋         | 11/157 [00:00<00:11, 13.22it/s][A
  8%|▊         | 13/157 [00:00<00:10, 13.20it/s][A
 10%|▉         | 15/157 [00:01<00:10, 13.10it/s][A
 11%|█         | 17/157 [00:01<00:10, 12.97it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.95it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.91it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.91it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.85it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.78it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.68it/s][A
 20%|█▉        | 31/157 [00:02<00:09, 12.66it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.79it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.59it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.62it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.4890627861022949, 'eval_acc': 0.8564, 'eval_acc2': 0.935, 'eval_f1': 0.8556546548285425, 'eval_roc_auc_micro': 0.9984082351515151, 'eval_precision': 0.8564, 'eval_recall': 0.8564, 'eval_runtime': 15.1765, 'eval_samples_per_second': 658.914, 'eval_steps_per_second': 10.345, 'epoch': 6.72}


 68%|██████▊   | 1060/1560 [1:00:31<08:55,  1.07s/it]

{'loss': 0.2687, 'grad_norm': 3.334587574005127, 'learning_rate': 0.00023276708693609945, 'epoch': 6.78}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:12, 12.58it/s][A
  3%|▎         | 4/157 [00:00<00:12, 12.10it/s][A
  4%|▍         | 6/157 [00:00<00:12, 12.17it/s][A
  5%|▌         | 8/157 [00:00<00:12, 12.33it/s][A
  6%|▋         | 10/157 [00:00<00:11, 12.44it/s][A
  8%|▊         | 12/157 [00:00<00:11, 12.59it/s][A
  9%|▉         | 14/157 [00:01<00:11, 12.51it/s][A
 10%|█         | 16/157 [00:01<00:11, 12.55it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 12.34it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 12.39it/s][A
 14%|█▍        | 22/157 [00:01<00:10, 12.60it/s][A
 15%|█▌        | 24/157 [00:01<00:10, 12.67it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.74it/s][A
 18%|█▊        | 28/157 [00:02<00:14,  8.77it/s][A
 19%|█▉        | 30/157 [00:02<00:13,  9.65it/s][A
 20%|██        | 32/157 [00:02<00:12, 10.29it/s][A
 22%|██▏       | 34/157 [00:02<00:11, 10.83it/s][A
 23%|██▎       | 36/157 [00:03<00:10, 11.23it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.48356035351753235, 'eval_acc': 0.8619, 'eval_acc2': 0.934, 'eval_f1': 0.8613479122875962, 'eval_roc_auc_micro': 0.9984204757070707, 'eval_precision': 0.8619, 'eval_recall': 0.8619, 'eval_runtime': 18.7037, 'eval_samples_per_second': 534.653, 'eval_steps_per_second': 8.394, 'epoch': 6.78}


 69%|██████▊   | 1070/1560 [1:00:59<09:15,  1.13s/it]

{'loss': 0.2659, 'grad_norm': 2.3305985927581787, 'learning_rate': 0.0002243114551391542, 'epoch': 6.85}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.70it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.19it/s][A
  4%|▍         | 7/157 [00:00<00:10, 14.08it/s][A
  6%|▌         | 9/157 [00:00<00:10, 13.49it/s][A
  7%|▋         | 11/157 [00:00<00:11, 13.18it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.90it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.80it/s][A
 11%|█         | 17/157 [00:01<00:10, 12.75it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.75it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.73it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.74it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.73it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.68it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.65it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.56it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.53it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.53it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.50it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.4868922233581543, 'eval_acc': 0.8583, 'eval_acc2': 0.9354, 'eval_f1': 0.8583406568616496, 'eval_roc_auc_micro': 0.9983998693434344, 'eval_precision': 0.8583, 'eval_recall': 0.8583, 'eval_runtime': 15.4474, 'eval_samples_per_second': 647.358, 'eval_steps_per_second': 10.164, 'epoch': 6.85}


 69%|██████▉   | 1080/1560 [1:01:24<08:56,  1.12s/it]

{'loss': 0.2927, 'grad_norm': 2.7385456562042236, 'learning_rate': 0.00021596762663442215, 'epoch': 6.91}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.99it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.05it/s][A
  4%|▍         | 7/157 [00:00<00:10, 14.12it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.45it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.60it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.68it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.66it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.67it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.64it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.66it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.52it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.37it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.51it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.55it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.56it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.38it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.50it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.45it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.48537519574165344, 'eval_acc': 0.86, 'eval_acc2': 0.9374, 'eval_f1': 0.8601962089001501, 'eval_roc_auc_micro': 0.9984485377272727, 'eval_precision': 0.86, 'eval_recall': 0.86, 'eval_runtime': 18.3831, 'eval_samples_per_second': 543.978, 'eval_steps_per_second': 8.54, 'epoch': 6.91}


 70%|██████▉   | 1090/1560 [1:01:52<08:45,  1.12s/it]

{'loss': 0.2746, 'grad_norm': 2.2893309593200684, 'learning_rate': 0.00020773898519505567, 'epoch': 6.98}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.61it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.57it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.51it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.05it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.95it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.75it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.73it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.70it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.67it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.66it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.61it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.56it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.62it/s][A
 18%|█▊        | 29/157 [00:02<00:14,  8.84it/s][A
 20%|█▉        | 31/157 [00:02<00:12,  9.74it/s][A
 21%|██        | 33/157 [00:02<00:12, 10.21it/s][A
 22%|██▏       | 35/157 [00:02<00:11, 10.81it/s][A
 24%|██▎       | 37/157 [00:03<00:10, 11.27it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.4837382733821869, 'eval_acc': 0.8601, 'eval_acc2': 0.9369, 'eval_f1': 0.8594818889163874, 'eval_roc_auc_micro': 0.9984038479797979, 'eval_precision': 0.8601, 'eval_recall': 0.8601, 'eval_runtime': 15.352, 'eval_samples_per_second': 651.38, 'eval_steps_per_second': 10.227, 'epoch': 6.98}


 71%|███████   | 1100/1560 [1:02:20<09:04,  1.18s/it]

{'loss': 0.2645, 'grad_norm': 3.762627363204956, 'learning_rate': 0.00019962886788101047, 'epoch': 7.04}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:09, 15.84it/s][A
  3%|▎         | 4/157 [00:00<00:11, 13.49it/s][A
  4%|▍         | 6/157 [00:00<00:11, 12.90it/s][A
  5%|▌         | 8/157 [00:00<00:11, 12.49it/s][A
  6%|▋         | 10/157 [00:00<00:11, 12.42it/s][A
  8%|▊         | 12/157 [00:00<00:11, 12.31it/s][A
  9%|▉         | 14/157 [00:01<00:11, 12.31it/s][A
 10%|█         | 16/157 [00:01<00:11, 12.41it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 12.37it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 12.35it/s][A
 14%|█▍        | 22/157 [00:01<00:10, 12.44it/s][A
 15%|█▌        | 24/157 [00:01<00:10, 12.45it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.46it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 12.50it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 12.50it/s][A
 20%|██        | 32/157 [00:02<00:09, 12.51it/s][A
 22%|██▏       | 34/157 [00:02<00:09, 12.48it/s][A
 23%|██▎       | 36/157 [00:02<00:09, 12.52it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.4786907136440277, 'eval_acc': 0.8594, 'eval_acc2': 0.9371, 'eval_f1': 0.8590419558479254, 'eval_roc_auc_micro': 0.9984676676767676, 'eval_precision': 0.8594, 'eval_recall': 0.8594, 'eval_runtime': 19.9446, 'eval_samples_per_second': 501.388, 'eval_steps_per_second': 7.872, 'epoch': 7.04}


 71%|███████   | 1110/1560 [1:02:49<08:40,  1.16s/it]

{'loss': 0.2436, 'grad_norm': 1.860051155090332, 'learning_rate': 0.00019164056368572847, 'epoch': 7.1}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.35it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.83it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.79it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.34it/s][A
  7%|▋         | 11/157 [00:00<00:11, 13.11it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.90it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.75it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.66it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.59it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.57it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.46it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.45it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.49it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.47it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.55it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.57it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.58it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.59it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.477934867143631, 'eval_acc': 0.8586, 'eval_acc2': 0.9376, 'eval_f1': 0.8582008187438918, 'eval_roc_auc_micro': 0.9984852283333333, 'eval_precision': 0.8586, 'eval_recall': 0.8586, 'eval_runtime': 39.2351, 'eval_samples_per_second': 254.874, 'eval_steps_per_second': 4.002, 'epoch': 7.1}


 72%|███████▏  | 1120/1560 [1:03:38<10:02,  1.37s/it]  

{'loss': 0.2172, 'grad_norm': 1.8331931829452515, 'learning_rate': 0.0001837773122023114, 'epoch': 7.17}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:17,  8.84it/s][A
  3%|▎         | 4/157 [00:00<00:15, 10.15it/s][A
  4%|▍         | 6/157 [00:00<00:13, 10.82it/s][A
  5%|▌         | 8/157 [00:00<00:13, 11.20it/s][A
  6%|▋         | 10/157 [00:00<00:13, 11.24it/s][A
  8%|▊         | 12/157 [00:01<00:12, 11.44it/s][A
  9%|▉         | 14/157 [00:01<00:12, 11.62it/s][A
 10%|█         | 16/157 [00:01<00:12, 11.67it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 11.81it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 11.96it/s][A
 14%|█▍        | 22/157 [00:01<00:11, 11.94it/s][A
 15%|█▌        | 24/157 [00:02<00:11, 11.66it/s][A
 17%|█▋        | 26/157 [00:02<00:11, 11.86it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 11.95it/s][A
 19%|█▉        | 30/157 [00:02<00:15,  8.18it/s][A
 20%|██        | 32/157 [00:03<00:13,  9.14it/s][A
 22%|██▏       | 34/157 [00:03<00:12,  9.90it/s][A
 23%|██▎       | 36/157 [00:03<00:11, 10.61it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.4759122133255005, 'eval_acc': 0.8606, 'eval_acc2': 0.9372, 'eval_f1': 0.8601791180964031, 'eval_roc_auc_micro': 0.9984972775757575, 'eval_precision': 0.8606, 'eval_recall': 0.8606, 'eval_runtime': 16.5244, 'eval_samples_per_second': 605.167, 'eval_steps_per_second': 9.501, 'epoch': 7.17}


 72%|███████▏  | 1130/1560 [1:04:04<08:09,  1.14s/it]

{'loss': 0.2088, 'grad_norm': 3.1077425479888916, 'learning_rate': 0.00017604230230973067, 'epoch': 7.23}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.44it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.39it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.24it/s][A
  6%|▌         | 9/157 [00:00<00:12, 12.31it/s][A
  7%|▋         | 11/157 [00:00<00:12, 12.15it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.95it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.05it/s][A
 11%|█         | 17/157 [00:01<00:11, 11.96it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.08it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.07it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 12.07it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.12it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.04it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.04it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 11.99it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.02it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 12.06it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.10it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.4848586320877075, 'eval_acc': 0.8589, 'eval_acc2': 0.9374, 'eval_f1': 0.8586210342719905, 'eval_roc_auc_micro': 0.9984717671212122, 'eval_precision': 0.8589, 'eval_recall': 0.8589, 'eval_runtime': 20.2572, 'eval_samples_per_second': 493.652, 'eval_steps_per_second': 7.75, 'epoch': 7.23}


 73%|███████▎  | 1140/1560 [1:04:39<12:44,  1.82s/it]

{'loss': 0.2186, 'grad_norm': 2.2180469036102295, 'learning_rate': 0.00016843867087960252, 'epoch': 7.3}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 17.42it/s][A
  3%|▎         | 5/157 [00:00<00:23,  6.60it/s][A
  4%|▍         | 6/157 [00:00<00:25,  5.90it/s][A
  4%|▍         | 7/157 [00:01<00:26,  5.72it/s][A
  5%|▌         | 8/157 [00:01<00:25,  5.78it/s][A
  6%|▌         | 9/157 [00:01<00:25,  5.84it/s][A
  6%|▋         | 10/157 [00:01<00:24,  6.03it/s][A
  7%|▋         | 11/157 [00:01<00:23,  6.17it/s][A
  8%|▊         | 12/157 [00:01<00:22,  6.32it/s][A
  8%|▊         | 13/157 [00:02<00:22,  6.44it/s][A
  9%|▉         | 14/157 [00:02<00:21,  6.68it/s][A
 10%|▉         | 15/157 [00:02<00:20,  6.85it/s][A
 10%|█         | 16/157 [00:02<00:20,  6.97it/s][A
 11%|█         | 17/157 [00:02<00:19,  7.07it/s][A
 11%|█▏        | 18/157 [00:02<00:19,  7.22it/s][A
 12%|█▏        | 19/157 [00:02<00:19,  7.26it/s][A
 13%|█▎        | 20/157 [00:02<00:18,  7.31it/s][A
 13%|█▎        | 21/157 [00:03<00:18,  7.39it/s][A
 14%|█▍        | 22/157 [0

{'eval_loss': 0.48281434178352356, 'eval_acc': 0.8626, 'eval_acc2': 0.936, 'eval_f1': 0.8621292897042675, 'eval_roc_auc_micro': 0.998483812929293, 'eval_precision': 0.8626, 'eval_recall': 0.8626, 'eval_runtime': 47.8054, 'eval_samples_per_second': 209.182, 'eval_steps_per_second': 3.284, 'epoch': 7.3}


 74%|███████▎  | 1150/1560 [1:05:36<10:26,  1.53s/it]  

{'loss': 0.2575, 'grad_norm': 1.5224308967590332, 'learning_rate': 0.00016096950150405455, 'epoch': 7.36}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 17.51it/s][A
  3%|▎         | 5/157 [00:00<00:10, 13.96it/s][A
  4%|▍         | 7/157 [00:00<00:11, 12.91it/s][A
  6%|▌         | 9/157 [00:00<00:12, 12.22it/s][A
  7%|▋         | 11/157 [00:00<00:12, 11.54it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.34it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.54it/s][A
 11%|█         | 17/157 [00:01<00:12, 11.61it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.58it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.56it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 11.64it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.68it/s][A
 17%|█▋        | 27/157 [00:02<00:11, 11.81it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 11.76it/s][A
 20%|█▉        | 31/157 [00:02<00:16,  7.71it/s][A
 21%|██        | 33/157 [00:03<00:14,  8.70it/s][A
 22%|██▏       | 35/157 [00:03<00:12,  9.44it/s][A
 24%|██▎       | 37/157 [00:03<00:11, 10.02it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.48836904764175415, 'eval_acc': 0.8603, 'eval_acc2': 0.934, 'eval_f1': 0.8599555779215665, 'eval_roc_auc_micro': 0.9984210673232323, 'eval_precision': 0.8603, 'eval_recall': 0.8603, 'eval_runtime': 16.1859, 'eval_samples_per_second': 617.821, 'eval_steps_per_second': 9.7, 'epoch': 7.36}


 74%|███████▍  | 1160/1560 [1:06:04<09:26,  1.42s/it]

{'loss': 0.2258, 'grad_norm': 1.970708966255188, 'learning_rate': 0.00015363782324520031, 'epoch': 7.42}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:12, 12.64it/s][A
  3%|▎         | 4/157 [00:00<00:13, 11.49it/s][A
  4%|▍         | 6/157 [00:00<00:12, 11.62it/s][A
  5%|▌         | 8/157 [00:00<00:12, 11.74it/s][A
  6%|▋         | 10/157 [00:00<00:12, 11.76it/s][A
  8%|▊         | 12/157 [00:01<00:12, 11.41it/s][A
  9%|▉         | 14/157 [00:01<00:13, 10.99it/s][A
 10%|█         | 16/157 [00:01<00:12, 10.96it/s][A
 11%|█▏        | 18/157 [00:01<00:12, 11.00it/s][A
 13%|█▎        | 20/157 [00:01<00:14,  9.28it/s][A
 13%|█▎        | 21/157 [00:02<00:17,  7.77it/s][A
 14%|█▍        | 22/157 [00:02<00:18,  7.14it/s][A
 15%|█▌        | 24/157 [00:02<00:16,  8.24it/s][A
 17%|█▋        | 26/157 [00:02<00:14,  9.19it/s][A
 18%|█▊        | 28/157 [00:02<00:12,  9.93it/s][A
 19%|█▉        | 30/157 [00:02<00:11, 10.59it/s][A
 20%|██        | 32/157 [00:03<00:11, 10.84it/s][A
 22%|██▏       | 34/157 [00:03<00:10, 11.29it/s][A
 23%|██▎       | 36/157 

{'eval_loss': 0.4829813539981842, 'eval_acc': 0.8614, 'eval_acc2': 0.9366, 'eval_f1': 0.8611465731195677, 'eval_roc_auc_micro': 0.9984473368181819, 'eval_precision': 0.8614, 'eval_recall': 0.8614, 'eval_runtime': 17.396, 'eval_samples_per_second': 574.845, 'eval_steps_per_second': 9.025, 'epoch': 7.42}


 75%|███████▌  | 1170/1560 [1:06:32<08:01,  1.23s/it]

{'loss': 0.2119, 'grad_norm': 2.1769118309020996, 'learning_rate': 0.00014644660940672628, 'epoch': 7.49}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.10it/s][A
  3%|▎         | 5/157 [00:00<00:10, 13.98it/s][A
  4%|▍         | 7/157 [00:00<00:11, 12.80it/s][A
  6%|▌         | 9/157 [00:00<00:12, 11.95it/s][A
  7%|▋         | 11/157 [00:00<00:12, 11.88it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.88it/s][A
 10%|▉         | 15/157 [00:01<00:11, 11.84it/s][A
 11%|█         | 17/157 [00:01<00:11, 11.67it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.58it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.49it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 11.46it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.60it/s][A
 17%|█▋        | 27/157 [00:02<00:11, 11.63it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 11.70it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 11.83it/s][A
 21%|██        | 33/157 [00:02<00:10, 11.88it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 11.68it/s][A
 24%|██▎       | 37/157 [00:03<00:10, 11.85it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.47288280725479126, 'eval_acc': 0.8635, 'eval_acc2': 0.9384, 'eval_f1': 0.862985101922736, 'eval_roc_auc_micro': 0.9984887479292929, 'eval_precision': 0.8635, 'eval_recall': 0.8635, 'eval_runtime': 40.8971, 'eval_samples_per_second': 244.516, 'eval_steps_per_second': 3.839, 'epoch': 7.49}


 76%|███████▌  | 1180/1560 [1:07:23<08:46,  1.39s/it]  

{'loss': 0.2722, 'grad_norm': 3.90692138671875, 'learning_rate': 0.0001393987763280928, 'epoch': 7.55}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:23,  6.54it/s][A
  3%|▎         | 5/157 [00:00<00:17,  8.59it/s][A
  4%|▍         | 7/157 [00:00<00:15,  9.85it/s][A
  6%|▌         | 9/157 [00:00<00:13, 10.70it/s][A
  7%|▋         | 11/157 [00:01<00:13, 11.02it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.27it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.40it/s][A
 11%|█         | 17/157 [00:01<00:12, 11.57it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.63it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.56it/s][A
 15%|█▍        | 23/157 [00:02<00:11, 11.79it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.94it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.03it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.08it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.19it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.29it/s][A
 22%|██▏       | 35/157 [00:03<00:09, 12.33it/s][A
 24%|██▎       | 37/157 [00:03<00:14,  8.12it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.4797546863555908, 'eval_acc': 0.8602, 'eval_acc2': 0.9372, 'eval_f1': 0.8596234054466673, 'eval_roc_auc_micro': 0.9984604366666667, 'eval_precision': 0.8602, 'eval_recall': 0.8602, 'eval_runtime': 18.5053, 'eval_samples_per_second': 540.385, 'eval_steps_per_second': 8.484, 'epoch': 7.55}


 76%|███████▋  | 1190/1560 [1:07:51<06:56,  1.13s/it]

{'loss': 0.2398, 'grad_norm': 2.2049684524536133, 'learning_rate': 0.00013249718220183582, 'epoch': 7.62}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 19.02it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.05it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.93it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.36it/s][A
  7%|▋         | 11/157 [00:00<00:11, 13.12it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.98it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.87it/s][A
 11%|█         | 17/157 [00:01<00:10, 12.82it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.70it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.73it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.72it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.64it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.57it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.61it/s][A
 20%|█▉        | 31/157 [00:02<00:09, 12.62it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.69it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.70it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.62it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.48523056507110596, 'eval_acc': 0.8574, 'eval_acc2': 0.9354, 'eval_f1': 0.8572825708697391, 'eval_roc_auc_micro': 0.998410665909091, 'eval_precision': 0.8574, 'eval_recall': 0.8574, 'eval_runtime': 16.3561, 'eval_samples_per_second': 611.391, 'eval_steps_per_second': 9.599, 'epoch': 7.62}


 77%|███████▋  | 1200/1560 [1:08:17<06:55,  1.15s/it]

{'loss': 0.2353, 'grad_norm': 1.9910014867782593, 'learning_rate': 0.0001257446259144494, 'epoch': 7.68}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 19.11it/s][A
  3%|▎         | 5/157 [00:00<00:09, 15.31it/s][A
  4%|▍         | 7/157 [00:00<00:10, 14.13it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.45it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.97it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.83it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.77it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.66it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.68it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.68it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.66it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.41it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.45it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.52it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.44it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.54it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.63it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.63it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.4702223539352417, 'eval_acc': 0.8654, 'eval_acc2': 0.9389, 'eval_f1': 0.8656121152465978, 'eval_roc_auc_micro': 0.9985418802525252, 'eval_precision': 0.8654, 'eval_recall': 0.8654, 'eval_runtime': 15.4003, 'eval_samples_per_second': 649.336, 'eval_steps_per_second': 10.195, 'epoch': 7.68}


 78%|███████▊  | 1210/1560 [1:08:42<06:21,  1.09s/it]

{'loss': 0.2218, 'grad_norm': 2.2945919036865234, 'learning_rate': 0.00011914384591132044, 'epoch': 7.74}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 17.53it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.46it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.17it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.62it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.30it/s][A
  8%|▊         | 13/157 [00:01<00:11, 12.11it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.11it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.10it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.08it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.10it/s][A
 15%|█▍        | 23/157 [00:02<00:17,  7.79it/s][A
 16%|█▌        | 25/157 [00:02<00:14,  8.81it/s][A
 17%|█▋        | 27/157 [00:02<00:13,  9.68it/s][A
 18%|█▊        | 29/157 [00:02<00:12, 10.18it/s][A
 20%|█▉        | 31/157 [00:02<00:11, 10.71it/s][A
 21%|██        | 33/157 [00:03<00:11, 10.46it/s][A
 22%|██▏       | 35/157 [00:03<00:11, 10.99it/s][A
 24%|██▎       | 37/157 [00:03<00:10, 11.40it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.46449506282806396, 'eval_acc': 0.8676, 'eval_acc2': 0.9393, 'eval_f1': 0.8673946219864447, 'eval_roc_auc_micro': 0.9985706744444445, 'eval_precision': 0.8676, 'eval_recall': 0.8676, 'eval_runtime': 44.7671, 'eval_samples_per_second': 223.378, 'eval_steps_per_second': 3.507, 'epoch': 7.74}


 78%|███████▊  | 1220/1560 [1:09:36<08:09,  1.44s/it]  

{'loss': 0.2357, 'grad_norm': 2.2795426845550537, 'learning_rate': 0.00011269751908617276, 'epoch': 7.81}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.03it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.03it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.88it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.35it/s][A
  7%|▋         | 11/157 [00:00<00:11, 13.12it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.98it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.89it/s][A
 11%|█         | 17/157 [00:01<00:10, 12.77it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.73it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.71it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.59it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.52it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.59it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.68it/s][A
 20%|█▉        | 31/157 [00:02<00:09, 12.67it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.73it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.72it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.72it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.4669131338596344, 'eval_acc': 0.8675, 'eval_acc2': 0.939, 'eval_f1': 0.8669707380511993, 'eval_roc_auc_micro': 0.9985514539393939, 'eval_precision': 0.8675, 'eval_recall': 0.8675, 'eval_runtime': 37.7677, 'eval_samples_per_second': 264.777, 'eval_steps_per_second': 4.157, 'epoch': 7.81}


 79%|███████▉  | 1230/1560 [1:10:24<07:45,  1.41s/it]  

{'loss': 0.2083, 'grad_norm': 1.936719536781311, 'learning_rate': 0.00010640825969547497, 'epoch': 7.87}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:14, 10.83it/s][A
  3%|▎         | 5/157 [00:00<00:13, 11.57it/s][A
  4%|▍         | 7/157 [00:00<00:12, 11.83it/s][A
  6%|▌         | 9/157 [00:00<00:12, 11.83it/s][A
  7%|▋         | 11/157 [00:00<00:12, 11.69it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.34it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.70it/s][A
 11%|█         | 17/157 [00:01<00:11, 11.84it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.97it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.99it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 12.10it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.14it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.25it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.20it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.16it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.14it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 12.01it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.04it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.46964871883392334, 'eval_acc': 0.8665, 'eval_acc2': 0.9389, 'eval_f1': 0.8661308174005169, 'eval_roc_auc_micro': 0.9985619280303031, 'eval_precision': 0.8665, 'eval_recall': 0.8665, 'eval_runtime': 19.897, 'eval_samples_per_second': 502.589, 'eval_steps_per_second': 7.891, 'epoch': 7.87}


 79%|███████▉  | 1240/1560 [1:10:55<07:02,  1.32s/it]

{'loss': 0.2004, 'grad_norm': 1.2449294328689575, 'learning_rate': 0.00010027861829824952, 'epoch': 7.94}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:09, 17.07it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.12it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.00it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.73it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.48it/s][A
  8%|▊         | 13/157 [00:01<00:11, 12.20it/s][A
 10%|▉         | 15/157 [00:01<00:13, 10.43it/s][A
 11%|█         | 17/157 [00:01<00:16,  8.39it/s][A
 11%|█▏        | 18/157 [00:01<00:17,  8.00it/s][A
 12%|█▏        | 19/157 [00:01<00:17,  7.84it/s][A
 13%|█▎        | 20/157 [00:02<00:17,  7.89it/s][A
 13%|█▎        | 21/157 [00:02<00:17,  7.90it/s][A
 14%|█▍        | 22/157 [00:02<00:16,  8.07it/s][A
 15%|█▍        | 23/157 [00:02<00:16,  8.20it/s][A
 15%|█▌        | 24/157 [00:02<00:15,  8.49it/s][A
 16%|█▌        | 25/157 [00:02<00:15,  8.76it/s][A
 17%|█▋        | 26/157 [00:02<00:14,  8.99it/s][A
 17%|█▋        | 27/157 [00:03<00:24,  5.27it/s][A
 18%|█▊        | 29/157 

{'eval_loss': 0.4790729582309723, 'eval_acc': 0.8622, 'eval_acc2': 0.939, 'eval_f1': 0.8618776977315948, 'eval_roc_auc_micro': 0.9985381508585858, 'eval_precision': 0.8622, 'eval_recall': 0.8622, 'eval_runtime': 19.5088, 'eval_samples_per_second': 512.59, 'eval_steps_per_second': 8.048, 'epoch': 7.94}


 80%|████████  | 1250/1560 [1:11:32<10:36,  2.05s/it]

{'loss': 0.1995, 'grad_norm': 2.2531540393829346, 'learning_rate': 9.431108072171346e-05, 'epoch': 8.0}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:08, 18.97it/s][A
  3%|▎         | 4/157 [00:00<00:31,  4.85it/s][A
  3%|▎         | 5/157 [00:00<00:33,  4.61it/s][A
  4%|▍         | 6/157 [00:01<00:34,  4.33it/s][A
  4%|▍         | 7/157 [00:01<00:36,  4.15it/s][A
  5%|▌         | 8/157 [00:01<00:35,  4.15it/s][A
  6%|▌         | 9/157 [00:01<00:36,  4.10it/s][A
  6%|▋         | 10/157 [00:02<00:36,  4.00it/s][A
  7%|▋         | 11/157 [00:02<00:37,  3.93it/s][A
  8%|▊         | 12/157 [00:02<00:36,  3.95it/s][A
  8%|▊         | 13/157 [00:03<00:36,  3.91it/s][A
  9%|▉         | 14/157 [00:03<00:36,  3.87it/s][A
 10%|▉         | 15/157 [00:03<00:36,  3.86it/s][A
 10%|█         | 16/157 [00:03<00:35,  3.95it/s][A
 11%|█         | 17/157 [00:04<00:35,  3.98it/s][A
 11%|█▏        | 18/157 [00:04<00:35,  3.94it/s][A
 12%|█▏        | 19/157 [00:04<00:35,  3.91it/s][A
 13%|█▎        | 20/157 [00:04<00:35,  3.87it/s][A
 13%|█▎        | 21/157 [00

{'eval_loss': 0.4749569892883301, 'eval_acc': 0.8624, 'eval_acc2': 0.9379, 'eval_f1': 0.862255503371108, 'eval_roc_auc_micro': 0.9985767232828283, 'eval_precision': 0.8624, 'eval_recall': 0.8624, 'eval_runtime': 41.098, 'eval_samples_per_second': 243.321, 'eval_steps_per_second': 3.82, 'epoch': 8.0}


 81%|████████  | 1260/1560 [1:12:29<08:52,  1.78s/it]  

{'loss': 0.1755, 'grad_norm': 1.1377122402191162, 'learning_rate': 8.850806705317183e-05, 'epoch': 8.06}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:20,  7.66it/s][A
  3%|▎         | 5/157 [00:00<00:16,  9.28it/s][A
  4%|▍         | 7/157 [00:00<00:15,  9.98it/s][A
  6%|▌         | 9/157 [00:00<00:14, 10.31it/s][A
  7%|▋         | 11/157 [00:01<00:13, 10.80it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.22it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.42it/s][A
 11%|█         | 17/157 [00:01<00:12, 11.62it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.56it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.67it/s][A
 15%|█▍        | 23/157 [00:02<00:11, 11.83it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.02it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 11.94it/s][A
 18%|█▊        | 29/157 [00:02<00:12, 10.31it/s][A
 20%|█▉        | 31/157 [00:02<00:13,  9.48it/s][A
 20%|██        | 32/157 [00:03<00:13,  9.28it/s][A
 21%|██        | 33/157 [00:03<00:13,  9.25it/s][A
 22%|██▏       | 34/157 [00:03<00:13,  9.19it/s][A
 22%|██▏       | 35/157 

{'eval_loss': 0.4691101610660553, 'eval_acc': 0.8645, 'eval_acc2': 0.9376, 'eval_f1': 0.8641798599769991, 'eval_roc_auc_micro': 0.9986076764141414, 'eval_precision': 0.8645, 'eval_recall': 0.8645, 'eval_runtime': 44.08, 'eval_samples_per_second': 226.86, 'eval_steps_per_second': 3.562, 'epoch': 8.06}


 81%|████████▏ | 1270/1560 [1:13:23<07:06,  1.47s/it]  

{'loss': 0.2029, 'grad_norm': 3.1754403114318848, 'learning_rate': 8.287193065856935e-05, 'epoch': 8.13}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:09, 15.93it/s][A
  3%|▎         | 4/157 [00:00<00:11, 13.21it/s][A
  4%|▍         | 6/157 [00:00<00:12, 12.34it/s][A
  5%|▌         | 8/157 [00:00<00:12, 12.26it/s][A
  6%|▋         | 10/157 [00:00<00:12, 12.24it/s][A
  8%|▊         | 12/157 [00:00<00:11, 12.31it/s][A
  9%|▉         | 14/157 [00:01<00:11, 12.31it/s][A
 10%|█         | 16/157 [00:01<00:18,  7.77it/s][A
 11%|█▏        | 18/157 [00:01<00:16,  8.65it/s][A
 13%|█▎        | 20/157 [00:01<00:14,  9.57it/s][A
 14%|█▍        | 22/157 [00:02<00:13, 10.22it/s][A
 15%|█▌        | 24/157 [00:02<00:12, 10.70it/s][A
 17%|█▋        | 26/157 [00:02<00:11, 11.10it/s][A
 18%|█▊        | 28/157 [00:02<00:11, 11.39it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 11.66it/s][A
 20%|██        | 32/157 [00:02<00:10, 11.87it/s][A
 22%|██▏       | 34/157 [00:03<00:10, 11.93it/s][A
 23%|██▎       | 36/157 [00:03<00:10, 11.88it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.4736596643924713, 'eval_acc': 0.8614, 'eval_acc2': 0.9382, 'eval_f1': 0.8610745567219074, 'eval_roc_auc_micro': 0.998586263989899, 'eval_precision': 0.8614, 'eval_recall': 0.8614, 'eval_runtime': 16.2129, 'eval_samples_per_second': 616.794, 'eval_steps_per_second': 9.684, 'epoch': 8.13}


 82%|████████▏ | 1280/1560 [1:13:49<05:13,  1.12s/it]

{'loss': 0.1874, 'grad_norm': 2.4422519207000732, 'learning_rate': 7.74049572281027e-05, 'epoch': 8.19}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.86it/s][A
  3%|▎         | 5/157 [00:00<00:09, 15.31it/s][A
  4%|▍         | 7/157 [00:00<00:10, 14.08it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.36it/s][A
  7%|▋         | 11/157 [00:00<00:11, 13.01it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.85it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.82it/s][A
 11%|█         | 17/157 [00:01<00:10, 12.80it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.74it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.62it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.60it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.60it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.60it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.55it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.60it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.48it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.48it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.48it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.47911375761032104, 'eval_acc': 0.8616, 'eval_acc2': 0.937, 'eval_f1': 0.8615855727151492, 'eval_roc_auc_micro': 0.9985396816666667, 'eval_precision': 0.8616, 'eval_recall': 0.8616, 'eval_runtime': 19.0696, 'eval_samples_per_second': 524.396, 'eval_steps_per_second': 8.233, 'epoch': 8.19}


 83%|████████▎ | 1290/1560 [1:14:18<05:06,  1.14s/it]

{'loss': 0.1861, 'grad_norm': 1.8556535243988037, 'learning_rate': 7.21093638492763e-05, 'epoch': 8.26}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.84it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.60it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.71it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.19it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.95it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.83it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.58it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.55it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.62it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.62it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.61it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.61it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.62it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.60it/s][A
 20%|█▉        | 31/157 [00:02<00:09, 12.62it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.61it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.58it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.63it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.4686000943183899, 'eval_acc': 0.8645, 'eval_acc2': 0.9393, 'eval_f1': 0.8644984696955864, 'eval_roc_auc_micro': 0.9985850404040404, 'eval_precision': 0.8645, 'eval_recall': 0.8645, 'eval_runtime': 15.3858, 'eval_samples_per_second': 649.951, 'eval_steps_per_second': 10.204, 'epoch': 8.26}


 83%|████████▎ | 1300/1560 [1:14:43<04:41,  1.08s/it]

{'loss': 0.1949, 'grad_norm': 1.428512454032898, 'learning_rate': 6.698729810778065e-05, 'epoch': 8.32}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:09, 16.91it/s][A
  3%|▎         | 5/157 [00:00<00:16,  9.25it/s][A
  4%|▍         | 7/157 [00:00<00:14, 10.37it/s][A
  6%|▌         | 9/157 [00:00<00:13, 10.61it/s][A
  7%|▋         | 11/157 [00:01<00:13, 11.02it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.28it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.40it/s][A
 11%|█         | 17/157 [00:01<00:12, 11.55it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.52it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.70it/s][A
 15%|█▍        | 23/157 [00:02<00:17,  7.46it/s][A
 16%|█▌        | 25/157 [00:02<00:15,  8.49it/s][A
 17%|█▋        | 27/157 [00:02<00:13,  9.38it/s][A
 18%|█▊        | 29/157 [00:02<00:12, 10.11it/s][A
 20%|█▉        | 31/157 [00:02<00:11, 10.66it/s][A
 21%|██        | 33/157 [00:03<00:11, 11.00it/s][A
 22%|██▏       | 35/157 [00:03<00:10, 11.39it/s][A
 24%|██▎       | 37/157 [00:03<00:10, 11.73it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.4627034664154053, 'eval_acc': 0.8664, 'eval_acc2': 0.9413, 'eval_f1': 0.8664243709039018, 'eval_roc_auc_micro': 0.9986084134343434, 'eval_precision': 0.8664, 'eval_recall': 0.8664, 'eval_runtime': 40.8766, 'eval_samples_per_second': 244.639, 'eval_steps_per_second': 3.841, 'epoch': 8.32}


 84%|████████▍ | 1310/1560 [1:15:33<05:45,  1.38s/it]

{'loss': 0.1854, 'grad_norm': 2.464507818222046, 'learning_rate': 6.204083721655607e-05, 'epoch': 8.38}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.95it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.16it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.45it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.57it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.21it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.98it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.06it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.11it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.72it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.85it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 12.04it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.19it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.28it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.34it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.51it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.60it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.55it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.44it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.4637092649936676, 'eval_acc': 0.8656, 'eval_acc2': 0.9423, 'eval_f1': 0.8654300095660301, 'eval_roc_auc_micro': 0.9985918465151515, 'eval_precision': 0.8656, 'eval_recall': 0.8656, 'eval_runtime': 41.8344, 'eval_samples_per_second': 239.038, 'eval_steps_per_second': 3.753, 'epoch': 8.38}


 84%|████████▍ | 1310/1560 [1:16:16<14:33,  3.49s/it]


{'train_runtime': 4576.3742, 'train_samples_per_second': 87.405, 'train_steps_per_second': 0.341, 'train_loss': 0.703006292117461, 'epoch': 8.38}


100%|██████████| 157/157 [00:13<00:00, 12.00it/s]
wandb:                                                                                
wandb: 
wandb: Run history:
wandb:                eval/acc ▁▃▄▆▇▇▇▇▇▇▇▇▇▇▇██▇██████████████████████
wandb:               eval/acc2 ▁▇▇▇▇▇▇▇▇▇▇▇▇▇▇█▇███████████████████████
wandb:                 eval/f1 ▁▅▅▆▆▆▇▆▇▇▇▆▇▇▇▇▇▇▇▇▇▇▇▇█▇███▇██████████
wandb:               eval/loss █▆▅▅▅▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
wandb:          eval/precision ▁▃▅▆▇▇▇▇▇▇▇▇▇▇▇▇▇██▇████████████████████
wandb:             eval/recall ▁▃▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████████
wandb:      eval/roc_auc_micro ▁▄▄▆▇▇▇▇▇��▇▇█▇▇█████████████████████████
wandb:            eval/runtime ▁▁▁▁▄▃▂▇▆▂▂▇▅▇▂▃▂▁▆█▂▂▇▁▁▁▁▁▁▁▂▂▆▂▂▇▁▂▁▇
wandb: eval/samples_per_second ▇▇▇▇▇▇▇▇▁▇▆▁▂▆▂▇█▇▇▁▅▇████▇█▆▆▂▇█▇▅▂▅▆▂▂
wandb:   eval/steps_per_second ▇▇▇█▄▇█▁█▁▆█▂▆▂▅▂▅▅▂▁▄▆████▇█▂▅▂▆▅▂▅▇▇▆▂
wandb:                test/acc ▁
wandb:               test/acc2 ▁
wandb:                 test/f1 ▁
wandb:         

{'loss': 4.4797, 'grad_norm': 1.358915090560913, 'learning_rate': 0.0009998986144924252, 'epoch': 0.06}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:42,  3.67it/s][A
  3%|▎         | 4/157 [00:00<00:26,  5.81it/s][A
  3%|▎         | 5/157 [00:00<00:24,  6.19it/s][A
  4%|▍         | 6/157 [00:01<00:25,  6.01it/s][A
  5%|▌         | 8/157 [00:01<00:21,  7.00it/s][A
  6%|▌         | 9/157 [00:01<00:22,  6.55it/s][A
  6%|▋         | 10/157 [00:01<00:22,  6.47it/s][A
  7%|▋         | 11/157 [00:01<00:22,  6.56it/s][A
  8%|▊         | 12/157 [00:01<00:22,  6.40it/s][A
  8%|▊         | 13/157 [00:02<00:20,  6.91it/s][A
  9%|▉         | 14/157 [00:02<00:21,  6.60it/s][A
 10%|▉         | 15/157 [00:02<00:23,  6.10it/s][A
 10%|█         | 16/157 [00:02<00:22,  6.39it/s][A
 11%|█         | 17/157 [00:02<00:20,  6.73it/s][A
 11%|█▏        | 18/157 [00:02<00:19,  7.31it/s][A
 12%|█▏        | 19/157 [00:02<00:19,  6.92it/s][A
 13%|█▎        | 20/157 [00:03<00:18,  7.23it/s][A
 13%|█▎        | 21/157 [00:03<00:18,  7.45it/s][A
 15%|█▍        | 23/157 [0

{'eval_loss': 4.072371006011963, 'eval_acc': 0.1947, 'eval_acc2': 0.2859, 'eval_f1': 0.15336226415129067, 'eval_roc_auc_micro': 0.8563114183838383, 'eval_precision': 0.1947, 'eval_recall': 0.1947, 'eval_runtime': 17.1111, 'eval_samples_per_second': 584.417, 'eval_steps_per_second': 9.175, 'epoch': 0.06}


  1%|▏         | 20/1560 [00:39<28:21,  1.10s/it]  

{'loss': 3.6906, 'grad_norm': 1.8618786334991455, 'learning_rate': 0.0009995944990857848, 'epoch': 0.13}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.49it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.71it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.39it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.02it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.79it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.72it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.61it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.61it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.57it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.64it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.59it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.31it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.17it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.15it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.27it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.02it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 12.08it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.28it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 3.024113655090332, 'eval_acc': 0.3551, 'eval_acc2': 0.5061, 'eval_f1': 0.30743155895252394, 'eval_roc_auc_micro': 0.9528105622222222, 'eval_precision': 0.3551, 'eval_recall': 0.3551, 'eval_runtime': 15.2293, 'eval_samples_per_second': 656.63, 'eval_steps_per_second': 10.309, 'epoch': 0.13}


  2%|▏         | 30/1560 [01:06<34:13,  1.34s/it]  

{'loss': 2.8391, 'grad_norm': 2.8236098289489746, 'learning_rate': 0.0009990877771116587, 'epoch': 0.19}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:09, 17.05it/s][A
  3%|▎         | 5/157 [00:00<00:10, 13.91it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.11it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.86it/s][A
  7%|▋         | 11/157 [00:01<00:17,  8.55it/s][A
  8%|▊         | 13/157 [00:01<00:15,  9.42it/s][A
 10%|▉         | 15/157 [00:01<00:14, 10.09it/s][A
 11%|█         | 17/157 [00:01<00:13, 10.50it/s][A
 12%|█▏        | 19/157 [00:01<00:12, 10.92it/s][A
 13%|█▎        | 21/157 [00:02<00:15,  8.77it/s][A
 14%|█▍        | 22/157 [00:02<00:16,  8.19it/s][A
 15%|█▍        | 23/157 [00:02<00:17,  7.88it/s][A
 15%|█▌        | 24/157 [00:02<00:16,  7.84it/s][A
 16%|█▌        | 25/157 [00:02<00:16,  7.82it/s][A
 17%|█▋        | 26/157 [00:02<00:16,  7.94it/s][A
 17%|█▋        | 27/157 [00:02<00:15,  8.17it/s][A
 18%|█▊        | 28/157 [00:02<00:15,  8.29it/s][A
 18%|█▊        | 29/157 [00:03<00:14,  8.56it/s][A
 19%|█▉        | 30/157 

{'eval_loss': 2.2331361770629883, 'eval_acc': 0.4995, 'eval_acc2': 0.6538, 'eval_f1': 0.458698991421317, 'eval_roc_auc_micro': 0.9773474327272728, 'eval_precision': 0.4995, 'eval_recall': 0.4995, 'eval_runtime': 19.1847, 'eval_samples_per_second': 521.25, 'eval_steps_per_second': 8.184, 'epoch': 0.19}


  3%|▎         | 40/1560 [01:45<1:00:44,  2.40s/it]

{'loss': 2.3324, 'grad_norm': 3.9403672218322754, 'learning_rate': 0.000998378654067105, 'epoch': 0.26}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:38,  4.01it/s][A
  2%|▏         | 3/157 [00:00<00:32,  4.74it/s][A
  3%|▎         | 4/157 [00:00<00:28,  5.28it/s][A
  3%|▎         | 5/157 [00:00<00:27,  5.60it/s][A
  4%|▍         | 6/157 [00:01<00:25,  5.87it/s][A
  4%|▍         | 7/157 [00:01<00:24,  6.01it/s][A
  5%|▌         | 8/157 [00:01<00:24,  6.19it/s][A
  6%|▌         | 9/157 [00:01<00:23,  6.25it/s][A
  6%|▋         | 10/157 [00:01<00:23,  6.32it/s][A
  7%|▋         | 11/157 [00:01<00:23,  6.21it/s][A
  8%|▊         | 12/157 [00:02<00:26,  5.50it/s][A
  8%|▊         | 13/157 [00:02<00:28,  5.14it/s][A
  9%|▉         | 14/157 [00:02<00:29,  4.90it/s][A
 10%|▉         | 15/157 [00:02<00:30,  4.72it/s][A
 10%|█         | 16/157 [00:03<00:30,  4.63it/s][A
 11%|█         | 17/157 [00:03<00:30,  4.55it/s][A
 11%|█▏        | 18/157 [00:03<00:31,  4.48it/s][A
 12%|█▏        | 19/157 [00:03<00:30,  4.46it/s][A
 13%|█▎        | 20/157 [00:

{'eval_loss': 1.7292789220809937, 'eval_acc': 0.5881, 'eval_acc2': 0.7387, 'eval_f1': 0.5540935470474176, 'eval_roc_auc_micro': 0.9863099011111112, 'eval_precision': 0.5881, 'eval_recall': 0.5881, 'eval_runtime': 33.1704, 'eval_samples_per_second': 301.474, 'eval_steps_per_second': 4.733, 'epoch': 0.26}


  3%|▎         | 50/1560 [02:31<39:34,  1.57s/it]  

{'loss': 1.977, 'grad_norm': 3.984400510787964, 'learning_rate': 0.0009974674175313228, 'epoch': 0.32}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:09, 17.07it/s][A
  3%|▎         | 5/157 [00:00<00:10, 13.91it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.26it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.96it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.51it/s][A
  8%|▊         | 13/157 [00:01<00:11, 12.19it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.16it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.12it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.11it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.12it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 12.16it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.06it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.11it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 11.99it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 11.97it/s][A
 21%|██        | 33/157 [00:02<00:10, 11.84it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 11.66it/s][A
 24%|██▎       | 37/157 [00:03<00:10, 11.67it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 1.4255542755126953, 'eval_acc': 0.6463, 'eval_acc2': 0.7966, 'eval_f1': 0.6234751180257938, 'eval_roc_auc_micro': 0.9901875076262625, 'eval_precision': 0.6463, 'eval_recall': 0.6463, 'eval_runtime': 16.1631, 'eval_samples_per_second': 618.692, 'eval_steps_per_second': 9.713, 'epoch': 0.32}


  4%|▍         | 60/1560 [02:57<28:47,  1.15s/it]  

{'loss': 1.7409, 'grad_norm': 6.387554168701172, 'learning_rate': 0.000996354437049027, 'epoch': 0.38}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 17.30it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.59it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.60it/s][A
  6%|▌         | 9/157 [00:00<00:17,  8.60it/s][A
  7%|▋         | 11/157 [00:01<00:15,  9.59it/s][A
  8%|▊         | 13/157 [00:01<00:13, 10.29it/s][A
 10%|▉         | 15/157 [00:01<00:13, 10.85it/s][A
 11%|█         | 17/157 [00:01<00:12, 11.32it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.65it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.78it/s][A
 15%|█▍        | 23/157 [00:02<00:11, 11.84it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.01it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.18it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.21it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.12it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.16it/s][A
 22%|██▏       | 35/157 [00:03<00:10, 11.92it/s][A
 24%|██▎       | 37/157 [00:03<00:10, 11.90it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 1.2488033771514893, 'eval_acc': 0.6743, 'eval_acc2': 0.8169, 'eval_f1': 0.6612073628233753, 'eval_roc_auc_micro': 0.9917589689898989, 'eval_precision': 0.6743, 'eval_recall': 0.6743, 'eval_runtime': 15.5679, 'eval_samples_per_second': 642.348, 'eval_steps_per_second': 10.085, 'epoch': 0.38}


  4%|▍         | 70/1560 [03:23<27:22,  1.10s/it]  

{'loss': 1.6353, 'grad_norm': 3.6382622718811035, 'learning_rate': 0.0009950401639805821, 'epoch': 0.45}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 17.23it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.05it/s][A
  4%|▍         | 7/157 [00:00<00:12, 12.31it/s][A
  6%|▌         | 9/157 [00:00<00:12, 12.22it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.23it/s][A
  8%|▊         | 13/157 [00:01<00:11, 12.29it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.32it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.50it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.52it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.45it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.39it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.09it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.19it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.25it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.23it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.14it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 12.01it/s][A
 24%|██▎       | 37/157 [00:02<00:10, 11.92it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 1.068459153175354, 'eval_acc': 0.7182, 'eval_acc2': 0.8434, 'eval_f1': 0.7094876134754727, 'eval_roc_auc_micro': 0.9936614864646465, 'eval_precision': 0.7182, 'eval_recall': 0.7182, 'eval_runtime': 14.8397, 'eval_samples_per_second': 673.87, 'eval_steps_per_second': 10.58, 'epoch': 0.45}


  5%|▌         | 80/1560 [03:47<26:35,  1.08s/it]  

{'loss': 1.4796, 'grad_norm': 3.1023545265197754, 'learning_rate': 0.0009935251313189565, 'epoch': 0.51}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:11, 13.23it/s][A
  3%|▎         | 4/157 [00:00<00:13, 10.96it/s][A
  4%|▍         | 6/157 [00:00<00:13, 11.60it/s][A
  5%|▌         | 8/157 [00:00<00:12, 11.96it/s][A
  6%|▋         | 10/157 [00:00<00:12, 11.89it/s][A
  8%|▊         | 12/157 [00:01<00:11, 12.15it/s][A
  9%|▉         | 14/157 [00:01<00:11, 12.35it/s][A
 10%|█         | 16/157 [00:01<00:11, 12.46it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 12.19it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 12.27it/s][A
 14%|█▍        | 22/157 [00:01<00:10, 12.42it/s][A
 15%|█▌        | 24/157 [00:01<00:10, 12.44it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.47it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 12.27it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 12.24it/s][A
 20%|██        | 32/157 [00:02<00:10, 12.32it/s][A
 22%|██▏       | 34/157 [00:02<00:09, 12.34it/s][A
 23%|██▎       | 36/157 [00:02<00:09, 12.43it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.9875775575637817, 'eval_acc': 0.7213, 'eval_acc2': 0.8546, 'eval_f1': 0.7136377949251196, 'eval_roc_auc_micro': 0.9944855067676767, 'eval_precision': 0.7213, 'eval_recall': 0.7213, 'eval_runtime': 14.8944, 'eval_samples_per_second': 671.393, 'eval_steps_per_second': 10.541, 'epoch': 0.51}


  6%|▌         | 90/1560 [04:12<26:20,  1.08s/it]  

{'loss': 1.4857, 'grad_norm': 3.1386730670928955, 'learning_rate': 0.0009918099534735718, 'epoch': 0.58}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 19.01it/s][A
  3%|▎         | 5/157 [00:00<00:18,  8.03it/s][A
  4%|▍         | 7/157 [00:00<00:16,  9.37it/s][A
  6%|▌         | 9/157 [00:00<00:14, 10.36it/s][A
  7%|▋         | 11/157 [00:01<00:13, 11.04it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.52it/s][A
 10%|▉         | 15/157 [00:01<00:11, 11.88it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.12it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.33it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.34it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.41it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.46it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.51it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.61it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.43it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.45it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.37it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.40it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.9785988330841064, 'eval_acc': 0.7269, 'eval_acc2': 0.8542, 'eval_f1': 0.7185760582666466, 'eval_roc_auc_micro': 0.9944544946969697, 'eval_precision': 0.7269, 'eval_recall': 0.7269, 'eval_runtime': 15.175, 'eval_samples_per_second': 658.978, 'eval_steps_per_second': 10.346, 'epoch': 0.58}


  6%|▋         | 100/1560 [04:36<26:03,  1.07s/it] 

{'loss': 1.4258, 'grad_norm': 4.667105674743652, 'learning_rate': 0.0009898953260211339, 'epoch': 0.64}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.64it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.20it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.36it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.05it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.91it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.78it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.77it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.69it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.71it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.65it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.69it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.63it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.56it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.59it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.50it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.46it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.23it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.23it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.9508662223815918, 'eval_acc': 0.7283, 'eval_acc2': 0.8523, 'eval_f1': 0.7236313081460245, 'eval_roc_auc_micro': 0.994489908989899, 'eval_precision': 0.7283, 'eval_recall': 0.7283, 'eval_runtime': 15.1831, 'eval_samples_per_second': 658.629, 'eval_steps_per_second': 10.34, 'epoch': 0.64}


  7%|▋         | 110/1560 [05:01<26:05,  1.08s/it]  

{'loss': 1.4155, 'grad_norm': 3.7776384353637695, 'learning_rate': 0.000987782025423547, 'epoch': 0.7}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:16,  9.52it/s][A
  2%|▏         | 3/157 [00:00<00:24,  6.19it/s][A
  3%|▎         | 4/157 [00:00<00:21,  7.10it/s][A
  4%|▍         | 6/157 [00:00<00:16,  9.15it/s][A
  5%|▌         | 8/157 [00:00<00:14, 10.38it/s][A
  6%|▋         | 10/157 [00:01<00:13, 11.05it/s][A
  8%|▊         | 12/157 [00:01<00:12, 11.46it/s][A
  9%|▉         | 14/157 [00:01<00:12, 11.86it/s][A
 10%|█         | 16/157 [00:01<00:11, 11.94it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 12.10it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 12.24it/s][A
 14%|█▍        | 22/157 [00:01<00:10, 12.40it/s][A
 15%|█▌        | 24/157 [00:02<00:10, 12.55it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.61it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 12.64it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 12.38it/s][A
 20%|██        | 32/157 [00:02<00:10, 12.44it/s][A
 22%|██▏       | 34/157 [00:02<00:09, 12.48it/s][A
 23%|██▎       | 36/157 [

{'eval_loss': 0.9124379754066467, 'eval_acc': 0.7367, 'eval_acc2': 0.8616, 'eval_f1': 0.7286148422372697, 'eval_roc_auc_micro': 0.9949887621212121, 'eval_precision': 0.7367, 'eval_recall': 0.7367, 'eval_runtime': 14.742, 'eval_samples_per_second': 678.336, 'eval_steps_per_second': 10.65, 'epoch': 0.7}


  8%|▊         | 120/1560 [05:25<25:45,  1.07s/it]  

{'loss': 1.3744, 'grad_norm': 3.487565517425537, 'learning_rate': 0.000985470908713026, 'epoch': 0.77}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.80it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.58it/s][A
  4%|▍         | 7/157 [00:00<00:18,  8.05it/s][A
  6%|▌         | 9/157 [00:00<00:15,  9.30it/s][A
  7%|▋         | 11/157 [00:01<00:14, 10.15it/s][A
  8%|▊         | 13/157 [00:01<00:13, 10.87it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.41it/s][A
 11%|█         | 17/157 [00:01<00:11, 11.76it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.04it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.21it/s][A
 15%|█▍        | 23/157 [00:02<00:10, 12.34it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.47it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.37it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.49it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.53it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.55it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.52it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.57it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.8686138391494751, 'eval_acc': 0.7508, 'eval_acc2': 0.8636, 'eval_f1': 0.7462749505954642, 'eval_roc_auc_micro': 0.9956287942424243, 'eval_precision': 0.7508, 'eval_recall': 0.7508, 'eval_runtime': 15.2921, 'eval_samples_per_second': 653.934, 'eval_steps_per_second': 10.267, 'epoch': 0.77}


  8%|▊         | 130/1560 [05:50<25:24,  1.07s/it]  

{'loss': 1.3252, 'grad_norm': 3.9499244689941406, 'learning_rate': 0.0009829629131445341, 'epoch': 0.83}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.34it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.91it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.71it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.22it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.98it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.82it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.66it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.56it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.58it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.61it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.54it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.54it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.51it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.47it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.30it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.12it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.29it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.17it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.9076951146125793, 'eval_acc': 0.7383, 'eval_acc2': 0.8573, 'eval_f1': 0.7335889959537344, 'eval_roc_auc_micro': 0.9949838735858586, 'eval_precision': 0.7383, 'eval_recall': 0.7383, 'eval_runtime': 15.0721, 'eval_samples_per_second': 663.478, 'eval_steps_per_second': 10.417, 'epoch': 0.83}


  9%|▉         | 140/1560 [06:14<25:32,  1.08s/it]  

{'loss': 1.3467, 'grad_norm': 3.2487339973449707, 'learning_rate': 0.000980259055815686, 'epoch': 0.9}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.40it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.10it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.99it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.37it/s][A
  7%|▋         | 11/157 [00:00<00:11, 13.15it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.90it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.83it/s][A
 11%|█         | 17/157 [00:01<00:10, 12.84it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.86it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.87it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.88it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.89it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.88it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.62it/s][A
 20%|█▉        | 31/157 [00:02<00:09, 12.65it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.62it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.60it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.66it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.8821486234664917, 'eval_acc': 0.7434, 'eval_acc2': 0.865, 'eval_f1': 0.7365217813724098, 'eval_roc_auc_micro': 0.9953926366666667, 'eval_precision': 0.7434, 'eval_recall': 0.7434, 'eval_runtime': 15.0376, 'eval_samples_per_second': 665.002, 'eval_steps_per_second': 10.441, 'epoch': 0.9}


 10%|▉         | 150/1560 [06:39<24:59,  1.06s/it]  

{'loss': 1.3028, 'grad_norm': 5.9649434089660645, 'learning_rate': 0.0009773604332542728, 'epoch': 0.96}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.83it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.12it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.98it/s][A
  6%|▌         | 9/157 [00:00<00:18,  7.85it/s][A
  7%|▋         | 11/157 [00:01<00:16,  9.02it/s][A
  8%|▊         | 13/157 [00:01<00:14,  9.92it/s][A
 10%|▉         | 15/157 [00:01<00:13, 10.61it/s][A
 11%|█         | 17/157 [00:01<00:12, 11.12it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.52it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.82it/s][A
 15%|█▍        | 23/157 [00:02<00:11, 12.05it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.25it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 11.86it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 11.89it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 11.74it/s][A
 21%|██        | 33/157 [00:02<00:11, 11.18it/s][A
 22%|██▏       | 35/157 [00:03<00:11, 10.87it/s][A
 24%|██▎       | 37/157 [00:03<00:11, 10.49it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.8765919208526611, 'eval_acc': 0.7459, 'eval_acc2': 0.8657, 'eval_f1': 0.7427277856284387, 'eval_roc_auc_micro': 0.9950803591919192, 'eval_precision': 0.7459, 'eval_recall': 0.7459, 'eval_runtime': 18.2143, 'eval_samples_per_second': 549.018, 'eval_steps_per_second': 8.62, 'epoch': 0.96}


 10%|█         | 160/1560 [07:09<32:25,  1.39s/it]  

{'loss': 1.1866, 'grad_norm': 8.126376152038574, 'learning_rate': 0.0009742682209735727, 'epoch': 1.02}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.27it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.89it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.62it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.87it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.19it/s][A
  8%|▊         | 13/157 [00:01<00:11, 12.19it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.25it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.36it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.33it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.48it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.47it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.53it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.46it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.51it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.56it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.57it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.48it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.50it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.8486950993537903, 'eval_acc': 0.76, 'eval_acc2': 0.873, 'eval_f1': 0.7568541041785772, 'eval_roc_auc_micro': 0.9954791746969698, 'eval_precision': 0.76, 'eval_recall': 0.76, 'eval_runtime': 15.29, 'eval_samples_per_second': 654.021, 'eval_steps_per_second': 10.268, 'epoch': 1.02}


 11%|█         | 170/1560 [07:34<25:14,  1.09s/it]  

{'loss': 1.1037, 'grad_norm': 4.1659722328186035, 'learning_rate': 0.0009709836729956326, 'epoch': 1.09}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:09, 16.56it/s][A
  3%|▎         | 5/157 [00:00<00:11, 13.23it/s][A
  4%|▍         | 7/157 [00:00<00:12, 11.91it/s][A
  6%|▌         | 9/157 [00:00<00:12, 11.75it/s][A
  7%|▋         | 11/157 [00:00<00:13, 11.18it/s][A
  8%|▊         | 13/157 [00:01<00:13, 10.88it/s][A
 10%|▉         | 15/157 [00:01<00:13, 10.73it/s][A
 11%|█         | 17/157 [00:01<00:13, 10.64it/s][A
 12%|█▏        | 19/157 [00:01<00:13, 10.58it/s][A
 13%|█▎        | 21/157 [00:01<00:12, 10.71it/s][A
 15%|█▍        | 23/157 [00:02<00:12, 10.79it/s][A
 16%|█▌        | 25/157 [00:02<00:12, 10.78it/s][A
 17%|█▋        | 27/157 [00:02<00:11, 10.97it/s][A
 18%|█▊        | 29/157 [00:02<00:11, 11.04it/s][A
 20%|█▉        | 31/157 [00:02<00:11, 10.97it/s][A
 21%|██        | 33/157 [00:02<00:11, 10.83it/s][A
 22%|██▏       | 35/157 [00:03<00:11, 10.85it/s][A
 24%|██▎       | 37/157 [00:03<00:11, 10.85it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.8307854533195496, 'eval_acc': 0.7569, 'eval_acc2': 0.8701, 'eval_f1': 0.7512208391574007, 'eval_roc_auc_micro': 0.9955318677272728, 'eval_precision': 0.7569, 'eval_recall': 0.7569, 'eval_runtime': 17.7086, 'eval_samples_per_second': 564.699, 'eval_steps_per_second': 8.866, 'epoch': 1.09}


 12%|█▏        | 180/1560 [08:03<26:00,  1.13s/it]  

{'loss': 1.0821, 'grad_norm': 3.0783464908599854, 'learning_rate': 0.0009675081213427075, 'epoch': 1.15}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.91it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.09it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.57it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.10it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.92it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.78it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.59it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.58it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.56it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.59it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.39it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.43it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.53it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.54it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 11.90it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.14it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.33it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.36it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.7237176299095154, 'eval_acc': 0.7826, 'eval_acc2': 0.8895, 'eval_f1': 0.7802563178921148, 'eval_roc_auc_micro': 0.9967770445454547, 'eval_precision': 0.7826, 'eval_recall': 0.7826, 'eval_runtime': 15.6193, 'eval_samples_per_second': 640.236, 'eval_steps_per_second': 10.052, 'epoch': 1.15}


 12%|█▏        | 190/1560 [08:28<24:59,  1.09s/it]  

{'loss': 1.1259, 'grad_norm': 3.5821731090545654, 'learning_rate': 0.0009638429754970715, 'epoch': 1.22}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:09, 16.50it/s][A
  3%|▎         | 5/157 [00:00<00:17,  8.62it/s][A
  4%|▍         | 7/157 [00:00<00:15,  9.85it/s][A
  6%|▌         | 9/157 [00:00<00:13, 10.67it/s][A
  7%|▋         | 11/157 [00:01<00:13, 11.17it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.53it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.64it/s][A
 11%|█         | 17/157 [00:01<00:11, 11.73it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.52it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.72it/s][A
 15%|█▍        | 23/157 [00:02<00:11, 11.93it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.11it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.22it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.33it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.43it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.36it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.41it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.41it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.7828617691993713, 'eval_acc': 0.775, 'eval_acc2': 0.8802, 'eval_f1': 0.7722844956983648, 'eval_roc_auc_micro': 0.9961493522727272, 'eval_precision': 0.775, 'eval_recall': 0.775, 'eval_runtime': 15.2017, 'eval_samples_per_second': 657.82, 'eval_steps_per_second': 10.328, 'epoch': 1.22}


 13%|█▎        | 200/1560 [08:53<24:41,  1.09s/it]  

{'loss': 1.0656, 'grad_norm': 3.1141092777252197, 'learning_rate': 0.0009599897218294122, 'epoch': 1.28}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.83it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.93it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.88it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.34it/s][A
  7%|▋         | 11/157 [00:00<00:11, 13.10it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.89it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.50it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.49it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.48it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.48it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.40it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.34it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.45it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.51it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.36it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.50it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 11.84it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.03it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.7710549235343933, 'eval_acc': 0.7698, 'eval_acc2': 0.8831, 'eval_f1': 0.7670603633181698, 'eval_roc_auc_micro': 0.9961315682828282, 'eval_precision': 0.7698, 'eval_recall': 0.7698, 'eval_runtime': 15.3936, 'eval_samples_per_second': 649.62, 'eval_steps_per_second': 10.199, 'epoch': 1.28}


 13%|█▎        | 210/1560 [09:18<25:26,  1.13s/it]  

{'loss': 1.1313, 'grad_norm': 3.719470500946045, 'learning_rate': 0.0009559499229960451, 'epoch': 1.34}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:26,  5.93it/s][A
  3%|▎         | 4/157 [00:00<00:17,  8.59it/s][A
  4%|▍         | 6/157 [00:00<00:15,  9.63it/s][A
  5%|▌         | 8/157 [00:00<00:14, 10.64it/s][A
  6%|▋         | 10/157 [00:00<00:13, 11.24it/s][A
  8%|▊         | 12/157 [00:01<00:12, 11.58it/s][A
  9%|▉         | 14/157 [00:01<00:12, 11.75it/s][A
 10%|█         | 16/157 [00:01<00:11, 12.01it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 12.20it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 12.34it/s][A
 14%|█▍        | 22/157 [00:01<00:10, 12.39it/s][A
 15%|█▌        | 24/157 [00:02<00:10, 12.19it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.04it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 12.13it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 12.23it/s][A
 20%|██        | 32/157 [00:02<00:10, 12.35it/s][A
 22%|██▏       | 34/157 [00:02<00:09, 12.45it/s][A
 23%|██▎       | 36/157 [00:03<00:09, 12.47it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.7216058373451233, 'eval_acc': 0.7888, 'eval_acc2': 0.8907, 'eval_f1': 0.7858391357143364, 'eval_roc_auc_micro': 0.9967459995454545, 'eval_precision': 0.7888, 'eval_recall': 0.7888, 'eval_runtime': 19.5122, 'eval_samples_per_second': 512.5, 'eval_steps_per_second': 8.046, 'epoch': 1.34}


 14%|█▍        | 220/1560 [09:47<25:15,  1.13s/it]  

{'loss': 1.0602, 'grad_norm': 2.9853084087371826, 'learning_rate': 0.0009517252173051911, 'epoch': 1.41}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.26it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.68it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.68it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.13it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.74it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.73it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.64it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.70it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.55it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.51it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.56it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.57it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.60it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.53it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.49it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.48it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.55it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.59it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.7343201041221619, 'eval_acc': 0.7841, 'eval_acc2': 0.8888, 'eval_f1': 0.7804200997456896, 'eval_roc_auc_micro': 0.9965649573737374, 'eval_precision': 0.7841, 'eval_recall': 0.7841, 'eval_runtime': 15.3451, 'eval_samples_per_second': 651.673, 'eval_steps_per_second': 10.231, 'epoch': 1.41}


 15%|█▍        | 230/1560 [10:12<24:07,  1.09s/it]  

{'loss': 1.0513, 'grad_norm': 3.627594232559204, 'learning_rate': 0.0009473173180525737, 'epoch': 1.47}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.46it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.91it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.81it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.23it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.81it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.76it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.76it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.53it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.58it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.34it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.43it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.54it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.58it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.60it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.41it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.29it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 12.15it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.20it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.7268187999725342, 'eval_acc': 0.7846, 'eval_acc2': 0.8899, 'eval_f1': 0.7820613402111974, 'eval_roc_auc_micro': 0.9966192935858585, 'eval_precision': 0.7846, 'eval_recall': 0.7846, 'eval_runtime': 19.8096, 'eval_samples_per_second': 504.806, 'eval_steps_per_second': 7.925, 'epoch': 1.47}


 15%|█▌        | 240/1560 [10:42<26:26,  1.20s/it]  

{'loss': 1.0445, 'grad_norm': 3.3133533000946045, 'learning_rate': 0.0009427280128266049, 'epoch': 1.54}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 17.32it/s][A
  3%|▎         | 5/157 [00:00<00:12, 12.04it/s][A
  4%|▍         | 7/157 [00:00<00:12, 12.28it/s][A
  6%|▌         | 9/157 [00:00<00:12, 12.21it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.28it/s][A
  8%|▊         | 13/157 [00:01<00:11, 12.34it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.05it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.14it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.28it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.27it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.34it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.34it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.43it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.48it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.49it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.47it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.50it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.48it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.735007107257843, 'eval_acc': 0.7814, 'eval_acc2': 0.8915, 'eval_f1': 0.780842332094023, 'eval_roc_auc_micro': 0.996556831111111, 'eval_precision': 0.7814, 'eval_recall': 0.7814, 'eval_runtime': 15.45, 'eval_samples_per_second': 647.25, 'eval_steps_per_second': 10.162, 'epoch': 1.54}


 16%|█▌        | 250/1560 [11:07<23:41,  1.08s/it]  

{'loss': 1.0497, 'grad_norm': 3.2592613697052, 'learning_rate': 0.000937959162783444, 'epoch': 1.6}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.46it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.89it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.83it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.33it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.97it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.66it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.60it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.46it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.52it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.42it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.53it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.57it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.52it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.47it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.49it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.52it/s][A
 22%|██▏       | 35/157 [00:02<00:14,  8.43it/s][A
 24%|██▎       | 37/157 [00:03<00:12,  9.40it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.6879010796546936, 'eval_acc': 0.79, 'eval_acc2': 0.8978, 'eval_f1': 0.7898155073456619, 'eval_roc_auc_micro': 0.9971513134343435, 'eval_precision': 0.79, 'eval_recall': 0.79, 'eval_runtime': 39.0118, 'eval_samples_per_second': 256.333, 'eval_steps_per_second': 4.024, 'epoch': 1.6}


 17%|█▋        | 260/1560 [11:55<29:44,  1.37s/it]  

{'loss': 1.0629, 'grad_norm': 3.7890329360961914, 'learning_rate': 0.0009330127018922195, 'epoch': 1.66}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 19.01it/s][A
  3%|▎         | 5/157 [00:00<00:09, 15.26it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.86it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.78it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.53it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.51it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.33it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.25it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.27it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.35it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.24it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.39it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.47it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.52it/s][A
 20%|█▉        | 31/157 [00:02<00:09, 12.63it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.12it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 11.82it/s][A
 24%|██▎       | 37/157 [00:02<00:10, 11.89it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.716563880443573, 'eval_acc': 0.7859, 'eval_acc2': 0.892, 'eval_f1': 0.7847258125362029, 'eval_roc_auc_micro': 0.9967588441414142, 'eval_precision': 0.7859, 'eval_recall': 0.7859, 'eval_runtime': 35.4125, 'eval_samples_per_second': 282.386, 'eval_steps_per_second': 4.433, 'epoch': 1.66}


 17%|█▋        | 270/1560 [12:41<29:40,  1.38s/it]  

{'loss': 1.0484, 'grad_norm': 3.172822952270508, 'learning_rate': 0.0009278906361507238, 'epoch': 1.73}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:12, 12.31it/s][A
  3%|▎         | 5/157 [00:00<00:12, 12.08it/s][A
  4%|▍         | 7/157 [00:00<00:12, 12.20it/s][A
  6%|▌         | 9/157 [00:00<00:12, 12.20it/s][A
  7%|▋         | 11/157 [00:00<00:12, 11.67it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.60it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.83it/s][A
 11%|█         | 17/157 [00:01<00:11, 11.80it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.75it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.67it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 11.73it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.79it/s][A
 17%|█▋        | 27/157 [00:02<00:11, 11.61it/s][A
 18%|█▊        | 29/157 [00:02<00:11, 11.42it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 11.47it/s][A
 21%|██        | 33/157 [00:02<00:10, 11.49it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 11.56it/s][A
 24%|██▎       | 37/157 [00:03<00:10, 11.70it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.7136221528053284, 'eval_acc': 0.7901, 'eval_acc2': 0.8931, 'eval_f1': 0.7883372766078828, 'eval_roc_auc_micro': 0.9967196831818183, 'eval_precision': 0.7901, 'eval_recall': 0.7901, 'eval_runtime': 21.2983, 'eval_samples_per_second': 469.521, 'eval_steps_per_second': 7.371, 'epoch': 1.73}


 18%|█▊        | 280/1560 [13:18<43:25,  2.04s/it]  

{'loss': 1.05, 'grad_norm': 2.876460075378418, 'learning_rate': 0.0009225950427718975, 'epoch': 1.79}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:09, 15.99it/s][A
  3%|▎         | 5/157 [00:00<00:11, 12.95it/s][A
  4%|▍         | 7/157 [00:00<00:12, 12.17it/s][A
  6%|▌         | 9/157 [00:00<00:12, 12.05it/s][A
  7%|▋         | 11/157 [00:01<00:15,  9.38it/s][A
  8%|▊         | 13/157 [00:01<00:25,  5.71it/s][A
  9%|▉         | 14/157 [00:01<00:28,  4.96it/s][A
 10%|▉         | 15/157 [00:02<00:32,  4.43it/s][A
 10%|█         | 16/157 [00:02<00:34,  4.05it/s][A
 11%|█         | 17/157 [00:02<00:36,  3.79it/s][A
 11%|█▏        | 18/157 [00:03<00:38,  3.60it/s][A
 12%|█▏        | 19/157 [00:03<00:40,  3.44it/s][A
 13%|█▎        | 20/157 [00:03<00:40,  3.35it/s][A
 13%|█▎        | 21/157 [00:04<00:41,  3.30it/s][A
 14%|█▍        | 22/157 [00:04<00:40,  3.30it/s][A
 15%|█▍        | 23/157 [00:04<00:40,  3.27it/s][A
 15%|█▌        | 24/157 [00:05<00:39,  3.36it/s][A
 16%|█▌        | 25/157 [00:05<00:38,  3.44it/s][A
 17%|█▋        | 26/157 

{'eval_loss': 0.6820855140686035, 'eval_acc': 0.7994, 'eval_acc2': 0.8979, 'eval_f1': 0.7987790248268868, 'eval_roc_auc_micro': 0.9969793822222222, 'eval_precision': 0.7994, 'eval_recall': 0.7994, 'eval_runtime': 30.6032, 'eval_samples_per_second': 326.763, 'eval_steps_per_second': 5.13, 'epoch': 1.79}


 19%|█▊        | 290/1560 [14:02<33:32,  1.58s/it]  

{'loss': 1.0128, 'grad_norm': 2.6092376708984375, 'learning_rate': 0.0009171280693414306, 'epoch': 1.86}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:26,  5.94it/s][A
  3%|▎         | 4/157 [00:00<00:18,  8.37it/s][A
  4%|▍         | 6/157 [00:00<00:15,  9.55it/s][A
  5%|▌         | 8/157 [00:00<00:14,  9.98it/s][A
  6%|▋         | 10/157 [00:01<00:13, 10.55it/s][A
  8%|▊         | 12/157 [00:01<00:13, 10.81it/s][A
  9%|▉         | 14/157 [00:01<00:13, 10.99it/s][A
 10%|█         | 16/157 [00:01<00:12, 11.15it/s][A
 11%|█▏        | 18/157 [00:01<00:12, 11.27it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 11.48it/s][A
 14%|█▍        | 22/157 [00:02<00:11, 11.56it/s][A
 15%|█▌        | 24/157 [00:02<00:11, 11.66it/s][A
 17%|█▋        | 26/157 [00:02<00:11, 11.65it/s][A
 18%|█▊        | 28/157 [00:02<00:11, 11.68it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 11.65it/s][A
 20%|██        | 32/157 [00:02<00:10, 11.67it/s][A
 22%|██▏       | 34/157 [00:03<00:10, 11.54it/s][A
 23%|██▎       | 36/157 [00:03<00:10, 11.62it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.686424195766449, 'eval_acc': 0.7924, 'eval_acc2': 0.8948, 'eval_f1': 0.7917213746555464, 'eval_roc_auc_micro': 0.9970167931313132, 'eval_precision': 0.7924, 'eval_recall': 0.7924, 'eval_runtime': 17.164, 'eval_samples_per_second': 582.614, 'eval_steps_per_second': 9.147, 'epoch': 1.86}


 19%|█▉        | 300/1560 [14:29<25:50,  1.23s/it]  

{'loss': 0.9731, 'grad_norm': 4.968384742736816, 'learning_rate': 0.0009114919329468282, 'epoch': 1.92}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 17.24it/s][A
  3%|▎         | 5/157 [00:00<00:11, 13.22it/s][A
  4%|▍         | 7/157 [00:00<00:11, 12.63it/s][A
  6%|▌         | 9/157 [00:00<00:12, 12.31it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.41it/s][A
  8%|▊         | 13/157 [00:01<00:11, 12.30it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.36it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.30it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.22it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.27it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.37it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.31it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.35it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.28it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.37it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.42it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.25it/s][A
 24%|██▎       | 37/157 [00:02<00:10, 11.83it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.6822786331176758, 'eval_acc': 0.7937, 'eval_acc2': 0.8982, 'eval_f1': 0.791579331467461, 'eval_roc_auc_micro': 0.9970409625757576, 'eval_precision': 0.7937, 'eval_recall': 0.7937, 'eval_runtime': 15.667, 'eval_samples_per_second': 638.286, 'eval_steps_per_second': 10.021, 'epoch': 1.92}


 20%|█▉        | 310/1560 [14:55<23:16,  1.12s/it]  

{'loss': 0.967, 'grad_norm': 5.126833438873291, 'learning_rate': 0.0009056889192782866, 'epoch': 1.98}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.41it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.22it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.34it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.38it/s][A
  7%|▋         | 11/157 [00:00<00:12, 12.10it/s][A
  8%|▊         | 13/157 [00:01<00:11, 12.13it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.23it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.15it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.17it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.06it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 12.11it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.15it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.12it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.24it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.20it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.21it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 12.14it/s][A
 24%|██▎       | 37/157 [00:03<00:10, 11.93it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.7081497311592102, 'eval_acc': 0.7895, 'eval_acc2': 0.8918, 'eval_f1': 0.7903395408160312, 'eval_roc_auc_micro': 0.9966911096969697, 'eval_precision': 0.7895, 'eval_recall': 0.7895, 'eval_runtime': 19.0638, 'eval_samples_per_second': 524.554, 'eval_steps_per_second': 8.235, 'epoch': 1.98}


 21%|██        | 320/1560 [15:26<24:50,  1.20s/it]  

{'loss': 0.8814, 'grad_norm': 5.088314533233643, 'learning_rate': 0.0008997213817017506, 'epoch': 2.05}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 19.13it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.14it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.98it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.34it/s][A
  7%|▋         | 11/157 [00:00<00:11, 13.07it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.89it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.80it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.68it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.60it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.42it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 12.16it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.13it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.22it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.34it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.41it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.49it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.28it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.26it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.6801974773406982, 'eval_acc': 0.7963, 'eval_acc2': 0.8977, 'eval_f1': 0.794928375054182, 'eval_roc_auc_micro': 0.9969288828787879, 'eval_precision': 0.7963, 'eval_recall': 0.7963, 'eval_runtime': 15.1938, 'eval_samples_per_second': 658.162, 'eval_steps_per_second': 10.333, 'epoch': 2.05}


 21%|██        | 330/1560 [15:51<22:43,  1.11s/it]  

{'loss': 0.8629, 'grad_norm': 3.103084087371826, 'learning_rate': 0.000893591740304525, 'epoch': 2.11}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.50it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.06it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.90it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.34it/s][A
  7%|▋         | 11/157 [00:00<00:11, 13.09it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.89it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.50it/s][A
 11%|█         | 17/157 [00:01<00:11, 11.78it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.75it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.72it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 11.67it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.66it/s][A
 17%|█▋        | 27/157 [00:02<00:11, 11.50it/s][A
 18%|█▊        | 29/157 [00:02<00:11, 11.57it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 11.71it/s][A
 21%|██        | 33/157 [00:02<00:10, 11.90it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 11.97it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.09it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.651325523853302, 'eval_acc': 0.8046, 'eval_acc2': 0.9009, 'eval_f1': 0.8024592319843327, 'eval_roc_auc_micro': 0.9971770225757576, 'eval_precision': 0.8046, 'eval_recall': 0.8046, 'eval_runtime': 15.3056, 'eval_samples_per_second': 653.354, 'eval_steps_per_second': 10.258, 'epoch': 2.11}


 22%|██▏       | 340/1560 [16:16<21:55,  1.08s/it]  

{'loss': 0.8439, 'grad_norm': 3.4912526607513428, 'learning_rate': 0.0008873024809138273, 'epoch': 2.18}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.86it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.60it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.36it/s][A
  6%|▌         | 9/157 [00:00<00:12, 12.17it/s][A
  7%|▋         | 11/157 [00:00<00:12, 11.86it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.81it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.77it/s][A
 11%|█         | 17/157 [00:01<00:11, 11.85it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.81it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.88it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 11.94it/s][A
 16%|█▌        | 25/157 [00:02<00:17,  7.69it/s][A
 17%|█▋        | 27/157 [00:02<00:14,  8.72it/s][A
 18%|█▊        | 29/157 [00:02<00:13,  9.56it/s][A
 20%|█▉        | 31/157 [00:02<00:12, 10.20it/s][A
 21%|██        | 33/157 [00:02<00:11, 10.77it/s][A
 22%|██▏       | 35/157 [00:03<00:10, 11.16it/s][A
 24%|██▎       | 37/157 [00:03<00:10, 11.51it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.6749293804168701, 'eval_acc': 0.8004, 'eval_acc2': 0.8972, 'eval_f1': 0.7981921234435617, 'eval_roc_auc_micro': 0.9969150042929292, 'eval_precision': 0.8004, 'eval_recall': 0.8004, 'eval_runtime': 15.6337, 'eval_samples_per_second': 639.645, 'eval_steps_per_second': 10.042, 'epoch': 2.18}


 22%|██▏       | 350/1560 [16:41<22:09,  1.10s/it]  

{'loss': 0.8424, 'grad_norm': 3.9883408546447754, 'learning_rate': 0.0008808561540886796, 'epoch': 2.24}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:09, 16.42it/s][A
  3%|▎         | 5/157 [00:00<00:12, 12.51it/s][A
  4%|▍         | 7/157 [00:00<00:13, 11.48it/s][A
  6%|▌         | 9/157 [00:00<00:13, 11.15it/s][A
  7%|▋         | 11/157 [00:00<00:13, 10.90it/s][A
  8%|▊         | 13/157 [00:01<00:13, 10.81it/s][A
 10%|▉         | 15/157 [00:01<00:13, 10.82it/s][A
 11%|█         | 17/157 [00:01<00:13, 10.65it/s][A
 12%|█▏        | 19/157 [00:01<00:12, 10.62it/s][A
 13%|█▎        | 21/157 [00:01<00:12, 10.52it/s][A
 15%|█▍        | 23/157 [00:02<00:13, 10.27it/s][A
 16%|█▌        | 25/157 [00:02<00:13, 10.09it/s][A
 17%|█▋        | 27/157 [00:02<00:12, 10.14it/s][A
 18%|█▊        | 29/157 [00:02<00:12,  9.93it/s][A
 19%|█▉        | 30/157 [00:02<00:13,  9.74it/s][A
 20%|█▉        | 31/157 [00:02<00:13,  9.64it/s][A
 20%|██        | 32/157 [00:03<00:13,  9.50it/s][A
 21%|██        | 33/157 [00:03<00:13,  9.40it/s][A
 22%|██▏       | 34/157 

{'eval_loss': 0.6542670130729675, 'eval_acc': 0.8039, 'eval_acc2': 0.8984, 'eval_f1': 0.8030337728122278, 'eval_roc_auc_micro': 0.9972805493434344, 'eval_precision': 0.8039, 'eval_recall': 0.8039, 'eval_runtime': 19.0138, 'eval_samples_per_second': 525.934, 'eval_steps_per_second': 8.257, 'epoch': 2.24}


 23%|██▎       | 360/1560 [17:10<23:37,  1.18s/it]  

{'loss': 0.8044, 'grad_norm': 3.3323142528533936, 'learning_rate': 0.0008742553740855505, 'epoch': 2.3}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.59it/s][A
  3%|▎         | 5/157 [00:00<00:09, 15.29it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.25it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.49it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.44it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.47it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.40it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.41it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.37it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.49it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.26it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.37it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.45it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.47it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.50it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.53it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.49it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.52it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.6553704142570496, 'eval_acc': 0.8042, 'eval_acc2': 0.9013, 'eval_f1': 0.8033107715394162, 'eval_roc_auc_micro': 0.9970345321212121, 'eval_precision': 0.8042, 'eval_recall': 0.8042, 'eval_runtime': 15.0276, 'eval_samples_per_second': 665.444, 'eval_steps_per_second': 10.447, 'epoch': 2.3}


 24%|██▎       | 370/1560 [17:35<21:23,  1.08s/it]  

{'loss': 0.8261, 'grad_norm': 4.971045970916748, 'learning_rate': 0.0008675028177981643, 'epoch': 2.37}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:26,  5.85it/s][A
  3%|▎         | 4/157 [00:00<00:18,  8.27it/s][A
  4%|▍         | 6/157 [00:00<00:15,  9.59it/s][A
  5%|▌         | 8/157 [00:00<00:14, 10.39it/s][A
  6%|▋         | 10/157 [00:01<00:13, 10.68it/s][A
  8%|▊         | 12/157 [00:01<00:12, 11.16it/s][A
  9%|▉         | 14/157 [00:01<00:12, 11.38it/s][A
 10%|█         | 16/157 [00:01<00:12, 11.70it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 11.88it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 12.04it/s][A
 14%|█▍        | 22/157 [00:02<00:11, 12.13it/s][A
 15%|█▌        | 24/157 [00:02<00:10, 12.18it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.30it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 12.33it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 12.32it/s][A
 20%|██        | 32/157 [00:03<00:16,  7.70it/s][A
 22%|██▏       | 34/157 [00:03<00:14,  8.72it/s][A
 23%|██▎       | 36/157 [00:03<00:12,  9.46it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.6460996866226196, 'eval_acc': 0.8043, 'eval_acc2': 0.9036, 'eval_f1': 0.8044279374408292, 'eval_roc_auc_micro': 0.9972022312121211, 'eval_precision': 0.8043, 'eval_recall': 0.8043, 'eval_runtime': 15.3852, 'eval_samples_per_second': 649.975, 'eval_steps_per_second': 10.205, 'epoch': 2.37}


 24%|██▍       | 380/1560 [18:00<21:08,  1.07s/it]  

{'loss': 0.8588, 'grad_norm': 5.663798809051514, 'learning_rate': 0.0008606012236719073, 'epoch': 2.43}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 19.07it/s][A
  3%|▎         | 5/157 [00:00<00:09, 15.25it/s][A
  4%|▍         | 7/157 [00:00<00:10, 14.07it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.45it/s][A
  7%|▋         | 11/157 [00:00<00:11, 13.04it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.85it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.77it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.70it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.72it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.69it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.64it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.68it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.66it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.64it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.48it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.51it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.59it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.59it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.6816591024398804, 'eval_acc': 0.7966, 'eval_acc2': 0.898, 'eval_f1': 0.7939878087743699, 'eval_roc_auc_micro': 0.9969235417171717, 'eval_precision': 0.7966, 'eval_recall': 0.7966, 'eval_runtime': 15.268, 'eval_samples_per_second': 654.966, 'eval_steps_per_second': 10.283, 'epoch': 2.43}


 25%|██▌       | 390/1560 [18:25<21:57,  1.13s/it]  

{'loss': 0.8585, 'grad_norm': 3.721961736679077, 'learning_rate': 0.0008535533905932737, 'epoch': 2.5}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:10, 14.31it/s][A
  3%|▎         | 4/157 [00:00<00:11, 13.03it/s][A
  4%|▍         | 6/157 [00:00<00:11, 12.73it/s][A
  5%|▌         | 8/157 [00:00<00:12, 12.31it/s][A
  6%|▋         | 10/157 [00:00<00:12, 12.15it/s][A
  8%|▊         | 12/157 [00:00<00:12, 12.01it/s][A
  9%|▉         | 14/157 [00:01<00:11, 12.14it/s][A
 10%|█         | 16/157 [00:01<00:11, 12.30it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 12.38it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 12.42it/s][A
 14%|█▍        | 22/157 [00:01<00:10, 12.48it/s][A
 15%|█▌        | 24/157 [00:01<00:10, 12.51it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.56it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 12.60it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 12.58it/s][A
 20%|██        | 32/157 [00:02<00:09, 12.62it/s][A
 22%|██▏       | 34/157 [00:02<00:09, 12.34it/s][A
 23%|██▎       | 36/157 [00:02<00:10, 12.08it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.6594119071960449, 'eval_acc': 0.8049, 'eval_acc2': 0.9007, 'eval_f1': 0.8033691981656487, 'eval_roc_auc_micro': 0.997060093939394, 'eval_precision': 0.8049, 'eval_recall': 0.8049, 'eval_runtime': 15.2343, 'eval_samples_per_second': 656.415, 'eval_steps_per_second': 10.306, 'epoch': 2.5}


 26%|██▌       | 400/1560 [18:49<20:50,  1.08s/it]  

{'loss': 0.806, 'grad_norm': 3.6920363903045654, 'learning_rate': 0.0008463621767547997, 'epoch': 2.56}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:12, 12.09it/s][A
  3%|▎         | 5/157 [00:00<00:12, 12.30it/s][A
  4%|▍         | 7/157 [00:00<00:12, 12.35it/s][A
  6%|▌         | 9/157 [00:00<00:12, 12.28it/s][A
  7%|▋         | 11/157 [00:00<00:12, 12.16it/s][A
  8%|▊         | 13/157 [00:01<00:11, 12.05it/s][A
 10%|▉         | 15/157 [00:01<00:11, 11.99it/s][A
 11%|█         | 17/157 [00:01<00:11, 11.95it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.00it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.90it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 11.92it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.00it/s][A
 17%|█▋        | 27/157 [00:02<00:11, 11.78it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 11.86it/s][A
 20%|█▉        | 31/157 [00:03<00:19,  6.33it/s][A
 21%|██        | 33/157 [00:03<00:16,  7.45it/s][A
 22%|██▏       | 35/157 [00:03<00:14,  8.43it/s][A
 24%|██▎       | 37/157 [00:03<00:12,  9.26it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.6292334794998169, 'eval_acc': 0.8134, 'eval_acc2': 0.9062, 'eval_f1': 0.8105439325959652, 'eval_roc_auc_micro': 0.9973265812121213, 'eval_precision': 0.8134, 'eval_recall': 0.8134, 'eval_runtime': 15.5508, 'eval_samples_per_second': 643.054, 'eval_steps_per_second': 10.096, 'epoch': 2.56}


 26%|██▋       | 410/1560 [19:14<20:37,  1.08s/it]  

{'loss': 0.8825, 'grad_norm': 3.9077792167663574, 'learning_rate': 0.0008390304984959455, 'epoch': 2.62}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.84it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.05it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.53it/s][A
  6%|▌         | 9/157 [00:00<00:12, 12.33it/s][A
  7%|▋         | 11/157 [00:00<00:12, 11.95it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.45it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.59it/s][A
 11%|█         | 17/157 [00:01<00:12, 11.66it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.75it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.79it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 11.93it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.87it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.05it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.21it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.07it/s][A
 21%|██        | 33/157 [00:02<00:10, 11.95it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 12.06it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.12it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.6657398343086243, 'eval_acc': 0.8011, 'eval_acc2': 0.9036, 'eval_f1': 0.7996984525805078, 'eval_roc_auc_micro': 0.997046131060606, 'eval_precision': 0.8011, 'eval_recall': 0.8011, 'eval_runtime': 15.5281, 'eval_samples_per_second': 643.993, 'eval_steps_per_second': 10.111, 'epoch': 2.62}


 27%|██▋       | 420/1560 [19:40<21:40,  1.14s/it]  

{'loss': 0.8212, 'grad_norm': 3.0098958015441895, 'learning_rate': 0.0008315613291203976, 'epoch': 2.69}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:09, 15.66it/s][A
  3%|▎         | 5/157 [00:00<00:11, 12.94it/s][A
  4%|▍         | 7/157 [00:00<00:12, 11.59it/s][A
  6%|▌         | 9/157 [00:00<00:12, 11.40it/s][A
  7%|▋         | 11/157 [00:00<00:12, 11.28it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.08it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.12it/s][A
 11%|█         | 17/157 [00:01<00:12, 11.20it/s][A
 12%|█▏        | 19/157 [00:01<00:12, 11.13it/s][A
 13%|█▎        | 21/157 [00:01<00:12, 11.12it/s][A
 15%|█▍        | 23/157 [00:02<00:11, 11.20it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.18it/s][A
 17%|█▋        | 27/157 [00:02<00:11, 11.15it/s][A
 18%|█▊        | 29/157 [00:02<00:11, 10.94it/s][A
 20%|█▉        | 31/157 [00:02<00:11, 10.93it/s][A
 21%|██        | 33/157 [00:02<00:11, 10.87it/s][A
 22%|██▏       | 35/157 [00:03<00:11, 10.89it/s][A
 24%|██▎       | 37/157 [00:03<00:10, 10.93it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.6000028252601624, 'eval_acc': 0.8184, 'eval_acc2': 0.9134, 'eval_f1': 0.8171640170037971, 'eval_roc_auc_micro': 0.9976905645454545, 'eval_precision': 0.8184, 'eval_recall': 0.8184, 'eval_runtime': 18.1957, 'eval_samples_per_second': 549.58, 'eval_steps_per_second': 8.628, 'epoch': 2.69}


 28%|██▊       | 430/1560 [20:09<23:00,  1.22s/it]  

{'loss': 0.8895, 'grad_norm': 5.364750862121582, 'learning_rate': 0.0008239576976902694, 'epoch': 2.75}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:09, 16.30it/s][A
  3%|▎         | 5/157 [00:00<00:11, 12.68it/s][A
  4%|▍         | 7/157 [00:00<00:12, 11.99it/s][A
  6%|▌         | 9/157 [00:00<00:12, 11.62it/s][A
  7%|▋         | 11/157 [00:00<00:12, 11.25it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.23it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.15it/s][A
 11%|█         | 17/157 [00:01<00:12, 11.04it/s][A
 12%|█▏        | 19/157 [00:01<00:12, 11.12it/s][A
 13%|█▎        | 21/157 [00:01<00:12, 11.18it/s][A
 15%|█▍        | 23/157 [00:02<00:12, 11.05it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.08it/s][A
 17%|█▋        | 27/157 [00:02<00:11, 11.02it/s][A
 18%|█▊        | 29/157 [00:02<00:12, 10.65it/s][A
 20%|█▉        | 31/157 [00:02<00:11, 10.72it/s][A
 21%|██        | 33/157 [00:02<00:11, 10.70it/s][A
 22%|██▏       | 35/157 [00:03<00:11, 10.72it/s][A
 24%|██▎       | 37/157 [00:03<00:11, 10.82it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.6310471892356873, 'eval_acc': 0.8117, 'eval_acc2': 0.9032, 'eval_f1': 0.810702928604397, 'eval_roc_auc_micro': 0.9973554556565657, 'eval_precision': 0.8117, 'eval_recall': 0.8117, 'eval_runtime': 18.2792, 'eval_samples_per_second': 547.071, 'eval_steps_per_second': 8.589, 'epoch': 2.75}


 28%|██▊       | 440/1560 [20:37<20:58,  1.12s/it]  

{'loss': 0.8117, 'grad_norm': 2.9480810165405273, 'learning_rate': 0.0008162226877976886, 'epoch': 2.82}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.79it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.10it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.79it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.25it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.98it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.82it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.52it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.28it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.31it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.29it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 12.17it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.24it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.26it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.32it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.44it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.46it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.53it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.23it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.6281595826148987, 'eval_acc': 0.8089, 'eval_acc2': 0.9071, 'eval_f1': 0.8082001195609655, 'eval_roc_auc_micro': 0.9974581722727273, 'eval_precision': 0.8089, 'eval_recall': 0.8089, 'eval_runtime': 15.402, 'eval_samples_per_second': 649.266, 'eval_steps_per_second': 10.193, 'epoch': 2.82}


 29%|██▉       | 450/1560 [21:02<21:39,  1.17s/it]  

{'loss': 0.8675, 'grad_norm': 3.0825541019439697, 'learning_rate': 0.0008083594363142716, 'epoch': 2.88}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:10, 14.15it/s][A
  3%|▎         | 4/157 [00:00<00:15,  9.99it/s][A
  4%|▍         | 6/157 [00:00<00:13, 10.94it/s][A
  5%|▌         | 8/157 [00:00<00:12, 11.50it/s][A
  6%|▋         | 10/157 [00:00<00:12, 11.83it/s][A
  8%|▊         | 12/157 [00:01<00:12, 12.06it/s][A
  9%|▉         | 14/157 [00:01<00:11, 12.16it/s][A
 10%|█         | 16/157 [00:01<00:11, 11.95it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 12.12it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 12.20it/s][A
 14%|█▍        | 22/157 [00:01<00:10, 12.32it/s][A
 15%|█▌        | 24/157 [00:02<00:10, 12.38it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.24it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 12.29it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 12.29it/s][A
 20%|██        | 32/157 [00:02<00:10, 12.21it/s][A
 22%|██▏       | 34/157 [00:02<00:09, 12.30it/s][A
 23%|██▎       | 36/157 [00:02<00:09, 12.39it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.6037947535514832, 'eval_acc': 0.8162, 'eval_acc2': 0.9139, 'eval_f1': 0.8168246860610583, 'eval_roc_auc_micro': 0.9976633267171716, 'eval_precision': 0.8162, 'eval_recall': 0.8162, 'eval_runtime': 15.129, 'eval_samples_per_second': 660.982, 'eval_steps_per_second': 10.377, 'epoch': 2.88}


 29%|██▉       | 460/1560 [21:27<19:46,  1.08s/it]  

{'loss': 0.8088, 'grad_norm': 3.5895073413848877, 'learning_rate': 0.0008003711321189895, 'epoch': 2.94}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:15, 10.09it/s][A
  3%|▎         | 4/157 [00:00<00:13, 10.95it/s][A
  4%|▍         | 6/157 [00:00<00:13, 11.60it/s][A
  5%|▌         | 8/157 [00:00<00:12, 11.90it/s][A
  6%|▋         | 10/157 [00:00<00:12, 12.12it/s][A
  8%|▊         | 12/157 [00:01<00:11, 12.19it/s][A
  9%|▉         | 14/157 [00:01<00:11, 12.27it/s][A
 10%|█         | 16/157 [00:01<00:11, 12.37it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 12.41it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 12.40it/s][A
 14%|█▍        | 22/157 [00:01<00:10, 12.43it/s][A
 15%|█▌        | 24/157 [00:01<00:10, 12.38it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.46it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 12.50it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 12.42it/s][A
 20%|██        | 32/157 [00:02<00:15,  8.12it/s][A
 22%|██▏       | 34/157 [00:03<00:13,  8.86it/s][A
 23%|██▎       | 36/157 [00:03<00:12,  9.57it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.6084777116775513, 'eval_acc': 0.8192, 'eval_acc2': 0.915, 'eval_f1': 0.8177309079794697, 'eval_roc_auc_micro': 0.9975164688888889, 'eval_precision': 0.8192, 'eval_recall': 0.8192, 'eval_runtime': 15.4681, 'eval_samples_per_second': 646.493, 'eval_steps_per_second': 10.15, 'epoch': 2.94}


 30%|███       | 470/1560 [21:55<30:59,  1.71s/it]  

{'loss': 0.8011, 'grad_norm': 3.043917655944824, 'learning_rate': 0.0007922610148049445, 'epoch': 3.01}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:08, 19.33it/s][A
  3%|▎         | 4/157 [00:00<00:10, 14.26it/s][A
  4%|▍         | 6/157 [00:00<00:12, 12.33it/s][A
  5%|▌         | 8/157 [00:00<00:12, 12.18it/s][A
  6%|▋         | 10/157 [00:00<00:12, 11.99it/s][A
  8%|▊         | 12/157 [00:00<00:12, 11.99it/s][A
  9%|▉         | 14/157 [00:01<00:11, 11.99it/s][A
 10%|█         | 16/157 [00:01<00:11, 12.00it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 12.07it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 12.16it/s][A
 14%|█▍        | 22/157 [00:01<00:11, 12.25it/s][A
 15%|█▌        | 24/157 [00:01<00:10, 12.27it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.30it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 12.30it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 12.23it/s][A
 20%|██        | 32/157 [00:02<00:10, 12.11it/s][A
 22%|██▏       | 34/157 [00:02<00:10, 12.18it/s][A
 23%|██▎       | 36/157 [00:02<00:09, 12.31it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.604233980178833, 'eval_acc': 0.816, 'eval_acc2': 0.9108, 'eval_f1': 0.8159374025353447, 'eval_roc_auc_micro': 0.9976794695454545, 'eval_precision': 0.816, 'eval_recall': 0.816, 'eval_runtime': 47.8081, 'eval_samples_per_second': 209.17, 'eval_steps_per_second': 3.284, 'epoch': 3.01}


 31%|███       | 480/1560 [22:52<27:12,  1.51s/it]  

{'loss': 0.6761, 'grad_norm': 2.8761684894561768, 'learning_rate': 0.0007840323733655779, 'epoch': 3.07}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.77it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.04it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.87it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.32it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.80it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.75it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.39it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.46it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.51it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.63it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.67it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.69it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.69it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.60it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.58it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.54it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 12.12it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.25it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.6085166335105896, 'eval_acc': 0.8171, 'eval_acc2': 0.9103, 'eval_f1': 0.8169996786939013, 'eval_roc_auc_micro': 0.9975575735353536, 'eval_precision': 0.8171, 'eval_recall': 0.8171, 'eval_runtime': 34.8001, 'eval_samples_per_second': 287.355, 'eval_steps_per_second': 4.511, 'epoch': 3.07}


 31%|███▏      | 490/1560 [23:37<24:27,  1.37s/it]  

{'loss': 0.6667, 'grad_norm': 2.847320795059204, 'learning_rate': 0.000775688544860846, 'epoch': 3.14}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:32,  4.71it/s][A
  3%|▎         | 4/157 [00:00<00:20,  7.41it/s][A
  4%|▍         | 6/157 [00:00<00:17,  8.81it/s][A
  5%|▌         | 8/157 [00:00<00:16,  9.21it/s][A
  6%|▋         | 10/157 [00:01<00:14,  9.97it/s][A
  8%|▊         | 12/157 [00:01<00:13, 10.59it/s][A
  9%|▉         | 14/157 [00:01<00:13, 11.00it/s][A
 10%|█         | 16/157 [00:01<00:12, 11.28it/s][A
 11%|█▏        | 18/157 [00:01<00:12, 11.55it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 11.76it/s][A
 14%|█▍        | 22/157 [00:02<00:11, 11.95it/s][A
 15%|█▌        | 24/157 [00:02<00:11, 12.07it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.00it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 12.01it/s][A
 19%|█▉        | 30/157 [00:03<00:15,  8.32it/s][A
 20%|██        | 32/157 [00:03<00:13,  9.17it/s][A
 22%|██▏       | 34/157 [00:03<00:12,  9.85it/s][A
 23%|██▎       | 36/157 [00:03<00:11, 10.37it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.5971376299858093, 'eval_acc': 0.8187, 'eval_acc2': 0.9136, 'eval_f1': 0.8176714156337421, 'eval_roc_auc_micro': 0.9975786661616162, 'eval_precision': 0.8187, 'eval_recall': 0.8187, 'eval_runtime': 20.0336, 'eval_samples_per_second': 499.162, 'eval_steps_per_second': 7.837, 'epoch': 3.14}


 32%|███▏      | 500/1560 [24:13<38:12,  2.16s/it]  

{'loss': 0.6954, 'grad_norm': 3.319309949874878, 'learning_rate': 0.0007672329130639005, 'epoch': 3.2}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:09, 16.88it/s][A
  3%|▎         | 5/157 [00:00<00:10, 13.82it/s][A
  4%|▍         | 7/157 [00:00<00:16,  9.19it/s][A
  6%|▌         | 9/157 [00:01<00:27,  5.47it/s][A
  6%|▋         | 10/157 [00:01<00:30,  4.83it/s][A
  7%|▋         | 11/157 [00:01<00:32,  4.45it/s][A
  8%|▊         | 12/157 [00:02<00:34,  4.22it/s][A
  8%|▊         | 13/157 [00:02<00:33,  4.31it/s][A
  9%|▉         | 14/157 [00:02<00:33,  4.33it/s][A
 10%|▉         | 15/157 [00:02<00:32,  4.38it/s][A
 10%|█         | 16/157 [00:03<00:31,  4.44it/s][A
 11%|█         | 17/157 [00:03<00:31,  4.47it/s][A
 11%|█▏        | 18/157 [00:03<00:31,  4.46it/s][A
 12%|█▏        | 19/157 [00:03<00:30,  4.49it/s][A
 13%|█▎        | 20/157 [00:03<00:29,  4.68it/s][A
 13%|█▎        | 21/157 [00:04<00:28,  4.84it/s][A
 14%|█▍        | 22/157 [00:04<00:27,  4.96it/s][A
 15%|█▍        | 23/157 [00:04<00:26,  5.07it/s][A
 15%|█▌        | 24/157 

{'eval_loss': 0.6224907040596008, 'eval_acc': 0.8147, 'eval_acc2': 0.9081, 'eval_f1': 0.8143910436448268, 'eval_roc_auc_micro': 0.9973925704545454, 'eval_precision': 0.8147, 'eval_recall': 0.8147, 'eval_runtime': 55.0355, 'eval_samples_per_second': 181.701, 'eval_steps_per_second': 2.853, 'epoch': 3.2}


 33%|███▎      | 510/1560 [25:19<30:41,  1.75s/it]  

{'loss': 0.6203, 'grad_norm': 2.869976043701172, 'learning_rate': 0.0007586689070888284, 'epoch': 3.26}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 17.85it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.40it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.18it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.91it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.71it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.12it/s][A
 10%|▉         | 15/157 [00:01<00:15,  9.16it/s][A
 10%|█         | 16/157 [00:01<00:15,  8.85it/s][A
 11%|█         | 17/157 [00:01<00:16,  8.67it/s][A
 11%|█▏        | 18/157 [00:01<00:16,  8.62it/s][A
 12%|█▏        | 19/157 [00:01<00:15,  8.75it/s][A
 13%|█▎        | 20/157 [00:01<00:15,  8.77it/s][A
 13%|█▎        | 21/157 [00:02<00:15,  8.95it/s][A
 14%|█▍        | 22/157 [00:02<00:14,  9.08it/s][A
 15%|█▌        | 24/157 [00:02<00:14,  9.37it/s][A
 17%|█▋        | 26/157 [00:02<00:13,  9.48it/s][A
 17%|█▋        | 27/157 [00:02<00:13,  9.59it/s][A
 18%|█▊        | 28/157 [00:02<00:13,  9.64it/s][A
 18%|█▊        | 29/157 

{'eval_loss': 0.6090586185455322, 'eval_acc': 0.8193, 'eval_acc2': 0.9101, 'eval_f1': 0.8180952744489595, 'eval_roc_auc_micro': 0.9975101344949495, 'eval_precision': 0.8193, 'eval_recall': 0.8193, 'eval_runtime': 20.5278, 'eval_samples_per_second': 487.144, 'eval_steps_per_second': 7.648, 'epoch': 3.26}


 33%|███▎      | 520/1560 [25:51<23:24,  1.35s/it]  

{'loss': 0.692, 'grad_norm': 2.781346082687378, 'learning_rate': 0.00075, 'epoch': 3.33}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:09, 16.10it/s][A
  3%|▎         | 5/157 [00:00<00:13, 11.43it/s][A
  4%|▍         | 7/157 [00:00<00:13, 11.54it/s][A
  6%|▌         | 9/157 [00:00<00:12, 11.60it/s][A
  7%|▋         | 11/157 [00:00<00:12, 11.66it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.76it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.57it/s][A
 11%|█         | 17/157 [00:01<00:18,  7.60it/s][A
 12%|█▏        | 19/157 [00:01<00:16,  8.58it/s][A
 13%|█▎        | 21/157 [00:02<00:14,  9.42it/s][A
 15%|█▍        | 23/157 [00:02<00:13, 10.17it/s][A
 16%|█▌        | 25/157 [00:02<00:12, 10.66it/s][A
 17%|█▋        | 27/157 [00:02<00:11, 10.98it/s][A
 18%|█▊        | 29/157 [00:02<00:11, 11.16it/s][A
 20%|█▉        | 31/157 [00:02<00:11, 11.28it/s][A
 21%|██        | 33/157 [00:03<00:10, 11.57it/s][A
 22%|██▏       | 35/157 [00:03<00:10, 11.71it/s][A
 24%|██▎       | 37/157 [00:03<00:10, 11.81it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5938795208930969, 'eval_acc': 0.8188, 'eval_acc2': 0.912, 'eval_f1': 0.8190697257980349, 'eval_roc_auc_micro': 0.9977320721717171, 'eval_precision': 0.8188, 'eval_recall': 0.8188, 'eval_runtime': 18.2945, 'eval_samples_per_second': 546.612, 'eval_steps_per_second': 8.582, 'epoch': 3.33}


 34%|███▍      | 530/1560 [26:19<20:02,  1.17s/it]  

{'loss': 0.6861, 'grad_norm': 3.1914544105529785, 'learning_rate': 0.0007412297074035968, 'epoch': 3.39}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.60it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.85it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.60it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.94it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.63it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.57it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.58it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.36it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.22it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.19it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.27it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.36it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.09it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.11it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.07it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.24it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.30it/s][A
 24%|██▎       | 37/157 [00:02<00:10, 11.93it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.6278762817382812, 'eval_acc': 0.8122, 'eval_acc2': 0.9049, 'eval_f1': 0.8105715818979041, 'eval_roc_auc_micro': 0.9973673710606061, 'eval_precision': 0.8122, 'eval_recall': 0.8122, 'eval_runtime': 15.9148, 'eval_samples_per_second': 628.344, 'eval_steps_per_second': 9.865, 'epoch': 3.39}


 35%|███▍      | 540/1560 [26:45<19:18,  1.14s/it]  

{'loss': 0.6976, 'grad_norm': 2.678858757019043, 'learning_rate': 0.0007323615860218843, 'epoch': 3.46}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:09, 16.12it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.11it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.33it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.94it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.73it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.59it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.13it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.23it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.30it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.28it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.29it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.31it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.39it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.44it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.50it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.49it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.50it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.42it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.617277204990387, 'eval_acc': 0.8161, 'eval_acc2': 0.9114, 'eval_f1': 0.8168836224606155, 'eval_roc_auc_micro': 0.997350964949495, 'eval_precision': 0.8161, 'eval_recall': 0.8161, 'eval_runtime': 34.6981, 'eval_samples_per_second': 288.2, 'eval_steps_per_second': 4.525, 'epoch': 3.46}


 35%|███▌      | 550/1560 [27:29<21:59,  1.31s/it]  

{'loss': 0.7143, 'grad_norm': 3.369746685028076, 'learning_rate': 0.000723399232250813, 'epoch': 3.52}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.87it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.11it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.76it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.32it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.80it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.67it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.62it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.45it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.50it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.58it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.63it/s][A
 16%|█▌        | 25/157 [00:02<00:16,  8.09it/s][A
 17%|█▋        | 27/157 [00:02<00:14,  9.13it/s][A
 18%|█▊        | 29/157 [00:02<00:13,  9.82it/s][A
 20%|█▉        | 31/157 [00:02<00:12, 10.44it/s][A
 21%|██        | 33/157 [00:02<00:11, 11.05it/s][A
 22%|██▏       | 35/157 [00:03<00:10, 11.48it/s][A
 24%|██▎       | 37/157 [00:03<00:10, 11.76it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.6083472967147827, 'eval_acc': 0.8172, 'eval_acc2': 0.9104, 'eval_f1': 0.8158329023370636, 'eval_roc_auc_micro': 0.9975619574747475, 'eval_precision': 0.8172, 'eval_recall': 0.8172, 'eval_runtime': 27.0325, 'eval_samples_per_second': 369.926, 'eval_steps_per_second': 5.808, 'epoch': 3.52}


 36%|███▌      | 560/1560 [28:06<20:23,  1.22s/it]  

{'loss': 0.6629, 'grad_norm': 2.9222989082336426, 'learning_rate': 0.000714346280701527, 'epoch': 3.58}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 17.92it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.07it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.44it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.10it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.84it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.60it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.61it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.40it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.50it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.38it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.47it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.55it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.68it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.65it/s][A
 20%|█▉        | 31/157 [00:02<00:09, 12.76it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.77it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.75it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.71it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.6112889647483826, 'eval_acc': 0.8173, 'eval_acc2': 0.9107, 'eval_f1': 0.8168147144072423, 'eval_roc_auc_micro': 0.9974121754545455, 'eval_precision': 0.8173, 'eval_recall': 0.8173, 'eval_runtime': 15.9736, 'eval_samples_per_second': 626.034, 'eval_steps_per_second': 9.829, 'epoch': 3.58}


 37%|███▋      | 570/1560 [28:32<18:19,  1.11s/it]  

{'loss': 0.6459, 'grad_norm': 2.6741180419921875, 'learning_rate': 0.0007052064027263785, 'epoch': 3.65}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.16it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.42it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.58it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.17it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.82it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.52it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.57it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.44it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.49it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.44it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.50it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.56it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.56it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.51it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.54it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.50it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.49it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.30it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5757094621658325, 'eval_acc': 0.8272, 'eval_acc2': 0.9213, 'eval_f1': 0.8268870778101538, 'eval_roc_auc_micro': 0.9976776467676767, 'eval_precision': 0.8272, 'eval_recall': 0.8272, 'eval_runtime': 19.1687, 'eval_samples_per_second': 521.685, 'eval_steps_per_second': 8.19, 'epoch': 3.65}


 37%|███▋      | 580/1560 [29:00<18:23,  1.13s/it]  

{'loss': 0.7036, 'grad_norm': 2.7826247215270996, 'learning_rate': 0.0006959833049300376, 'epoch': 3.71}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:09, 16.17it/s][A
  3%|▎         | 4/157 [00:00<00:11, 13.78it/s][A
  4%|▍         | 6/157 [00:00<00:11, 12.94it/s][A
  5%|▌         | 8/157 [00:00<00:11, 12.72it/s][A
  6%|▋         | 10/157 [00:00<00:11, 12.48it/s][A
  8%|▊         | 12/157 [00:00<00:11, 12.36it/s][A
  9%|▉         | 14/157 [00:01<00:11, 12.35it/s][A
 10%|█         | 16/157 [00:01<00:11, 12.31it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 12.11it/s][A
 13%|█▎        | 20/157 [00:01<00:16,  8.27it/s][A
 14%|█▍        | 22/157 [00:02<00:14,  9.22it/s][A
 15%|█▌        | 24/157 [00:02<00:13,  9.92it/s][A
 17%|█▋        | 26/157 [00:02<00:12, 10.51it/s][A
 18%|█▊        | 28/157 [00:02<00:11, 11.05it/s][A
 19%|█▉        | 30/157 [00:02<00:11, 11.31it/s][A
 20%|██        | 32/157 [00:02<00:10, 11.58it/s][A
 22%|██▏       | 34/157 [00:02<00:10, 11.79it/s][A
 23%|██▎       | 36/157 [00:03<00:10, 11.96it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.5888393521308899, 'eval_acc': 0.8254, 'eval_acc2': 0.9146, 'eval_f1': 0.8241794898179613, 'eval_roc_auc_micro': 0.9976583443434343, 'eval_precision': 0.8254, 'eval_recall': 0.8254, 'eval_runtime': 16.1152, 'eval_samples_per_second': 620.533, 'eval_steps_per_second': 9.742, 'epoch': 3.71}


 38%|███▊      | 590/1560 [29:26<17:46,  1.10s/it]  

{'loss': 0.6986, 'grad_norm': 3.075843334197998, 'learning_rate': 0.0006866807276663105, 'epoch': 3.78}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.87it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.96it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.78it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.39it/s][A
  7%|▋         | 11/157 [00:00<00:12, 11.92it/s][A
  8%|▊         | 13/157 [00:01<00:11, 12.02it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.72it/s][A
 11%|█         | 17/157 [00:01<00:12, 11.51it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.60it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.87it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 12.03it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.13it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.24it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.30it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 11.99it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.17it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.26it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.36it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5886579155921936, 'eval_acc': 0.8222, 'eval_acc2': 0.92, 'eval_f1': 0.8218548813875859, 'eval_roc_auc_micro': 0.997635143888889, 'eval_precision': 0.8222, 'eval_recall': 0.8222, 'eval_runtime': 16.0002, 'eval_samples_per_second': 624.992, 'eval_steps_per_second': 9.812, 'epoch': 3.78}


 38%|███▊      | 600/1560 [29:52<18:24,  1.15s/it]  

{'loss': 0.6618, 'grad_norm': 1.9806801080703735, 'learning_rate': 0.0006773024435212678, 'epoch': 3.84}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.26it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.77it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.59it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.11it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.74it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.41it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.35it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.29it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.31it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.30it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.29it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.26it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.26it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.14it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.24it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.28it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.29it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.22it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5676490068435669, 'eval_acc': 0.8235, 'eval_acc2': 0.9223, 'eval_f1': 0.8228947110480888, 'eval_roc_auc_micro': 0.9978076423737373, 'eval_precision': 0.8235, 'eval_recall': 0.8235, 'eval_runtime': 20.1255, 'eval_samples_per_second': 496.882, 'eval_steps_per_second': 7.801, 'epoch': 3.84}


 39%|███▉      | 610/1560 [30:22<18:33,  1.17s/it]  

{'loss': 0.6456, 'grad_norm': 2.6229145526885986, 'learning_rate': 0.0006678522557833024, 'epoch': 3.9}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 17.83it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.38it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.53it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.05it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.76it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.48it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.34it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.20it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.90it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.74it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 11.72it/s][A
 16%|█▌        | 25/157 [00:02<00:15,  8.46it/s][A
 17%|█▋        | 27/157 [00:02<00:13,  9.38it/s][A
 18%|█▊        | 29/157 [00:02<00:12, 10.15it/s][A
 20%|█▉        | 31/157 [00:02<00:11, 10.64it/s][A
 21%|██        | 33/157 [00:02<00:11, 11.06it/s][A
 22%|██▏       | 35/157 [00:03<00:10, 11.29it/s][A
 24%|██▎       | 37/157 [00:03<00:10, 11.43it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5840252637863159, 'eval_acc': 0.8261, 'eval_acc2': 0.9164, 'eval_f1': 0.8249468639029522, 'eval_roc_auc_micro': 0.9977170244444445, 'eval_precision': 0.8261, 'eval_recall': 0.8261, 'eval_runtime': 39.9258, 'eval_samples_per_second': 250.465, 'eval_steps_per_second': 3.932, 'epoch': 3.9}


 40%|███▉      | 620/1560 [31:11<21:29,  1.37s/it]  

{'loss': 0.6629, 'grad_norm': 2.6277694702148438, 'learning_rate': 0.0006583339969007363, 'epoch': 3.97}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.97it/s][A
  3%|▎         | 5/157 [00:00<00:09, 15.39it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.84it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.32it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.80it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.67it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.68it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.72it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.63it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.62it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.65it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.60it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.60it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.68it/s][A
 20%|█▉        | 31/157 [00:02<00:09, 12.65it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.59it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.62it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.43it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5836173892021179, 'eval_acc': 0.8234, 'eval_acc2': 0.9158, 'eval_f1': 0.8222087927731785, 'eval_roc_auc_micro': 0.9976629381313131, 'eval_precision': 0.8234, 'eval_recall': 0.8234, 'eval_runtime': 17.1806, 'eval_samples_per_second': 582.05, 'eval_steps_per_second': 9.138, 'epoch': 3.97}


 40%|████      | 630/1560 [31:41<20:22,  1.32s/it]  

{'loss': 0.6114, 'grad_norm': 3.1111180782318115, 'learning_rate': 0.0006487515269276015, 'epoch': 4.03}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 17.72it/s][A
  3%|▎         | 5/157 [00:00<00:11, 13.27it/s][A
  4%|▍         | 7/157 [00:00<00:11, 12.79it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.58it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.42it/s][A
  8%|▊         | 13/157 [00:01<00:11, 12.26it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.24it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.39it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.47it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.48it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.54it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.57it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.60it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.64it/s][A
 20%|█▉        | 31/157 [00:02<00:09, 12.63it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.63it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.58it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.36it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5697917342185974, 'eval_acc': 0.8289, 'eval_acc2': 0.9201, 'eval_f1': 0.8294876416735762, 'eval_roc_auc_micro': 0.9979265560101012, 'eval_precision': 0.8289, 'eval_recall': 0.8289, 'eval_runtime': 17.3558, 'eval_samples_per_second': 576.175, 'eval_steps_per_second': 9.046, 'epoch': 4.03}


 41%|████      | 640/1560 [32:07<17:01,  1.11s/it]  

{'loss': 0.5183, 'grad_norm': 3.093439817428589, 'learning_rate': 0.0006391087319582263, 'epoch': 4.1}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.60it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.76it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.48it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.86it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.43it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.56it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.59it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.64it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.55it/s][A
 13%|█▎        | 21/157 [00:01<00:15,  8.70it/s][A
 15%|█▍        | 23/157 [00:02<00:13,  9.67it/s][A
 16%|█▌        | 25/157 [00:02<00:12, 10.43it/s][A
 17%|█▋        | 27/157 [00:02<00:11, 10.97it/s][A
 18%|█▊        | 29/157 [00:02<00:11, 11.45it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 11.74it/s][A
 21%|██        | 33/157 [00:02<00:10, 11.94it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 12.11it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.27it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5726419687271118, 'eval_acc': 0.8308, 'eval_acc2': 0.9186, 'eval_f1': 0.8301360333924508, 'eval_roc_auc_micro': 0.9977650869191919, 'eval_precision': 0.8308, 'eval_recall': 0.8308, 'eval_runtime': 20.7848, 'eval_samples_per_second': 481.12, 'eval_steps_per_second': 7.554, 'epoch': 4.1}


 42%|████▏     | 650/1560 [32:38<17:24,  1.15s/it]  

{'loss': 0.5588, 'grad_norm': 3.9712319374084473, 'learning_rate': 0.0006294095225512603, 'epoch': 4.16}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.75it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.94it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.92it/s][A
  6%|▌         | 9/157 [00:00<00:10, 13.47it/s][A
  7%|▋         | 11/157 [00:00<00:11, 13.11it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.99it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.86it/s][A
 11%|█         | 17/157 [00:01<00:10, 12.80it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.69it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.72it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.70it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.73it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.67it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.67it/s][A
 20%|█▉        | 31/157 [00:02<00:09, 12.69it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.64it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.59it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.63it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5479196310043335, 'eval_acc': 0.8345, 'eval_acc2': 0.9219, 'eval_f1': 0.833748305224282, 'eval_roc_auc_micro': 0.9979730077777778, 'eval_precision': 0.8345, 'eval_recall': 0.8345, 'eval_runtime': 15.1925, 'eval_samples_per_second': 658.218, 'eval_steps_per_second': 10.334, 'epoch': 4.16}


 42%|████▏     | 660/1560 [33:03<16:24,  1.09s/it]  

{'loss': 0.5463, 'grad_norm': 2.7572197914123535, 'learning_rate': 0.0006196578321437789, 'epoch': 4.22}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.97it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.78it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.77it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.35it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.69it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.22it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.41it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.49it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.58it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.53it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.53it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.55it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.61it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.64it/s][A
 20%|█▉        | 31/157 [00:02<00:09, 12.67it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.60it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.35it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.41it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5441598892211914, 'eval_acc': 0.8371, 'eval_acc2': 0.9239, 'eval_f1': 0.836865593682261, 'eval_roc_auc_micro': 0.9979988345959597, 'eval_precision': 0.8371, 'eval_recall': 0.8371, 'eval_runtime': 21.5646, 'eval_samples_per_second': 463.722, 'eval_steps_per_second': 7.28, 'epoch': 4.22}


 43%|████▎     | 670/1560 [33:34<17:04,  1.15s/it]  

{'loss': 0.5369, 'grad_norm': 2.9744207859039307, 'learning_rate': 0.0006098576154561086, 'epoch': 4.29}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.91it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.61it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.78it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.38it/s][A
  7%|▋         | 11/157 [00:00<00:11, 13.02it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.84it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.75it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.68it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.97it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.15it/s][A
 15%|█▍        | 23/157 [00:01<00:15,  8.86it/s][A
 16%|█▌        | 25/157 [00:02<00:13,  9.76it/s][A
 17%|█▋        | 27/157 [00:02<00:12, 10.47it/s][A
 18%|█▊        | 29/157 [00:02<00:11, 11.06it/s][A
 20%|█▉        | 31/157 [00:02<00:11, 11.45it/s][A
 21%|██        | 33/157 [00:02<00:10, 11.78it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 11.96it/s][A
 24%|██▎       | 37/157 [00:03<00:10, 11.99it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5713198184967041, 'eval_acc': 0.8306, 'eval_acc2': 0.9199, 'eval_f1': 0.8292516765909348, 'eval_roc_auc_micro': 0.9977925915656567, 'eval_precision': 0.8306, 'eval_recall': 0.8306, 'eval_runtime': 15.2387, 'eval_samples_per_second': 656.222, 'eval_steps_per_second': 10.303, 'epoch': 4.29}


 44%|████▎     | 680/1560 [33:58<15:59,  1.09s/it]  

{'loss': 0.5449, 'grad_norm': 2.0158884525299072, 'learning_rate': 0.0006000128468880223, 'epoch': 4.35}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.93it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.08it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.96it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.42it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.79it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.72it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.65it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.65it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.66it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.63it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.30it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.31it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.41it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.52it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.52it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.45it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.35it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.36it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5697001218795776, 'eval_acc': 0.8316, 'eval_acc2': 0.9214, 'eval_f1': 0.8305165216562987, 'eval_roc_auc_micro': 0.9978009706565656, 'eval_precision': 0.8316, 'eval_recall': 0.8316, 'eval_runtime': 17.6333, 'eval_samples_per_second': 567.11, 'eval_steps_per_second': 8.904, 'epoch': 4.35}


 44%|████▍     | 690/1560 [34:26<16:28,  1.14s/it]  

{'loss': 0.5639, 'grad_norm': 2.870403528213501, 'learning_rate': 0.000590127518906953, 'epoch': 4.42}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.92it/s][A
  3%|▎         | 5/157 [00:00<00:09, 15.27it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.99it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.38it/s][A
  7%|▋         | 11/157 [00:00<00:11, 13.13it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.97it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.81it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.68it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.24it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.24it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.38it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.45it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.54it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.57it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.29it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.31it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.42it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.38it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5695797204971313, 'eval_acc': 0.8264, 'eval_acc2': 0.9177, 'eval_f1': 0.8256478248377179, 'eval_roc_auc_micro': 0.9978861642424244, 'eval_precision': 0.8264, 'eval_recall': 0.8264, 'eval_runtime': 36.6351, 'eval_samples_per_second': 272.963, 'eval_steps_per_second': 4.286, 'epoch': 4.42}


 45%|████▍     | 700/1560 [35:12<19:02,  1.33s/it]  

{'loss': 0.5693, 'grad_norm': 2.1328136920928955, 'learning_rate': 0.0005802056404288802, 'epoch': 4.48}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:25,  6.02it/s][A
  2%|▏         | 3/157 [00:00<00:26,  5.77it/s][A
  3%|▎         | 5/157 [00:00<00:18,  8.23it/s][A
  4%|▍         | 7/157 [00:00<00:15,  9.72it/s][A
  6%|▌         | 9/157 [00:00<00:13, 10.63it/s][A
  7%|▋         | 11/157 [00:01<00:12, 11.28it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.65it/s][A
 10%|▉         | 15/157 [00:01<00:11, 11.96it/s][A
 11%|█         | 17/157 [00:01<00:11, 11.95it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.16it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.95it/s][A
 15%|█▍        | 23/157 [00:02<00:11, 12.02it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.26it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.39it/s][A
 18%|█▊        | 29/157 [00:02<00:14,  8.89it/s][A
 20%|█▉        | 31/157 [00:02<00:12,  9.83it/s][A
 21%|██        | 33/157 [00:03<00:11, 10.45it/s][A
 22%|██▏       | 35/157 [00:03<00:11, 11.01it/s][A
 24%|██▎       | 37/157 [

{'eval_loss': 0.5677563548088074, 'eval_acc': 0.8302, 'eval_acc2': 0.9176, 'eval_f1': 0.8293250691571784, 'eval_roc_auc_micro': 0.9979716631313131, 'eval_precision': 0.8302, 'eval_recall': 0.8302, 'eval_runtime': 15.6163, 'eval_samples_per_second': 640.355, 'eval_steps_per_second': 10.054, 'epoch': 4.48}


 46%|████▌     | 710/1560 [35:37<15:41,  1.11s/it]  

{'loss': 0.5209, 'grad_norm': 2.63433837890625, 'learning_rate': 0.0005702512351925465, 'epoch': 4.54}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.47it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.06it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.92it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.60it/s][A
  7%|▋         | 11/157 [00:00<00:12, 11.91it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.94it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.00it/s][A
 11%|█         | 17/157 [00:01<00:11, 11.70it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.85it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.07it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 11.92it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.06it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.11it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.21it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.34it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.28it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 12.12it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.22it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5450806021690369, 'eval_acc': 0.8372, 'eval_acc2': 0.9201, 'eval_f1': 0.8373167499485222, 'eval_roc_auc_micro': 0.9981149496464647, 'eval_precision': 0.8372, 'eval_recall': 0.8372, 'eval_runtime': 18.4592, 'eval_samples_per_second': 541.737, 'eval_steps_per_second': 8.505, 'epoch': 4.54}


 46%|████▌     | 720/1560 [36:06<16:13,  1.16s/it]  

{'loss': 0.512, 'grad_norm': 2.7445614337921143, 'learning_rate': 0.0005602683401276614, 'epoch': 4.61}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 17.84it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.64it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.48it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.71it/s][A
  7%|▋         | 11/157 [00:00<00:12, 11.82it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.65it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.54it/s][A
 11%|█         | 17/157 [00:01<00:12, 11.44it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.52it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.73it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 11.99it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.96it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 11.99it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 11.87it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 11.83it/s][A
 21%|██        | 33/157 [00:02<00:10, 11.99it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 12.05it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.00it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.567557156085968, 'eval_acc': 0.8319, 'eval_acc2': 0.9194, 'eval_f1': 0.831619524598768, 'eval_roc_auc_micro': 0.9978305029797979, 'eval_precision': 0.8319, 'eval_recall': 0.8319, 'eval_runtime': 18.7219, 'eval_samples_per_second': 534.134, 'eval_steps_per_second': 8.386, 'epoch': 4.61}


 47%|████▋     | 730/1560 [36:34<16:01,  1.16s/it]  

{'loss': 0.5286, 'grad_norm': 2.875196933746338, 'learning_rate': 0.0005502610037177585, 'epoch': 4.67}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.28it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.75it/s][A
  4%|▍         | 7/157 [00:00<00:12, 11.82it/s][A
  6%|▌         | 9/157 [00:00<00:12, 11.65it/s][A
  7%|▋         | 11/157 [00:00<00:12, 11.68it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.59it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.50it/s][A
 11%|█         | 17/157 [00:01<00:11, 11.68it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.70it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.86it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 11.77it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.87it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.00it/s][A
 18%|█▊        | 29/157 [00:02<00:14,  8.58it/s][A
 20%|█▉        | 31/157 [00:02<00:13,  9.49it/s][A
 21%|██        | 33/157 [00:02<00:12, 10.13it/s][A
 22%|██▏       | 35/157 [00:03<00:11, 10.53it/s][A
 24%|██▎       | 37/157 [00:03<00:11, 10.84it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5507157444953918, 'eval_acc': 0.8355, 'eval_acc2': 0.924, 'eval_f1': 0.8352628293165805, 'eval_roc_auc_micro': 0.9979903366666667, 'eval_precision': 0.8355, 'eval_recall': 0.8355, 'eval_runtime': 15.8403, 'eval_samples_per_second': 631.3, 'eval_steps_per_second': 9.911, 'epoch': 4.67}


 47%|████▋     | 740/1560 [37:00<15:21,  1.12s/it]  

{'loss': 0.4968, 'grad_norm': 2.5445730686187744, 'learning_rate': 0.000540233284358363, 'epoch': 4.74}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 17.38it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.32it/s][A
  4%|▍         | 7/157 [00:00<00:12, 12.50it/s][A
  6%|▌         | 9/157 [00:00<00:12, 11.84it/s][A
  7%|▋         | 11/157 [00:00<00:12, 11.76it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.85it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.64it/s][A
 11%|█         | 17/157 [00:01<00:11, 11.76it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.87it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.95it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 11.97it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.91it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 11.92it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 11.99it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 11.96it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.11it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 12.04it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.07it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5476306676864624, 'eval_acc': 0.8354, 'eval_acc2': 0.9222, 'eval_f1': 0.8352024425287137, 'eval_roc_auc_micro': 0.9979793039393939, 'eval_precision': 0.8354, 'eval_recall': 0.8354, 'eval_runtime': 38.3954, 'eval_samples_per_second': 260.448, 'eval_steps_per_second': 4.089, 'epoch': 4.74}


 48%|████▊     | 750/1560 [37:48<18:37,  1.38s/it]  

{'loss': 0.5207, 'grad_norm': 2.511228084564209, 'learning_rate': 0.0005301892487111431, 'epoch': 4.8}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.71it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.08it/s][A
  4%|▍         | 7/157 [00:00<00:10, 14.07it/s][A
  6%|▌         | 9/157 [00:00<00:10, 13.53it/s][A
  7%|▋         | 11/157 [00:00<00:11, 13.15it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.92it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.89it/s][A
 11%|█         | 17/157 [00:01<00:10, 12.82it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.74it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.69it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.72it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.71it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.75it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.65it/s][A
 20%|█▉        | 31/157 [00:02<00:09, 12.70it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.60it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.65it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.37it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.563530445098877, 'eval_acc': 0.8316, 'eval_acc2': 0.9203, 'eval_f1': 0.8309200015073329, 'eval_roc_auc_micro': 0.9979549292929293, 'eval_precision': 0.8316, 'eval_recall': 0.8316, 'eval_runtime': 15.7957, 'eval_samples_per_second': 633.083, 'eval_steps_per_second': 9.939, 'epoch': 4.8}


 49%|████▊     | 760/1560 [38:13<14:32,  1.09s/it]  

{'loss': 0.5076, 'grad_norm': 2.2485456466674805, 'learning_rate': 0.0005201329700547076, 'epoch': 4.86}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.58it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.01it/s][A
  4%|▍         | 7/157 [00:00<00:11, 12.95it/s][A
  6%|▌         | 9/157 [00:00<00:12, 12.21it/s][A
  7%|▋         | 11/157 [00:00<00:12, 12.08it/s][A
  8%|▊         | 13/157 [00:01<00:11, 12.09it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.19it/s][A
 11%|█         | 17/157 [00:01<00:16,  8.49it/s][A
 12%|█▏        | 19/157 [00:01<00:14,  9.40it/s][A
 13%|█▎        | 21/157 [00:01<00:13,  9.90it/s][A
 15%|█▍        | 23/157 [00:02<00:12, 10.47it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.09it/s][A
 17%|█▋        | 27/157 [00:02<00:11, 11.06it/s][A
 18%|█▊        | 29/157 [00:02<00:11, 11.47it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 11.57it/s][A
 21%|██        | 33/157 [00:02<00:10, 11.48it/s][A
 22%|██▏       | 35/157 [00:03<00:10, 11.78it/s][A
 24%|██▎       | 37/157 [00:03<00:10, 11.99it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5345399975776672, 'eval_acc': 0.8404, 'eval_acc2': 0.9268, 'eval_f1': 0.8398445097677956, 'eval_roc_auc_micro': 0.9980751304545453, 'eval_precision': 0.8404, 'eval_recall': 0.8404, 'eval_runtime': 37.8803, 'eval_samples_per_second': 263.989, 'eval_steps_per_second': 4.145, 'epoch': 4.86}


 49%|████▉     | 770/1560 [39:01<17:43,  1.35s/it]  

{'loss': 0.5253, 'grad_norm': 4.211185932159424, 'learning_rate': 0.0005100685266327202, 'epoch': 4.93}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.84it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.93it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.92it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.36it/s][A
  7%|▋         | 11/157 [00:00<00:11, 13.06it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.94it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.44it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.43it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.39it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.47it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.55it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.57it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.63it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.64it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.58it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.48it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.59it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.54it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5498534440994263, 'eval_acc': 0.8425, 'eval_acc2': 0.9202, 'eval_f1': 0.8425459166264411, 'eval_roc_auc_micro': 0.9978548995959595, 'eval_precision': 0.8425, 'eval_recall': 0.8425, 'eval_runtime': 15.4555, 'eval_samples_per_second': 647.018, 'eval_steps_per_second': 10.158, 'epoch': 4.93}


 50%|█████     | 780/1560 [39:25<14:01,  1.08s/it]  

{'loss': 0.5152, 'grad_norm': 3.064265012741089, 'learning_rate': 0.0005, 'epoch': 4.99}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:11, 13.40it/s][A
  3%|▎         | 4/157 [00:00<00:12, 12.06it/s][A
  4%|▍         | 6/157 [00:00<00:12, 11.75it/s][A
  5%|▌         | 8/157 [00:00<00:12, 11.80it/s][A
  6%|▋         | 10/157 [00:00<00:12, 12.09it/s][A
  8%|▊         | 12/157 [00:00<00:11, 12.32it/s][A
  9%|▉         | 14/157 [00:01<00:11, 12.46it/s][A
 10%|█         | 16/157 [00:01<00:11, 11.97it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 11.81it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 12.07it/s][A
 14%|█▍        | 22/157 [00:01<00:11, 12.25it/s][A
 15%|█▌        | 24/157 [00:01<00:10, 12.33it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.42it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 12.42it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 12.44it/s][A
 20%|██        | 32/157 [00:02<00:10, 12.49it/s][A
 22%|██▏       | 34/157 [00:02<00:09, 12.55it/s][A
 23%|██▎       | 36/157 [00:02<00:09, 12.66it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.5314702391624451, 'eval_acc': 0.8426, 'eval_acc2': 0.9263, 'eval_f1': 0.8425306690451718, 'eval_roc_auc_micro': 0.9980645604040403, 'eval_precision': 0.8426, 'eval_recall': 0.8426, 'eval_runtime': 17.716, 'eval_samples_per_second': 564.463, 'eval_steps_per_second': 8.862, 'epoch': 4.99}


 51%|█████     | 790/1560 [39:55<14:52,  1.16s/it]  

{'loss': 0.3996, 'grad_norm': 2.7048275470733643, 'learning_rate': 0.0004899314733672799, 'epoch': 5.06}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.39it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.05it/s][A
  4%|▍         | 7/157 [00:00<00:19,  7.81it/s][A
  6%|▌         | 9/157 [00:00<00:16,  9.04it/s][A
  7%|▋         | 11/157 [00:01<00:15,  9.73it/s][A
  8%|▊         | 13/157 [00:01<00:13, 10.55it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.12it/s][A
 11%|█         | 17/157 [00:01<00:12, 11.43it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.76it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.03it/s][A
 15%|█▍        | 23/157 [00:02<00:10, 12.22it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.19it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.15it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.29it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.29it/s][A
 21%|██        | 33/157 [00:02<00:10, 11.89it/s][A
 22%|██▏       | 35/157 [00:03<00:10, 11.54it/s][A
 24%|██▎       | 37/157 [00:03<00:10, 11.54it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5287039279937744, 'eval_acc': 0.8453, 'eval_acc2': 0.9291, 'eval_f1': 0.8448403948312895, 'eval_roc_auc_micro': 0.9981228747474747, 'eval_precision': 0.8453, 'eval_recall': 0.8453, 'eval_runtime': 34.7764, 'eval_samples_per_second': 287.551, 'eval_steps_per_second': 4.515, 'epoch': 5.06}


 51%|█████▏    | 800/1560 [40:40<16:38,  1.31s/it]  

{'loss': 0.3879, 'grad_norm': 3.0897889137268066, 'learning_rate': 0.0004798670299452926, 'epoch': 5.12}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.85it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.82it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.09it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.66it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.26it/s][A
  8%|▊         | 13/157 [00:01<00:11, 12.15it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.15it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.09it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.95it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.03it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 12.06it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.02it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.23it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.26it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.03it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.07it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 12.11it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.15it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5213403105735779, 'eval_acc': 0.8482, 'eval_acc2': 0.9258, 'eval_f1': 0.8486284913270737, 'eval_roc_auc_micro': 0.9981573134343434, 'eval_precision': 0.8482, 'eval_recall': 0.8482, 'eval_runtime': 16.003, 'eval_samples_per_second': 624.882, 'eval_steps_per_second': 9.811, 'epoch': 5.12}


 52%|█████▏    | 810/1560 [41:05<14:04,  1.13s/it]  

{'loss': 0.3925, 'grad_norm': 2.9148662090301514, 'learning_rate': 0.0004698107512888569, 'epoch': 5.18}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:28,  5.41it/s][A
  3%|▎         | 4/157 [00:00<00:19,  8.01it/s][A
  4%|▍         | 6/157 [00:00<00:16,  9.42it/s][A
  5%|▌         | 8/157 [00:00<00:14,  9.97it/s][A
  6%|▋         | 10/157 [00:01<00:13, 10.53it/s][A
  8%|▊         | 12/157 [00:01<00:14, 10.33it/s][A
  9%|▉         | 14/157 [00:01<00:13, 10.51it/s][A
 10%|█         | 16/157 [00:01<00:13, 10.77it/s][A
 11%|█▏        | 18/157 [00:01<00:12, 11.02it/s][A
 13%|█▎        | 20/157 [00:01<00:12, 11.03it/s][A
 14%|█▍        | 22/157 [00:02<00:11, 11.28it/s][A
 15%|█▌        | 24/157 [00:02<00:11, 11.49it/s][A
 17%|█▋        | 26/157 [00:02<00:11, 11.50it/s][A
 18%|█▊        | 28/157 [00:02<00:11, 11.57it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 11.67it/s][A
 20%|██        | 32/157 [00:02<00:10, 11.69it/s][A
 22%|██▏       | 34/157 [00:03<00:10, 11.71it/s][A
 23%|██▎       | 36/157 [00:03<00:10, 11.82it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.5262879133224487, 'eval_acc': 0.8433, 'eval_acc2': 0.9284, 'eval_f1': 0.8428482226145793, 'eval_roc_auc_micro': 0.9980218867676768, 'eval_precision': 0.8433, 'eval_recall': 0.8433, 'eval_runtime': 51.8391, 'eval_samples_per_second': 192.905, 'eval_steps_per_second': 3.029, 'epoch': 5.18}


 53%|█████▎    | 820/1560 [42:07<20:12,  1.64s/it]  

{'loss': 0.4257, 'grad_norm': 2.0269131660461426, 'learning_rate': 0.00045976671564163706, 'epoch': 5.25}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:08, 19.17it/s][A
  3%|▎         | 4/157 [00:00<00:10, 14.08it/s][A
  4%|▍         | 6/157 [00:00<00:11, 12.63it/s][A
  5%|▌         | 8/157 [00:00<00:12, 12.30it/s][A
  6%|▋         | 10/157 [00:00<00:12, 12.03it/s][A
  8%|▊         | 12/157 [00:00<00:12, 11.95it/s][A
  9%|▉         | 14/157 [00:01<00:11, 11.98it/s][A
 10%|█         | 16/157 [00:01<00:11, 11.79it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 11.82it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 11.62it/s][A
 14%|█▍        | 22/157 [00:01<00:11, 11.49it/s][A
 15%|█▌        | 24/157 [00:02<00:11, 11.53it/s][A
 17%|█▋        | 26/157 [00:02<00:11, 11.57it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 11.79it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 11.84it/s][A
 20%|██        | 32/157 [00:02<00:10, 11.77it/s][A
 22%|██▏       | 34/157 [00:02<00:10, 11.74it/s][A
 23%|██▎       | 36/157 [00:03<00:10, 11.31it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.5171045660972595, 'eval_acc': 0.8483, 'eval_acc2': 0.93, 'eval_f1': 0.8489367081133113, 'eval_roc_auc_micro': 0.9981393255050505, 'eval_precision': 0.8483, 'eval_recall': 0.8483, 'eval_runtime': 18.3292, 'eval_samples_per_second': 545.578, 'eval_steps_per_second': 8.566, 'epoch': 5.25}


 53%|█████▎    | 830/1560 [42:47<29:59,  2.47s/it]  

{'loss': 0.3887, 'grad_norm': 2.429656744003296, 'learning_rate': 0.00044973899628224153, 'epoch': 5.31}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 17.72it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.16it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.04it/s][A
  6%|▌         | 9/157 [00:00<00:14, 10.28it/s][A
  7%|▋         | 11/157 [00:01<00:20,  7.30it/s][A
  8%|▊         | 12/157 [00:01<00:20,  6.98it/s][A
  8%|▊         | 13/157 [00:01<00:21,  6.80it/s][A
  9%|▉         | 14/157 [00:01<00:21,  6.60it/s][A
 10%|▉         | 15/157 [00:01<00:22,  6.44it/s][A
 10%|█         | 16/157 [00:02<00:22,  6.29it/s][A
 11%|█         | 17/157 [00:02<00:22,  6.23it/s][A
 11%|█▏        | 18/157 [00:02<00:22,  6.16it/s][A
 12%|█▏        | 19/157 [00:02<00:22,  6.12it/s][A
 13%|█▎        | 20/157 [00:02<00:22,  6.14it/s][A
 13%|█▎        | 21/157 [00:02<00:22,  6.14it/s][A
 14%|█▍        | 22/157 [00:03<00:22,  6.13it/s][A
 15%|█▍        | 23/157 [00:03<00:21,  6.25it/s][A
 15%|█▌        | 24/157 [00:03<00:20,  6.44it/s][A
 16%|█▌        | 25/157 

{'eval_loss': 0.49777886271476746, 'eval_acc': 0.8511, 'eval_acc2': 0.9321, 'eval_f1': 0.8511141869388672, 'eval_roc_auc_micro': 0.9982640913131313, 'eval_precision': 0.8511, 'eval_recall': 0.8511, 'eval_runtime': 19.9623, 'eval_samples_per_second': 500.943, 'eval_steps_per_second': 7.865, 'epoch': 5.31}


 54%|█████▍    | 840/1560 [43:19<17:25,  1.45s/it]  

{'loss': 0.36, 'grad_norm': 2.4493887424468994, 'learning_rate': 0.00043973165987233853, 'epoch': 5.38}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 17.55it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.06it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.04it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.47it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.30it/s][A
  8%|▊         | 13/157 [00:01<00:11, 12.13it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.74it/s][A
 11%|█         | 17/157 [00:01<00:11, 11.88it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.92it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.61it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 11.71it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.79it/s][A
 17%|█▋        | 27/157 [00:02<00:11, 11.79it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 11.75it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 11.97it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.01it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 11.90it/s][A
 24%|██▎       | 37/157 [00:03<00:10, 11.83it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5270001888275146, 'eval_acc': 0.8419, 'eval_acc2': 0.9276, 'eval_f1': 0.8420683226201914, 'eval_roc_auc_micro': 0.998112457020202, 'eval_precision': 0.8419, 'eval_recall': 0.8419, 'eval_runtime': 16.619, 'eval_samples_per_second': 601.72, 'eval_steps_per_second': 9.447, 'epoch': 5.38}


 54%|█████▍    | 850/1560 [43:45<13:35,  1.15s/it]  

{'loss': 0.4204, 'grad_norm': 2.46540904045105, 'learning_rate': 0.0004297487648074538, 'epoch': 5.44}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:08, 18.53it/s][A
  3%|▎         | 4/157 [00:00<00:23,  6.49it/s][A
  4%|▍         | 6/157 [00:00<00:18,  8.31it/s][A
  5%|▌         | 8/157 [00:00<00:16,  9.15it/s][A
  6%|▋         | 10/157 [00:01<00:14, 10.08it/s][A
  8%|▊         | 12/157 [00:01<00:13, 10.60it/s][A
  9%|▉         | 14/157 [00:01<00:12, 11.01it/s][A
 10%|█         | 16/157 [00:01<00:12, 11.36it/s][A
 11%|█▏        | 18/157 [00:01<00:12, 11.55it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 11.77it/s][A
 14%|█▍        | 22/157 [00:02<00:11, 11.80it/s][A
 15%|█▌        | 24/157 [00:02<00:11, 11.99it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.07it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 12.03it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 12.09it/s][A
 20%|██        | 32/157 [00:02<00:10, 12.15it/s][A
 22%|██▏       | 34/157 [00:03<00:10, 12.24it/s][A
 23%|██▎       | 36/157 [00:03<00:09, 12.17it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.5197774767875671, 'eval_acc': 0.8486, 'eval_acc2': 0.9263, 'eval_f1': 0.8487365480719126, 'eval_roc_auc_micro': 0.9981909083838383, 'eval_precision': 0.8486, 'eval_recall': 0.8486, 'eval_runtime': 16.2347, 'eval_samples_per_second': 615.963, 'eval_steps_per_second': 9.671, 'epoch': 5.44}


 55%|█████▌    | 860/1560 [44:11<13:05,  1.12s/it]  

{'loss': 0.396, 'grad_norm': 2.116103172302246, 'learning_rate': 0.0004197943595711198, 'epoch': 5.5}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.49it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.93it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.88it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.33it/s][A
  7%|▋         | 11/157 [00:00<00:11, 13.05it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.86it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.76it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.66it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.59it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.49it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.53it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.57it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.55it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.59it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.55it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.57it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.56it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.58it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5005635023117065, 'eval_acc': 0.8521, 'eval_acc2': 0.9315, 'eval_f1': 0.8513532278729606, 'eval_roc_auc_micro': 0.9983022472727272, 'eval_precision': 0.8521, 'eval_recall': 0.8521, 'eval_runtime': 15.5557, 'eval_samples_per_second': 642.851, 'eval_steps_per_second': 10.093, 'epoch': 5.5}


 56%|█████▌    | 870/1560 [44:36<12:30,  1.09s/it]  

{'loss': 0.381, 'grad_norm': 2.981677532196045, 'learning_rate': 0.00040987248109304716, 'epoch': 5.57}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:13, 11.13it/s][A
  3%|▎         | 4/157 [00:00<00:14, 10.52it/s][A
  4%|▍         | 6/157 [00:00<00:14, 10.40it/s][A
  5%|▌         | 8/157 [00:00<00:13, 11.11it/s][A
  6%|▋         | 10/157 [00:00<00:12, 11.55it/s][A
  8%|▊         | 12/157 [00:01<00:12, 11.72it/s][A
  9%|▉         | 14/157 [00:01<00:12, 11.26it/s][A
 10%|█         | 16/157 [00:01<00:12, 11.24it/s][A
 11%|█▏        | 18/157 [00:01<00:12, 11.46it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 11.72it/s][A
 14%|█▍        | 22/157 [00:01<00:11, 11.44it/s][A
 15%|█▌        | 24/157 [00:02<00:11, 11.70it/s][A
 17%|█▋        | 26/157 [00:02<00:11, 11.88it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 12.00it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 12.05it/s][A
 20%|██        | 32/157 [00:02<00:10, 11.94it/s][A
 22%|██▏       | 34/157 [00:02<00:10, 12.06it/s][A
 23%|██▎       | 36/157 [00:03<00:09, 12.18it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.5090546011924744, 'eval_acc': 0.8467, 'eval_acc2': 0.9319, 'eval_f1': 0.846846600786204, 'eval_roc_auc_micro': 0.9981752469191919, 'eval_precision': 0.8467, 'eval_recall': 0.8467, 'eval_runtime': 84.749, 'eval_samples_per_second': 117.995, 'eval_steps_per_second': 1.853, 'epoch': 5.57}


 56%|█████▋    | 880/1560 [46:10<21:33,  1.90s/it]  

{'loss': 0.4011, 'grad_norm': 2.490016222000122, 'learning_rate': 0.0003999871531119779, 'epoch': 5.63}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 17.56it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.75it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.81it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.27it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.86it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.54it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.46it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.51it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.05it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.20it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 12.14it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.16it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.32it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.30it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.42it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.39it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.29it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.23it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5157449841499329, 'eval_acc': 0.8464, 'eval_acc2': 0.9321, 'eval_f1': 0.8459429843361302, 'eval_roc_auc_micro': 0.9981379982323233, 'eval_precision': 0.8464, 'eval_recall': 0.8464, 'eval_runtime': 20.1798, 'eval_samples_per_second': 495.546, 'eval_steps_per_second': 7.78, 'epoch': 5.63}


 57%|█████▋    | 890/1560 [46:40<12:50,  1.15s/it]  

{'loss': 0.3329, 'grad_norm': 3.129396677017212, 'learning_rate': 0.0003901423845438916, 'epoch': 5.7}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.49it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.92it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.77it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.32it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.99it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.90it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.83it/s][A
 11%|█         | 17/157 [00:01<00:10, 12.83it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.64it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.45it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.39it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.48it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.39it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.48it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.48it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.49it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.52it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.49it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5255072712898254, 'eval_acc': 0.8424, 'eval_acc2': 0.9326, 'eval_f1': 0.8422046365582168, 'eval_roc_auc_micro': 0.9981175335353536, 'eval_precision': 0.8424, 'eval_recall': 0.8424, 'eval_runtime': 15.9548, 'eval_samples_per_second': 626.77, 'eval_steps_per_second': 9.84, 'epoch': 5.7}


 58%|█████▊    | 900/1560 [47:05<11:58,  1.09s/it]  

{'loss': 0.4084, 'grad_norm': 3.770444869995117, 'learning_rate': 0.00038034216785622126, 'epoch': 5.76}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 17.55it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.41it/s][A
  4%|▍         | 7/157 [00:00<00:11, 12.80it/s][A
  6%|▌         | 9/157 [00:00<00:12, 12.17it/s][A
  7%|▋         | 11/157 [00:00<00:12, 11.74it/s][A
  8%|▊         | 13/157 [00:01<00:11, 12.00it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.18it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.32it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.42it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.46it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.35it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.21it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.22it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.33it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.00it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.00it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 12.14it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.04it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5428040623664856, 'eval_acc': 0.8429, 'eval_acc2': 0.9277, 'eval_f1': 0.8422487567462642, 'eval_roc_auc_micro': 0.998004671111111, 'eval_precision': 0.8429, 'eval_recall': 0.8429, 'eval_runtime': 33.0297, 'eval_samples_per_second': 302.758, 'eval_steps_per_second': 4.753, 'epoch': 5.76}


 58%|█████▊    | 910/1560 [47:48<14:01,  1.29s/it]  

{'loss': 0.3923, 'grad_norm': 2.874756097793579, 'learning_rate': 0.0003705904774487396, 'epoch': 5.82}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:23,  6.56it/s][A
  3%|▎         | 4/157 [00:00<00:17,  8.83it/s][A
  3%|▎         | 5/157 [00:00<00:16,  9.03it/s][A
  4%|▍         | 6/157 [00:00<00:16,  9.10it/s][A
  5%|▌         | 8/157 [00:00<00:15,  9.79it/s][A
  6%|▋         | 10/157 [00:01<00:14, 10.27it/s][A
  8%|▊         | 12/157 [00:01<00:13, 10.87it/s][A
  9%|▉         | 14/157 [00:01<00:12, 11.32it/s][A
 10%|█         | 16/157 [00:01<00:12, 11.55it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 11.61it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 11.76it/s][A
 14%|█▍        | 22/157 [00:02<00:11, 11.85it/s][A
 15%|█▌        | 24/157 [00:02<00:11, 12.05it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.11it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 12.19it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 12.27it/s][A
 20%|██        | 32/157 [00:02<00:10, 12.20it/s][A
 22%|██▏       | 34/157 [00:03<00:10, 12.19it/s][A
 23%|██▎       | 36/157 [

{'eval_loss': 0.5275641083717346, 'eval_acc': 0.8467, 'eval_acc2': 0.9257, 'eval_f1': 0.8460340890067629, 'eval_roc_auc_micro': 0.9981100474747474, 'eval_precision': 0.8467, 'eval_recall': 0.8467, 'eval_runtime': 16.2649, 'eval_samples_per_second': 614.82, 'eval_steps_per_second': 9.653, 'epoch': 5.82}


 59%|█████▉    | 920/1560 [48:15<13:36,  1.28s/it]  

{'loss': 0.4197, 'grad_norm': 2.221710681915283, 'learning_rate': 0.0003608912680417737, 'epoch': 5.89}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:13, 11.68it/s][A
  3%|▎         | 4/157 [00:00<00:13, 11.53it/s][A
  4%|▍         | 6/157 [00:00<00:13, 11.61it/s][A
  5%|▌         | 8/157 [00:00<00:12, 11.55it/s][A
  6%|▋         | 10/157 [00:01<00:21,  6.69it/s][A
  7%|▋         | 11/157 [00:01<00:24,  5.94it/s][A
  8%|▊         | 12/157 [00:01<00:25,  5.72it/s][A
  8%|▊         | 13/157 [00:01<00:25,  5.74it/s][A
  9%|▉         | 14/157 [00:01<00:24,  5.87it/s][A
 10%|▉         | 15/157 [00:02<00:23,  6.13it/s][A
 10%|█         | 16/157 [00:02<00:21,  6.45it/s][A
 11%|█         | 17/157 [00:02<00:20,  6.76it/s][A
 11%|█▏        | 18/157 [00:02<00:19,  7.14it/s][A
 12%|█▏        | 19/157 [00:02<00:18,  7.54it/s][A
 13%|█▎        | 20/157 [00:02<00:17,  7.94it/s][A
 13%|█▎        | 21/157 [00:02<00:16,  8.27it/s][A
 14%|█▍        | 22/157 [00:02<00:15,  8.64it/s][A
 15%|█▍        | 23/157 [00:03<00:15,  8.80it/s][A
 15%|█▌        | 24/157 

{'eval_loss': 0.5174486041069031, 'eval_acc': 0.847, 'eval_acc2': 0.9296, 'eval_f1': 0.8465039684087836, 'eval_roc_auc_micro': 0.9981495474242423, 'eval_precision': 0.847, 'eval_recall': 0.847, 'eval_runtime': 29.0844, 'eval_samples_per_second': 343.827, 'eval_steps_per_second': 5.398, 'epoch': 5.89}


 60%|█████▉    | 930/1560 [48:56<16:36,  1.58s/it]  

{'loss': 0.3765, 'grad_norm': 2.134141206741333, 'learning_rate': 0.0003512484730723986, 'epoch': 5.95}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 17.28it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.13it/s][A
  4%|▍         | 7/157 [00:00<00:22,  6.53it/s][A
  5%|▌         | 8/157 [00:01<00:26,  5.68it/s][A
  6%|▌         | 9/157 [00:01<00:27,  5.39it/s][A
  6%|▋         | 10/157 [00:01<00:26,  5.45it/s][A
  7%|▋         | 11/157 [00:01<00:26,  5.59it/s][A
  8%|▊         | 12/157 [00:01<00:24,  5.81it/s][A
  8%|▊         | 13/157 [00:01<00:23,  6.17it/s][A
  9%|▉         | 14/157 [00:02<00:21,  6.56it/s][A
 10%|▉         | 15/157 [00:02<00:20,  6.90it/s][A
 10%|█         | 16/157 [00:02<00:19,  7.18it/s][A
 11%|█         | 17/157 [00:02<00:19,  7.34it/s][A
 11%|█▏        | 18/157 [00:02<00:18,  7.50it/s][A
 12%|█▏        | 19/157 [00:02<00:18,  7.63it/s][A
 13%|█▎        | 20/157 [00:02<00:17,  7.65it/s][A
 13%|█▎        | 21/157 [00:03<00:17,  7.75it/s][A
 14%|█▍        | 22/157 [00:03<00:17,  7.86it/s][A
 15%|█▍        | 23/157 [

{'eval_loss': 0.512933611869812, 'eval_acc': 0.8474, 'eval_acc2': 0.9282, 'eval_f1': 0.8472916780921561, 'eval_roc_auc_micro': 0.9982190254545453, 'eval_precision': 0.8474, 'eval_recall': 0.8474, 'eval_runtime': 32.6361, 'eval_samples_per_second': 306.409, 'eval_steps_per_second': 4.811, 'epoch': 5.95}


 60%|██████    | 940/1560 [49:59<36:55,  3.57s/it]  

{'loss': 0.3909, 'grad_norm': 1.6842321157455444, 'learning_rate': 0.00034166600309926387, 'epoch': 6.02}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.57it/s][A
  3%|▎         | 5/157 [00:00<00:23,  6.36it/s][A
  4%|▍         | 6/157 [00:01<00:37,  4.06it/s][A
  4%|▍         | 7/157 [00:01<00:39,  3.83it/s][A
  5%|▌         | 8/157 [00:01<00:40,  3.72it/s][A
  6%|▌         | 9/157 [00:02<00:40,  3.64it/s][A
  6%|▋         | 10/157 [00:02<00:40,  3.62it/s][A
  7%|▋         | 11/157 [00:02<00:40,  3.58it/s][A
  8%|▊         | 12/157 [00:02<00:39,  3.65it/s][A
  8%|▊         | 13/157 [00:03<00:37,  3.86it/s][A
  9%|▉         | 14/157 [00:03<00:35,  4.00it/s][A
 10%|▉         | 15/157 [00:03<00:34,  4.09it/s][A
 10%|█         | 16/157 [00:03<00:33,  4.17it/s][A
 11%|█         | 17/157 [00:04<00:33,  4.22it/s][A
 11%|█▏        | 18/157 [00:04<00:31,  4.45it/s][A
 12%|█▏        | 19/157 [00:04<00:28,  4.83it/s][A
 13%|█▎        | 20/157 [00:04<00:26,  5.13it/s][A
 13%|█▎        | 21/157 [00:04<00:25,  5.35it/s][A
 14%|█▍        | 22/157 [0

{'eval_loss': 0.5126056671142578, 'eval_acc': 0.8506, 'eval_acc2': 0.9326, 'eval_f1': 0.8509769661997565, 'eval_roc_auc_micro': 0.9982507793939395, 'eval_precision': 0.8506, 'eval_recall': 0.8506, 'eval_runtime': 33.2009, 'eval_samples_per_second': 301.197, 'eval_steps_per_second': 4.729, 'epoch': 6.02}


 61%|██████    | 950/1560 [50:42<14:23,  1.42s/it]  

{'loss': 0.3078, 'grad_norm': 2.563314437866211, 'learning_rate': 0.00033214774421669774, 'epoch': 6.08}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 17.88it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.36it/s][A
  4%|▍         | 7/157 [00:00<00:11, 12.85it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.64it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.37it/s][A
  8%|▊         | 13/157 [00:01<00:11, 12.03it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.82it/s][A
 11%|█         | 17/157 [00:01<00:11, 11.81it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.86it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.84it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 11.78it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.94it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 11.94it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 11.94it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.08it/s][A
 21%|██        | 33/157 [00:02<00:10, 11.95it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 12.01it/s][A
 24%|██▎       | 37/157 [00:03<00:10, 11.99it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5126093029975891, 'eval_acc': 0.8521, 'eval_acc2': 0.932, 'eval_f1': 0.8519603223879, 'eval_roc_auc_micro': 0.9981815007070707, 'eval_precision': 0.8521, 'eval_recall': 0.8521, 'eval_runtime': 16.109, 'eval_samples_per_second': 620.772, 'eval_steps_per_second': 9.746, 'epoch': 6.08}


 62%|██████▏   | 960/1560 [51:08<11:25,  1.14s/it]  

{'loss': 0.3298, 'grad_norm': 2.279709815979004, 'learning_rate': 0.00032269755647873217, 'epoch': 6.14}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:22,  6.96it/s][A
  3%|▎         | 4/157 [00:00<00:17,  8.88it/s][A
  4%|▍         | 6/157 [00:00<00:15,  9.98it/s][A
  5%|▌         | 8/157 [00:00<00:14, 10.45it/s][A
  6%|▋         | 10/157 [00:00<00:13, 10.97it/s][A
  8%|▊         | 12/157 [00:01<00:12, 11.24it/s][A
  9%|▉         | 14/157 [00:01<00:12, 11.55it/s][A
 10%|█         | 16/157 [00:01<00:12, 11.64it/s][A
 11%|█▏        | 18/157 [00:01<00:12, 11.56it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 11.63it/s][A
 14%|█▍        | 22/157 [00:02<00:11, 11.67it/s][A
 15%|█▌        | 24/157 [00:02<00:11, 11.83it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 11.97it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 11.95it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 12.00it/s][A
 20%|██        | 32/157 [00:02<00:10, 12.05it/s][A
 22%|██▏       | 34/157 [00:02<00:10, 11.96it/s][A
 23%|██▎       | 36/157 [00:03<00:10, 12.08it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.502120316028595, 'eval_acc': 0.8554, 'eval_acc2': 0.9336, 'eval_f1': 0.8554032178973435, 'eval_roc_auc_micro': 0.998287362070707, 'eval_precision': 0.8554, 'eval_recall': 0.8554, 'eval_runtime': 15.6252, 'eval_samples_per_second': 639.994, 'eval_steps_per_second': 10.048, 'epoch': 6.14}


 62%|██████▏   | 970/1560 [51:34<11:04,  1.13s/it]

{'loss': 0.3234, 'grad_norm': 2.228736639022827, 'learning_rate': 0.0003133192723336895, 'epoch': 6.21}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:21,  7.28it/s][A
  3%|▎         | 5/157 [00:00<00:16,  9.17it/s][A
  4%|▍         | 7/157 [00:00<00:14, 10.12it/s][A
  6%|▌         | 9/157 [00:00<00:13, 10.75it/s][A
  7%|▋         | 11/157 [00:01<00:13, 11.23it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.50it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.69it/s][A
 11%|█         | 17/157 [00:01<00:11, 11.81it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.90it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.01it/s][A
 15%|█▍        | 23/157 [00:02<00:11, 12.07it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.19it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.19it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.23it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 11.87it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.02it/s][A
 22%|██▏       | 35/157 [00:03<00:10, 12.06it/s][A
 24%|██▎       | 37/157 [00:03<00:10, 11.99it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5024470686912537, 'eval_acc': 0.8522, 'eval_acc2': 0.9332, 'eval_f1': 0.8525859479532217, 'eval_roc_auc_micro': 0.9982311155050505, 'eval_precision': 0.8522, 'eval_recall': 0.8522, 'eval_runtime': 15.9091, 'eval_samples_per_second': 628.573, 'eval_steps_per_second': 9.869, 'epoch': 6.21}


 63%|██████▎   | 980/1560 [51:59<10:42,  1.11s/it]

{'loss': 0.2851, 'grad_norm': 3.0332067012786865, 'learning_rate': 0.0003040166950699625, 'epoch': 6.27}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.81it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.97it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.93it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.34it/s][A
  7%|▋         | 11/157 [00:00<00:11, 13.07it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.60it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.48it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.49it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.39it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.38it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 12.07it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.11it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.31it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.26it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.37it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.33it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.23it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.19it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5096402168273926, 'eval_acc': 0.8517, 'eval_acc2': 0.9319, 'eval_f1': 0.8514731728432007, 'eval_roc_auc_micro': 0.998223384040404, 'eval_precision': 0.8517, 'eval_recall': 0.8517, 'eval_runtime': 15.4619, 'eval_samples_per_second': 646.751, 'eval_steps_per_second': 10.154, 'epoch': 6.27}


 63%|██████▎   | 990/1560 [52:25<10:23,  1.09s/it]

{'loss': 0.3087, 'grad_norm': 2.3347787857055664, 'learning_rate': 0.0002947935972736217, 'epoch': 6.34}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.79it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.86it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.16it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.84it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.60it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.51it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.43it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.43it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.43it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.36it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.43it/s][A
 16%|█▌        | 25/157 [00:01<00:11, 11.86it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 11.88it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.03it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.16it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.21it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.22it/s][A
 24%|██▎       | 37/157 [00:02<00:10, 11.73it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5100955367088318, 'eval_acc': 0.8514, 'eval_acc2': 0.933, 'eval_f1': 0.8508144784926098, 'eval_roc_auc_micro': 0.998189334949495, 'eval_precision': 0.8514, 'eval_recall': 0.8514, 'eval_runtime': 15.5539, 'eval_samples_per_second': 642.926, 'eval_steps_per_second': 10.094, 'epoch': 6.34}


 64%|██████▍   | 1000/1560 [52:50<10:39,  1.14s/it]

{'loss': 0.2816, 'grad_norm': 2.1906867027282715, 'learning_rate': 0.00028565371929847286, 'epoch': 6.4}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.68it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.18it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.38it/s][A
  6%|▌         | 9/157 [00:00<00:12, 11.92it/s][A
  7%|▋         | 11/157 [00:00<00:12, 11.49it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.18it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.49it/s][A
 11%|█         | 17/157 [00:01<00:12, 11.40it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.58it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.68it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 11.86it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.59it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 11.85it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.04it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.19it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.27it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.28it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.23it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5173983573913574, 'eval_acc': 0.8486, 'eval_acc2': 0.9317, 'eval_f1': 0.8489144999555533, 'eval_roc_auc_micro': 0.9982341593939393, 'eval_precision': 0.8486, 'eval_recall': 0.8486, 'eval_runtime': 15.4521, 'eval_samples_per_second': 647.162, 'eval_steps_per_second': 10.16, 'epoch': 6.4}


 65%|██████▍   | 1010/1560 [53:15<09:57,  1.09s/it]

{'loss': 0.295, 'grad_norm': 2.568268299102783, 'learning_rate': 0.0002766007677491871, 'epoch': 6.46}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.61it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.47it/s][A
  4%|▍         | 7/157 [00:00<00:12, 12.41it/s][A
  6%|▌         | 9/157 [00:00<00:12, 12.09it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.20it/s][A
  8%|▊         | 13/157 [00:01<00:11, 12.23it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.06it/s][A
 11%|█         | 17/157 [00:01<00:11, 11.87it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.11it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.21it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.20it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.33it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.47it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.55it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.54it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.39it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.36it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.44it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5040785074234009, 'eval_acc': 0.8535, 'eval_acc2': 0.934, 'eval_f1': 0.8528487727259301, 'eval_roc_auc_micro': 0.9983346373737374, 'eval_precision': 0.8535, 'eval_recall': 0.8535, 'eval_runtime': 60.9063, 'eval_samples_per_second': 164.187, 'eval_steps_per_second': 2.578, 'epoch': 6.46}


 65%|██████▌   | 1020/1560 [54:25<14:36,  1.62s/it]  

{'loss': 0.2972, 'grad_norm': 2.3687896728515625, 'learning_rate': 0.00026763841397811573, 'epoch': 6.53}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:07, 19.26it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.70it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.62it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.36it/s][A
  7%|▋         | 11/157 [00:00<00:13, 11.09it/s][A
  8%|▊         | 13/157 [00:01<00:13, 10.80it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.19it/s][A
 11%|█         | 17/157 [00:01<00:12, 11.52it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.83it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.06it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 12.15it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.32it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.41it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.47it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.55it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.44it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.27it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.09it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.4967484474182129, 'eval_acc': 0.857, 'eval_acc2': 0.9357, 'eval_f1': 0.8568058028957354, 'eval_roc_auc_micro': 0.9983150874747475, 'eval_precision': 0.857, 'eval_recall': 0.857, 'eval_runtime': 20.2964, 'eval_samples_per_second': 492.698, 'eval_steps_per_second': 7.735, 'epoch': 6.53}


 66%|██████▌   | 1030/1560 [54:55<10:11,  1.15s/it]  

{'loss': 0.2735, 'grad_norm': 2.7859232425689697, 'learning_rate': 0.0002587702925964034, 'epoch': 6.59}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.79it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.88it/s][A
  4%|▍         | 7/157 [00:00<00:20,  7.33it/s][A
  6%|▌         | 9/157 [00:00<00:17,  8.46it/s][A
  7%|▋         | 11/157 [00:01<00:15,  9.43it/s][A
  8%|▊         | 13/157 [00:01<00:13, 10.29it/s][A
 10%|▉         | 15/157 [00:01<00:12, 10.94it/s][A
 11%|█         | 17/157 [00:01<00:12, 11.42it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.72it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.86it/s][A
 15%|█▍        | 23/157 [00:02<00:11, 12.07it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.20it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.34it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.44it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.49it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.32it/s][A
 22%|██▏       | 35/157 [00:03<00:09, 12.44it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.44it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.5042385458946228, 'eval_acc': 0.8535, 'eval_acc2': 0.9348, 'eval_f1': 0.8536387144023029, 'eval_roc_auc_micro': 0.998218172979798, 'eval_precision': 0.8535, 'eval_recall': 0.8535, 'eval_runtime': 24.1585, 'eval_samples_per_second': 413.933, 'eval_steps_per_second': 6.499, 'epoch': 6.59}


 67%|██████▋   | 1040/1560 [55:29<10:21,  1.20s/it]  

{'loss': 0.2867, 'grad_norm': 1.9333592653274536, 'learning_rate': 0.0002500000000000001, 'epoch': 6.66}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.74it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.77it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.66it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.18it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.93it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.73it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.48it/s][A
 11%|█         | 17/157 [00:01<00:11, 11.85it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.58it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.81it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 11.80it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.92it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.16it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.29it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.29it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.39it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.43it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.50it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.49199366569519043, 'eval_acc': 0.8557, 'eval_acc2': 0.9316, 'eval_f1': 0.8556245116631462, 'eval_roc_auc_micro': 0.9982990176262625, 'eval_precision': 0.8557, 'eval_recall': 0.8557, 'eval_runtime': 16.5697, 'eval_samples_per_second': 603.51, 'eval_steps_per_second': 9.475, 'epoch': 6.66}


 67%|██████▋   | 1050/1560 [55:56<10:04,  1.19s/it]

{'loss': 0.2807, 'grad_norm': 2.6196815967559814, 'learning_rate': 0.00024133109291117155, 'epoch': 6.72}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:21,  7.18it/s][A
  3%|▎         | 5/157 [00:00<00:17,  8.86it/s][A
  4%|▍         | 7/157 [00:00<00:15,  9.98it/s][A
  6%|▌         | 9/157 [00:01<00:17,  8.36it/s][A
  6%|▋         | 10/157 [00:01<00:18,  8.09it/s][A
  7%|▋         | 11/157 [00:01<00:18,  8.02it/s][A
  8%|▊         | 12/157 [00:01<00:17,  8.07it/s][A
  8%|▊         | 13/157 [00:01<00:17,  8.24it/s][A
  9%|▉         | 14/157 [00:01<00:16,  8.48it/s][A
 10%|▉         | 15/157 [00:01<00:16,  8.64it/s][A
 10%|█         | 16/157 [00:01<00:15,  8.91it/s][A
 11%|█         | 17/157 [00:01<00:15,  9.14it/s][A
 12%|█▏        | 19/157 [00:02<00:14,  9.61it/s][A
 13%|█▎        | 20/157 [00:02<00:14,  9.69it/s][A
 14%|█▍        | 22/157 [00:02<00:13, 10.04it/s][A
 15%|█▌        | 24/157 [00:02<00:12, 10.38it/s][A
 17%|█▋        | 26/157 [00:02<00:12, 10.34it/s][A
 18%|█▊        | 28/157 [00:03<00:12, 10.67it/s][A
 19%|█▉        | 30/157 

{'eval_loss': 0.48183515667915344, 'eval_acc': 0.8599, 'eval_acc2': 0.9359, 'eval_f1': 0.8598274845385994, 'eval_roc_auc_micro': 0.9983889222222222, 'eval_precision': 0.8599, 'eval_recall': 0.8599, 'eval_runtime': 19.7762, 'eval_samples_per_second': 505.659, 'eval_steps_per_second': 7.939, 'epoch': 6.72}


 68%|██████▊   | 1060/1560 [56:35<18:36,  2.23s/it]  

{'loss': 0.2757, 'grad_norm': 2.6799674034118652, 'learning_rate': 0.00023276708693609945, 'epoch': 6.78}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 17.24it/s][A
  3%|▎         | 5/157 [00:00<00:11, 13.78it/s][A
  4%|▍         | 7/157 [00:00<00:11, 12.54it/s][A
  6%|▌         | 9/157 [00:00<00:12, 11.54it/s][A
  7%|▋         | 11/157 [00:01<00:18,  7.91it/s][A
  8%|▊         | 12/157 [00:01<00:20,  7.02it/s][A
  8%|▊         | 13/157 [00:01<00:28,  5.00it/s][A
  9%|▉         | 14/157 [00:01<00:26,  5.31it/s][A
 10%|▉         | 15/157 [00:02<00:25,  5.59it/s][A
 10%|█         | 16/157 [00:02<00:24,  5.83it/s][A
 11%|█         | 17/157 [00:02<00:23,  6.03it/s][A
 11%|█▏        | 18/157 [00:02<00:22,  6.16it/s][A
 12%|█▏        | 19/157 [00:02<00:22,  6.26it/s][A
 13%|█▎        | 20/157 [00:02<00:21,  6.34it/s][A
 13%|█▎        | 21/157 [00:02<00:21,  6.40it/s][A
 14%|█▍        | 22/157 [00:03<00:20,  6.46it/s][A
 15%|█▍        | 23/157 [00:03<00:20,  6.48it/s][A
 15%|█▌        | 24/157 [00:03<00:20,  6.48it/s][A
 16%|█▌        | 25/157 

{'eval_loss': 0.49035531282424927, 'eval_acc': 0.8578, 'eval_acc2': 0.9356, 'eval_f1': 0.8570237623687763, 'eval_roc_auc_micro': 0.9983781063131314, 'eval_precision': 0.8578, 'eval_recall': 0.8578, 'eval_runtime': 30.0881, 'eval_samples_per_second': 332.357, 'eval_steps_per_second': 5.218, 'epoch': 6.78}


 69%|██████▊   | 1070/1560 [57:15<11:00,  1.35s/it]  

{'loss': 0.3174, 'grad_norm': 2.8787286281585693, 'learning_rate': 0.0002243114551391542, 'epoch': 6.85}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:09, 16.24it/s][A
  3%|▎         | 5/157 [00:00<00:11, 13.37it/s][A
  4%|▍         | 7/157 [00:00<00:11, 12.79it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.43it/s][A
  7%|▋         | 11/157 [00:00<00:12, 12.16it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.81it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.58it/s][A
 11%|█         | 17/157 [00:01<00:11, 11.70it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.77it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.56it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 11.27it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.40it/s][A
 17%|█▋        | 27/157 [00:02<00:11, 11.64it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 11.71it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 11.76it/s][A
 21%|██        | 33/157 [00:02<00:10, 11.92it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 11.92it/s][A
 24%|██▎       | 37/157 [00:03<00:10, 11.79it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.48722466826438904, 'eval_acc': 0.8581, 'eval_acc2': 0.9355, 'eval_f1': 0.8577150713753326, 'eval_roc_auc_micro': 0.9983782901010101, 'eval_precision': 0.8581, 'eval_recall': 0.8581, 'eval_runtime': 18.5236, 'eval_samples_per_second': 539.852, 'eval_steps_per_second': 8.476, 'epoch': 6.85}


 69%|██████▉   | 1080/1560 [57:44<09:27,  1.18s/it]

{'loss': 0.2873, 'grad_norm': 2.3752269744873047, 'learning_rate': 0.00021596762663442215, 'epoch': 6.91}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 17.56it/s][A
  3%|▎         | 5/157 [00:00<00:11, 13.69it/s][A
  4%|▍         | 7/157 [00:00<00:11, 12.99it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.80it/s][A
  7%|▋         | 11/157 [00:00<00:12, 12.10it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.76it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.48it/s][A
 11%|█         | 17/157 [00:01<00:11, 11.69it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.56it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.61it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 11.62it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.53it/s][A
 17%|█▋        | 27/157 [00:02<00:11, 11.55it/s][A
 18%|█▊        | 29/157 [00:02<00:11, 11.63it/s][A
 20%|█▉        | 31/157 [00:02<00:11, 11.39it/s][A
 21%|██        | 33/157 [00:02<00:10, 11.56it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 11.80it/s][A
 24%|██▎       | 37/157 [00:03<00:10, 11.96it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.48006942868232727, 'eval_acc': 0.8618, 'eval_acc2': 0.9364, 'eval_f1': 0.8612162749681844, 'eval_roc_auc_micro': 0.9984044192424242, 'eval_precision': 0.8618, 'eval_recall': 0.8618, 'eval_runtime': 16.2753, 'eval_samples_per_second': 614.429, 'eval_steps_per_second': 9.647, 'epoch': 6.91}


 70%|██████▉   | 1090/1560 [58:10<08:43,  1.11s/it]

{'loss': 0.282, 'grad_norm': 5.396607398986816, 'learning_rate': 0.00020773898519505567, 'epoch': 6.98}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.21it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.72it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.16it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.85it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.65it/s][A
  8%|▊         | 13/157 [00:01<00:17,  8.31it/s][A
 10%|▉         | 15/157 [00:01<00:15,  9.29it/s][A
 11%|█         | 17/157 [00:01<00:14,  9.99it/s][A
 12%|█▏        | 19/157 [00:01<00:13, 10.43it/s][A
 13%|█▎        | 21/157 [00:01<00:12, 10.94it/s][A
 15%|█▍        | 23/157 [00:02<00:12, 11.04it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.33it/s][A
 17%|█▋        | 27/157 [00:02<00:11, 11.66it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 11.83it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 11.88it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.02it/s][A
 22%|██▏       | 35/157 [00:03<00:10, 12.04it/s][A
 24%|██▎       | 37/157 [00:03<00:10, 11.99it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.47941356897354126, 'eval_acc': 0.8613, 'eval_acc2': 0.9359, 'eval_f1': 0.8608542766687107, 'eval_roc_auc_micro': 0.9984278024747475, 'eval_precision': 0.8613, 'eval_recall': 0.8613, 'eval_runtime': 15.8273, 'eval_samples_per_second': 631.818, 'eval_steps_per_second': 9.92, 'epoch': 6.98}


 71%|███████   | 1100/1560 [58:38<09:05,  1.19s/it]

{'loss': 0.2416, 'grad_norm': 1.756449580192566, 'learning_rate': 0.00019962886788101047, 'epoch': 7.04}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:10, 15.01it/s][A
  3%|▎         | 4/157 [00:00<00:11, 13.44it/s][A
  4%|▍         | 6/157 [00:00<00:11, 12.73it/s][A
  5%|▌         | 8/157 [00:00<00:11, 12.54it/s][A
  6%|▋         | 10/157 [00:00<00:11, 12.37it/s][A
  8%|▊         | 12/157 [00:00<00:11, 12.41it/s][A
  9%|▉         | 14/157 [00:01<00:11, 12.31it/s][A
 10%|█         | 16/157 [00:01<00:11, 12.27it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 12.29it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 12.20it/s][A
 14%|█▍        | 22/157 [00:01<00:11, 12.25it/s][A
 15%|█▌        | 24/157 [00:01<00:11, 11.83it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.04it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 12.06it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 12.10it/s][A
 20%|██        | 32/157 [00:02<00:10, 12.17it/s][A
 22%|██▏       | 34/157 [00:02<00:10, 12.11it/s][A
 23%|██▎       | 36/157 [00:02<00:10, 12.03it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.47834718227386475, 'eval_acc': 0.8611, 'eval_acc2': 0.9359, 'eval_f1': 0.8609827647321734, 'eval_roc_auc_micro': 0.9984482863636364, 'eval_precision': 0.8611, 'eval_recall': 0.8611, 'eval_runtime': 15.6742, 'eval_samples_per_second': 637.991, 'eval_steps_per_second': 10.016, 'epoch': 7.04}


 71%|███████   | 1110/1560 [59:03<08:09,  1.09s/it]

{'loss': 0.2233, 'grad_norm': 1.979851484298706, 'learning_rate': 0.00019164056368572847, 'epoch': 7.1}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 17.26it/s][A
  3%|▎         | 5/157 [00:00<00:11, 13.76it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.01it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.58it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.44it/s][A
  8%|▊         | 13/157 [00:01<00:11, 12.47it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.38it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.39it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.29it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.23it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.21it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.25it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.29it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.31it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.34it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.43it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.46it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.49it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.4969235360622406, 'eval_acc': 0.8578, 'eval_acc2': 0.9349, 'eval_f1': 0.857212876929885, 'eval_roc_auc_micro': 0.9983124316666667, 'eval_precision': 0.8578, 'eval_recall': 0.8578, 'eval_runtime': 16.6555, 'eval_samples_per_second': 600.401, 'eval_steps_per_second': 9.426, 'epoch': 7.1}


 72%|███████▏  | 1120/1560 [59:29<08:01,  1.09s/it]

{'loss': 0.2299, 'grad_norm': 2.3015236854553223, 'learning_rate': 0.0001837773122023114, 'epoch': 7.17}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:26,  5.90it/s][A
  3%|▎         | 4/157 [00:00<00:17,  8.57it/s][A
  4%|▍         | 6/157 [00:00<00:15,  9.96it/s][A
  5%|▌         | 8/157 [00:00<00:14, 10.45it/s][A
  6%|▋         | 10/157 [00:01<00:13, 10.72it/s][A
  8%|▊         | 12/157 [00:01<00:13, 10.96it/s][A
  9%|▉         | 14/157 [00:01<00:12, 11.42it/s][A
 10%|█         | 16/157 [00:01<00:18,  7.65it/s][A
 11%|█▏        | 18/157 [00:01<00:15,  8.74it/s][A
 13%|█▎        | 20/157 [00:02<00:14,  9.62it/s][A
 14%|█▍        | 22/157 [00:02<00:13, 10.33it/s][A
 15%|█▌        | 24/157 [00:02<00:12, 10.86it/s][A
 17%|█▋        | 26/157 [00:02<00:11, 11.36it/s][A
 18%|█▊        | 28/157 [00:02<00:11, 11.69it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 11.96it/s][A
 20%|██        | 32/157 [00:03<00:10, 12.14it/s][A
 22%|██▏       | 34/157 [00:03<00:10, 12.28it/s][A
 23%|██▎       | 36/157 [00:03<00:09, 12.32it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.4919947385787964, 'eval_acc': 0.8617, 'eval_acc2': 0.9365, 'eval_f1': 0.8611779468683629, 'eval_roc_auc_micro': 0.9982894473232323, 'eval_precision': 0.8617, 'eval_recall': 0.8617, 'eval_runtime': 15.9056, 'eval_samples_per_second': 628.709, 'eval_steps_per_second': 9.871, 'epoch': 7.17}


 72%|███████▏  | 1130/1560 [59:54<07:44,  1.08s/it]

{'loss': 0.237, 'grad_norm': 2.1962380409240723, 'learning_rate': 0.00017604230230973067, 'epoch': 7.23}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 19.10it/s][A
  3%|▎         | 5/157 [00:00<00:09, 15.22it/s][A
  4%|▍         | 7/157 [00:00<00:10, 14.06it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.44it/s][A
  7%|▋         | 11/157 [00:00<00:11, 13.07it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.71it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.63it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.61it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.49it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.27it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 12.18it/s][A
 16%|█▌        | 25/157 [00:01<00:11, 11.91it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 11.94it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.08it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.06it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.25it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.25it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.20it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.4872792065143585, 'eval_acc': 0.8613, 'eval_acc2': 0.9359, 'eval_f1': 0.8606323067641839, 'eval_roc_auc_micro': 0.9983383095454545, 'eval_precision': 0.8613, 'eval_recall': 0.8613, 'eval_runtime': 15.5114, 'eval_samples_per_second': 644.689, 'eval_steps_per_second': 10.122, 'epoch': 7.23}


 73%|███████▎  | 1140/1560 [1:00:19<07:37,  1.09s/it]

{'loss': 0.2487, 'grad_norm': 2.5465118885040283, 'learning_rate': 0.00016843867087960252, 'epoch': 7.3}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:07, 19.71it/s][A
  3%|▎         | 4/157 [00:00<00:10, 14.96it/s][A
  4%|▍         | 6/157 [00:00<00:10, 13.79it/s][A
  5%|▌         | 8/157 [00:00<00:11, 13.35it/s][A
  6%|▋         | 10/157 [00:00<00:11, 13.01it/s][A
  8%|▊         | 12/157 [00:00<00:11, 12.87it/s][A
  9%|▉         | 14/157 [00:01<00:11, 12.66it/s][A
 10%|█         | 16/157 [00:01<00:11, 12.54it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 12.41it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 12.37it/s][A
 14%|█▍        | 22/157 [00:01<00:11, 12.21it/s][A
 15%|█▌        | 24/157 [00:01<00:11, 12.07it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.15it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 12.13it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 12.11it/s][A
 20%|██        | 32/157 [00:02<00:10, 12.25it/s][A
 22%|██▏       | 34/157 [00:02<00:09, 12.33it/s][A
 23%|██▎       | 36/157 [00:02<00:10, 12.07it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.48269131779670715, 'eval_acc': 0.8622, 'eval_acc2': 0.9361, 'eval_f1': 0.8615295432645921, 'eval_roc_auc_micro': 0.9983983017676767, 'eval_precision': 0.8622, 'eval_recall': 0.8622, 'eval_runtime': 15.5212, 'eval_samples_per_second': 644.278, 'eval_steps_per_second': 10.115, 'epoch': 7.3}


 74%|███████▎  | 1150/1560 [1:00:44<07:25,  1.09s/it]

{'loss': 0.2276, 'grad_norm': 1.806491732597351, 'learning_rate': 0.00016096950150405455, 'epoch': 7.36}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.42it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.03it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.97it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.42it/s][A
  7%|▋         | 11/157 [00:00<00:11, 13.14it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.87it/s][A
 10%|▉         | 15/157 [00:01<00:16,  8.85it/s][A
 11%|█         | 17/157 [00:01<00:14,  9.82it/s][A
 12%|█▏        | 19/157 [00:01<00:13, 10.45it/s][A
 13%|█▎        | 21/157 [00:01<00:12, 11.02it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 11.51it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.36it/s][A
 17%|█▋        | 27/157 [00:02<00:11, 11.74it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.06it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.27it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.32it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.27it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.44it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.48460137844085693, 'eval_acc': 0.8574, 'eval_acc2': 0.9354, 'eval_f1': 0.8571336182014188, 'eval_roc_auc_micro': 0.9984466923232325, 'eval_precision': 0.8574, 'eval_recall': 0.8574, 'eval_runtime': 15.4173, 'eval_samples_per_second': 648.62, 'eval_steps_per_second': 10.183, 'epoch': 7.36}


 74%|███████▍  | 1160/1560 [1:01:09<07:10,  1.08s/it]

{'loss': 0.2044, 'grad_norm': 3.0475003719329834, 'learning_rate': 0.00015363782324520031, 'epoch': 7.42}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:08, 19.02it/s][A
  3%|▎         | 4/157 [00:00<00:16,  9.55it/s][A
  4%|▍         | 6/157 [00:00<00:14, 10.52it/s][A
  5%|▌         | 8/157 [00:00<00:14, 10.60it/s][A
  6%|▋         | 10/157 [00:00<00:13, 10.78it/s][A
  8%|▊         | 12/157 [00:01<00:13, 11.08it/s][A
  9%|▉         | 14/157 [00:01<00:12, 11.33it/s][A
 10%|█         | 16/157 [00:01<00:12, 11.64it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 11.97it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 12.16it/s][A
 14%|█▍        | 22/157 [00:01<00:11, 12.15it/s][A
 15%|█▌        | 24/157 [00:02<00:10, 12.31it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.37it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 12.49it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 12.52it/s][A
 20%|██        | 32/157 [00:02<00:10, 12.44it/s][A
 22%|██▏       | 34/157 [00:02<00:09, 12.50it/s][A
 23%|██▎       | 36/157 [00:03<00:09, 12.49it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.48569706082344055, 'eval_acc': 0.8588, 'eval_acc2': 0.9349, 'eval_f1': 0.8583801670343517, 'eval_roc_auc_micro': 0.998440072929293, 'eval_precision': 0.8588, 'eval_recall': 0.8588, 'eval_runtime': 15.1373, 'eval_samples_per_second': 660.619, 'eval_steps_per_second': 10.372, 'epoch': 7.42}


 75%|███████▌  | 1170/1560 [1:01:34<06:58,  1.07s/it]

{'loss': 0.2183, 'grad_norm': 1.9281306266784668, 'learning_rate': 0.00014644660940672628, 'epoch': 7.49}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:07, 19.49it/s][A
  3%|▎         | 5/157 [00:00<00:09, 15.38it/s][A
  4%|▍         | 7/157 [00:00<00:10, 14.07it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.40it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.68it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.40it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.27it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.21it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.98it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.81it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 11.81it/s][A
 16%|█▌        | 25/157 [00:01<00:11, 11.83it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 11.90it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.05it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.02it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.18it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.29it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.31it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.47560229897499084, 'eval_acc': 0.8611, 'eval_acc2': 0.9381, 'eval_f1': 0.8609331453827028, 'eval_roc_auc_micro': 0.9984643431818182, 'eval_precision': 0.8611, 'eval_recall': 0.8611, 'eval_runtime': 15.3914, 'eval_samples_per_second': 649.714, 'eval_steps_per_second': 10.201, 'epoch': 7.49}


 76%|███████▌  | 1180/1560 [1:01:59<06:51,  1.08s/it]

{'loss': 0.2183, 'grad_norm': 1.7206382751464844, 'learning_rate': 0.0001393987763280928, 'epoch': 7.55}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.93it/s][A
  3%|▎         | 5/157 [00:00<00:09, 15.33it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.95it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.39it/s][A
  7%|▋         | 11/157 [00:01<00:16,  8.68it/s][A
  8%|▊         | 13/157 [00:01<00:15,  9.58it/s][A
 10%|▉         | 15/157 [00:01<00:13, 10.29it/s][A
 11%|█         | 17/157 [00:01<00:13, 10.63it/s][A
 12%|█▏        | 19/157 [00:01<00:12, 11.10it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.36it/s][A
 15%|█▍        | 23/157 [00:02<00:11, 11.58it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.74it/s][A
 17%|█▋        | 27/157 [00:02<00:11, 11.69it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 11.89it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.02it/s][A
 21%|██        | 33/157 [00:02<00:10, 11.99it/s][A
 22%|██▏       | 35/157 [00:03<00:10, 12.16it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.28it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.4765159785747528, 'eval_acc': 0.8645, 'eval_acc2': 0.9385, 'eval_f1': 0.8645793391812134, 'eval_roc_auc_micro': 0.9984645277777778, 'eval_precision': 0.8645, 'eval_recall': 0.8645, 'eval_runtime': 15.3882, 'eval_samples_per_second': 649.849, 'eval_steps_per_second': 10.203, 'epoch': 7.55}


 76%|███████▋  | 1190/1560 [1:02:24<06:43,  1.09s/it]

{'loss': 0.2485, 'grad_norm': 1.6764622926712036, 'learning_rate': 0.00013249718220183582, 'epoch': 7.62}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.64it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.59it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.07it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.70it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.32it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.84it/s][A
 10%|▉         | 15/157 [00:01<00:11, 11.87it/s][A
 11%|█         | 17/157 [00:01<00:11, 11.84it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.84it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.89it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 11.68it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.82it/s][A
 17%|█▋        | 27/157 [00:02<00:11, 11.65it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 11.90it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.18it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.31it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.34it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.43it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.4814381003379822, 'eval_acc': 0.8628, 'eval_acc2': 0.9359, 'eval_f1': 0.8629467591836425, 'eval_roc_auc_micro': 0.9984508352020203, 'eval_precision': 0.8628, 'eval_recall': 0.8628, 'eval_runtime': 15.7471, 'eval_samples_per_second': 635.039, 'eval_steps_per_second': 9.97, 'epoch': 7.62}


 77%|███████▋  | 1200/1560 [1:02:49<06:35,  1.10s/it]

{'loss': 0.2202, 'grad_norm': 2.138059377670288, 'learning_rate': 0.0001257446259144494, 'epoch': 7.68}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:09, 16.71it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.27it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.43it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.89it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.77it/s][A
  8%|▊         | 13/157 [00:01<00:11, 12.33it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.30it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.27it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.18it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.28it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.40it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.23it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.03it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 11.95it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.04it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.11it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 12.16it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.25it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.4769190847873688, 'eval_acc': 0.8641, 'eval_acc2': 0.9373, 'eval_f1': 0.8641851610008882, 'eval_roc_auc_micro': 0.9984803316161617, 'eval_precision': 0.8641, 'eval_recall': 0.8641, 'eval_runtime': 32.1629, 'eval_samples_per_second': 310.918, 'eval_steps_per_second': 4.881, 'epoch': 7.68}


 78%|███████▊  | 1210/1560 [1:03:31<07:32,  1.29s/it]  

{'loss': 0.2236, 'grad_norm': 1.7752282619476318, 'learning_rate': 0.00011914384591132044, 'epoch': 7.74}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.78it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.09it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.55it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.19it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.87it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.49it/s][A
 10%|▉         | 15/157 [00:01<00:18,  7.49it/s][A
 11%|█         | 17/157 [00:01<00:16,  8.46it/s][A
 12%|█▏        | 19/157 [00:01<00:14,  9.28it/s][A
 13%|█▎        | 21/157 [00:01<00:13, 10.00it/s][A
 15%|█▍        | 23/157 [00:02<00:12, 10.59it/s][A
 16%|█▌        | 25/157 [00:02<00:12, 10.93it/s][A
 17%|█▋        | 27/157 [00:02<00:11, 11.24it/s][A
 18%|█▊        | 29/157 [00:02<00:11, 11.60it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 11.81it/s][A
 21%|██        | 33/157 [00:02<00:10, 11.95it/s][A
 22%|██▏       | 35/157 [00:03<00:10, 12.04it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.10it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.47225674986839294, 'eval_acc': 0.8642, 'eval_acc2': 0.9386, 'eval_f1': 0.8640179192979622, 'eval_roc_auc_micro': 0.9985102609090909, 'eval_precision': 0.8642, 'eval_recall': 0.8642, 'eval_runtime': 16.0371, 'eval_samples_per_second': 623.553, 'eval_steps_per_second': 9.79, 'epoch': 7.74}


 78%|███████▊  | 1220/1560 [1:03:56<06:10,  1.09s/it]

{'loss': 0.1845, 'grad_norm': 1.9662883281707764, 'learning_rate': 0.00011269751908617276, 'epoch': 7.81}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.75it/s][A
  3%|▎         | 5/157 [00:00<00:09, 15.21it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.67it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.03it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.62it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.17it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.12it/s][A
 11%|█         | 17/157 [00:01<00:12, 11.65it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.62it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.78it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 11.86it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.85it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.00it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.22it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.30it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.24it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.32it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.39it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.4687228500843048, 'eval_acc': 0.8648, 'eval_acc2': 0.9392, 'eval_f1': 0.8646563528076846, 'eval_roc_auc_micro': 0.9984920374747475, 'eval_precision': 0.8648, 'eval_recall': 0.8648, 'eval_runtime': 38.6038, 'eval_samples_per_second': 259.042, 'eval_steps_per_second': 4.067, 'epoch': 7.81}


 79%|███████▉  | 1230/1560 [1:04:45<07:32,  1.37s/it]  

{'loss': 0.1864, 'grad_norm': 2.406740188598633, 'learning_rate': 0.00010640825969547497, 'epoch': 7.87}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:09, 15.50it/s][A
  3%|▎         | 4/157 [00:00<00:11, 13.26it/s][A
  4%|▍         | 6/157 [00:00<00:14, 10.50it/s][A
  5%|▌         | 8/157 [00:00<00:13, 11.09it/s][A
  6%|▋         | 10/157 [00:00<00:12, 11.53it/s][A
  8%|▊         | 12/157 [00:01<00:12, 11.86it/s][A
  9%|▉         | 14/157 [00:01<00:11, 11.97it/s][A
 10%|█         | 16/157 [00:01<00:12, 11.71it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 11.86it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 11.90it/s][A
 14%|█▍        | 22/157 [00:01<00:11, 11.28it/s][A
 15%|█▌        | 24/157 [00:02<00:11, 11.37it/s][A
 17%|█▋        | 26/157 [00:02<00:11, 11.61it/s][A
 18%|█▊        | 28/157 [00:02<00:11, 11.66it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 11.82it/s][A
 20%|██        | 32/157 [00:02<00:10, 11.97it/s][A
 22%|██▏       | 34/157 [00:02<00:10, 12.06it/s][A
 23%|██▎       | 36/157 [00:03<00:09, 12.17it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.4655468165874481, 'eval_acc': 0.866, 'eval_acc2': 0.9388, 'eval_f1': 0.8660191774044043, 'eval_roc_auc_micro': 0.9985083589898989, 'eval_precision': 0.866, 'eval_recall': 0.866, 'eval_runtime': 15.2526, 'eval_samples_per_second': 655.627, 'eval_steps_per_second': 10.293, 'epoch': 7.87}


 79%|███████▉  | 1240/1560 [1:05:10<06:02,  1.13s/it]

{'loss': 0.1972, 'grad_norm': 1.8027782440185547, 'learning_rate': 0.00010027861829824952, 'epoch': 7.94}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.41it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.53it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.17it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.60it/s][A
  7%|▋         | 11/157 [00:01<00:20,  7.16it/s][A
  8%|▊         | 13/157 [00:01<00:17,  8.14it/s][A
 10%|▉         | 15/157 [00:01<00:15,  8.89it/s][A
 11%|█         | 17/157 [00:01<00:14,  9.54it/s][A
 12%|█▏        | 19/157 [00:01<00:13, 10.14it/s][A
 13%|█▎        | 21/157 [00:02<00:13, 10.41it/s][A
 15%|█▍        | 23/157 [00:02<00:12, 10.87it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.14it/s][A
 17%|█▋        | 27/157 [00:02<00:11, 11.34it/s][A
 18%|█▊        | 29/157 [00:02<00:11, 11.49it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 11.59it/s][A
 21%|██        | 33/157 [00:03<00:10, 11.66it/s][A
 22%|██▏       | 35/157 [00:03<00:10, 11.62it/s][A
 24%|██▎       | 37/157 [00:03<00:10, 11.67it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.46911269426345825, 'eval_acc': 0.8663, 'eval_acc2': 0.939, 'eval_f1': 0.8660622657539842, 'eval_roc_auc_micro': 0.9985108425252526, 'eval_precision': 0.8663, 'eval_recall': 0.8663, 'eval_runtime': 23.6768, 'eval_samples_per_second': 422.354, 'eval_steps_per_second': 6.631, 'epoch': 7.94}


 80%|████████  | 1250/1560 [1:05:55<12:24,  2.40s/it]

{'loss': 0.2229, 'grad_norm': 1.545821189880371, 'learning_rate': 9.431108072171346e-05, 'epoch': 8.0}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:27,  5.63it/s][A
  3%|▎         | 4/157 [00:00<00:19,  7.77it/s][A
  4%|▍         | 6/157 [00:00<00:16,  9.21it/s][A
  5%|▌         | 8/157 [00:01<00:21,  7.02it/s][A
  6%|▌         | 9/157 [00:01<00:26,  5.54it/s][A
  6%|▋         | 10/157 [00:01<00:31,  4.71it/s][A
  7%|▋         | 11/157 [00:02<00:34,  4.26it/s][A
  8%|▊         | 12/157 [00:02<00:36,  4.02it/s][A
  8%|▊         | 13/157 [00:02<00:34,  4.14it/s][A
  9%|▉         | 14/157 [00:02<00:32,  4.37it/s][A
 10%|▉         | 15/157 [00:02<00:31,  4.55it/s][A
 10%|█         | 16/157 [00:03<00:30,  4.69it/s][A
 11%|█         | 17/157 [00:03<00:29,  4.79it/s][A
 11%|█▏        | 18/157 [00:03<00:28,  4.89it/s][A
 12%|█▏        | 19/157 [00:03<00:27,  4.98it/s][A
 13%|█▎        | 20/157 [00:03<00:27,  5.00it/s][A
 13%|█▎        | 21/157 [00:04<00:26,  5.05it/s][A
 14%|█▍        | 22/157 [00:04<00:26,  5.06it/s][A
 15%|█▍        | 23/157 [

{'eval_loss': 0.4691942036151886, 'eval_acc': 0.8668, 'eval_acc2': 0.9377, 'eval_f1': 0.866574155168718, 'eval_roc_auc_micro': 0.9985051505050504, 'eval_precision': 0.8668, 'eval_recall': 0.8668, 'eval_runtime': 32.3491, 'eval_samples_per_second': 309.127, 'eval_steps_per_second': 4.853, 'epoch': 8.0}


 81%|████████  | 1260/1560 [1:06:41<07:08,  1.43s/it]  

{'loss': 0.2007, 'grad_norm': 2.1735756397247314, 'learning_rate': 8.850806705317183e-05, 'epoch': 8.06}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 17.62it/s][A
  3%|▎         | 5/157 [00:00<00:10, 13.88it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.25it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.77it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.40it/s][A
  8%|▊         | 13/157 [00:01<00:11, 12.32it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.19it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.25it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.21it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.11it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 12.09it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 12.00it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.09it/s][A
 18%|█▊        | 29/157 [00:02<00:14,  8.70it/s][A
 20%|█▉        | 31/157 [00:02<00:16,  7.72it/s][A
 20%|██        | 32/157 [00:03<00:16,  7.55it/s][A
 21%|██        | 33/157 [00:03<00:16,  7.55it/s][A
 22%|██▏       | 34/157 [00:03<00:16,  7.60it/s][A
 22%|██▏       | 35/157 

{'eval_loss': 0.4706018567085266, 'eval_acc': 0.865, 'eval_acc2': 0.9369, 'eval_f1': 0.8649260648027337, 'eval_roc_auc_micro': 0.9984799717171717, 'eval_precision': 0.865, 'eval_recall': 0.865, 'eval_runtime': 25.0525, 'eval_samples_per_second': 399.162, 'eval_steps_per_second': 6.267, 'epoch': 8.06}


 81%|████████▏ | 1270/1560 [1:07:16<06:03,  1.25s/it]

{'loss': 0.1931, 'grad_norm': 1.4404230117797852, 'learning_rate': 8.287193065856935e-05, 'epoch': 8.13}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.86it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.21it/s][A
  4%|▍         | 7/157 [00:00<00:11, 12.79it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.43it/s][A
  7%|▋         | 11/157 [00:01<00:19,  7.54it/s][A
  8%|▊         | 13/157 [00:01<00:16,  8.61it/s][A
 10%|▉         | 15/157 [00:01<00:14,  9.52it/s][A
 11%|█         | 17/157 [00:01<00:13, 10.26it/s][A
 12%|█▏        | 19/157 [00:01<00:12, 10.76it/s][A
 13%|█▎        | 21/157 [00:01<00:12, 11.19it/s][A
 15%|█▍        | 23/157 [00:02<00:11, 11.48it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.61it/s][A
 17%|█▋        | 27/157 [00:02<00:11, 11.71it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 11.73it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 11.80it/s][A
 21%|██        | 33/157 [00:02<00:10, 11.91it/s][A
 22%|██▏       | 35/157 [00:03<00:10, 12.04it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.07it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.4663201570510864, 'eval_acc': 0.8655, 'eval_acc2': 0.9393, 'eval_f1': 0.8656577333336625, 'eval_roc_auc_micro': 0.9985106584343435, 'eval_precision': 0.8655, 'eval_recall': 0.8655, 'eval_runtime': 29.2376, 'eval_samples_per_second': 342.025, 'eval_steps_per_second': 5.37, 'epoch': 8.13}


 82%|████████▏ | 1280/1560 [1:07:55<05:53,  1.26s/it]

{'loss': 0.1956, 'grad_norm': 3.618619680404663, 'learning_rate': 7.74049572281027e-05, 'epoch': 8.19}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 17.71it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.39it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.31it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.85it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.51it/s][A
  8%|▊         | 13/157 [00:01<00:11, 12.03it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.03it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.01it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.96it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.96it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 11.82it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.83it/s][A
 17%|█▋        | 27/157 [00:02<00:11, 11.79it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.00it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.14it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.14it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 12.12it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.24it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.4630345106124878, 'eval_acc': 0.8675, 'eval_acc2': 0.9405, 'eval_f1': 0.8677557086494649, 'eval_roc_auc_micro': 0.9985508622727273, 'eval_precision': 0.8675, 'eval_recall': 0.8675, 'eval_runtime': 31.3092, 'eval_samples_per_second': 319.395, 'eval_steps_per_second': 5.015, 'epoch': 8.19}


 83%|████████▎ | 1290/1560 [1:08:36<05:49,  1.30s/it]

{'loss': 0.1814, 'grad_norm': 1.8652656078338623, 'learning_rate': 7.21093638492763e-05, 'epoch': 8.26}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 17.75it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.74it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.45it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.54it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.40it/s][A
  8%|▊         | 13/157 [00:01<00:11, 12.17it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.06it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.15it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.13it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.06it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 12.03it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.01it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 11.97it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 11.99it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.16it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.26it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.31it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.31it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.46136099100112915, 'eval_acc': 0.8682, 'eval_acc2': 0.9407, 'eval_f1': 0.8683398275953387, 'eval_roc_auc_micro': 0.9985645954545455, 'eval_precision': 0.8682, 'eval_recall': 0.8682, 'eval_runtime': 15.8047, 'eval_samples_per_second': 632.724, 'eval_steps_per_second': 9.934, 'epoch': 8.26}


 83%|████████▎ | 1300/1560 [1:09:01<04:42,  1.09s/it]

{'loss': 0.186, 'grad_norm': 1.2078152894973755, 'learning_rate': 6.698729810778065e-05, 'epoch': 8.32}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:09, 16.84it/s][A
  3%|▎         | 5/157 [00:00<00:11, 13.65it/s][A
  4%|▍         | 7/157 [00:00<00:11, 12.81it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.73it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.67it/s][A
  8%|▊         | 13/157 [00:01<00:11, 12.58it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.67it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.44it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.14it/s][A
 13%|█▎        | 21/157 [00:01<00:17,  7.88it/s][A
 15%|█▍        | 23/157 [00:02<00:15,  8.82it/s][A
 16%|█▌        | 25/157 [00:02<00:13,  9.73it/s][A
 17%|█▋        | 27/157 [00:02<00:12, 10.45it/s][A
 18%|█▊        | 29/157 [00:02<00:11, 11.05it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 11.46it/s][A
 21%|██        | 33/157 [00:02<00:10, 11.77it/s][A
 22%|██▏       | 35/157 [00:03<00:10, 11.89it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.02it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.4617263078689575, 'eval_acc': 0.8681, 'eval_acc2': 0.9417, 'eval_f1': 0.8678523291518239, 'eval_roc_auc_micro': 0.9985631947474749, 'eval_precision': 0.8681, 'eval_recall': 0.8681, 'eval_runtime': 18.8342, 'eval_samples_per_second': 530.948, 'eval_steps_per_second': 8.336, 'epoch': 8.32}


 84%|████████▍ | 1310/1560 [1:09:29<04:43,  1.13s/it]

{'loss': 0.1844, 'grad_norm': 1.153135061264038, 'learning_rate': 6.204083721655607e-05, 'epoch': 8.38}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:20,  7.58it/s][A
  2%|▏         | 3/157 [00:00<00:21,  7.29it/s][A
  3%|▎         | 4/157 [00:00<00:22,  6.74it/s][A
  4%|▍         | 6/157 [00:00<00:17,  8.85it/s][A
  5%|▌         | 8/157 [00:00<00:14, 10.04it/s][A
  6%|▋         | 10/157 [00:01<00:14, 10.28it/s][A
  8%|▊         | 12/157 [00:01<00:13, 10.80it/s][A
  9%|▉         | 14/157 [00:01<00:12, 11.15it/s][A
 10%|█         | 16/157 [00:01<00:12, 11.37it/s][A
 11%|█▏        | 18/157 [00:01<00:12, 11.40it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 11.54it/s][A
 14%|█▍        | 22/157 [00:02<00:11, 11.61it/s][A
 15%|█▌        | 24/157 [00:02<00:11, 11.48it/s][A
 17%|█▋        | 26/157 [00:02<00:11, 11.77it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 11.83it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 11.95it/s][A
 20%|██        | 32/157 [00:02<00:10, 12.17it/s][A
 22%|██▏       | 34/157 [00:03<00:10, 12.22it/s][A
 23%|██▎       | 36/157 [

{'eval_loss': 0.46239039301872253, 'eval_acc': 0.8695, 'eval_acc2': 0.9415, 'eval_f1': 0.8692007895976332, 'eval_roc_auc_micro': 0.9985544553030303, 'eval_precision': 0.8695, 'eval_recall': 0.8695, 'eval_runtime': 20.8774, 'eval_samples_per_second': 478.987, 'eval_steps_per_second': 7.52, 'epoch': 8.38}


 85%|████████▍ | 1320/1560 [1:10:00<04:43,  1.18s/it]

{'loss': 0.183, 'grad_norm': 1.9538649320602417, 'learning_rate': 5.72719871733951e-05, 'epoch': 8.45}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.36it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.89it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.78it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.22it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.67it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.39it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.40it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.25it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.23it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.25it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 12.16it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.00it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 11.95it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.03it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.09it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.24it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.32it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.35it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.4612191915512085, 'eval_acc': 0.8708, 'eval_acc2': 0.9408, 'eval_f1': 0.8705676851599393, 'eval_roc_auc_micro': 0.9985576443939393, 'eval_precision': 0.8708, 'eval_recall': 0.8708, 'eval_runtime': 23.3711, 'eval_samples_per_second': 427.878, 'eval_steps_per_second': 6.718, 'epoch': 8.45}


 85%|████████▌ | 1330/1560 [1:10:33<04:32,  1.19s/it]

{'loss': 0.1916, 'grad_norm': 2.1817626953125, 'learning_rate': 5.268268194742637e-05, 'epoch': 8.51}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:21,  7.22it/s][A
  3%|▎         | 5/157 [00:00<00:16,  9.07it/s][A
  4%|▍         | 7/157 [00:01<00:24,  6.06it/s][A
  6%|▌         | 9/157 [00:01<00:19,  7.41it/s][A
  7%|▋         | 11/157 [00:01<00:17,  8.55it/s][A
  8%|▊         | 13/157 [00:01<00:15,  9.40it/s][A
 10%|▉         | 15/157 [00:01<00:14, 10.08it/s][A
 11%|█         | 17/157 [00:01<00:13, 10.56it/s][A
 12%|█▏        | 19/157 [00:02<00:12, 10.93it/s][A
 13%|█▎        | 21/157 [00:02<00:12, 11.33it/s][A
 15%|█▍        | 23/157 [00:02<00:11, 11.63it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.84it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.04it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 11.98it/s][A
 20%|█▉        | 31/157 [00:03<00:10, 11.87it/s][A
 21%|██        | 33/157 [00:03<00:10, 11.84it/s][A
 22%|██▏       | 35/157 [00:03<00:10, 11.84it/s][A
 24%|██▎       | 37/157 [00:03<00:10, 11.02it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.45879364013671875, 'eval_acc': 0.8714, 'eval_acc2': 0.9416, 'eval_f1': 0.8712613010199569, 'eval_roc_auc_micro': 0.9985705534848485, 'eval_precision': 0.8714, 'eval_recall': 0.8714, 'eval_runtime': 28.3272, 'eval_samples_per_second': 353.017, 'eval_steps_per_second': 5.542, 'epoch': 8.51}


 86%|████████▌ | 1340/1560 [1:11:11<04:36,  1.26s/it]

{'loss': 0.1801, 'grad_norm': 2.6128101348876953, 'learning_rate': 4.827478269480895e-05, 'epoch': 8.58}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.35it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.99it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.75it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.22it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.68it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.29it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.12it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.00it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.93it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.59it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 11.64it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.75it/s][A
 17%|█▋        | 27/157 [00:02<00:11, 11.70it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 11.73it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 11.81it/s][A
 21%|██        | 33/157 [00:02<00:10, 11.77it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 11.88it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.01it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.46084946393966675, 'eval_acc': 0.8686, 'eval_acc2': 0.9418, 'eval_f1': 0.868572370587093, 'eval_roc_auc_micro': 0.9985558135353536, 'eval_precision': 0.8686, 'eval_recall': 0.8686, 'eval_runtime': 16.0986, 'eval_samples_per_second': 621.172, 'eval_steps_per_second': 9.752, 'epoch': 8.58}


 87%|████████▋ | 1350/1560 [1:11:37<03:59,  1.14s/it]

{'loss': 0.1829, 'grad_norm': 1.896581768989563, 'learning_rate': 4.405007700395497e-05, 'epoch': 8.64}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:18,  8.57it/s][A
  3%|▎         | 4/157 [00:00<00:15,  9.94it/s][A
  4%|▍         | 6/157 [00:00<00:15, 10.04it/s][A
  5%|▌         | 8/157 [00:00<00:14, 10.48it/s][A
  6%|▋         | 10/157 [00:00<00:13, 10.84it/s][A
  8%|▊         | 12/157 [00:01<00:13, 11.13it/s][A
  9%|▉         | 14/157 [00:01<00:12, 11.16it/s][A
 10%|█         | 16/157 [00:01<00:12, 11.25it/s][A
 11%|█▏        | 18/157 [00:01<00:12, 11.20it/s][A
 13%|█▎        | 20/157 [00:01<00:12, 11.34it/s][A
 14%|█▍        | 22/157 [00:02<00:11, 11.40it/s][A
 15%|█▌        | 24/157 [00:02<00:11, 11.50it/s][A
 17%|█▋        | 26/157 [00:02<00:11, 11.63it/s][A
 18%|█▊        | 28/157 [00:02<00:11, 11.71it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 11.72it/s][A
 20%|██        | 32/157 [00:02<00:10, 11.64it/s][A
 22%|██▏       | 34/157 [00:03<00:10, 11.70it/s][A
 23%|██▎       | 36/157 [00:03<00:10, 11.36it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.4618237316608429, 'eval_acc': 0.8693, 'eval_acc2': 0.9407, 'eval_f1': 0.8693146120925401, 'eval_roc_auc_micro': 0.9985619856565655, 'eval_precision': 0.8693, 'eval_recall': 0.8693, 'eval_runtime': 22.2583, 'eval_samples_per_second': 449.27, 'eval_steps_per_second': 7.054, 'epoch': 8.64}


 87%|████████▋ | 1360/1560 [1:12:14<05:57,  1.79s/it]

{'loss': 0.1619, 'grad_norm': 1.697275996208191, 'learning_rate': 4.001027817058789e-05, 'epoch': 8.7}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 17.66it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.10it/s][A
  4%|▍         | 7/157 [00:00<00:11, 12.95it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.64it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.38it/s][A
  8%|▊         | 13/157 [00:01<00:11, 12.27it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.30it/s][A
 11%|█         | 17/157 [00:01<00:24,  5.83it/s][A
 11%|█▏        | 18/157 [00:02<00:24,  5.79it/s][A
 12%|█▏        | 19/157 [00:02<00:23,  5.80it/s][A
 13%|█▎        | 20/157 [00:02<00:22,  6.05it/s][A
 13%|█▎        | 21/157 [00:02<00:21,  6.32it/s][A
 14%|█▍        | 22/157 [00:02<00:20,  6.65it/s][A
 15%|█▍        | 23/157 [00:02<00:19,  7.03it/s][A
 15%|█▌        | 24/157 [00:02<00:18,  7.37it/s][A
 16%|█▌        | 25/157 [00:03<00:17,  7.72it/s][A
 17%|█▋        | 26/157 [00:03<00:16,  8.11it/s][A
 17%|█▋        | 27/157 [00:03<00:15,  8.45it/s][A
 18%|█▊        | 29/157 

{'eval_loss': 0.46117866039276123, 'eval_acc': 0.8711, 'eval_acc2': 0.9403, 'eval_f1': 0.8711999395713419, 'eval_roc_auc_micro': 0.9985654317171718, 'eval_precision': 0.8711, 'eval_recall': 0.8711, 'eval_runtime': 23.2128, 'eval_samples_per_second': 430.796, 'eval_steps_per_second': 6.763, 'epoch': 8.7}


 88%|████████▊ | 1370/1560 [1:12:48<04:24,  1.39s/it]

{'loss': 0.182, 'grad_norm': 2.095938205718994, 'learning_rate': 3.615702450292857e-05, 'epoch': 8.77}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 17.77it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.32it/s][A
  4%|▍         | 7/157 [00:00<00:11, 12.80it/s][A
  6%|▌         | 9/157 [00:00<00:12, 11.87it/s][A
  7%|▋         | 11/157 [00:00<00:12, 11.68it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.62it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.81it/s][A
 11%|█         | 17/157 [00:01<00:11, 11.96it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.89it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.01it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 11.96it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.02it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.06it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.02it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.12it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.03it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 12.04it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.06it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.4606863260269165, 'eval_acc': 0.8717, 'eval_acc2': 0.9406, 'eval_f1': 0.8716977569427625, 'eval_roc_auc_micro': 0.998563145050505, 'eval_precision': 0.8717, 'eval_recall': 0.8717, 'eval_runtime': 16.262, 'eval_samples_per_second': 614.929, 'eval_steps_per_second': 9.654, 'epoch': 8.77}


 88%|████████▊ | 1380/1560 [1:13:14<03:26,  1.15s/it]

{'loss': 0.1617, 'grad_norm': 1.9041399955749512, 'learning_rate': 3.249187865729264e-05, 'epoch': 8.83}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.10it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.66it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.61it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.85it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.38it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.30it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.36it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.35it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.33it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.22it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.25it/s][A
 16%|█▌        | 25/157 [00:01<00:11, 11.85it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 11.92it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 11.96it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 11.93it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.02it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 12.09it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.16it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.4620736241340637, 'eval_acc': 0.8726, 'eval_acc2': 0.9394, 'eval_f1': 0.8725737872885444, 'eval_roc_auc_micro': 0.9985538086363637, 'eval_precision': 0.8726, 'eval_recall': 0.8726, 'eval_runtime': 34.8068, 'eval_samples_per_second': 287.3, 'eval_steps_per_second': 4.511, 'epoch': 8.83}


 89%|████████▉ | 1390/1560 [1:13:59<03:42,  1.31s/it]

{'loss': 0.1837, 'grad_norm': 1.4127581119537354, 'learning_rate': 2.9016327004367572e-05, 'epoch': 8.9}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.64it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.51it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.59it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.17it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.95it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.87it/s][A
 10%|▉         | 15/157 [00:01<00:16,  8.61it/s][A
 11%|█         | 17/157 [00:01<00:14,  9.61it/s][A
 12%|█▏        | 19/157 [00:01<00:13, 10.34it/s][A
 13%|█▎        | 21/157 [00:01<00:12, 10.95it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 11.42it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.76it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 11.99it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.18it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.28it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.25it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 12.01it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.14it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.4610714316368103, 'eval_acc': 0.8718, 'eval_acc2': 0.9399, 'eval_f1': 0.871746716217649, 'eval_roc_auc_micro': 0.9985587340909091, 'eval_precision': 0.8718, 'eval_recall': 0.8718, 'eval_runtime': 16.8196, 'eval_samples_per_second': 594.544, 'eval_steps_per_second': 9.334, 'epoch': 8.9}


 90%|████████▉ | 1400/1560 [1:14:25<02:56,  1.10s/it]

{'loss': 0.1615, 'grad_norm': 1.2311116456985474, 'learning_rate': 2.573177902642726e-05, 'epoch': 8.96}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:09, 16.58it/s][A
  3%|▎         | 5/157 [00:00<00:11, 12.99it/s][A
  4%|▍         | 7/157 [00:00<00:11, 12.69it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.35it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.48it/s][A
  8%|▊         | 13/157 [00:01<00:11, 12.61it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.27it/s][A
 11%|█         | 17/157 [00:01<00:12, 11.48it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 11.77it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 11.99it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.18it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.33it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.37it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.42it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.48it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.45it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.33it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.42it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.46018174290657043, 'eval_acc': 0.8702, 'eval_acc2': 0.9401, 'eval_f1': 0.8701420270779652, 'eval_roc_auc_micro': 0.9985641577272728, 'eval_precision': 0.8702, 'eval_recall': 0.8702, 'eval_runtime': 16.0325, 'eval_samples_per_second': 623.732, 'eval_steps_per_second': 9.793, 'epoch': 8.96}


 90%|█████████ | 1410/1560 [1:14:53<03:22,  1.35s/it]

{'loss': 0.1573, 'grad_norm': 1.8620846271514893, 'learning_rate': 2.2639566745727203e-05, 'epoch': 9.02}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.43it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.88it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.81it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.39it/s][A
  7%|▋         | 11/157 [00:00<00:11, 13.13it/s][A
  8%|▊         | 13/157 [00:00<00:11, 13.02it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.88it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.64it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.53it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.64it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.69it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.68it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.58it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.61it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.59it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.63it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.36it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.40it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.4606951177120209, 'eval_acc': 0.8698, 'eval_acc2': 0.9401, 'eval_f1': 0.8696790762804034, 'eval_roc_auc_micro': 0.9985596125252525, 'eval_precision': 0.8698, 'eval_recall': 0.8698, 'eval_runtime': 19.2664, 'eval_samples_per_second': 519.039, 'eval_steps_per_second': 8.149, 'epoch': 9.02}


 91%|█████████ | 1420/1560 [1:15:22<02:37,  1.13s/it]

{'loss': 0.1721, 'grad_norm': 2.2232425212860107, 'learning_rate': 1.974094418431388e-05, 'epoch': 9.09}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:27,  5.58it/s][A
  3%|▎         | 4/157 [00:00<00:18,  8.28it/s][A
  4%|▍         | 6/157 [00:00<00:15,  9.79it/s][A
  5%|▌         | 8/157 [00:01<00:22,  6.69it/s][A
  6%|▋         | 10/157 [00:01<00:18,  8.05it/s][A
  8%|▊         | 12/157 [00:01<00:15,  9.11it/s][A
  9%|▉         | 14/157 [00:01<00:14, 10.02it/s][A
 10%|█         | 16/157 [00:01<00:13, 10.72it/s][A
 11%|█▏        | 18/157 [00:01<00:12, 11.22it/s][A
 13%|█▎        | 20/157 [00:02<00:12, 11.40it/s][A
 14%|█▍        | 22/157 [00:02<00:11, 11.74it/s][A
 15%|█▌        | 24/157 [00:02<00:11, 12.03it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.19it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 12.25it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 12.34it/s][A
 20%|██        | 32/157 [00:03<00:10, 12.41it/s][A
 22%|██▏       | 34/157 [00:03<00:09, 12.51it/s][A
 23%|██▎       | 36/157 [00:03<00:09, 12.54it/s][A
 24%|██▍       | 38/157 

{'eval_loss': 0.46084287762641907, 'eval_acc': 0.8692, 'eval_acc2': 0.9397, 'eval_f1': 0.8690547236322904, 'eval_roc_auc_micro': 0.9985572407575758, 'eval_precision': 0.8692, 'eval_recall': 0.8692, 'eval_runtime': 15.5119, 'eval_samples_per_second': 644.665, 'eval_steps_per_second': 10.121, 'epoch': 9.09}


 92%|█████████▏| 1430/1560 [1:15:47<02:21,  1.09s/it]

{'loss': 0.1836, 'grad_norm': 1.888134479522705, 'learning_rate': 1.70370868554659e-05, 'epoch': 9.15}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.85it/s][A
  3%|▎         | 5/157 [00:00<00:10, 15.07it/s][A
  4%|▍         | 7/157 [00:00<00:10, 13.88it/s][A
  6%|▌         | 9/157 [00:00<00:11, 13.33it/s][A
  7%|▋         | 11/157 [00:00<00:11, 13.03it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.83it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.74it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.61it/s][A
 12%|█▏        | 19/157 [00:01<00:10, 12.59it/s][A
 13%|█▎        | 21/157 [00:01<00:10, 12.51it/s][A
 15%|█▍        | 23/157 [00:01<00:10, 12.57it/s][A
 16%|█▌        | 25/157 [00:01<00:10, 12.51it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.44it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.54it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.54it/s][A
 21%|██        | 33/157 [00:02<00:09, 12.46it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.50it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.54it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.45965853333473206, 'eval_acc': 0.8702, 'eval_acc2': 0.94, 'eval_f1': 0.8699616000520524, 'eval_roc_auc_micro': 0.9985652269191918, 'eval_precision': 0.8702, 'eval_recall': 0.8702, 'eval_runtime': 15.4323, 'eval_samples_per_second': 647.991, 'eval_steps_per_second': 10.173, 'epoch': 9.15}


 92%|█████████▏| 1440/1560 [1:16:12<02:10,  1.09s/it]

{'loss': 0.1572, 'grad_norm': 1.615565538406372, 'learning_rate': 1.4529091286973995e-05, 'epoch': 9.22}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:29,  5.22it/s][A
  3%|▎         | 5/157 [00:00<00:20,  7.27it/s][A
  4%|▍         | 7/157 [00:00<00:17,  8.80it/s][A
  6%|▌         | 9/157 [00:01<00:15,  9.73it/s][A
  7%|▋         | 11/157 [00:01<00:13, 10.52it/s][A
  8%|▊         | 13/157 [00:01<00:12, 11.09it/s][A
 10%|▉         | 15/157 [00:01<00:12, 11.51it/s][A
 11%|█         | 17/157 [00:01<00:12, 11.49it/s][A
 12%|█▏        | 19/157 [00:01<00:12, 11.42it/s][A
 13%|█▎        | 21/157 [00:02<00:11, 11.66it/s][A
 15%|█▍        | 23/157 [00:02<00:11, 11.94it/s][A
 16%|█▌        | 25/157 [00:02<00:10, 12.15it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.17it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 12.22it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.01it/s][A
 21%|██        | 33/157 [00:03<00:10, 12.06it/s][A
 22%|██▏       | 35/157 [00:03<00:09, 12.28it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.39it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.45891836285591125, 'eval_acc': 0.8698, 'eval_acc2': 0.9403, 'eval_f1': 0.8695462550528044, 'eval_roc_auc_micro': 0.9985695470707071, 'eval_precision': 0.8698, 'eval_recall': 0.8698, 'eval_runtime': 19.2568, 'eval_samples_per_second': 519.298, 'eval_steps_per_second': 8.153, 'epoch': 9.22}


 93%|█████████▎| 1450/1560 [1:16:41<02:03,  1.12s/it]

{'loss': 0.1553, 'grad_norm': 2.174081802368164, 'learning_rate': 1.2217974576453073e-05, 'epoch': 9.28}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.81it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.72it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.15it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.69it/s][A
  7%|▋         | 11/157 [00:01<00:20,  7.26it/s][A
  8%|▊         | 13/157 [00:01<00:17,  8.35it/s][A
 10%|▉         | 15/157 [00:01<00:15,  9.27it/s][A
 11%|█         | 17/157 [00:01<00:14,  9.79it/s][A
 12%|█▏        | 19/157 [00:01<00:13, 10.30it/s][A
 13%|█▎        | 21/157 [00:02<00:12, 10.66it/s][A
 15%|█▍        | 23/157 [00:02<00:12, 10.90it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.23it/s][A
 17%|█▋        | 27/157 [00:02<00:11, 11.54it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 11.75it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 11.85it/s][A
 21%|██        | 33/157 [00:03<00:10, 11.96it/s][A
 22%|██▏       | 35/157 [00:03<00:10, 12.02it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.11it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.4580051302909851, 'eval_acc': 0.87, 'eval_acc2': 0.9405, 'eval_f1': 0.8698169339240043, 'eval_roc_auc_micro': 0.9985805925252526, 'eval_precision': 0.87, 'eval_recall': 0.87, 'eval_runtime': 16.162, 'eval_samples_per_second': 618.736, 'eval_steps_per_second': 9.714, 'epoch': 9.28}


 94%|█████████▎| 1460/1560 [1:17:07<01:49,  1.10s/it]

{'loss': 0.1545, 'grad_norm': 1.8030941486358643, 'learning_rate': 1.0104673978866164e-05, 'epoch': 9.34}



  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|▏         | 2/157 [00:00<00:16,  9.18it/s][A
  2%|▏         | 3/157 [00:00<00:21,  7.15it/s][A
  3%|▎         | 4/157 [00:00<00:21,  7.23it/s][A
  4%|▍         | 6/157 [00:00<00:16,  9.30it/s][A
  5%|▌         | 8/157 [00:00<00:14, 10.47it/s][A
  6%|▋         | 10/157 [00:01<00:13, 10.72it/s][A
  8%|▊         | 12/157 [00:01<00:13, 10.79it/s][A
  9%|▉         | 14/157 [00:01<00:12, 11.31it/s][A
 10%|█         | 16/157 [00:01<00:12, 11.66it/s][A
 11%|█▏        | 18/157 [00:01<00:11, 11.93it/s][A
 13%|█▎        | 20/157 [00:01<00:11, 12.13it/s][A
 14%|█▍        | 22/157 [00:02<00:11, 12.26it/s][A
 15%|█▌        | 24/157 [00:02<00:10, 12.24it/s][A
 17%|█▋        | 26/157 [00:02<00:10, 12.39it/s][A
 18%|█▊        | 28/157 [00:02<00:10, 12.37it/s][A
 19%|█▉        | 30/157 [00:02<00:10, 12.28it/s][A
 20%|██        | 32/157 [00:02<00:10, 12.17it/s][A
 22%|██▏       | 34/157 [00:02<00:10, 12.22it/s][A
 23%|██▎       | 36/157 [

{'eval_loss': 0.45707157254219055, 'eval_acc': 0.8702, 'eval_acc2': 0.9409, 'eval_f1': 0.8700745164872976, 'eval_roc_auc_micro': 0.9985875428787878, 'eval_precision': 0.8702, 'eval_recall': 0.8702, 'eval_runtime': 18.2166, 'eval_samples_per_second': 548.951, 'eval_steps_per_second': 8.619, 'epoch': 9.34}


 94%|█████████▍| 1470/1560 [1:17:35<01:41,  1.12s/it]

{'loss': 0.1502, 'grad_norm': 1.5919179916381836, 'learning_rate': 8.190046526428241e-06, 'epoch': 9.41}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 17.61it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.54it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.52it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.98it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.55it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.44it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.26it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.20it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.16it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.11it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 11.98it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.89it/s][A
 17%|█▋        | 27/157 [00:02<00:11, 11.52it/s][A
 18%|█▊        | 29/157 [00:02<00:11, 11.59it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 11.78it/s][A
 21%|██        | 33/157 [00:02<00:10, 11.96it/s][A
 22%|██▏       | 35/157 [00:02<00:10, 12.11it/s][A
 24%|██▎       | 37/157 [00:03<00:09, 12.15it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.45646932721138, 'eval_acc': 0.8713, 'eval_acc2': 0.9409, 'eval_f1': 0.8711913452360978, 'eval_roc_auc_micro': 0.9985916252525253, 'eval_precision': 0.8713, 'eval_recall': 0.8713, 'eval_runtime': 36.9154, 'eval_samples_per_second': 270.89, 'eval_steps_per_second': 4.253, 'epoch': 9.41}


 95%|█████████▍| 1480/1560 [1:18:21<01:53,  1.41s/it]

{'loss': 0.1597, 'grad_norm': 1.9018003940582275, 'learning_rate': 6.474868681043577e-06, 'epoch': 9.47}



  0%|          | 0/157 [00:00<?, ?it/s][A
  2%|▏         | 3/157 [00:00<00:08, 18.60it/s][A
  3%|▎         | 5/157 [00:00<00:10, 14.31it/s][A
  4%|▍         | 7/157 [00:00<00:11, 13.51it/s][A
  6%|▌         | 9/157 [00:00<00:11, 12.92it/s][A
  7%|▋         | 11/157 [00:00<00:11, 12.67it/s][A
  8%|▊         | 13/157 [00:00<00:11, 12.40it/s][A
 10%|▉         | 15/157 [00:01<00:11, 12.24it/s][A
 11%|█         | 17/157 [00:01<00:11, 12.06it/s][A
 12%|█▏        | 19/157 [00:01<00:11, 12.07it/s][A
 13%|█▎        | 21/157 [00:01<00:11, 12.01it/s][A
 15%|█▍        | 23/157 [00:01<00:11, 11.89it/s][A
 16%|█▌        | 25/157 [00:02<00:11, 11.99it/s][A
 17%|█▋        | 27/157 [00:02<00:10, 12.03it/s][A
 18%|█▊        | 29/157 [00:02<00:10, 11.98it/s][A
 20%|█▉        | 31/157 [00:02<00:10, 12.08it/s][A
 21%|██        | 33/157 [00:02<00:10, 12.15it/s][A
 22%|██▏       | 35/157 [00:02<00:09, 12.25it/s][A
 24%|██▎       | 37/157 [00:02<00:09, 12.37it/s][A
 25%|██▍       | 39/157 

{'eval_loss': 0.45602455735206604, 'eval_acc': 0.8714, 'eval_acc2': 0.9412, 'eval_f1': 0.8712873718964061, 'eval_roc_auc_micro': 0.99859393489899, 'eval_precision': 0.8714, 'eval_recall': 0.8714, 'eval_runtime': 33.8001, 'eval_samples_per_second': 295.857, 'eval_steps_per_second': 4.645, 'epoch': 9.47}


 95%|█████████▍| 1480/1560 [1:18:56<04:16,  3.20s/it]


{'train_runtime': 4736.3766, 'train_samples_per_second': 84.453, 'train_steps_per_second': 0.329, 'train_loss': 0.6574109908696767, 'epoch': 9.47}


100%|██████████| 157/157 [00:26<00:00,  5.94it/s]
wandb:                                                                                
wandb: 
wandb: Run history:
wandb:                eval/acc ▁▃▄▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████████████████
wandb:               eval/acc2 ▁▂▁▂▄▄▄▄▅▆▆▅▆▆▅▅▆▆▆▇▆▆▇▇▇▇▇█▇███████████
wandb:                 eval/f1 ▁▄▅▆▆▇▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇███████████████████
wandb:               eval/loss █▅▄▃▃▃▃▂▂▂▂▂▂▂▂▂▁▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
wandb:          eval/precision ▁▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████████████████
wandb:             eval/recall ▁▁▂▂▂▄▃▄▄▅▅▅▅▅▅▅▅▆▆▆▇▇▇▇▇▇▇▇▇▇█▇████████
wandb:      eval/roc_auc_micro ▁▃▄▄▄▆▆▆▆��▇▇▇▇▇▇█▇▇▇████████████████████
wandb:            eval/runtime ▁▃▁▁▁▁▁▁▁▃▁▁▄▂▃▁▁▃▃▁▅▁█▂▃▃▂▂▁▁▁▁▂▂▂▂▁▂▁▁
wandb: eval/samples_per_second █▇██▇▅▂▅█▆▇▁▅▆▆▆▅▇▆▁▇▇▇▇▇▅▇▇▇▇█▁▃▅▇▅▇▇▅▆
wandb:   eval/steps_per_second ▆▃█▇▆█████▄▆█▆▇▆▃█▃█▇█▁███▆▅▇█▅▃▄▄▅█▅▅▇▃
wandb:                test/acc ▁
wandb:               test/acc2 ▁
wandb:                 test/f1 ▁
wandb:         