## Step1 Import Required Packages

In [2]:
# !git clone https://huggingface.co/hfl/rbt3

Cloning into 'rbt3'...
Filtering content:  66% (2/3)
Filtering content: 100% (3/3)
Filtering content: 100% (3/3), 442.86 MiB | 8.88 MiB/s, done.


In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


## Step2 Load the Dataset

In [2]:
dataset = load_dataset("csv", data_files="./data_2.csv", split="train")
dataset = dataset.filter(lambda x: x["content"] is not None and x["label"] is not None and x['label'] in [0,1,2,3,4,5])
#dataset = dataset.filter(lambda x: x["content"] is not None and len(x["content"]) >= 10)
dataset

Dataset({
    features: ['content', 'label'],
    num_rows: 2073
})

## Step3 Split the Dataset

In [3]:
datasets = dataset.train_test_split(test_size=0.1)
datasets

DatasetDict({
    train: Dataset({
        features: ['content', 'label'],
        num_rows: 1865
    })
    test: Dataset({
        features: ['content', 'label'],
        num_rows: 208
    })
})

## Step4 Dataset Preprocessing

In [4]:
import torch

tokenizer = AutoTokenizer.from_pretrained("rbt3")

def process_function(examples):
    tokenized_examples = tokenizer(examples["content"], max_length=128, truncation=True,padding=True)
    #tokenized_examples["labels"] = examples["label"]
    tokenized_examples["labels"] = [int(label) for label in examples["label"]]
    return tokenized_examples

tokenized_datasets = datasets.map(process_function, batched=True, remove_columns=datasets["train"].column_names)
tokenized_datasets

Map: 100%|██████████| 1865/1865 [00:00<00:00, 19463.93 examples/s]
Map: 100%|██████████| 208/208 [00:00<00:00, 10941.98 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1865
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 208
    })
})

## Step5 Create the model

In [5]:
model = AutoModelForSequenceClassification.from_pretrained("rbt3",num_labels=6)

  return self.fget.__get__(instance, owner)()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
model.config

BertConfig {
  "_name_or_path": "rbt3",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 3,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version":

## Step6 Create an Evaluation Function

In [7]:
import evaluate

acc_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

In [8]:
def eval_metric(eval_predict):
    predictions, labels = eval_predict
    predictions = predictions.argmax(axis=-1)
    acc = acc_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average='macro')
    acc.update(f1)
    return acc

## Step7 Create TrainingArguments

In [9]:
train_args = TrainingArguments(output_dir="./checkpoints",      # 输出文件夹
                               per_device_train_batch_size=64,  # 训练时的batch_size
                               per_device_eval_batch_size=128,  # 验证时的batch_size
                               logging_steps=10,                # log 打印的频率
                               evaluation_strategy="epoch",     # 评估策略
                               save_strategy="epoch",           # 保存策略
                               save_total_limit=3,              # 最大保存数
                               learning_rate=2e-5,              # 学习率
                               weight_decay=0.1,               # weight_decay
                               metric_for_best_model="f1",      # 设定评估指标
                               load_best_model_at_end=True,
                               num_train_epochs=100)     # 训练完成后加载最优模型
train_args



TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_steps=None,
eval_strategy=epoch,
evaluation_strategy=epoch,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsd

## Step8 Create Trainer

In [10]:
from transformers import DataCollatorWithPadding
trainer = Trainer(model=model, 
                  args=train_args, 
                  train_dataset=tokenized_datasets["train"], 
                  eval_dataset=tokenized_datasets["test"], 
                  data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
                  compute_metrics=eval_metric)

## Step9 Model Training

In [11]:
trainer.train()

  0%|          | 12/3000 [00:00<02:56, 16.91it/s]

{'loss': 1.9249, 'grad_norm': 8.071365356445312, 'learning_rate': 1.9933333333333334e-05, 'epoch': 0.33}


  1%|          | 22/3000 [00:01<02:41, 18.48it/s]

{'loss': 1.4553, 'grad_norm': 4.214043140411377, 'learning_rate': 1.9866666666666667e-05, 'epoch': 0.67}


  1%|          | 30/3000 [00:01<02:38, 18.73it/s]

{'loss': 1.2392, 'grad_norm': 8.134020805358887, 'learning_rate': 1.98e-05, 'epoch': 1.0}


                                                 
  1%|          | 30/3000 [00:01<02:38, 18.73it/s]

{'eval_loss': 1.1452360153198242, 'eval_accuracy': 0.6009615384615384, 'eval_f1': 0.2801397849462366, 'eval_runtime': 0.0941, 'eval_samples_per_second': 2209.664, 'eval_steps_per_second': 21.247, 'epoch': 1.0}


  1%|▏         | 44/3000 [00:02<02:52, 17.16it/s]

{'loss': 1.1058, 'grad_norm': 3.515657424926758, 'learning_rate': 1.9733333333333336e-05, 'epoch': 1.33}


  2%|▏         | 52/3000 [00:03<02:35, 18.96it/s]

{'loss': 0.9858, 'grad_norm': 3.2431788444519043, 'learning_rate': 1.9666666666666666e-05, 'epoch': 1.67}


  2%|▏         | 60/3000 [00:03<02:30, 19.56it/s]

{'loss': 1.0898, 'grad_norm': 7.797556400299072, 'learning_rate': 1.9600000000000002e-05, 'epoch': 2.0}


                                                 
  2%|▏         | 60/3000 [00:03<02:30, 19.56it/s]

{'eval_loss': 1.0294773578643799, 'eval_accuracy': 0.6298076923076923, 'eval_f1': 0.31983987591031154, 'eval_runtime': 0.092, 'eval_samples_per_second': 2259.792, 'eval_steps_per_second': 21.729, 'epoch': 2.0}


  2%|▏         | 72/3000 [00:04<02:58, 16.40it/s]

{'loss': 0.9197, 'grad_norm': 4.5487494468688965, 'learning_rate': 1.9533333333333335e-05, 'epoch': 2.33}


  3%|▎         | 81/3000 [00:05<02:36, 18.70it/s]

{'loss': 0.9811, 'grad_norm': 4.1804633140563965, 'learning_rate': 1.9466666666666668e-05, 'epoch': 2.67}


  3%|▎         | 90/3000 [00:05<02:18, 21.08it/s]

{'loss': 0.8615, 'grad_norm': 10.3552885055542, 'learning_rate': 1.94e-05, 'epoch': 3.0}


                                                 
  3%|▎         | 90/3000 [00:05<02:18, 21.08it/s]

{'eval_loss': 0.9441810250282288, 'eval_accuracy': 0.6634615384615384, 'eval_f1': 0.3379523868009498, 'eval_runtime': 0.094, 'eval_samples_per_second': 2212.068, 'eval_steps_per_second': 21.27, 'epoch': 3.0}


  3%|▎         | 104/3000 [00:06<02:56, 16.39it/s]

{'loss': 0.8599, 'grad_norm': 4.856118202209473, 'learning_rate': 1.9333333333333333e-05, 'epoch': 3.33}


  4%|▍         | 114/3000 [00:07<02:32, 18.97it/s]

{'loss': 0.747, 'grad_norm': 3.374086856842041, 'learning_rate': 1.926666666666667e-05, 'epoch': 3.67}


  4%|▍         | 120/3000 [00:07<02:16, 21.16it/s]

{'loss': 0.8645, 'grad_norm': 12.84814739227295, 'learning_rate': 1.9200000000000003e-05, 'epoch': 4.0}


                                                  
  4%|▍         | 120/3000 [00:07<02:16, 21.16it/s]

{'eval_loss': 0.9269416332244873, 'eval_accuracy': 0.6634615384615384, 'eval_f1': 0.37565813164302186, 'eval_runtime': 0.094, 'eval_samples_per_second': 2213.909, 'eval_steps_per_second': 21.288, 'epoch': 4.0}


  4%|▍         | 132/3000 [00:08<03:05, 15.45it/s]

{'loss': 0.6985, 'grad_norm': 8.310361862182617, 'learning_rate': 1.9133333333333335e-05, 'epoch': 4.33}


  5%|▍         | 144/3000 [00:09<02:31, 18.82it/s]

{'loss': 0.7214, 'grad_norm': 4.230612277984619, 'learning_rate': 1.9066666666666668e-05, 'epoch': 4.67}


  5%|▌         | 150/3000 [00:09<02:15, 21.00it/s]

{'loss': 0.6489, 'grad_norm': 8.115900993347168, 'learning_rate': 1.9e-05, 'epoch': 5.0}


                                                  
  5%|▌         | 150/3000 [00:09<02:15, 21.00it/s]

{'eval_loss': 0.8844397664070129, 'eval_accuracy': 0.6875, 'eval_f1': 0.43219205373406194, 'eval_runtime': 0.0935, 'eval_samples_per_second': 2223.785, 'eval_steps_per_second': 21.383, 'epoch': 5.0}


  5%|▌         | 162/3000 [00:10<03:01, 15.66it/s]

{'loss': 0.6004, 'grad_norm': 5.155263900756836, 'learning_rate': 1.8933333333333334e-05, 'epoch': 5.33}


  6%|▌         | 174/3000 [00:11<02:29, 18.95it/s]

{'loss': 0.6223, 'grad_norm': 5.239542007446289, 'learning_rate': 1.886666666666667e-05, 'epoch': 5.67}


  6%|▌         | 180/3000 [00:11<02:14, 20.94it/s]

{'loss': 0.616, 'grad_norm': 15.650217056274414, 'learning_rate': 1.88e-05, 'epoch': 6.0}


                                                  
  6%|▌         | 180/3000 [00:11<02:14, 20.94it/s]

{'eval_loss': 0.86812824010849, 'eval_accuracy': 0.7115384615384616, 'eval_f1': 0.4405715287185655, 'eval_runtime': 0.0923, 'eval_samples_per_second': 2254.542, 'eval_steps_per_second': 21.678, 'epoch': 6.0}


  6%|▋         | 194/3000 [00:12<02:48, 16.63it/s]

{'loss': 0.5609, 'grad_norm': 4.45497465133667, 'learning_rate': 1.8733333333333336e-05, 'epoch': 6.33}


  7%|▋         | 204/3000 [00:12<02:26, 19.10it/s]

{'loss': 0.4993, 'grad_norm': 5.851970672607422, 'learning_rate': 1.866666666666667e-05, 'epoch': 6.67}


  7%|▋         | 210/3000 [00:13<02:11, 21.24it/s]

{'loss': 0.5069, 'grad_norm': 19.69986343383789, 'learning_rate': 1.86e-05, 'epoch': 7.0}


                                                  
  7%|▋         | 210/3000 [00:13<02:11, 21.24it/s]

{'eval_loss': 0.8870638608932495, 'eval_accuracy': 0.6730769230769231, 'eval_f1': 0.4272055067086123, 'eval_runtime': 0.0924, 'eval_samples_per_second': 2250.79, 'eval_steps_per_second': 21.642, 'epoch': 7.0}


  7%|▋         | 224/3000 [00:14<02:48, 16.47it/s]

{'loss': 0.3957, 'grad_norm': 5.267388343811035, 'learning_rate': 1.8533333333333334e-05, 'epoch': 7.33}


  8%|▊         | 232/3000 [00:14<02:28, 18.66it/s]

{'loss': 0.4266, 'grad_norm': 6.890261650085449, 'learning_rate': 1.8466666666666667e-05, 'epoch': 7.67}


  8%|▊         | 240/3000 [00:15<02:20, 19.63it/s]

{'loss': 0.4913, 'grad_norm': 19.146263122558594, 'learning_rate': 1.8400000000000003e-05, 'epoch': 8.0}


                                                  
  8%|▊         | 240/3000 [00:15<02:20, 19.63it/s]

{'eval_loss': 0.935671865940094, 'eval_accuracy': 0.6923076923076923, 'eval_f1': 0.46107481905244635, 'eval_runtime': 0.0927, 'eval_samples_per_second': 2243.497, 'eval_steps_per_second': 21.572, 'epoch': 8.0}


  8%|▊         | 252/3000 [00:16<02:43, 16.78it/s]

{'loss': 0.3467, 'grad_norm': 5.582660675048828, 'learning_rate': 1.8333333333333333e-05, 'epoch': 8.33}


  9%|▉         | 264/3000 [00:16<02:20, 19.42it/s]

{'loss': 0.3359, 'grad_norm': 4.622363567352295, 'learning_rate': 1.826666666666667e-05, 'epoch': 8.67}


  9%|▉         | 270/3000 [00:16<02:10, 20.89it/s]

{'loss': 0.3205, 'grad_norm': 9.492921829223633, 'learning_rate': 1.8200000000000002e-05, 'epoch': 9.0}


                                                  
  9%|▉         | 270/3000 [00:17<02:10, 20.89it/s]

{'eval_loss': 0.9500120878219604, 'eval_accuracy': 0.6778846153846154, 'eval_f1': 0.45125891265597146, 'eval_runtime': 0.0906, 'eval_samples_per_second': 2294.857, 'eval_steps_per_second': 22.066, 'epoch': 9.0}


  9%|▉         | 281/3000 [00:17<02:55, 15.49it/s]

{'loss': 0.3032, 'grad_norm': 7.183337211608887, 'learning_rate': 1.8133333333333335e-05, 'epoch': 9.33}


 10%|▉         | 292/3000 [00:18<02:23, 18.83it/s]

{'loss': 0.25, 'grad_norm': 3.4912543296813965, 'learning_rate': 1.8066666666666668e-05, 'epoch': 9.67}


 10%|█         | 300/3000 [00:18<02:05, 21.58it/s]

{'loss': 0.2738, 'grad_norm': 15.675248146057129, 'learning_rate': 1.8e-05, 'epoch': 10.0}


                                                  
 10%|█         | 300/3000 [00:18<02:05, 21.58it/s]

{'eval_loss': 0.9680238962173462, 'eval_accuracy': 0.6875, 'eval_f1': 0.5108657599895357, 'eval_runtime': 0.0926, 'eval_samples_per_second': 2245.848, 'eval_steps_per_second': 21.595, 'epoch': 10.0}


 10%|█         | 312/3000 [00:19<02:49, 15.83it/s]

{'loss': 0.2466, 'grad_norm': 4.236247539520264, 'learning_rate': 1.7933333333333333e-05, 'epoch': 10.33}


 11%|█         | 323/3000 [00:20<02:21, 18.95it/s]

{'loss': 0.2124, 'grad_norm': 4.825068950653076, 'learning_rate': 1.7866666666666666e-05, 'epoch': 10.67}


 11%|█         | 330/3000 [00:20<02:05, 21.25it/s]

{'loss': 0.232, 'grad_norm': 6.0142903327941895, 'learning_rate': 1.7800000000000002e-05, 'epoch': 11.0}


                                                  
 11%|█         | 330/3000 [00:20<02:05, 21.25it/s]

{'eval_loss': 1.0554428100585938, 'eval_accuracy': 0.6826923076923077, 'eval_f1': 0.5402584333922061, 'eval_runtime': 0.0923, 'eval_samples_per_second': 2254.047, 'eval_steps_per_second': 21.674, 'epoch': 11.0}


 11%|█▏        | 344/3000 [00:21<02:41, 16.46it/s]

{'loss': 0.188, 'grad_norm': 5.245562553405762, 'learning_rate': 1.7733333333333335e-05, 'epoch': 11.33}


 12%|█▏        | 353/3000 [00:22<02:19, 18.91it/s]

{'loss': 0.1886, 'grad_norm': 6.115188121795654, 'learning_rate': 1.7666666666666668e-05, 'epoch': 11.67}


 12%|█▏        | 360/3000 [00:22<02:12, 19.92it/s]

{'loss': 0.1978, 'grad_norm': 24.708051681518555, 'learning_rate': 1.76e-05, 'epoch': 12.0}


                                                  
 12%|█▏        | 360/3000 [00:22<02:12, 19.92it/s]

{'eval_loss': 1.04304838180542, 'eval_accuracy': 0.6778846153846154, 'eval_f1': 0.5016542600281799, 'eval_runtime': 0.1012, 'eval_samples_per_second': 2054.53, 'eval_steps_per_second': 19.755, 'epoch': 12.0}


 12%|█▏        | 374/3000 [00:23<02:36, 16.83it/s]

{'loss': 0.1514, 'grad_norm': 3.9116315841674805, 'learning_rate': 1.7533333333333337e-05, 'epoch': 12.33}


 13%|█▎        | 383/3000 [00:24<02:17, 19.01it/s]

{'loss': 0.129, 'grad_norm': 3.8974621295928955, 'learning_rate': 1.7466666666666667e-05, 'epoch': 12.67}


 13%|█▎        | 390/3000 [00:24<02:15, 19.32it/s]

{'loss': 0.1511, 'grad_norm': 17.671520233154297, 'learning_rate': 1.7400000000000003e-05, 'epoch': 13.0}


                                                  
 13%|█▎        | 390/3000 [00:24<02:15, 19.32it/s]

{'eval_loss': 1.0922646522521973, 'eval_accuracy': 0.6971153846153846, 'eval_f1': 0.5454238553492904, 'eval_runtime': 0.1023, 'eval_samples_per_second': 2033.645, 'eval_steps_per_second': 19.554, 'epoch': 13.0}


 13%|█▎        | 404/3000 [00:25<02:37, 16.44it/s]

{'loss': 0.1276, 'grad_norm': 6.837328910827637, 'learning_rate': 1.7333333333333336e-05, 'epoch': 13.33}


 14%|█▎        | 412/3000 [00:26<02:18, 18.70it/s]

{'loss': 0.108, 'grad_norm': 3.5525684356689453, 'learning_rate': 1.726666666666667e-05, 'epoch': 13.67}


 14%|█▍        | 420/3000 [00:26<02:18, 18.69it/s]

{'loss': 0.1228, 'grad_norm': 3.9175140857696533, 'learning_rate': 1.72e-05, 'epoch': 14.0}


                                                  
 14%|█▍        | 420/3000 [00:26<02:18, 18.69it/s]

{'eval_loss': 1.2184916734695435, 'eval_accuracy': 0.6971153846153846, 'eval_f1': 0.5981939021754492, 'eval_runtime': 0.0966, 'eval_samples_per_second': 2153.633, 'eval_steps_per_second': 20.708, 'epoch': 14.0}


 14%|█▍        | 433/3000 [00:27<02:40, 16.03it/s]

{'loss': 0.1004, 'grad_norm': 3.496161937713623, 'learning_rate': 1.7133333333333334e-05, 'epoch': 14.33}


 15%|█▍        | 443/3000 [00:28<02:17, 18.63it/s]

{'loss': 0.0918, 'grad_norm': 3.234027147293091, 'learning_rate': 1.706666666666667e-05, 'epoch': 14.67}


 15%|█▌        | 450/3000 [00:28<02:12, 19.18it/s]

{'loss': 0.1074, 'grad_norm': 9.1805419921875, 'learning_rate': 1.7e-05, 'epoch': 15.0}


                                                  
 15%|█▌        | 450/3000 [00:28<02:12, 19.18it/s]

{'eval_loss': 1.2324763536453247, 'eval_accuracy': 0.7019230769230769, 'eval_f1': 0.5964792470195838, 'eval_runtime': 0.0945, 'eval_samples_per_second': 2201.796, 'eval_steps_per_second': 21.171, 'epoch': 15.0}


 15%|█▌        | 463/3000 [00:29<02:35, 16.28it/s]

{'loss': 0.0703, 'grad_norm': 2.447260856628418, 'learning_rate': 1.6933333333333336e-05, 'epoch': 15.33}


 16%|█▌        | 473/3000 [00:30<02:20, 17.96it/s]

{'loss': 0.0828, 'grad_norm': 4.274540901184082, 'learning_rate': 1.686666666666667e-05, 'epoch': 15.67}


 16%|█▌        | 480/3000 [00:30<02:00, 20.90it/s]

{'loss': 0.1248, 'grad_norm': 14.586875915527344, 'learning_rate': 1.6800000000000002e-05, 'epoch': 16.0}


                                                  
 16%|█▌        | 480/3000 [00:30<02:00, 20.90it/s]

{'eval_loss': 1.1390196084976196, 'eval_accuracy': 0.6875, 'eval_f1': 0.5497181964573269, 'eval_runtime': 0.0948, 'eval_samples_per_second': 2193.607, 'eval_steps_per_second': 21.092, 'epoch': 16.0}


 16%|█▋        | 494/3000 [00:32<02:40, 15.66it/s]

{'loss': 0.0723, 'grad_norm': 4.708146095275879, 'learning_rate': 1.6733333333333335e-05, 'epoch': 16.33}


 17%|█▋        | 503/3000 [00:32<02:23, 17.43it/s]

{'loss': 0.0805, 'grad_norm': 2.9435043334960938, 'learning_rate': 1.6666666666666667e-05, 'epoch': 16.67}


 17%|█▋        | 510/3000 [00:32<02:14, 18.57it/s]

{'loss': 0.0517, 'grad_norm': 0.8879022598266602, 'learning_rate': 1.66e-05, 'epoch': 17.0}


                                                  
 17%|█▋        | 510/3000 [00:33<02:14, 18.57it/s]

{'eval_loss': 1.2148547172546387, 'eval_accuracy': 0.7115384615384616, 'eval_f1': 0.5861416590799285, 'eval_runtime': 0.1004, 'eval_samples_per_second': 2071.778, 'eval_steps_per_second': 19.921, 'epoch': 17.0}


 17%|█▋        | 522/3000 [00:34<02:46, 14.92it/s]

{'loss': 0.0586, 'grad_norm': 5.048853874206543, 'learning_rate': 1.6533333333333333e-05, 'epoch': 17.33}


 18%|█▊        | 533/3000 [00:34<02:16, 18.01it/s]

{'loss': 0.0563, 'grad_norm': 1.2807610034942627, 'learning_rate': 1.646666666666667e-05, 'epoch': 17.67}


 18%|█▊        | 540/3000 [00:34<02:09, 19.04it/s]

{'loss': 0.0587, 'grad_norm': 9.926998138427734, 'learning_rate': 1.64e-05, 'epoch': 18.0}


                                                  
 18%|█▊        | 540/3000 [00:35<02:09, 19.04it/s]

{'eval_loss': 1.221285343170166, 'eval_accuracy': 0.6826923076923077, 'eval_f1': 0.5819306675843179, 'eval_runtime': 0.1341, 'eval_samples_per_second': 1551.013, 'eval_steps_per_second': 14.914, 'epoch': 18.0}


 18%|█▊        | 552/3000 [00:36<02:38, 15.44it/s]

{'loss': 0.0466, 'grad_norm': 1.8479851484298706, 'learning_rate': 1.6333333333333335e-05, 'epoch': 18.33}


 19%|█▉        | 564/3000 [00:36<02:10, 18.66it/s]

{'loss': 0.0422, 'grad_norm': 1.1131725311279297, 'learning_rate': 1.6266666666666668e-05, 'epoch': 18.67}


 19%|█▉        | 570/3000 [00:37<02:10, 18.64it/s]

{'loss': 0.0337, 'grad_norm': 1.021925687789917, 'learning_rate': 1.62e-05, 'epoch': 19.0}


                                                  
 19%|█▉        | 570/3000 [00:37<02:10, 18.64it/s]

{'eval_loss': 1.2485063076019287, 'eval_accuracy': 0.6923076923076923, 'eval_f1': 0.5646873218085454, 'eval_runtime': 0.0974, 'eval_samples_per_second': 2134.683, 'eval_steps_per_second': 20.526, 'epoch': 19.0}


 19%|█▉        | 584/3000 [00:38<02:29, 16.18it/s]

{'loss': 0.0502, 'grad_norm': 2.4820127487182617, 'learning_rate': 1.6133333333333334e-05, 'epoch': 19.33}


 20%|█▉        | 593/3000 [00:38<02:15, 17.82it/s]

{'loss': 0.0324, 'grad_norm': 2.573054552078247, 'learning_rate': 1.606666666666667e-05, 'epoch': 19.67}


 20%|██        | 600/3000 [00:39<01:53, 21.15it/s]

{'loss': 0.0356, 'grad_norm': 4.467740535736084, 'learning_rate': 1.6000000000000003e-05, 'epoch': 20.0}


                                                  
 20%|██        | 600/3000 [00:39<01:53, 21.15it/s]

{'eval_loss': 1.3000209331512451, 'eval_accuracy': 0.7211538461538461, 'eval_f1': 0.6020661875139325, 'eval_runtime': 0.0936, 'eval_samples_per_second': 2222.81, 'eval_steps_per_second': 21.373, 'epoch': 20.0}


 20%|██        | 612/3000 [00:40<02:45, 14.42it/s]

{'loss': 0.0425, 'grad_norm': 4.249758720397949, 'learning_rate': 1.5933333333333336e-05, 'epoch': 20.33}


 21%|██        | 622/3000 [00:40<02:09, 18.36it/s]

{'loss': 0.0325, 'grad_norm': 2.272040843963623, 'learning_rate': 1.586666666666667e-05, 'epoch': 20.67}


 21%|██        | 630/3000 [00:41<01:52, 20.99it/s]

{'loss': 0.0205, 'grad_norm': 0.1527143120765686, 'learning_rate': 1.58e-05, 'epoch': 21.0}


                                                  
 21%|██        | 630/3000 [00:41<01:52, 20.99it/s]

{'eval_loss': 1.3820909261703491, 'eval_accuracy': 0.7067307692307693, 'eval_f1': 0.6076183674708393, 'eval_runtime': 0.1009, 'eval_samples_per_second': 2062.442, 'eval_steps_per_second': 19.831, 'epoch': 21.0}


 21%|██▏       | 642/3000 [00:42<02:40, 14.65it/s]

{'loss': 0.0211, 'grad_norm': 1.4309682846069336, 'learning_rate': 1.5733333333333334e-05, 'epoch': 21.33}


 22%|██▏       | 653/3000 [00:42<02:14, 17.41it/s]

{'loss': 0.0263, 'grad_norm': 0.7529876232147217, 'learning_rate': 1.5666666666666667e-05, 'epoch': 21.67}


 22%|██▏       | 660/3000 [00:43<02:08, 18.27it/s]

{'loss': 0.0279, 'grad_norm': 0.0648827776312828, 'learning_rate': 1.5600000000000003e-05, 'epoch': 22.0}


                                                  
 22%|██▏       | 660/3000 [00:43<02:08, 18.27it/s]

{'eval_loss': 1.3783539533615112, 'eval_accuracy': 0.7019230769230769, 'eval_f1': 0.5896140896140896, 'eval_runtime': 0.095, 'eval_samples_per_second': 2189.61, 'eval_steps_per_second': 21.054, 'epoch': 22.0}


 22%|██▏       | 673/3000 [00:44<02:42, 14.31it/s]

{'loss': 0.0143, 'grad_norm': 0.7511207461357117, 'learning_rate': 1.5533333333333333e-05, 'epoch': 22.33}


 23%|██▎       | 683/3000 [00:45<02:10, 17.79it/s]

{'loss': 0.0237, 'grad_norm': 7.688266754150391, 'learning_rate': 1.546666666666667e-05, 'epoch': 22.67}


 23%|██▎       | 690/3000 [00:45<02:10, 17.75it/s]

{'loss': 0.0364, 'grad_norm': 0.36587807536125183, 'learning_rate': 1.54e-05, 'epoch': 23.0}


                                                  
 23%|██▎       | 690/3000 [00:45<02:10, 17.75it/s]

{'eval_loss': 1.5446966886520386, 'eval_accuracy': 0.6778846153846154, 'eval_f1': 0.562151926609758, 'eval_runtime': 0.0959, 'eval_samples_per_second': 2168.558, 'eval_steps_per_second': 20.852, 'epoch': 23.0}


 23%|██▎       | 703/3000 [00:46<02:27, 15.60it/s]

{'loss': 0.0285, 'grad_norm': 0.9754598736763, 'learning_rate': 1.5333333333333334e-05, 'epoch': 23.33}


 24%|██▍       | 713/3000 [00:47<02:23, 15.95it/s]

{'loss': 0.0186, 'grad_norm': 0.8308520317077637, 'learning_rate': 1.5266666666666667e-05, 'epoch': 23.67}


 24%|██▍       | 720/3000 [00:47<02:07, 17.88it/s]

{'loss': 0.0386, 'grad_norm': 18.288005828857422, 'learning_rate': 1.5200000000000002e-05, 'epoch': 24.0}


                                                  
 24%|██▍       | 720/3000 [00:47<02:07, 17.88it/s]

{'eval_loss': 1.483331561088562, 'eval_accuracy': 0.7163461538461539, 'eval_f1': 0.5866326051209773, 'eval_runtime': 0.1158, 'eval_samples_per_second': 1796.32, 'eval_steps_per_second': 17.272, 'epoch': 24.0}


 24%|██▍       | 732/3000 [00:48<02:25, 15.58it/s]

{'loss': 0.0297, 'grad_norm': 3.484718084335327, 'learning_rate': 1.5133333333333335e-05, 'epoch': 24.33}


 25%|██▍       | 742/3000 [00:49<02:04, 18.12it/s]

{'loss': 0.0111, 'grad_norm': 0.5427626371383667, 'learning_rate': 1.5066666666666668e-05, 'epoch': 24.67}


 25%|██▌       | 750/3000 [00:49<01:59, 18.83it/s]

{'loss': 0.0124, 'grad_norm': 2.4720230102539062, 'learning_rate': 1.5000000000000002e-05, 'epoch': 25.0}


                                                  
 25%|██▌       | 750/3000 [00:49<01:59, 18.83it/s]

{'eval_loss': 1.5932546854019165, 'eval_accuracy': 0.6826923076923077, 'eval_f1': 0.5728625101094736, 'eval_runtime': 0.0991, 'eval_samples_per_second': 2099.362, 'eval_steps_per_second': 20.186, 'epoch': 25.0}


 25%|██▌       | 763/3000 [00:51<02:23, 15.57it/s]

{'loss': 0.0117, 'grad_norm': 1.890413522720337, 'learning_rate': 1.4933333333333335e-05, 'epoch': 25.33}


 26%|██▌       | 772/3000 [00:51<02:01, 18.29it/s]

{'loss': 0.0311, 'grad_norm': 0.21576327085494995, 'learning_rate': 1.4866666666666668e-05, 'epoch': 25.67}


 26%|██▌       | 780/3000 [00:51<01:58, 18.70it/s]

{'loss': 0.01, 'grad_norm': 0.9370148181915283, 'learning_rate': 1.48e-05, 'epoch': 26.0}


                                                  
 26%|██▌       | 780/3000 [00:51<01:58, 18.70it/s]

{'eval_loss': 1.6378235816955566, 'eval_accuracy': 0.6875, 'eval_f1': 0.5704757981866415, 'eval_runtime': 0.0927, 'eval_samples_per_second': 2242.724, 'eval_steps_per_second': 21.565, 'epoch': 26.0}


 26%|██▋       | 793/3000 [00:53<02:15, 16.30it/s]

{'loss': 0.0122, 'grad_norm': 1.997572660446167, 'learning_rate': 1.4733333333333335e-05, 'epoch': 26.33}


 27%|██▋       | 802/3000 [00:53<01:57, 18.77it/s]

{'loss': 0.0208, 'grad_norm': 1.4298770427703857, 'learning_rate': 1.4666666666666666e-05, 'epoch': 26.67}


 27%|██▋       | 810/3000 [00:53<01:42, 21.30it/s]

{'loss': 0.0152, 'grad_norm': 0.29869893193244934, 'learning_rate': 1.46e-05, 'epoch': 27.0}


                                                  
 27%|██▋       | 810/3000 [00:54<01:42, 21.30it/s]

{'eval_loss': 1.4868144989013672, 'eval_accuracy': 0.7163461538461539, 'eval_f1': 0.6139806096773521, 'eval_runtime': 0.1016, 'eval_samples_per_second': 2047.173, 'eval_steps_per_second': 19.684, 'epoch': 27.0}


 27%|██▋       | 823/3000 [00:55<02:27, 14.73it/s]

{'loss': 0.0169, 'grad_norm': 0.7489205002784729, 'learning_rate': 1.4533333333333335e-05, 'epoch': 27.33}


 28%|██▊       | 833/3000 [00:55<01:59, 18.16it/s]

{'loss': 0.0189, 'grad_norm': 1.0578937530517578, 'learning_rate': 1.4466666666666668e-05, 'epoch': 27.67}


 28%|██▊       | 840/3000 [00:56<01:42, 21.13it/s]

{'loss': 0.0336, 'grad_norm': 26.63726234436035, 'learning_rate': 1.4400000000000001e-05, 'epoch': 28.0}


                                                  
 28%|██▊       | 840/3000 [00:56<01:42, 21.13it/s]

{'eval_loss': 1.6575933694839478, 'eval_accuracy': 0.6826923076923077, 'eval_f1': 0.5094568589574352, 'eval_runtime': 0.0993, 'eval_samples_per_second': 2094.78, 'eval_steps_per_second': 20.142, 'epoch': 28.0}


 28%|██▊       | 854/3000 [00:57<02:16, 15.72it/s]

{'loss': 0.0144, 'grad_norm': 1.2186559438705444, 'learning_rate': 1.4333333333333334e-05, 'epoch': 28.33}


 29%|██▉       | 864/3000 [00:57<01:53, 18.81it/s]

{'loss': 0.0047, 'grad_norm': 0.3789392113685608, 'learning_rate': 1.4266666666666668e-05, 'epoch': 28.67}


 29%|██▉       | 870/3000 [00:58<01:50, 19.23it/s]

{'loss': 0.0101, 'grad_norm': 0.13103067874908447, 'learning_rate': 1.4200000000000001e-05, 'epoch': 29.0}


                                                  
 29%|██▉       | 870/3000 [00:58<01:50, 19.23it/s]

{'eval_loss': 1.6093488931655884, 'eval_accuracy': 0.6971153846153846, 'eval_f1': 0.5175261167547919, 'eval_runtime': 0.0962, 'eval_samples_per_second': 2161.616, 'eval_steps_per_second': 20.785, 'epoch': 29.0}


 29%|██▉       | 882/3000 [00:59<02:21, 15.00it/s]

{'loss': 0.0113, 'grad_norm': 0.6002218127250671, 'learning_rate': 1.4133333333333334e-05, 'epoch': 29.33}


 30%|██▉       | 893/3000 [00:59<01:54, 18.39it/s]

{'loss': 0.0062, 'grad_norm': 0.4680057466030121, 'learning_rate': 1.4066666666666669e-05, 'epoch': 29.67}


 30%|███       | 900/3000 [01:00<01:42, 20.58it/s]

{'loss': 0.0218, 'grad_norm': 0.08173265308141708, 'learning_rate': 1.4e-05, 'epoch': 30.0}


                                                  
 30%|███       | 900/3000 [01:00<01:42, 20.58it/s]

{'eval_loss': 1.739910364151001, 'eval_accuracy': 0.6778846153846154, 'eval_f1': 0.5249945584154107, 'eval_runtime': 0.0941, 'eval_samples_per_second': 2210.039, 'eval_steps_per_second': 21.25, 'epoch': 30.0}


 30%|███       | 914/3000 [01:01<02:14, 15.52it/s]

{'loss': 0.0109, 'grad_norm': 0.20419169962406158, 'learning_rate': 1.3933333333333334e-05, 'epoch': 30.33}


 31%|███       | 923/3000 [01:02<01:52, 18.47it/s]

{'loss': 0.0117, 'grad_norm': 1.5300140380859375, 'learning_rate': 1.3866666666666669e-05, 'epoch': 30.67}


 31%|███       | 930/3000 [01:02<01:47, 19.30it/s]

{'loss': 0.0089, 'grad_norm': 0.04612762853503227, 'learning_rate': 1.38e-05, 'epoch': 31.0}


                                                  
 31%|███       | 930/3000 [01:02<01:47, 19.30it/s]

{'eval_loss': 1.7337182760238647, 'eval_accuracy': 0.6875, 'eval_f1': 0.5626232252182847, 'eval_runtime': 0.1106, 'eval_samples_per_second': 1881.219, 'eval_steps_per_second': 18.089, 'epoch': 31.0}


 31%|███▏      | 942/3000 [01:03<02:08, 15.99it/s]

{'loss': 0.0043, 'grad_norm': 0.4080306887626648, 'learning_rate': 1.3733333333333335e-05, 'epoch': 31.33}


 32%|███▏      | 952/3000 [01:03<01:50, 18.60it/s]

{'loss': 0.014, 'grad_norm': 1.6213172674179077, 'learning_rate': 1.3666666666666667e-05, 'epoch': 31.67}


 32%|███▏      | 960/3000 [01:04<01:39, 20.54it/s]

{'loss': 0.0114, 'grad_norm': 0.06951303780078888, 'learning_rate': 1.3600000000000002e-05, 'epoch': 32.0}


                                                  
 32%|███▏      | 960/3000 [01:04<01:39, 20.54it/s]

{'eval_loss': 1.6654982566833496, 'eval_accuracy': 0.6971153846153846, 'eval_f1': 0.5698518693901732, 'eval_runtime': 0.0916, 'eval_samples_per_second': 2270.897, 'eval_steps_per_second': 21.836, 'epoch': 32.0}


 32%|███▏      | 973/3000 [01:05<02:14, 15.02it/s]

{'loss': 0.0052, 'grad_norm': 0.11986096203327179, 'learning_rate': 1.3533333333333333e-05, 'epoch': 32.33}


 33%|███▎      | 983/3000 [01:06<01:55, 17.53it/s]

{'loss': 0.0142, 'grad_norm': 2.4437413215637207, 'learning_rate': 1.3466666666666668e-05, 'epoch': 32.67}


 33%|███▎      | 990/3000 [01:06<01:35, 21.07it/s]

{'loss': 0.0126, 'grad_norm': 0.9466429948806763, 'learning_rate': 1.3400000000000002e-05, 'epoch': 33.0}


                                                  
 33%|███▎      | 990/3000 [01:06<01:35, 21.07it/s]

{'eval_loss': 1.5899405479431152, 'eval_accuracy': 0.7019230769230769, 'eval_f1': 0.5794280477942606, 'eval_runtime': 0.0989, 'eval_samples_per_second': 2102.773, 'eval_steps_per_second': 20.219, 'epoch': 33.0}


 33%|███▎      | 1004/3000 [01:07<02:06, 15.84it/s]

{'loss': 0.0078, 'grad_norm': 0.18378446996212006, 'learning_rate': 1.3333333333333333e-05, 'epoch': 33.33}


 34%|███▍      | 1013/3000 [01:08<01:47, 18.45it/s]

{'loss': 0.0112, 'grad_norm': 1.3798043727874756, 'learning_rate': 1.3266666666666668e-05, 'epoch': 33.67}


 34%|███▍      | 1020/3000 [01:08<01:43, 19.13it/s]

{'loss': 0.0115, 'grad_norm': 0.14235655963420868, 'learning_rate': 1.3200000000000002e-05, 'epoch': 34.0}


                                                   
 34%|███▍      | 1020/3000 [01:08<01:43, 19.13it/s]

{'eval_loss': 1.6995762586593628, 'eval_accuracy': 0.7067307692307693, 'eval_f1': 0.5927671972563614, 'eval_runtime': 0.1201, 'eval_samples_per_second': 1732.368, 'eval_steps_per_second': 16.657, 'epoch': 34.0}


 34%|███▍      | 1034/3000 [01:09<02:00, 16.38it/s]

{'loss': 0.0036, 'grad_norm': 0.1319739818572998, 'learning_rate': 1.3133333333333334e-05, 'epoch': 34.33}


 35%|███▍      | 1043/3000 [01:10<01:47, 18.13it/s]

{'loss': 0.0084, 'grad_norm': 1.6556532382965088, 'learning_rate': 1.3066666666666668e-05, 'epoch': 34.67}


 35%|███▌      | 1050/3000 [01:10<01:33, 20.93it/s]

{'loss': 0.0172, 'grad_norm': 1.3587597608566284, 'learning_rate': 1.3000000000000001e-05, 'epoch': 35.0}


                                                   
 35%|███▌      | 1050/3000 [01:10<01:33, 20.93it/s]

{'eval_loss': 1.7185454368591309, 'eval_accuracy': 0.6875, 'eval_f1': 0.5513412205706052, 'eval_runtime': 0.0968, 'eval_samples_per_second': 2149.77, 'eval_steps_per_second': 20.671, 'epoch': 35.0}


 35%|███▌      | 1062/3000 [01:11<02:13, 14.54it/s]

{'loss': 0.0165, 'grad_norm': 2.8434717655181885, 'learning_rate': 1.2933333333333334e-05, 'epoch': 35.33}


 36%|███▌      | 1073/3000 [01:12<01:44, 18.50it/s]

{'loss': 0.0071, 'grad_norm': 0.26159486174583435, 'learning_rate': 1.2866666666666667e-05, 'epoch': 35.67}


 36%|███▌      | 1080/3000 [01:12<01:40, 19.10it/s]

{'loss': 0.0057, 'grad_norm': 0.07628188282251358, 'learning_rate': 1.2800000000000001e-05, 'epoch': 36.0}


                                                   
 36%|███▌      | 1080/3000 [01:12<01:40, 19.10it/s]

{'eval_loss': 1.9641928672790527, 'eval_accuracy': 0.6682692307692307, 'eval_f1': 0.5747873279563421, 'eval_runtime': 0.0935, 'eval_samples_per_second': 2224.471, 'eval_steps_per_second': 21.389, 'epoch': 36.0}


 36%|███▋      | 1093/3000 [01:13<01:54, 16.63it/s]

{'loss': 0.0127, 'grad_norm': 1.2837201356887817, 'learning_rate': 1.2733333333333336e-05, 'epoch': 36.33}


 37%|███▋      | 1103/3000 [01:14<01:39, 18.98it/s]

{'loss': 0.0172, 'grad_norm': 0.44920384883880615, 'learning_rate': 1.2666666666666667e-05, 'epoch': 36.67}


 37%|███▋      | 1110/3000 [01:14<01:36, 19.50it/s]

{'loss': 0.0025, 'grad_norm': 0.1595889776945114, 'learning_rate': 1.2600000000000001e-05, 'epoch': 37.0}


                                                   
 37%|███▋      | 1110/3000 [01:14<01:36, 19.50it/s]

{'eval_loss': 1.8619803190231323, 'eval_accuracy': 0.6923076923076923, 'eval_f1': 0.5784176236272045, 'eval_runtime': 0.0917, 'eval_samples_per_second': 2267.032, 'eval_steps_per_second': 21.798, 'epoch': 37.0}


 37%|███▋      | 1123/3000 [01:15<01:52, 16.71it/s]

{'loss': 0.0032, 'grad_norm': 0.08026941865682602, 'learning_rate': 1.2533333333333336e-05, 'epoch': 37.33}


 38%|███▊      | 1134/3000 [01:16<01:37, 19.16it/s]

{'loss': 0.0058, 'grad_norm': 0.10047397762537003, 'learning_rate': 1.2466666666666667e-05, 'epoch': 37.67}


 38%|███▊      | 1140/3000 [01:16<01:27, 21.33it/s]

{'loss': 0.0149, 'grad_norm': 0.2991020083427429, 'learning_rate': 1.2400000000000002e-05, 'epoch': 38.0}


                                                   
 38%|███▊      | 1140/3000 [01:16<01:27, 21.33it/s]

{'eval_loss': 1.7558027505874634, 'eval_accuracy': 0.6875, 'eval_f1': 0.5533668917887993, 'eval_runtime': 0.0929, 'eval_samples_per_second': 2238.259, 'eval_steps_per_second': 21.522, 'epoch': 38.0}


 38%|███▊      | 1154/3000 [01:17<01:51, 16.59it/s]

{'loss': 0.0057, 'grad_norm': 0.10927031934261322, 'learning_rate': 1.2333333333333334e-05, 'epoch': 38.33}


 39%|███▉      | 1163/3000 [01:18<01:36, 18.99it/s]

{'loss': 0.012, 'grad_norm': 6.782764911651611, 'learning_rate': 1.2266666666666667e-05, 'epoch': 38.67}


 39%|███▉      | 1170/3000 [01:18<01:31, 19.98it/s]

{'loss': 0.0047, 'grad_norm': 0.7102252244949341, 'learning_rate': 1.22e-05, 'epoch': 39.0}


                                                   
 39%|███▉      | 1170/3000 [01:18<01:31, 19.98it/s]

{'eval_loss': 1.771148920059204, 'eval_accuracy': 0.6875, 'eval_f1': 0.5946623503202451, 'eval_runtime': 0.0918, 'eval_samples_per_second': 2265.149, 'eval_steps_per_second': 21.78, 'epoch': 39.0}


 39%|███▉      | 1184/3000 [01:19<01:47, 16.87it/s]

{'loss': 0.0111, 'grad_norm': 0.2248019576072693, 'learning_rate': 1.2133333333333335e-05, 'epoch': 39.33}


 40%|███▉      | 1193/3000 [01:19<01:35, 19.02it/s]

{'loss': 0.0184, 'grad_norm': 0.23673680424690247, 'learning_rate': 1.206666666666667e-05, 'epoch': 39.67}


 40%|████      | 1200/3000 [01:20<01:30, 19.93it/s]

{'loss': 0.0037, 'grad_norm': 0.0860794261097908, 'learning_rate': 1.2e-05, 'epoch': 40.0}


                                                   
 40%|████      | 1200/3000 [01:20<01:30, 19.93it/s]

{'eval_loss': 1.8005059957504272, 'eval_accuracy': 0.6971153846153846, 'eval_f1': 0.5929960633908004, 'eval_runtime': 0.0943, 'eval_samples_per_second': 2205.447, 'eval_steps_per_second': 21.206, 'epoch': 40.0}


 40%|████      | 1214/3000 [01:21<01:45, 16.87it/s]

{'loss': 0.0049, 'grad_norm': 0.06256862729787827, 'learning_rate': 1.1933333333333335e-05, 'epoch': 40.33}


 41%|████      | 1222/3000 [01:21<01:34, 18.83it/s]

{'loss': 0.0046, 'grad_norm': 0.08189694583415985, 'learning_rate': 1.186666666666667e-05, 'epoch': 40.67}


 41%|████      | 1230/3000 [01:22<01:29, 19.72it/s]

{'loss': 0.0171, 'grad_norm': 0.1474190056324005, 'learning_rate': 1.18e-05, 'epoch': 41.0}


                                                   
 41%|████      | 1230/3000 [01:22<01:29, 19.72it/s]

{'eval_loss': 1.7542372941970825, 'eval_accuracy': 0.6971153846153846, 'eval_f1': 0.5894908196378785, 'eval_runtime': 0.0919, 'eval_samples_per_second': 2262.828, 'eval_steps_per_second': 21.758, 'epoch': 41.0}


 41%|████▏     | 1243/3000 [01:23<01:44, 16.83it/s]

{'loss': 0.0022, 'grad_norm': 0.14239533245563507, 'learning_rate': 1.1733333333333335e-05, 'epoch': 41.33}


 42%|████▏     | 1254/3000 [01:23<01:30, 19.33it/s]

{'loss': 0.0161, 'grad_norm': 0.8910853862762451, 'learning_rate': 1.1666666666666668e-05, 'epoch': 41.67}


 42%|████▏     | 1260/3000 [01:23<01:20, 21.56it/s]

{'loss': 0.0041, 'grad_norm': 0.036299996078014374, 'learning_rate': 1.16e-05, 'epoch': 42.0}


                                                   
 42%|████▏     | 1260/3000 [01:24<01:20, 21.56it/s]

{'eval_loss': 1.7269933223724365, 'eval_accuracy': 0.7019230769230769, 'eval_f1': 0.5915200532767828, 'eval_runtime': 0.0922, 'eval_samples_per_second': 2255.62, 'eval_steps_per_second': 21.689, 'epoch': 42.0}


 42%|████▏     | 1274/3000 [01:25<01:44, 16.55it/s]

{'loss': 0.003, 'grad_norm': 0.5513314008712769, 'learning_rate': 1.1533333333333334e-05, 'epoch': 42.33}


 43%|████▎     | 1283/3000 [01:25<01:30, 18.95it/s]

{'loss': 0.0149, 'grad_norm': 1.0358250141143799, 'learning_rate': 1.1466666666666668e-05, 'epoch': 42.67}


 43%|████▎     | 1290/3000 [01:25<01:25, 19.94it/s]

{'loss': 0.0049, 'grad_norm': 0.05614381656050682, 'learning_rate': 1.14e-05, 'epoch': 43.0}


                                                   
 43%|████▎     | 1290/3000 [01:25<01:25, 19.94it/s]

{'eval_loss': 1.9264363050460815, 'eval_accuracy': 0.7067307692307693, 'eval_f1': 0.6102797529268118, 'eval_runtime': 0.0924, 'eval_samples_per_second': 2249.977, 'eval_steps_per_second': 21.634, 'epoch': 43.0}


 43%|████▎     | 1304/3000 [01:26<01:40, 16.89it/s]

{'loss': 0.0042, 'grad_norm': 0.05141422897577286, 'learning_rate': 1.1333333333333334e-05, 'epoch': 43.33}


 44%|████▍     | 1313/3000 [01:27<01:28, 19.06it/s]

{'loss': 0.0082, 'grad_norm': 1.5576441287994385, 'learning_rate': 1.1266666666666668e-05, 'epoch': 43.67}


 44%|████▍     | 1320/3000 [01:27<01:23, 20.03it/s]

{'loss': 0.0098, 'grad_norm': 0.4999445378780365, 'learning_rate': 1.1200000000000001e-05, 'epoch': 44.0}


                                                   
 44%|████▍     | 1320/3000 [01:27<01:23, 20.03it/s]

{'eval_loss': 1.8634508848190308, 'eval_accuracy': 0.7019230769230769, 'eval_f1': 0.5916196282650775, 'eval_runtime': 0.092, 'eval_samples_per_second': 2260.442, 'eval_steps_per_second': 21.735, 'epoch': 44.0}


 44%|████▍     | 1334/3000 [01:28<01:39, 16.82it/s]

{'loss': 0.0097, 'grad_norm': 0.03209790959954262, 'learning_rate': 1.1133333333333334e-05, 'epoch': 44.33}


 45%|████▍     | 1342/3000 [01:29<01:28, 18.80it/s]

{'loss': 0.0045, 'grad_norm': 0.19791896641254425, 'learning_rate': 1.1066666666666669e-05, 'epoch': 44.67}


 45%|████▌     | 1350/3000 [01:29<01:23, 19.64it/s]

{'loss': 0.0082, 'grad_norm': 0.37564554810523987, 'learning_rate': 1.1000000000000001e-05, 'epoch': 45.0}


                                                   
 45%|████▌     | 1350/3000 [01:29<01:23, 19.64it/s]

{'eval_loss': 1.7968246936798096, 'eval_accuracy': 0.6875, 'eval_f1': 0.5834872445898719, 'eval_runtime': 0.0945, 'eval_samples_per_second': 2200.912, 'eval_steps_per_second': 21.163, 'epoch': 45.0}


 45%|████▌     | 1363/3000 [01:30<01:37, 16.78it/s]

{'loss': 0.0112, 'grad_norm': 0.1787310242652893, 'learning_rate': 1.0933333333333334e-05, 'epoch': 45.33}


 46%|████▌     | 1374/3000 [01:31<01:24, 19.21it/s]

{'loss': 0.0028, 'grad_norm': 0.21007980406284332, 'learning_rate': 1.0866666666666667e-05, 'epoch': 45.67}


 46%|████▌     | 1380/3000 [01:31<01:15, 21.42it/s]

{'loss': 0.0049, 'grad_norm': 0.020140621811151505, 'learning_rate': 1.0800000000000002e-05, 'epoch': 46.0}


                                                   
 46%|████▌     | 1380/3000 [01:31<01:15, 21.42it/s]

{'eval_loss': 1.9107781648635864, 'eval_accuracy': 0.6875, 'eval_f1': 0.564221778034243, 'eval_runtime': 0.0911, 'eval_samples_per_second': 2283.893, 'eval_steps_per_second': 21.961, 'epoch': 46.0}


 46%|████▋     | 1394/3000 [01:32<01:36, 16.62it/s]

{'loss': 0.0041, 'grad_norm': 1.5996482372283936, 'learning_rate': 1.0733333333333333e-05, 'epoch': 46.33}


 47%|████▋     | 1403/3000 [01:33<01:24, 18.94it/s]

{'loss': 0.01, 'grad_norm': 0.04369673877954483, 'learning_rate': 1.0666666666666667e-05, 'epoch': 46.67}


 47%|████▋     | 1410/3000 [01:33<01:19, 19.95it/s]

{'loss': 0.0059, 'grad_norm': 0.34847962856292725, 'learning_rate': 1.0600000000000002e-05, 'epoch': 47.0}


                                                   
 47%|████▋     | 1410/3000 [01:33<01:19, 19.95it/s]

{'eval_loss': 1.805586338043213, 'eval_accuracy': 0.7019230769230769, 'eval_f1': 0.5861820177825651, 'eval_runtime': 0.0923, 'eval_samples_per_second': 2253.953, 'eval_steps_per_second': 21.673, 'epoch': 47.0}


 47%|████▋     | 1424/3000 [01:34<01:33, 16.87it/s]

{'loss': 0.002, 'grad_norm': 0.06727764010429382, 'learning_rate': 1.0533333333333333e-05, 'epoch': 47.33}


 48%|████▊     | 1433/3000 [01:34<01:22, 19.03it/s]

{'loss': 0.0144, 'grad_norm': 0.04041365534067154, 'learning_rate': 1.0466666666666668e-05, 'epoch': 47.67}


 48%|████▊     | 1440/3000 [01:35<01:18, 19.93it/s]

{'loss': 0.005, 'grad_norm': 0.06758008152246475, 'learning_rate': 1.04e-05, 'epoch': 48.0}


                                                   
 48%|████▊     | 1440/3000 [01:35<01:18, 19.93it/s]

{'eval_loss': 1.854748010635376, 'eval_accuracy': 0.6971153846153846, 'eval_f1': 0.5841489517960107, 'eval_runtime': 0.0915, 'eval_samples_per_second': 2274.39, 'eval_steps_per_second': 21.869, 'epoch': 48.0}


 48%|████▊     | 1454/3000 [01:36<01:32, 16.63it/s]

{'loss': 0.0117, 'grad_norm': 0.5864347219467163, 'learning_rate': 1.0333333333333335e-05, 'epoch': 48.33}


 49%|████▉     | 1463/3000 [01:36<01:21, 18.92it/s]

{'loss': 0.0124, 'grad_norm': 0.07539136707782745, 'learning_rate': 1.0266666666666668e-05, 'epoch': 48.67}


 49%|████▉     | 1470/3000 [01:37<01:16, 19.90it/s]

{'loss': 0.002, 'grad_norm': 0.9607924222946167, 'learning_rate': 1.02e-05, 'epoch': 49.0}


                                                   
 49%|████▉     | 1470/3000 [01:37<01:16, 19.90it/s]

{'eval_loss': 1.892915964126587, 'eval_accuracy': 0.6923076923076923, 'eval_f1': 0.583800348021905, 'eval_runtime': 0.0923, 'eval_samples_per_second': 2253.528, 'eval_steps_per_second': 21.669, 'epoch': 49.0}


 49%|████▉     | 1484/3000 [01:38<01:29, 16.85it/s]

{'loss': 0.0054, 'grad_norm': 0.05848065763711929, 'learning_rate': 1.0133333333333335e-05, 'epoch': 49.33}


 50%|████▉     | 1493/3000 [01:38<01:19, 19.04it/s]

{'loss': 0.0089, 'grad_norm': 0.03796153515577316, 'learning_rate': 1.0066666666666666e-05, 'epoch': 49.67}


 50%|█████     | 1500/3000 [01:39<01:15, 19.96it/s]

{'loss': 0.0057, 'grad_norm': 0.12277447432279587, 'learning_rate': 1e-05, 'epoch': 50.0}


                                                   
 50%|█████     | 1500/3000 [01:39<01:15, 19.96it/s]

{'eval_loss': 1.8286983966827393, 'eval_accuracy': 0.7019230769230769, 'eval_f1': 0.587507944681011, 'eval_runtime': 0.0929, 'eval_samples_per_second': 2239.822, 'eval_steps_per_second': 21.537, 'epoch': 50.0}


 50%|█████     | 1514/3000 [01:40<01:27, 16.91it/s]

{'loss': 0.0108, 'grad_norm': 0.5260933041572571, 'learning_rate': 9.933333333333334e-06, 'epoch': 50.33}


 51%|█████     | 1523/3000 [01:40<01:17, 19.05it/s]

{'loss': 0.0034, 'grad_norm': 0.14990365505218506, 'learning_rate': 9.866666666666668e-06, 'epoch': 50.67}


 51%|█████     | 1530/3000 [01:40<01:13, 19.95it/s]

{'loss': 0.0058, 'grad_norm': 0.051733821630477905, 'learning_rate': 9.800000000000001e-06, 'epoch': 51.0}


                                                   
 51%|█████     | 1530/3000 [01:41<01:13, 19.95it/s]

{'eval_loss': 1.8051021099090576, 'eval_accuracy': 0.6971153846153846, 'eval_f1': 0.5902893569981721, 'eval_runtime': 0.0935, 'eval_samples_per_second': 2224.817, 'eval_steps_per_second': 21.392, 'epoch': 51.0}


 51%|█████▏    | 1544/3000 [01:42<01:26, 16.87it/s]

{'loss': 0.0026, 'grad_norm': 0.1613815873861313, 'learning_rate': 9.733333333333334e-06, 'epoch': 51.33}


 52%|█████▏    | 1553/3000 [01:42<01:16, 19.02it/s]

{'loss': 0.0107, 'grad_norm': 0.06709332019090652, 'learning_rate': 9.666666666666667e-06, 'epoch': 51.67}


 52%|█████▏    | 1560/3000 [01:42<01:12, 19.88it/s]

{'loss': 0.0052, 'grad_norm': 0.1479969173669815, 'learning_rate': 9.600000000000001e-06, 'epoch': 52.0}


                                                   
 52%|█████▏    | 1560/3000 [01:42<01:12, 19.88it/s]

{'eval_loss': 1.9228991270065308, 'eval_accuracy': 0.7019230769230769, 'eval_f1': 0.5919904721413902, 'eval_runtime': 0.0913, 'eval_samples_per_second': 2278.679, 'eval_steps_per_second': 21.91, 'epoch': 52.0}


 52%|█████▏    | 1571/3000 [01:43<01:31, 15.61it/s]

{'loss': 0.0022, 'grad_norm': 0.173593208193779, 'learning_rate': 9.533333333333334e-06, 'epoch': 52.33}


 53%|█████▎    | 1583/3000 [01:44<01:14, 18.97it/s]

{'loss': 0.0019, 'grad_norm': 0.9614808559417725, 'learning_rate': 9.466666666666667e-06, 'epoch': 52.67}


 53%|█████▎    | 1590/3000 [01:44<01:10, 19.93it/s]

{'loss': 0.0167, 'grad_norm': 0.03442004323005676, 'learning_rate': 9.4e-06, 'epoch': 53.0}


                                                   
 53%|█████▎    | 1590/3000 [01:44<01:10, 19.93it/s]

{'eval_loss': 1.8780875205993652, 'eval_accuracy': 0.6971153846153846, 'eval_f1': 0.5883024054188266, 'eval_runtime': 0.0925, 'eval_samples_per_second': 2247.589, 'eval_steps_per_second': 21.611, 'epoch': 53.0}


 53%|█████▎    | 1604/3000 [01:45<01:23, 16.71it/s]

{'loss': 0.0025, 'grad_norm': 0.6416522860527039, 'learning_rate': 9.333333333333334e-06, 'epoch': 53.33}


 54%|█████▍    | 1613/3000 [01:46<01:13, 18.90it/s]

{'loss': 0.0122, 'grad_norm': 0.06727659702301025, 'learning_rate': 9.266666666666667e-06, 'epoch': 53.67}


 54%|█████▍    | 1620/3000 [01:46<01:09, 19.84it/s]

{'loss': 0.0075, 'grad_norm': 0.12149997800588608, 'learning_rate': 9.200000000000002e-06, 'epoch': 54.0}


                                                   
 54%|█████▍    | 1620/3000 [01:46<01:09, 19.84it/s]

{'eval_loss': 1.9581665992736816, 'eval_accuracy': 0.6875, 'eval_f1': 0.5888171812629808, 'eval_runtime': 0.0946, 'eval_samples_per_second': 2198.633, 'eval_steps_per_second': 21.141, 'epoch': 54.0}


 54%|█████▍    | 1634/3000 [01:47<01:21, 16.77it/s]

{'loss': 0.0051, 'grad_norm': 0.027186747640371323, 'learning_rate': 9.133333333333335e-06, 'epoch': 54.33}


 55%|█████▍    | 1643/3000 [01:48<01:11, 18.86it/s]

{'loss': 0.0012, 'grad_norm': 0.10905736684799194, 'learning_rate': 9.066666666666667e-06, 'epoch': 54.67}


 55%|█████▌    | 1650/3000 [01:48<01:08, 19.73it/s]

{'loss': 0.0112, 'grad_norm': 0.00922964047640562, 'learning_rate': 9e-06, 'epoch': 55.0}


                                                   
 55%|█████▌    | 1650/3000 [01:48<01:08, 19.73it/s]

{'eval_loss': 1.869537353515625, 'eval_accuracy': 0.7019230769230769, 'eval_f1': 0.5909844999961279, 'eval_runtime': 0.0948, 'eval_samples_per_second': 2193.541, 'eval_steps_per_second': 21.092, 'epoch': 55.0}


 55%|█████▌    | 1664/3000 [01:49<01:19, 16.78it/s]

{'loss': 0.0029, 'grad_norm': 0.024678988382220268, 'learning_rate': 8.933333333333333e-06, 'epoch': 55.33}


 56%|█████▌    | 1673/3000 [01:50<01:09, 19.01it/s]

{'loss': 0.0115, 'grad_norm': 0.05772366002202034, 'learning_rate': 8.866666666666668e-06, 'epoch': 55.67}


 56%|█████▌    | 1680/3000 [01:50<01:06, 19.95it/s]

{'loss': 0.0023, 'grad_norm': 0.02342071197926998, 'learning_rate': 8.8e-06, 'epoch': 56.0}


                                                   
 56%|█████▌    | 1680/3000 [01:50<01:06, 19.95it/s]

{'eval_loss': 1.8757811784744263, 'eval_accuracy': 0.7019230769230769, 'eval_f1': 0.5898258165018666, 'eval_runtime': 0.0911, 'eval_samples_per_second': 2283.821, 'eval_steps_per_second': 21.96, 'epoch': 56.0}


 56%|█████▋    | 1694/3000 [01:51<01:17, 16.95it/s]

{'loss': 0.0053, 'grad_norm': 0.34455758333206177, 'learning_rate': 8.733333333333333e-06, 'epoch': 56.33}


 57%|█████▋    | 1703/3000 [01:52<01:07, 19.09it/s]

{'loss': 0.0094, 'grad_norm': 0.11878693103790283, 'learning_rate': 8.666666666666668e-06, 'epoch': 56.67}


 57%|█████▋    | 1710/3000 [01:52<01:04, 20.05it/s]

{'loss': 0.004, 'grad_norm': 0.11384493857622147, 'learning_rate': 8.6e-06, 'epoch': 57.0}


                                                   
 57%|█████▋    | 1710/3000 [01:52<01:04, 20.05it/s]

{'eval_loss': 2.1583991050720215, 'eval_accuracy': 0.6778846153846154, 'eval_f1': 0.5789557577730294, 'eval_runtime': 0.092, 'eval_samples_per_second': 2260.916, 'eval_steps_per_second': 21.74, 'epoch': 57.0}


 57%|█████▋    | 1724/3000 [01:53<01:15, 16.92it/s]

{'loss': 0.0055, 'grad_norm': 0.344380646944046, 'learning_rate': 8.533333333333335e-06, 'epoch': 57.33}


 58%|█████▊    | 1733/3000 [01:53<01:06, 19.04it/s]

{'loss': 0.004, 'grad_norm': 0.6619100570678711, 'learning_rate': 8.466666666666668e-06, 'epoch': 57.67}


 58%|█████▊    | 1740/3000 [01:54<01:03, 19.99it/s]

{'loss': 0.0144, 'grad_norm': 0.0334264375269413, 'learning_rate': 8.400000000000001e-06, 'epoch': 58.0}


                                                   
 58%|█████▊    | 1740/3000 [01:54<01:03, 19.99it/s]

{'eval_loss': 1.8179962635040283, 'eval_accuracy': 0.6923076923076923, 'eval_f1': 0.5811897860461472, 'eval_runtime': 0.0921, 'eval_samples_per_second': 2259.534, 'eval_steps_per_second': 21.726, 'epoch': 58.0}


 58%|█████▊    | 1754/3000 [01:55<01:13, 16.91it/s]

{'loss': 0.0022, 'grad_norm': 0.08830050379037857, 'learning_rate': 8.333333333333334e-06, 'epoch': 58.33}


 59%|█████▉    | 1763/3000 [01:55<01:04, 19.06it/s]

{'loss': 0.0135, 'grad_norm': 0.7969476580619812, 'learning_rate': 8.266666666666667e-06, 'epoch': 58.67}


 59%|█████▉    | 1770/3000 [01:56<01:01, 20.02it/s]

{'loss': 0.003, 'grad_norm': 0.3987663984298706, 'learning_rate': 8.2e-06, 'epoch': 59.0}


                                                   
 59%|█████▉    | 1770/3000 [01:56<01:01, 20.02it/s]

{'eval_loss': 2.0035340785980225, 'eval_accuracy': 0.6778846153846154, 'eval_f1': 0.5759951289693861, 'eval_runtime': 0.0912, 'eval_samples_per_second': 2280.896, 'eval_steps_per_second': 21.932, 'epoch': 59.0}


 59%|█████▉    | 1784/3000 [01:57<01:12, 16.84it/s]

{'loss': 0.0109, 'grad_norm': 0.07810644060373306, 'learning_rate': 8.133333333333334e-06, 'epoch': 59.33}


 60%|█████▉    | 1793/3000 [01:57<01:03, 18.98it/s]

{'loss': 0.0043, 'grad_norm': 0.6129716038703918, 'learning_rate': 8.066666666666667e-06, 'epoch': 59.67}


 60%|██████    | 1800/3000 [01:57<01:00, 19.94it/s]

{'loss': 0.0066, 'grad_norm': 0.019596343860030174, 'learning_rate': 8.000000000000001e-06, 'epoch': 60.0}


                                                   
 60%|██████    | 1800/3000 [01:58<01:00, 19.94it/s]

{'eval_loss': 1.952121615409851, 'eval_accuracy': 0.6826923076923077, 'eval_f1': 0.5769049231853035, 'eval_runtime': 0.0922, 'eval_samples_per_second': 2256.198, 'eval_steps_per_second': 21.694, 'epoch': 60.0}


 60%|██████    | 1814/3000 [01:59<01:10, 16.86it/s]

{'loss': 0.0069, 'grad_norm': 0.07913335412740707, 'learning_rate': 7.933333333333334e-06, 'epoch': 60.33}


 61%|██████    | 1822/3000 [01:59<01:02, 18.83it/s]

{'loss': 0.0053, 'grad_norm': 0.018999511376023293, 'learning_rate': 7.866666666666667e-06, 'epoch': 60.67}


 61%|██████    | 1830/3000 [01:59<00:59, 19.70it/s]

{'loss': 0.0087, 'grad_norm': 0.04481867700815201, 'learning_rate': 7.800000000000002e-06, 'epoch': 61.0}


                                                   
 61%|██████    | 1830/3000 [01:59<00:59, 19.70it/s]

{'eval_loss': 1.9459176063537598, 'eval_accuracy': 0.6875, 'eval_f1': 0.58002508002508, 'eval_runtime': 0.0946, 'eval_samples_per_second': 2198.461, 'eval_steps_per_second': 21.139, 'epoch': 61.0}


 61%|██████▏   | 1843/3000 [02:00<01:08, 16.89it/s]

{'loss': 0.0102, 'grad_norm': 0.049934618175029755, 'learning_rate': 7.733333333333334e-06, 'epoch': 61.33}


 62%|██████▏   | 1854/3000 [02:01<00:59, 19.35it/s]

{'loss': 0.0043, 'grad_norm': 2.5773675441741943, 'learning_rate': 7.666666666666667e-06, 'epoch': 61.67}


 62%|██████▏   | 1860/3000 [02:01<00:52, 21.53it/s]

{'loss': 0.0061, 'grad_norm': 0.029980499297380447, 'learning_rate': 7.600000000000001e-06, 'epoch': 62.0}


                                                   
 62%|██████▏   | 1860/3000 [02:01<00:52, 21.53it/s]

{'eval_loss': 1.9689050912857056, 'eval_accuracy': 0.6923076923076923, 'eval_f1': 0.563196553964988, 'eval_runtime': 0.0917, 'eval_samples_per_second': 2267.162, 'eval_steps_per_second': 21.8, 'epoch': 62.0}


 62%|██████▏   | 1874/3000 [02:02<01:07, 16.59it/s]

{'loss': 0.004, 'grad_norm': 0.03965414687991142, 'learning_rate': 7.533333333333334e-06, 'epoch': 62.33}


 63%|██████▎   | 1883/3000 [02:03<00:58, 18.97it/s]

{'loss': 0.0067, 'grad_norm': 0.08075366914272308, 'learning_rate': 7.4666666666666675e-06, 'epoch': 62.67}


 63%|██████▎   | 1890/3000 [02:03<00:55, 19.93it/s]

{'loss': 0.0049, 'grad_norm': 0.03313649445772171, 'learning_rate': 7.4e-06, 'epoch': 63.0}


                                                   
 63%|██████▎   | 1890/3000 [02:03<00:55, 19.93it/s]

{'eval_loss': 1.972814679145813, 'eval_accuracy': 0.6971153846153846, 'eval_f1': 0.5868781949753954, 'eval_runtime': 0.0926, 'eval_samples_per_second': 2247.219, 'eval_steps_per_second': 21.608, 'epoch': 63.0}


 63%|██████▎   | 1904/3000 [02:04<01:04, 16.95it/s]

{'loss': 0.0101, 'grad_norm': 0.18748259544372559, 'learning_rate': 7.333333333333333e-06, 'epoch': 63.33}


 64%|██████▍   | 1913/3000 [02:05<00:56, 19.10it/s]

{'loss': 0.002, 'grad_norm': 0.21436487138271332, 'learning_rate': 7.266666666666668e-06, 'epoch': 63.67}


 64%|██████▍   | 1920/3000 [02:05<00:53, 20.04it/s]

{'loss': 0.0063, 'grad_norm': 0.3880437910556793, 'learning_rate': 7.2000000000000005e-06, 'epoch': 64.0}


                                                   
 64%|██████▍   | 1920/3000 [02:05<00:53, 20.04it/s]

{'eval_loss': 1.9255790710449219, 'eval_accuracy': 0.6971153846153846, 'eval_f1': 0.5829859843172538, 'eval_runtime': 0.1225, 'eval_samples_per_second': 1697.355, 'eval_steps_per_second': 16.321, 'epoch': 64.0}


 64%|██████▍   | 1934/3000 [02:06<01:03, 16.70it/s]

{'loss': 0.0082, 'grad_norm': 0.6015915870666504, 'learning_rate': 7.133333333333334e-06, 'epoch': 64.33}


 65%|██████▍   | 1942/3000 [02:07<00:56, 18.65it/s]

{'loss': 0.0065, 'grad_norm': 2.070690870285034, 'learning_rate': 7.066666666666667e-06, 'epoch': 64.67}


 65%|██████▌   | 1950/3000 [02:07<00:53, 19.59it/s]

{'loss': 0.0019, 'grad_norm': 0.7235963344573975, 'learning_rate': 7e-06, 'epoch': 65.0}


                                                   
 65%|██████▌   | 1950/3000 [02:07<00:53, 19.59it/s]

{'eval_loss': 2.0237972736358643, 'eval_accuracy': 0.6923076923076923, 'eval_f1': 0.5837768508610743, 'eval_runtime': 0.0945, 'eval_samples_per_second': 2201.523, 'eval_steps_per_second': 21.168, 'epoch': 65.0}


 65%|██████▌   | 1963/3000 [02:08<01:01, 16.89it/s]

{'loss': 0.0091, 'grad_norm': 0.05976345017552376, 'learning_rate': 6.9333333333333344e-06, 'epoch': 65.33}


 66%|██████▌   | 1972/3000 [02:08<00:54, 19.03it/s]

{'loss': 0.0034, 'grad_norm': 0.042709946632385254, 'learning_rate': 6.866666666666667e-06, 'epoch': 65.67}


 66%|██████▌   | 1980/3000 [02:09<00:51, 19.72it/s]

{'loss': 0.0045, 'grad_norm': 0.03017040714621544, 'learning_rate': 6.800000000000001e-06, 'epoch': 66.0}


                                                   
 66%|██████▌   | 1980/3000 [02:09<00:51, 19.72it/s]

{'eval_loss': 1.9930593967437744, 'eval_accuracy': 0.6826923076923077, 'eval_f1': 0.5758646965481926, 'eval_runtime': 0.0908, 'eval_samples_per_second': 2291.127, 'eval_steps_per_second': 22.03, 'epoch': 66.0}


 66%|██████▋   | 1993/3000 [02:10<00:59, 16.96it/s]

{'loss': 0.0117, 'grad_norm': 0.013305772095918655, 'learning_rate': 6.733333333333334e-06, 'epoch': 66.33}


 67%|██████▋   | 2002/3000 [02:10<00:52, 19.10it/s]

{'loss': 0.003, 'grad_norm': 0.04874725639820099, 'learning_rate': 6.666666666666667e-06, 'epoch': 66.67}


 67%|██████▋   | 2010/3000 [02:11<00:50, 19.75it/s]

{'loss': 0.0028, 'grad_norm': 0.19395655393600464, 'learning_rate': 6.600000000000001e-06, 'epoch': 67.0}


                                                   
 67%|██████▋   | 2010/3000 [02:11<00:50, 19.75it/s]

{'eval_loss': 1.8812575340270996, 'eval_accuracy': 0.6923076923076923, 'eval_f1': 0.5806751651291747, 'eval_runtime': 0.0912, 'eval_samples_per_second': 2281.629, 'eval_steps_per_second': 21.939, 'epoch': 67.0}


 67%|██████▋   | 2023/3000 [02:12<00:57, 16.95it/s]

{'loss': 0.0032, 'grad_norm': 0.04094114899635315, 'learning_rate': 6.533333333333334e-06, 'epoch': 67.33}


 68%|██████▊   | 2032/3000 [02:12<00:50, 19.11it/s]

{'loss': 0.0108, 'grad_norm': 0.12140759825706482, 'learning_rate': 6.466666666666667e-06, 'epoch': 67.67}


 68%|██████▊   | 2040/3000 [02:13<00:48, 19.83it/s]

{'loss': 0.0039, 'grad_norm': 0.0390770323574543, 'learning_rate': 6.4000000000000006e-06, 'epoch': 68.0}


                                                   
 68%|██████▊   | 2040/3000 [02:13<00:48, 19.83it/s]

{'eval_loss': 2.0491294860839844, 'eval_accuracy': 0.6875, 'eval_f1': 0.5806375470943547, 'eval_runtime': 0.0895, 'eval_samples_per_second': 2324.315, 'eval_steps_per_second': 22.349, 'epoch': 68.0}


 68%|██████▊   | 2053/3000 [02:14<00:55, 17.02it/s]

{'loss': 0.0035, 'grad_norm': 0.23735950887203217, 'learning_rate': 6.333333333333333e-06, 'epoch': 68.33}


 69%|██████▊   | 2062/3000 [02:14<00:48, 19.15it/s]

{'loss': 0.0026, 'grad_norm': 0.0917477086186409, 'learning_rate': 6.266666666666668e-06, 'epoch': 68.67}


 69%|██████▉   | 2070/3000 [02:14<00:46, 19.85it/s]

{'loss': 0.0084, 'grad_norm': 0.1923765391111374, 'learning_rate': 6.200000000000001e-06, 'epoch': 69.0}


                                                   
 69%|██████▉   | 2070/3000 [02:14<00:46, 19.85it/s]

{'eval_loss': 1.987962245941162, 'eval_accuracy': 0.6971153846153846, 'eval_f1': 0.5740579113405977, 'eval_runtime': 0.0927, 'eval_samples_per_second': 2243.589, 'eval_steps_per_second': 21.573, 'epoch': 69.0}


 69%|██████▉   | 2083/3000 [02:15<00:54, 16.96it/s]

{'loss': 0.0034, 'grad_norm': 0.27342987060546875, 'learning_rate': 6.133333333333334e-06, 'epoch': 69.33}


 70%|██████▉   | 2092/3000 [02:16<00:47, 19.11it/s]

{'loss': 0.0103, 'grad_norm': 0.15138040482997894, 'learning_rate': 6.066666666666667e-06, 'epoch': 69.67}


 70%|███████   | 2100/3000 [02:16<00:45, 19.82it/s]

{'loss': 0.0011, 'grad_norm': 0.017834236845374107, 'learning_rate': 6e-06, 'epoch': 70.0}


                                                   
 70%|███████   | 2100/3000 [02:16<00:45, 19.82it/s]

{'eval_loss': 1.9866271018981934, 'eval_accuracy': 0.7019230769230769, 'eval_f1': 0.5971119147589736, 'eval_runtime': 0.0903, 'eval_samples_per_second': 2303.267, 'eval_steps_per_second': 22.147, 'epoch': 70.0}


 70%|███████   | 2113/3000 [02:17<00:52, 16.96it/s]

{'loss': 0.0051, 'grad_norm': 0.5603430271148682, 'learning_rate': 5.933333333333335e-06, 'epoch': 70.33}


 71%|███████   | 2122/3000 [02:18<00:45, 19.12it/s]

{'loss': 0.0061, 'grad_norm': 0.23794516921043396, 'learning_rate': 5.8666666666666675e-06, 'epoch': 70.67}


 71%|███████   | 2130/3000 [02:18<00:43, 19.87it/s]

{'loss': 0.004, 'grad_norm': 0.01934727281332016, 'learning_rate': 5.8e-06, 'epoch': 71.0}


                                                   
 71%|███████   | 2130/3000 [02:18<00:43, 19.87it/s]

{'eval_loss': 1.9782007932662964, 'eval_accuracy': 0.6875, 'eval_f1': 0.5789156232293488, 'eval_runtime': 0.0901, 'eval_samples_per_second': 2308.117, 'eval_steps_per_second': 22.193, 'epoch': 71.0}


 71%|███████▏  | 2143/3000 [02:19<00:50, 16.91it/s]

{'loss': 0.0019, 'grad_norm': 0.07812001556158066, 'learning_rate': 5.733333333333334e-06, 'epoch': 71.33}


 72%|███████▏  | 2152/3000 [02:20<00:44, 19.09it/s]

{'loss': 0.0061, 'grad_norm': 0.08207391202449799, 'learning_rate': 5.666666666666667e-06, 'epoch': 71.67}


 72%|███████▏  | 2160/3000 [02:20<00:42, 19.88it/s]

{'loss': 0.0073, 'grad_norm': 4.657955169677734, 'learning_rate': 5.600000000000001e-06, 'epoch': 72.0}


                                                   
 72%|███████▏  | 2160/3000 [02:20<00:42, 19.88it/s]

{'eval_loss': 1.9773484468460083, 'eval_accuracy': 0.6971153846153846, 'eval_f1': 0.5951114893422587, 'eval_runtime': 0.0899, 'eval_samples_per_second': 2313.853, 'eval_steps_per_second': 22.249, 'epoch': 72.0}


 72%|███████▏  | 2173/3000 [02:21<00:48, 17.05it/s]

{'loss': 0.0073, 'grad_norm': 0.041831281036138535, 'learning_rate': 5.533333333333334e-06, 'epoch': 72.33}


 73%|███████▎  | 2182/3000 [02:22<00:42, 19.24it/s]

{'loss': 0.004, 'grad_norm': 0.3686393201351166, 'learning_rate': 5.466666666666667e-06, 'epoch': 72.67}


 73%|███████▎  | 2190/3000 [02:22<00:40, 19.88it/s]

{'loss': 0.003, 'grad_norm': 0.23831826448440552, 'learning_rate': 5.400000000000001e-06, 'epoch': 73.0}


                                                   
 73%|███████▎  | 2190/3000 [02:22<00:40, 19.88it/s]

{'eval_loss': 1.9068113565444946, 'eval_accuracy': 0.7115384615384616, 'eval_f1': 0.6040339407428015, 'eval_runtime': 0.09, 'eval_samples_per_second': 2311.885, 'eval_steps_per_second': 22.23, 'epoch': 73.0}


 73%|███████▎  | 2202/3000 [02:23<00:47, 16.74it/s]

{'loss': 0.0077, 'grad_norm': 0.16137078404426575, 'learning_rate': 5.333333333333334e-06, 'epoch': 73.33}


 74%|███████▍  | 2214/3000 [02:23<00:40, 19.59it/s]

{'loss': 0.0029, 'grad_norm': 0.1100294291973114, 'learning_rate': 5.2666666666666665e-06, 'epoch': 73.67}


 74%|███████▍  | 2220/3000 [02:24<00:36, 21.63it/s]

{'loss': 0.0044, 'grad_norm': 0.1093137115240097, 'learning_rate': 5.2e-06, 'epoch': 74.0}


                                                   
 74%|███████▍  | 2220/3000 [02:24<00:36, 21.63it/s]

{'eval_loss': 1.9545296430587769, 'eval_accuracy': 0.6971153846153846, 'eval_f1': 0.5852880779351368, 'eval_runtime': 0.0961, 'eval_samples_per_second': 2165.291, 'eval_steps_per_second': 20.82, 'epoch': 74.0}


 74%|███████▍  | 2232/3000 [02:25<00:48, 15.79it/s]

{'loss': 0.0078, 'grad_norm': 0.015566235408186913, 'learning_rate': 5.133333333333334e-06, 'epoch': 74.33}


 75%|███████▍  | 2244/3000 [02:25<00:39, 19.21it/s]

{'loss': 0.0049, 'grad_norm': 0.6778572797775269, 'learning_rate': 5.0666666666666676e-06, 'epoch': 74.67}


 75%|███████▌  | 2250/3000 [02:26<00:34, 21.50it/s]

{'loss': 0.0033, 'grad_norm': 0.12770432233810425, 'learning_rate': 5e-06, 'epoch': 75.0}


                                                   
 75%|███████▌  | 2250/3000 [02:26<00:34, 21.50it/s]

{'eval_loss': 1.9341800212860107, 'eval_accuracy': 0.6971153846153846, 'eval_f1': 0.5836278001298207, 'eval_runtime': 0.0894, 'eval_samples_per_second': 2326.664, 'eval_steps_per_second': 22.372, 'epoch': 75.0}


 75%|███████▌  | 2262/3000 [02:27<00:47, 15.67it/s]

{'loss': 0.0029, 'grad_norm': 0.028651176020503044, 'learning_rate': 4.933333333333334e-06, 'epoch': 75.33}


 76%|███████▌  | 2274/3000 [02:27<00:37, 19.14it/s]

{'loss': 0.0061, 'grad_norm': 0.09592362493276596, 'learning_rate': 4.866666666666667e-06, 'epoch': 75.67}


 76%|███████▌  | 2280/3000 [02:27<00:33, 21.40it/s]

{'loss': 0.0057, 'grad_norm': 0.021428199484944344, 'learning_rate': 4.800000000000001e-06, 'epoch': 76.0}


                                                   
 76%|███████▌  | 2280/3000 [02:28<00:33, 21.40it/s]

{'eval_loss': 1.9523212909698486, 'eval_accuracy': 0.6971153846153846, 'eval_f1': 0.5836278001298207, 'eval_runtime': 0.0866, 'eval_samples_per_second': 2400.537, 'eval_steps_per_second': 23.082, 'epoch': 76.0}


 76%|███████▋  | 2292/3000 [02:29<00:45, 15.72it/s]

{'loss': 0.0021, 'grad_norm': 0.13104411959648132, 'learning_rate': 4.7333333333333335e-06, 'epoch': 76.33}


 77%|███████▋  | 2304/3000 [02:29<00:36, 19.14it/s]

{'loss': 0.0083, 'grad_norm': 0.03065437637269497, 'learning_rate': 4.666666666666667e-06, 'epoch': 76.67}


 77%|███████▋  | 2310/3000 [02:29<00:32, 21.43it/s]

{'loss': 0.004, 'grad_norm': 0.025675704702734947, 'learning_rate': 4.600000000000001e-06, 'epoch': 77.0}


                                                   
 77%|███████▋  | 2310/3000 [02:29<00:32, 21.43it/s]

{'eval_loss': 1.949190378189087, 'eval_accuracy': 0.6971153846153846, 'eval_f1': 0.5842720876162135, 'eval_runtime': 0.0899, 'eval_samples_per_second': 2313.227, 'eval_steps_per_second': 22.243, 'epoch': 77.0}


 77%|███████▋  | 2324/3000 [02:30<00:40, 16.56it/s]

{'loss': 0.0016, 'grad_norm': 0.016572436317801476, 'learning_rate': 4.533333333333334e-06, 'epoch': 77.33}


 78%|███████▊  | 2333/3000 [02:31<00:34, 19.09it/s]

{'loss': 0.0104, 'grad_norm': 0.014489134773612022, 'learning_rate': 4.4666666666666665e-06, 'epoch': 77.67}


 78%|███████▊  | 2340/3000 [02:31<00:32, 20.03it/s]

{'loss': 0.0037, 'grad_norm': 0.013870107010006905, 'learning_rate': 4.4e-06, 'epoch': 78.0}


                                                   
 78%|███████▊  | 2340/3000 [02:31<00:32, 20.03it/s]

{'eval_loss': 1.9261574745178223, 'eval_accuracy': 0.7019230769230769, 'eval_f1': 0.5967021526778576, 'eval_runtime': 0.0896, 'eval_samples_per_second': 2322.545, 'eval_steps_per_second': 22.332, 'epoch': 78.0}


 78%|███████▊  | 2354/3000 [02:32<00:37, 17.10it/s]

{'loss': 0.0034, 'grad_norm': 0.015688998624682426, 'learning_rate': 4.333333333333334e-06, 'epoch': 78.33}


 79%|███████▉  | 2363/3000 [02:33<00:33, 19.24it/s]

{'loss': 0.0028, 'grad_norm': 0.13500086963176727, 'learning_rate': 4.266666666666668e-06, 'epoch': 78.67}


 79%|███████▉  | 2370/3000 [02:33<00:31, 20.20it/s]

{'loss': 0.0084, 'grad_norm': 0.22702601552009583, 'learning_rate': 4.2000000000000004e-06, 'epoch': 79.0}


                                                   
 79%|███████▉  | 2370/3000 [02:33<00:31, 20.20it/s]

{'eval_loss': 1.9390790462493896, 'eval_accuracy': 0.7019230769230769, 'eval_f1': 0.6051983348582503, 'eval_runtime': 0.0913, 'eval_samples_per_second': 2278.947, 'eval_steps_per_second': 21.913, 'epoch': 79.0}


 79%|███████▉  | 2384/3000 [02:34<00:36, 16.95it/s]

{'loss': 0.0062, 'grad_norm': 0.06717626750469208, 'learning_rate': 4.133333333333333e-06, 'epoch': 79.33}


 80%|███████▉  | 2393/3000 [02:35<00:31, 19.16it/s]

{'loss': 0.0062, 'grad_norm': 0.490791916847229, 'learning_rate': 4.066666666666667e-06, 'epoch': 79.67}


 80%|████████  | 2400/3000 [02:35<00:29, 20.16it/s]

{'loss': 0.002, 'grad_norm': 0.05299678072333336, 'learning_rate': 4.000000000000001e-06, 'epoch': 80.0}


                                                   
 80%|████████  | 2400/3000 [02:35<00:29, 20.16it/s]

{'eval_loss': 1.9858622550964355, 'eval_accuracy': 0.7019230769230769, 'eval_f1': 0.6056511740358513, 'eval_runtime': 0.0914, 'eval_samples_per_second': 2276.2, 'eval_steps_per_second': 21.887, 'epoch': 80.0}


 80%|████████  | 2414/3000 [02:36<00:34, 17.04it/s]

{'loss': 0.0029, 'grad_norm': 0.013063921593129635, 'learning_rate': 3.9333333333333335e-06, 'epoch': 80.33}


 81%|████████  | 2423/3000 [02:37<00:29, 19.24it/s]

{'loss': 0.0058, 'grad_norm': 0.08913090080022812, 'learning_rate': 3.866666666666667e-06, 'epoch': 80.67}


 81%|████████  | 2430/3000 [02:37<00:28, 20.22it/s]

{'loss': 0.0073, 'grad_norm': 0.21118037402629852, 'learning_rate': 3.8000000000000005e-06, 'epoch': 81.0}


                                                   
 81%|████████  | 2430/3000 [02:37<00:28, 20.22it/s]

{'eval_loss': 1.8643025159835815, 'eval_accuracy': 0.7019230769230769, 'eval_f1': 0.5977334357862065, 'eval_runtime': 0.0906, 'eval_samples_per_second': 2295.666, 'eval_steps_per_second': 22.074, 'epoch': 81.0}


 81%|████████▏ | 2444/3000 [02:38<00:32, 17.10it/s]

{'loss': 0.004, 'grad_norm': 1.3236230611801147, 'learning_rate': 3.7333333333333337e-06, 'epoch': 81.33}


 82%|████████▏ | 2453/3000 [02:38<00:28, 19.11it/s]

{'loss': 0.0047, 'grad_norm': 0.07514643669128418, 'learning_rate': 3.6666666666666666e-06, 'epoch': 81.67}


 82%|████████▏ | 2460/3000 [02:39<00:26, 20.08it/s]

{'loss': 0.0084, 'grad_norm': 0.009029104374349117, 'learning_rate': 3.6000000000000003e-06, 'epoch': 82.0}


                                                   
 82%|████████▏ | 2460/3000 [02:39<00:26, 20.08it/s]

{'eval_loss': 1.9745973348617554, 'eval_accuracy': 0.6971153846153846, 'eval_f1': 0.5835920038706417, 'eval_runtime': 0.0902, 'eval_samples_per_second': 2305.897, 'eval_steps_per_second': 22.172, 'epoch': 82.0}


 82%|████████▏ | 2474/3000 [02:40<00:30, 17.10it/s]

{'loss': 0.0009, 'grad_norm': 0.023509789258241653, 'learning_rate': 3.5333333333333335e-06, 'epoch': 82.33}


 83%|████████▎ | 2483/3000 [02:40<00:26, 19.27it/s]

{'loss': 0.0106, 'grad_norm': 0.0208403579890728, 'learning_rate': 3.4666666666666672e-06, 'epoch': 82.67}


 83%|████████▎ | 2490/3000 [02:41<00:25, 20.18it/s]

{'loss': 0.0011, 'grad_norm': 0.005796823650598526, 'learning_rate': 3.4000000000000005e-06, 'epoch': 83.0}


                                                   
 83%|████████▎ | 2490/3000 [02:41<00:25, 20.18it/s]

{'eval_loss': 1.9749127626419067, 'eval_accuracy': 0.6971153846153846, 'eval_f1': 0.5835920038706417, 'eval_runtime': 0.0849, 'eval_samples_per_second': 2450.192, 'eval_steps_per_second': 23.56, 'epoch': 83.0}


 83%|████████▎ | 2504/3000 [02:42<00:29, 17.08it/s]

{'loss': 0.0025, 'grad_norm': 0.6939948797225952, 'learning_rate': 3.3333333333333333e-06, 'epoch': 83.33}


 84%|████████▍ | 2513/3000 [02:42<00:25, 19.29it/s]

{'loss': 0.0032, 'grad_norm': 0.08132544904947281, 'learning_rate': 3.266666666666667e-06, 'epoch': 83.67}


 84%|████████▍ | 2520/3000 [02:42<00:23, 20.04it/s]

{'loss': 0.0106, 'grad_norm': 13.413542747497559, 'learning_rate': 3.2000000000000003e-06, 'epoch': 84.0}


                                                   
 84%|████████▍ | 2520/3000 [02:42<00:23, 20.04it/s]

{'eval_loss': 1.9616141319274902, 'eval_accuracy': 0.7019230769230769, 'eval_f1': 0.5879568044588251, 'eval_runtime': 0.0899, 'eval_samples_per_second': 2314.651, 'eval_steps_per_second': 22.256, 'epoch': 84.0}


 84%|████████▍ | 2531/3000 [02:43<00:29, 15.86it/s]

{'loss': 0.0029, 'grad_norm': 0.013010604307055473, 'learning_rate': 3.133333333333334e-06, 'epoch': 84.33}


 85%|████████▍ | 2543/3000 [02:44<00:23, 19.26it/s]

{'loss': 0.0039, 'grad_norm': 0.8449860215187073, 'learning_rate': 3.066666666666667e-06, 'epoch': 84.67}


 85%|████████▌ | 2550/3000 [02:44<00:22, 20.19it/s]

{'loss': 0.0104, 'grad_norm': 0.0062835123389959335, 'learning_rate': 3e-06, 'epoch': 85.0}


                                                   
 85%|████████▌ | 2550/3000 [02:44<00:22, 20.19it/s]

{'eval_loss': 1.9389913082122803, 'eval_accuracy': 0.6923076923076923, 'eval_f1': 0.5818506945884178, 'eval_runtime': 0.093, 'eval_samples_per_second': 2235.363, 'eval_steps_per_second': 21.494, 'epoch': 85.0}


 85%|████████▌ | 2564/3000 [02:45<00:25, 16.94it/s]

{'loss': 0.0019, 'grad_norm': 0.038805700838565826, 'learning_rate': 2.9333333333333338e-06, 'epoch': 85.33}


 86%|████████▌ | 2573/3000 [02:46<00:22, 19.20it/s]

{'loss': 0.0007, 'grad_norm': 0.01649167574942112, 'learning_rate': 2.866666666666667e-06, 'epoch': 85.67}


 86%|████████▌ | 2580/3000 [02:46<00:20, 20.21it/s]

{'loss': 0.0108, 'grad_norm': 0.05829378962516785, 'learning_rate': 2.8000000000000003e-06, 'epoch': 86.0}


                                                   
 86%|████████▌ | 2580/3000 [02:46<00:20, 20.21it/s]

{'eval_loss': 1.9470757246017456, 'eval_accuracy': 0.7019230769230769, 'eval_f1': 0.6058805960668272, 'eval_runtime': 0.0901, 'eval_samples_per_second': 2309.798, 'eval_steps_per_second': 22.21, 'epoch': 86.0}


 86%|████████▋ | 2594/3000 [02:47<00:23, 17.05it/s]

{'loss': 0.0027, 'grad_norm': 0.12510602176189423, 'learning_rate': 2.7333333333333336e-06, 'epoch': 86.33}


 87%|████████▋ | 2603/3000 [02:48<00:20, 19.27it/s]

{'loss': 0.0043, 'grad_norm': 0.020158177241683006, 'learning_rate': 2.666666666666667e-06, 'epoch': 86.67}


 87%|████████▋ | 2610/3000 [02:48<00:19, 20.24it/s]

{'loss': 0.0067, 'grad_norm': 0.007424354087561369, 'learning_rate': 2.6e-06, 'epoch': 87.0}


                                                   
 87%|████████▋ | 2610/3000 [02:48<00:19, 20.24it/s]

{'eval_loss': 2.0341877937316895, 'eval_accuracy': 0.7067307692307693, 'eval_f1': 0.6111809141327214, 'eval_runtime': 0.0908, 'eval_samples_per_second': 2290.868, 'eval_steps_per_second': 22.028, 'epoch': 87.0}


 87%|████████▋ | 2624/3000 [02:49<00:22, 17.04it/s]

{'loss': 0.0054, 'grad_norm': 0.13718447089195251, 'learning_rate': 2.5333333333333338e-06, 'epoch': 87.33}


 88%|████████▊ | 2633/3000 [02:50<00:19, 19.18it/s]

{'loss': 0.0024, 'grad_norm': 0.5776841044425964, 'learning_rate': 2.466666666666667e-06, 'epoch': 87.67}


 88%|████████▊ | 2640/3000 [02:50<00:17, 20.19it/s]

{'loss': 0.0079, 'grad_norm': 0.008943002671003342, 'learning_rate': 2.4000000000000003e-06, 'epoch': 88.0}


                                                   
 88%|████████▊ | 2640/3000 [02:50<00:17, 20.19it/s]

{'eval_loss': 2.007291078567505, 'eval_accuracy': 0.7067307692307693, 'eval_f1': 0.6094613521084109, 'eval_runtime': 0.0923, 'eval_samples_per_second': 2254.082, 'eval_steps_per_second': 21.674, 'epoch': 88.0}


 88%|████████▊ | 2654/3000 [02:51<00:20, 16.96it/s]

{'loss': 0.0012, 'grad_norm': 0.041784483939409256, 'learning_rate': 2.3333333333333336e-06, 'epoch': 88.33}


 89%|████████▉ | 2663/3000 [02:51<00:17, 19.14it/s]

{'loss': 0.0039, 'grad_norm': 0.06596691906452179, 'learning_rate': 2.266666666666667e-06, 'epoch': 88.67}


 89%|████████▉ | 2670/3000 [02:52<00:16, 20.11it/s]

{'loss': 0.009, 'grad_norm': 0.03258652240037918, 'learning_rate': 2.2e-06, 'epoch': 89.0}


                                                   
 89%|████████▉ | 2670/3000 [02:52<00:16, 20.11it/s]

{'eval_loss': 2.0182924270629883, 'eval_accuracy': 0.7019230769230769, 'eval_f1': 0.6056511740358513, 'eval_runtime': 0.09, 'eval_samples_per_second': 2311.689, 'eval_steps_per_second': 22.228, 'epoch': 89.0}


 89%|████████▉ | 2683/3000 [02:53<00:18, 16.72it/s]

{'loss': 0.0028, 'grad_norm': 0.17204134166240692, 'learning_rate': 2.133333333333334e-06, 'epoch': 89.33}


 90%|████████▉ | 2692/3000 [02:53<00:16, 19.09it/s]

{'loss': 0.0092, 'grad_norm': 0.034794822335243225, 'learning_rate': 2.0666666666666666e-06, 'epoch': 89.67}


 90%|█████████ | 2700/3000 [02:54<00:15, 19.86it/s]

{'loss': 0.0026, 'grad_norm': 0.011572781950235367, 'learning_rate': 2.0000000000000003e-06, 'epoch': 90.0}


                                                   
 90%|█████████ | 2700/3000 [02:54<00:15, 19.86it/s]

{'eval_loss': 1.9903634786605835, 'eval_accuracy': 0.6971153846153846, 'eval_f1': 0.6013869790340379, 'eval_runtime': 0.0942, 'eval_samples_per_second': 2206.914, 'eval_steps_per_second': 21.22, 'epoch': 90.0}


 90%|█████████ | 2713/3000 [02:55<00:17, 16.78it/s]

{'loss': 0.0046, 'grad_norm': 0.04652196168899536, 'learning_rate': 1.9333333333333336e-06, 'epoch': 90.33}


 91%|█████████ | 2722/3000 [02:55<00:14, 19.07it/s]

{'loss': 0.0023, 'grad_norm': 0.016509132459759712, 'learning_rate': 1.8666666666666669e-06, 'epoch': 90.67}


 91%|█████████ | 2730/3000 [02:55<00:13, 19.88it/s]

{'loss': 0.0092, 'grad_norm': 0.04037324711680412, 'learning_rate': 1.8000000000000001e-06, 'epoch': 91.0}


                                                   
 91%|█████████ | 2730/3000 [02:56<00:13, 19.88it/s]

{'eval_loss': 2.0110511779785156, 'eval_accuracy': 0.6971153846153846, 'eval_f1': 0.6025485951956541, 'eval_runtime': 0.0903, 'eval_samples_per_second': 2304.459, 'eval_steps_per_second': 22.158, 'epoch': 91.0}


 91%|█████████▏| 2743/3000 [02:57<00:15, 17.00it/s]

{'loss': 0.0031, 'grad_norm': 0.5490327477455139, 'learning_rate': 1.7333333333333336e-06, 'epoch': 91.33}


 92%|█████████▏| 2752/3000 [02:57<00:12, 19.18it/s]

{'loss': 0.0023, 'grad_norm': 0.028028016909956932, 'learning_rate': 1.6666666666666667e-06, 'epoch': 91.67}


 92%|█████████▏| 2760/3000 [02:57<00:12, 19.88it/s]

{'loss': 0.0068, 'grad_norm': 0.011473036371171474, 'learning_rate': 1.6000000000000001e-06, 'epoch': 92.0}


                                                   
 92%|█████████▏| 2760/3000 [02:57<00:12, 19.88it/s]

{'eval_loss': 1.9905949831008911, 'eval_accuracy': 0.6923076923076923, 'eval_f1': 0.5798369501155879, 'eval_runtime': 0.0903, 'eval_samples_per_second': 2302.926, 'eval_steps_per_second': 22.144, 'epoch': 92.0}


 92%|█████████▏| 2773/3000 [02:58<00:13, 16.94it/s]

{'loss': 0.0018, 'grad_norm': 0.052961014211177826, 'learning_rate': 1.5333333333333334e-06, 'epoch': 92.33}


 93%|█████████▎| 2782/3000 [02:59<00:11, 19.15it/s]

{'loss': 0.0081, 'grad_norm': 0.13312967121601105, 'learning_rate': 1.4666666666666669e-06, 'epoch': 92.67}


 93%|█████████▎| 2790/3000 [02:59<00:10, 19.86it/s]

{'loss': 0.0017, 'grad_norm': 0.01173117570579052, 'learning_rate': 1.4000000000000001e-06, 'epoch': 93.0}


                                                   
 93%|█████████▎| 2790/3000 [02:59<00:10, 19.86it/s]

{'eval_loss': 1.9990310668945312, 'eval_accuracy': 0.6971153846153846, 'eval_f1': 0.5829302895247167, 'eval_runtime': 0.0926, 'eval_samples_per_second': 2245.316, 'eval_steps_per_second': 21.59, 'epoch': 93.0}


 93%|█████████▎| 2803/3000 [03:00<00:11, 16.98it/s]

{'loss': 0.0064, 'grad_norm': 0.03976726159453392, 'learning_rate': 1.3333333333333334e-06, 'epoch': 93.33}


 94%|█████████▎| 2812/3000 [03:01<00:09, 19.16it/s]

{'loss': 0.004, 'grad_norm': 0.09851784259080887, 'learning_rate': 1.2666666666666669e-06, 'epoch': 93.67}


 94%|█████████▍| 2820/3000 [03:01<00:09, 19.86it/s]

{'loss': 0.0024, 'grad_norm': 0.0493832528591156, 'learning_rate': 1.2000000000000002e-06, 'epoch': 94.0}


                                                   
 94%|█████████▍| 2820/3000 [03:01<00:09, 19.86it/s]

{'eval_loss': 1.9983805418014526, 'eval_accuracy': 0.6923076923076923, 'eval_f1': 0.5798369501155879, 'eval_runtime': 0.0897, 'eval_samples_per_second': 2318.36, 'eval_steps_per_second': 22.292, 'epoch': 94.0}


 94%|█████████▍| 2833/3000 [03:02<00:09, 16.96it/s]

{'loss': 0.0017, 'grad_norm': 0.01984296925365925, 'learning_rate': 1.1333333333333334e-06, 'epoch': 94.33}


 95%|█████████▍| 2842/3000 [03:03<00:08, 19.15it/s]

{'loss': 0.0096, 'grad_norm': 0.012981165200471878, 'learning_rate': 1.066666666666667e-06, 'epoch': 94.67}


 95%|█████████▌| 2850/3000 [03:03<00:07, 19.86it/s]

{'loss': 0.0028, 'grad_norm': 0.014235852286219597, 'learning_rate': 1.0000000000000002e-06, 'epoch': 95.0}


                                                   
 95%|█████████▌| 2850/3000 [03:03<00:07, 19.86it/s]

{'eval_loss': 1.9924594163894653, 'eval_accuracy': 0.6923076923076923, 'eval_f1': 0.5798369501155879, 'eval_runtime': 0.0857, 'eval_samples_per_second': 2427.583, 'eval_steps_per_second': 23.342, 'epoch': 95.0}


 95%|█████████▌| 2863/3000 [03:04<00:08, 16.92it/s]

{'loss': 0.0012, 'grad_norm': 0.012460772879421711, 'learning_rate': 9.333333333333334e-07, 'epoch': 95.33}


 96%|█████████▌| 2874/3000 [03:05<00:06, 19.49it/s]

{'loss': 0.0069, 'grad_norm': 0.7064154744148254, 'learning_rate': 8.666666666666668e-07, 'epoch': 95.67}


 96%|█████████▌| 2880/3000 [03:05<00:05, 21.61it/s]

{'loss': 0.0052, 'grad_norm': 0.37839066982269287, 'learning_rate': 8.000000000000001e-07, 'epoch': 96.0}


                                                   
 96%|█████████▌| 2880/3000 [03:05<00:05, 21.61it/s]

{'eval_loss': 1.9782201051712036, 'eval_accuracy': 0.6875, 'eval_f1': 0.5773415934418463, 'eval_runtime': 0.0975, 'eval_samples_per_second': 2133.399, 'eval_steps_per_second': 20.513, 'epoch': 96.0}


 96%|█████████▋| 2892/3000 [03:06<00:06, 15.66it/s]

{'loss': 0.0023, 'grad_norm': 0.08654039353132248, 'learning_rate': 7.333333333333334e-07, 'epoch': 96.33}


 97%|█████████▋| 2904/3000 [03:06<00:05, 19.09it/s]

{'loss': 0.0021, 'grad_norm': 0.25193917751312256, 'learning_rate': 6.666666666666667e-07, 'epoch': 96.67}


 97%|█████████▋| 2910/3000 [03:07<00:04, 21.41it/s]

{'loss': 0.0089, 'grad_norm': 0.04202881455421448, 'learning_rate': 6.000000000000001e-07, 'epoch': 97.0}


                                                   
 97%|█████████▋| 2910/3000 [03:07<00:04, 21.41it/s]

{'eval_loss': 2.0025949478149414, 'eval_accuracy': 0.6971153846153846, 'eval_f1': 0.5829302895247167, 'eval_runtime': 0.0868, 'eval_samples_per_second': 2396.969, 'eval_steps_per_second': 23.048, 'epoch': 97.0}


 97%|█████████▋| 2922/3000 [03:08<00:05, 14.37it/s]

{'loss': 0.008, 'grad_norm': 0.05810040608048439, 'learning_rate': 5.333333333333335e-07, 'epoch': 97.33}


 98%|█████████▊| 2931/3000 [03:08<00:03, 17.87it/s]

{'loss': 0.0038, 'grad_norm': 0.7345990538597107, 'learning_rate': 4.666666666666667e-07, 'epoch': 97.67}


 98%|█████████▊| 2940/3000 [03:09<00:02, 20.90it/s]

{'loss': 0.0015, 'grad_norm': 0.1142486035823822, 'learning_rate': 4.0000000000000003e-07, 'epoch': 98.0}


                                                   
 98%|█████████▊| 2940/3000 [03:09<00:02, 20.90it/s]

{'eval_loss': 2.0216917991638184, 'eval_accuracy': 0.6971153846153846, 'eval_f1': 0.584023720901549, 'eval_runtime': 0.0915, 'eval_samples_per_second': 2274.414, 'eval_steps_per_second': 21.869, 'epoch': 98.0}


 98%|█████████▊| 2952/3000 [03:10<00:03, 15.68it/s]

{'loss': 0.0079, 'grad_norm': 0.3985411822795868, 'learning_rate': 3.3333333333333335e-07, 'epoch': 98.33}


 99%|█████████▉| 2964/3000 [03:10<00:01, 19.08it/s]

{'loss': 0.0017, 'grad_norm': 0.11121676862239838, 'learning_rate': 2.666666666666667e-07, 'epoch': 98.67}


 99%|█████████▉| 2970/3000 [03:11<00:01, 21.35it/s]

{'loss': 0.0037, 'grad_norm': 0.028445353731513023, 'learning_rate': 2.0000000000000002e-07, 'epoch': 99.0}


                                                   
 99%|█████████▉| 2970/3000 [03:11<00:01, 21.35it/s]

{'eval_loss': 2.0119740962982178, 'eval_accuracy': 0.6923076923076923, 'eval_f1': 0.5809239046674044, 'eval_runtime': 0.0911, 'eval_samples_per_second': 2283.654, 'eval_steps_per_second': 21.958, 'epoch': 99.0}


 99%|█████████▉| 2982/3000 [03:12<00:01, 15.73it/s]

{'loss': 0.0068, 'grad_norm': 0.13214316964149475, 'learning_rate': 1.3333333333333336e-07, 'epoch': 99.33}


100%|█████████▉| 2994/3000 [03:12<00:00, 19.10it/s]

{'loss': 0.0058, 'grad_norm': 0.5463905334472656, 'learning_rate': 6.666666666666668e-08, 'epoch': 99.67}


100%|██████████| 3000/3000 [03:13<00:00, 21.30it/s]

{'loss': 0.0044, 'grad_norm': 0.012258490547537804, 'learning_rate': 0.0, 'epoch': 100.0}


                                                   
100%|██████████| 3000/3000 [03:13<00:00, 21.30it/s]

{'eval_loss': 2.00961971282959, 'eval_accuracy': 0.6923076923076923, 'eval_f1': 0.5809239046674044, 'eval_runtime': 0.0853, 'eval_samples_per_second': 2439.033, 'eval_steps_per_second': 23.452, 'epoch': 100.0}


100%|██████████| 3000/3000 [03:13<00:00, 15.50it/s]

{'train_runtime': 193.5287, 'train_samples_per_second': 963.681, 'train_steps_per_second': 15.502, 'train_loss': 0.08951363994088024, 'epoch': 100.0}





TrainOutput(global_step=3000, training_loss=0.08951363994088024, metrics={'train_runtime': 193.5287, 'train_samples_per_second': 963.681, 'train_steps_per_second': 15.502, 'total_flos': 3131102797056000.0, 'train_loss': 0.08951363994088024, 'epoch': 100.0})

## Step10 Model Evaluation

In [12]:
trainer.evaluate(tokenized_datasets["test"])

100%|██████████| 2/2 [00:00<00:00, 65.49it/s]


{'eval_loss': 1.4868144989013672,
 'eval_accuracy': 0.7163461538461539,
 'eval_f1': 0.6139806096773521,
 'eval_runtime': 0.2559,
 'eval_samples_per_second': 812.677,
 'eval_steps_per_second': 7.814,
 'epoch': 100.0}

## Step11 Save Model

In [13]:
# 保存模型、tokenizer、配置文件
trainer.save_model("my_model_label2")
tokenizer.save_pretrained("my_model_label2")

('my_model_label2\\tokenizer_config.json',
 'my_model_label2\\special_tokens_map.json',
 'my_model_label2\\vocab.txt',
 'my_model_label2\\added_tokens.json',
 'my_model_label2\\tokenizer.json')

In [23]:
import pandas as pd
import numpy as np
# 读取 Excel 文件
file_path = 'predict.xls'  # 替换为你的文件路径
df = pd.read_excel(file_path)

# 提取需要预测的列
texts_to_predict = df['内容'].tolist()  # 将内容列转换为列表


In [24]:
model_name = 'my_model_label2'  # 替换为你选择的模型名称，例如 'bert-base-uncased'
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 设置模型为评估模式
model.eval()


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-2): 3 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, e

In [25]:
predictions = []

# 对每个文本进行预测
for text in texts_to_predict:
    # 确保输入是字符串
    if isinstance(text, str):
        # 对文本进行编码
        inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
        
        with torch.no_grad():  # 不需要计算梯度
            outputs = model(**inputs)
        
        # 获取预测的类别 (假设使用 softmax)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()
        
        predictions.append(predicted_class)
    else:
        predictions.append(0)
        print(f"跳过非字符串输入: {text}")

跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串输入: nan
跳过非字符串

In [26]:
# 将预测结果添加到 DataFrame
#df['1=支持中医黑 2=反对中医黑 0=无关'] = predictions
df['主题框架 1=科学论证 2=政治倾向 3=阴谋论 4=文化认同 5=个人经历 0=无关'] = predictions
# 保存到新的 Excel 文件
output_file_path = 'label2.xlsx'  # 输出文件路径
df.to_excel(output_file_path, index=False)