In [1]:
'''
fine-tune GPT2
Author: Liu Jin Cheng
'''
from transformers import GPT2ForSequenceClassification,GPT2Tokenizer,get_linear_schedule_with_warmup
from datasets import load_dataset,Dataset
import os
import random
import numpy as np
import evaluate
import torch
import argparse
from torch.utils.data import DataLoader
from tqdm import tqdm
from torch.functional import F
from torch.cuda.amp import autocast as autocast,GradScaler
import pandas as pd
from sklearn.metrics import f1_score
from torch.optim import AdamW
import pandas as pd


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_path = os.path.join("..","dataset")
train_batch_size = 2
eval_batch_size = 4
lr = 5e-5
num_epochs = 5
model_name = "../../gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name,bos_token = "<|startoftext|>",eos_token = "<|endoftext|>",pad_token = "<|pad|>",cls_token = "<|cls|>",sep_token = "<|sep|>" ,model_max_length = 1024)
model = GPT2ForSequenceClassification.from_pretrained(model_name,num_labels = 2)
model.resize_token_embeddings(len(tokenizer))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at ../../gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embedding(50261, 768)

In [3]:

def compute_metrics(eval_pred):
        metirc = evaluate.load("accuracy")
        logits , labels = eval_pred
        predictions = np.argmax(logits,axis=-1)
        return metirc.compute(predictions=predictions,references=labels)


def tokenize_function(examples):
        return tokenizer(examples["text"],truncation = True,padding=True) 

def collate_fn(examples):
    return tokenizer.pad(examples, padding="max_length", return_tensors="pt")

In [4]:
dftrain = pd.read_pickle(os.path.join(data_path,"train.pkl"))
dfvalid = pd.read_pickle(os.path.join(data_path,"valid.pkl"))
traindatasets = Dataset.from_pandas(dftrain)
validdatasets = Dataset.from_pandas(dfvalid)
train_tokenized_dataset = traindatasets.map(tokenize_function,batched=True,remove_columns=["text","__index_level_0__"]).rename_column("label","labels")
valid_tokenized_dataset = validdatasets.map(tokenize_function,batched=True,remove_columns=["text","__index_level_0__"]).rename_column("label","labels")
train_dataloader = DataLoader(train_tokenized_dataset,shuffle=True,collate_fn=collate_fn,batch_size = train_batch_size)
eval_dataloader = DataLoader(valid_tokenized_dataset , collate_fn=collate_fn,batch_size = eval_batch_size)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

                                                                  

GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50261, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=66, bias=False)
)

In [5]:
optimizer = AdamW(params=model.parameters(), lr=lr)

# Instantiate scheduler

lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0.06 * (len(train_dataloader) * num_epochs),
    num_training_steps=(len(train_dataloader) * num_epochs),
)


In [None]:
model.to(device)
max_eval_acc = 0
iter_to_accumlate = 4
epochloss = []
trainlogdf = pd.DataFrame(columns=["step","trainloss","validloss","acc","f1-score"])
rowindex = 0
eval_no_progress_count = 0

In [6]:
for epoch in range(num_epochs):
    model.train()
    allloss = 0
    for step,batch in enumerate(tqdm(train_dataloader)):
        batch.to(device)
        outputs = model(**batch)
        loss = outputs.loss/iter_to_accumlate
        loss.backward()
        allloss += loss.item()
        trainlogdf.loc[rowindex] = [rowindex,loss.item(),None,None,None]
        rowindex += 1
        epochloss.append(loss.item())
        if (step+1)%iter_to_accumlate==0:
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
        if (step+1)%(50*iter_to_accumlate) == 0:
            print("epoch",epoch,"step",step,"loss",loss,sep=" ")
        
        if (step+1)%800 == 0:
            count = 0
            model.eval()
            validloss = 0
            preds = []
            labels = []
            for evalstep,batch in enumerate(tqdm(eval_dataloader)):
                labels += batch['labels'].cpu()
                batch.to(device)
                with torch.no_grad():
                    output = model(**batch)
                validloss += output.loss.item()
                pred = torch.argmax(F.softmax(output.logits.cpu(),dim=1),dim=1)
                preds += pred
                count += int(sum(batch['labels'].cpu() == pred))
            model.train()
            eval_acc = count/4000
            trainlogdf.loc[rowindex-1,"validloss"] = validloss/4000
            trainlogdf.loc[rowindex-1,"acc"] = eval_acc
            trainlogdf.loc[rowindex-1,"f1-score"] = f1_score(np.array(labels),np.array(preds),average="macro")
            print("epoch ",epoch,"step",step,"acc ",eval_acc)
            if eval_acc < max_eval_acc:
                eval_no_progress_count += 1
                if eval_no_progress_count >=6:
                    print("Early Stopping:Epoch",epoch," Step",step,"Eval_acc",eval_acc,sep=" ")
                    break
                else:
                    print("Early Stopping record count",eval_no_progress_count,"Max eval acc",max_eval_acc,sep=" ")
            if eval_acc > max_eval_acc:
                max_eval_acc = eval_acc
                print("Update Max eval acc",max_eval_acc)
                eval_no_progress_count = 0
                model.save_pretrained("GPT2saved_models")
                torch.save(model.state_dict(),os.path.join("checkpoint","model.bin"))
                torch.save(optimizer.state_dict(),os.path.join("checkpoint","optimizer.bin"))
                torch.save(lr_scheduler.state_dict(),os.path.join("checkpoint","lr_scheduler.bin"))
    print("epoch",epoch,"step",step,"trainLoss:",allloss/(len(train_dataloader)*train_batch_size))
trainlogdf.to_csv("trainlog.csv")
tokenizer.save_pretrained("GPT2saved_models")

  0%|          | 201/45051 [00:32<2:01:11,  6.17it/s]

epoch 0 step 199 loss tensor(1.5228, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 401/45051 [01:05<2:00:59,  6.15it/s]

epoch 0 step 399 loss tensor(1.6008, device='cuda:0', grad_fn=<DivBackward0>)


  1%|▏         | 601/45051 [01:37<2:00:50,  6.13it/s]

epoch 0 step 599 loss tensor(1.1093, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 799/45051 [02:09<1:58:39,  6.22it/s]

epoch 0 step 799 loss tensor(1.5098, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 1000/1000 [01:44<00:00,  9.56it/s]


epoch  0 step 799 acc  0.00025
Update Max eval acc 0.00025


  2%|▏         | 1001/45051 [04:39<1:59:41,  6.13it/s] 

epoch 0 step 999 loss tensor(0.9426, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 1201/45051 [05:11<1:59:06,  6.14it/s]

epoch 0 step 1199 loss tensor(0.5275, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 1401/45051 [05:44<1:58:47,  6.12it/s]

epoch 0 step 1399 loss tensor(0.2685, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▎         | 1599/45051 [06:16<1:57:04,  6.19it/s]

epoch 0 step 1599 loss tensor(0.3052, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 1000/1000 [01:44<00:00,  9.54it/s]


epoch  0 step 1599 acc  0.5445
Update Max eval acc 0.5445


  4%|▍         | 1801/45051 [08:46<1:56:50,  6.17it/s]  

epoch 0 step 1799 loss tensor(0.2135, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 2001/45051 [09:18<1:56:59,  6.13it/s]

epoch 0 step 1999 loss tensor(0.0918, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▍         | 2201/45051 [09:51<1:56:29,  6.13it/s]

epoch 0 step 2199 loss tensor(0.1757, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▌         | 2399/45051 [10:23<1:54:13,  6.22it/s]

epoch 0 step 2399 loss tensor(0.1047, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 1000/1000 [01:44<00:00,  9.57it/s]


epoch  0 step 2399 acc  0.55225
Update Max eval acc 0.55225


  6%|▌         | 2601/45051 [12:53<1:54:55,  6.16it/s]  

epoch 0 step 2599 loss tensor(0.2151, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 2801/45051 [13:26<1:54:43,  6.14it/s]

epoch 0 step 2799 loss tensor(0.1702, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 3001/45051 [13:58<1:54:01,  6.15it/s]

epoch 0 step 2999 loss tensor(0.1843, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 3199/45051 [14:30<2:01:27,  5.74it/s]

epoch 0 step 3199 loss tensor(0.0573, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 1000/1000 [01:44<00:00,  9.57it/s]


epoch  0 step 3199 acc  0.6705
Update Max eval acc 0.6705


  8%|▊         | 3401/45051 [17:00<1:52:32,  6.17it/s]  

epoch 0 step 3399 loss tensor(0.2718, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 3601/45051 [17:33<1:52:20,  6.15it/s]

epoch 0 step 3599 loss tensor(0.1286, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 3801/45051 [18:05<1:52:09,  6.13it/s]

epoch 0 step 3799 loss tensor(0.1927, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 3999/45051 [18:37<1:50:17,  6.20it/s]

epoch 0 step 3999 loss tensor(0.2663, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 1000/1000 [01:44<00:00,  9.56it/s]


epoch  0 step 3999 acc  0.74175
Update Max eval acc 0.74175


  9%|▉         | 4201/45051 [21:08<1:50:36,  6.16it/s]  

epoch 0 step 4199 loss tensor(0.0721, device='cuda:0', grad_fn=<DivBackward0>)


 10%|▉         | 4401/45051 [21:40<1:50:05,  6.15it/s]

epoch 0 step 4399 loss tensor(0.0857, device='cuda:0', grad_fn=<DivBackward0>)


 10%|█         | 4601/45051 [22:13<1:50:19,  6.11it/s]

epoch 0 step 4599 loss tensor(0.0522, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 4799/45051 [22:45<1:48:12,  6.20it/s]

epoch 0 step 4799 loss tensor(0.1270, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 1000/1000 [01:44<00:00,  9.56it/s]


epoch  0 step 4799 acc  0.762
Update Max eval acc 0.762


 11%|█         | 5001/45051 [25:17<1:48:21,  6.16it/s]  

epoch 0 step 4999 loss tensor(0.2896, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 5201/45051 [25:49<1:48:13,  6.14it/s]

epoch 0 step 5199 loss tensor(0.0526, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 5401/45051 [26:22<1:48:06,  6.11it/s]

epoch 0 step 5399 loss tensor(0.1872, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 5599/45051 [26:54<1:47:30,  6.12it/s]

epoch 0 step 5599 loss tensor(0.0778, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 1000/1000 [01:44<00:00,  9.55it/s]


epoch  0 step 5599 acc  0.77825
Update Max eval acc 0.77825


 13%|█▎        | 5801/45051 [29:25<1:46:13,  6.16it/s]  

epoch 0 step 5799 loss tensor(0.0297, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 6001/45051 [29:57<1:46:08,  6.13it/s]

epoch 0 step 5999 loss tensor(0.0379, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 6201/45051 [30:29<1:45:36,  6.13it/s]

epoch 0 step 6199 loss tensor(0.0320, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 6399/45051 [31:01<1:43:33,  6.22it/s]

epoch 0 step 6399 loss tensor(0.2431, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 1000/1000 [01:44<00:00,  9.57it/s]


epoch  0 step 6399 acc  0.8
Update Max eval acc 0.8


 15%|█▍        | 6601/45051 [33:34<1:44:07,  6.15it/s]  

epoch 0 step 6599 loss tensor(0.0863, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▌        | 6801/45051 [34:06<1:44:06,  6.12it/s]

epoch 0 step 6799 loss tensor(0.0685, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 7001/45051 [34:39<1:43:56,  6.10it/s]

epoch 0 step 6999 loss tensor(0.0676, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 7199/45051 [35:11<1:41:48,  6.20it/s]

epoch 0 step 7199 loss tensor(0.0185, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 1000/1000 [02:18<00:00,  7.21it/s]


epoch  0 step 7199 acc  0.811
Update Max eval acc 0.811


 16%|█▋        | 7400/45051 [38:38<3:02:02,  3.45it/s]  

epoch 0 step 7399 loss tensor(0.0105, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 7600/45051 [39:34<3:01:18,  3.44it/s]

epoch 0 step 7599 loss tensor(0.0192, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 7800/45051 [40:30<3:00:58,  3.43it/s]

epoch 0 step 7799 loss tensor(0.3303, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 7999/45051 [41:26<2:51:40,  3.60it/s]

epoch 0 step 7999 loss tensor(0.0916, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 1000/1000 [02:36<00:00,  6.37it/s]


epoch  0 step 7999 acc  0.83025
Update Max eval acc 0.83025


 18%|█▊        | 8201/45051 [44:50<1:39:59,  6.14it/s]  

epoch 0 step 8199 loss tensor(0.1604, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▊        | 8401/45051 [45:22<1:39:53,  6.11it/s]

epoch 0 step 8399 loss tensor(0.0376, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 8601/45051 [45:55<1:38:58,  6.14it/s]

epoch 0 step 8599 loss tensor(0.0090, device='cuda:0', grad_fn=<DivBackward0>)


 20%|█▉        | 8799/45051 [46:27<1:37:34,  6.19it/s]

epoch 0 step 8799 loss tensor(0.0006, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 1000/1000 [01:44<00:00,  9.57it/s]


epoch  0 step 8799 acc  0.847
Update Max eval acc 0.847


 20%|█▉        | 9001/45051 [48:59<1:37:32,  6.16it/s]  

epoch 0 step 8999 loss tensor(0.0384, device='cuda:0', grad_fn=<DivBackward0>)


 20%|██        | 9201/45051 [49:31<1:37:45,  6.11it/s]

epoch 0 step 9199 loss tensor(0.0118, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 9401/45051 [50:04<1:37:04,  6.12it/s]

epoch 0 step 9399 loss tensor(0.0041, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██▏       | 9599/45051 [50:36<1:35:17,  6.20it/s]

epoch 0 step 9599 loss tensor(0.6357, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 1000/1000 [01:44<00:00,  9.57it/s]


epoch  0 step 9599 acc  0.86775
Update Max eval acc 0.86775


 22%|██▏       | 9801/45051 [53:07<1:35:14,  6.17it/s]  

epoch 0 step 9799 loss tensor(0.0086, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 10001/45051 [53:39<1:35:11,  6.14it/s]

epoch 0 step 9999 loss tensor(0.0037, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 10201/45051 [54:12<1:34:53,  6.12it/s]

epoch 0 step 10199 loss tensor(0.3510, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 10399/45051 [54:44<1:32:52,  6.22it/s]

epoch 0 step 10399 loss tensor(0.1703, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 1000/1000 [01:44<00:00,  9.57it/s]


epoch  0 step 10399 acc  0.8685
Update Max eval acc 0.8685


 24%|██▎       | 10601/45051 [57:15<1:33:30,  6.14it/s]  

epoch 0 step 10599 loss tensor(0.0709, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 10801/45051 [57:47<1:32:56,  6.14it/s]

epoch 0 step 10799 loss tensor(0.0023, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 11001/45051 [58:20<1:32:26,  6.14it/s]

epoch 0 step 10999 loss tensor(0.0594, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▍       | 11199/45051 [58:52<1:30:52,  6.21it/s]

epoch 0 step 11199 loss tensor(0.0128, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 1000/1000 [01:44<00:00,  9.56it/s]


epoch  0 step 11199 acc  0.88175
Update Max eval acc 0.88175


 25%|██▌       | 11401/45051 [1:01:23<1:31:05,  6.16it/s]  

epoch 0 step 11399 loss tensor(0.0748, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 11601/45051 [1:01:55<1:30:52,  6.13it/s]

epoch 0 step 11599 loss tensor(0.0618, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 11801/45051 [1:02:28<1:30:30,  6.12it/s]

epoch 0 step 11799 loss tensor(0.0971, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 11999/45051 [1:03:00<1:28:55,  6.20it/s]

epoch 0 step 11999 loss tensor(0.1129, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 1000/1000 [01:44<00:00,  9.58it/s]


epoch  0 step 11999 acc  0.88875
Update Max eval acc 0.88875


 27%|██▋       | 12201/45051 [1:05:31<1:28:51,  6.16it/s]  

epoch 0 step 12199 loss tensor(0.0063, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 12401/45051 [1:06:03<1:28:33,  6.15it/s]

epoch 0 step 12399 loss tensor(0.0149, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 12601/45051 [1:06:36<1:28:07,  6.14it/s]

epoch 0 step 12599 loss tensor(0.0199, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 12799/45051 [1:07:08<1:26:44,  6.20it/s]

epoch 0 step 12799 loss tensor(0.0177, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 1000/1000 [01:44<00:00,  9.58it/s]
 28%|██▊       | 12801/45051 [1:08:53<197:56:15, 22.10s/it]

epoch  0 step 12799 acc  0.8885
Early Stopping record count 1 Max eval acc 0.88875


 29%|██▉       | 13001/45051 [1:09:25<1:26:52,  6.15it/s]  

epoch 0 step 12999 loss tensor(0.0034, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 13201/45051 [1:09:57<1:26:37,  6.13it/s]

epoch 0 step 13199 loss tensor(0.0309, device='cuda:0', grad_fn=<DivBackward0>)


 30%|██▉       | 13401/45051 [1:10:30<1:26:06,  6.13it/s]

epoch 0 step 13399 loss tensor(0.0033, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 13599/45051 [1:11:02<1:24:42,  6.19it/s]

epoch 0 step 13599 loss tensor(0.0488, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 1000/1000 [01:44<00:00,  9.57it/s]


epoch  0 step 13599 acc  0.892
Update Max eval acc 0.892


 31%|███       | 13801/45051 [1:13:33<1:24:32,  6.16it/s]  

epoch 0 step 13799 loss tensor(0.0029, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 14001/45051 [1:14:05<1:24:19,  6.14it/s]

epoch 0 step 13999 loss tensor(0.0416, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 14201/45051 [1:14:38<1:23:54,  6.13it/s]

epoch 0 step 14199 loss tensor(0.3836, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 14399/45051 [1:15:10<1:22:15,  6.21it/s]

epoch 0 step 14399 loss tensor(0.0257, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 1000/1000 [01:44<00:00,  9.56it/s]


epoch  0 step 14399 acc  0.89925
Update Max eval acc 0.89925


 32%|███▏      | 14601/45051 [1:17:43<1:22:17,  6.17it/s]  

epoch 0 step 14599 loss tensor(0.0118, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 14801/45051 [1:18:15<1:22:09,  6.14it/s]

epoch 0 step 14799 loss tensor(0.0167, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 15001/45051 [1:18:47<1:21:42,  6.13it/s]

epoch 0 step 14999 loss tensor(0.0019, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▎      | 15199/45051 [1:19:20<1:18:39,  6.33it/s]

epoch 0 step 15199 loss tensor(0.0053, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 1000/1000 [01:44<00:00,  9.57it/s]
 34%|███▎      | 15201/45051 [1:21:04<183:22:55, 22.12s/it]

epoch  0 step 15199 acc  0.8825
Early Stopping record count 1 Max eval acc 0.89925


 34%|███▍      | 15401/45051 [1:21:37<1:20:26,  6.14it/s]  

epoch 0 step 15399 loss tensor(0.0009, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▍      | 15601/45051 [1:22:09<1:20:06,  6.13it/s]

epoch 0 step 15599 loss tensor(0.0053, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▌      | 15801/45051 [1:22:42<1:19:26,  6.14it/s]

epoch 0 step 15799 loss tensor(0.0375, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 15999/45051 [1:23:14<1:18:01,  6.21it/s]

epoch 0 step 15999 loss tensor(0.0105, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 1000/1000 [01:44<00:00,  9.57it/s]
 36%|███▌      | 16001/45051 [1:24:59<178:29:08, 22.12s/it]

epoch  0 step 15999 acc  0.8785
Early Stopping record count 2 Max eval acc 0.89925


 36%|███▌      | 16201/45051 [1:25:31<1:18:14,  6.15it/s]  

epoch 0 step 16199 loss tensor(0.0021, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▋      | 16401/45051 [1:26:03<1:17:40,  6.15it/s]

epoch 0 step 16399 loss tensor(0.0996, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 16601/45051 [1:26:36<1:17:17,  6.14it/s]

epoch 0 step 16599 loss tensor(0.0224, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 16799/45051 [1:27:08<1:15:42,  6.22it/s]

epoch 0 step 16799 loss tensor(0.1063, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 1000/1000 [01:44<00:00,  9.57it/s]


epoch  0 step 16799 acc  0.9015
Update Max eval acc 0.9015


 38%|███▊      | 17001/45051 [1:29:43<1:16:02,  6.15it/s]  

epoch 0 step 16999 loss tensor(0.0417, device='cuda:0', grad_fn=<DivBackward0>)


 38%|███▊      | 17201/45051 [1:30:15<1:15:26,  6.15it/s]

epoch 0 step 17199 loss tensor(0.0014, device='cuda:0', grad_fn=<DivBackward0>)


 39%|███▊      | 17401/45051 [1:30:48<1:14:59,  6.14it/s]

epoch 0 step 17399 loss tensor(0.1209, device='cuda:0', grad_fn=<DivBackward0>)


 39%|███▉      | 17599/45051 [1:31:20<1:13:45,  6.20it/s]

epoch 0 step 17599 loss tensor(0.4008, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 1000/1000 [01:44<00:00,  9.56it/s]
 39%|███▉      | 17601/45051 [1:33:05<168:47:35, 22.14s/it]

epoch  0 step 17599 acc  0.8725
Early Stopping record count 1 Max eval acc 0.9015


 40%|███▉      | 17801/45051 [1:33:37<1:14:01,  6.14it/s]  

epoch 0 step 17799 loss tensor(0.0008, device='cuda:0', grad_fn=<DivBackward0>)


 40%|███▉      | 18001/45051 [1:34:09<1:13:22,  6.14it/s]

epoch 0 step 17999 loss tensor(0.1264, device='cuda:0', grad_fn=<DivBackward0>)


 40%|████      | 18201/45051 [1:34:42<1:13:01,  6.13it/s]

epoch 0 step 18199 loss tensor(0.0037, device='cuda:0', grad_fn=<DivBackward0>)


 41%|████      | 18399/45051 [1:35:14<1:11:42,  6.19it/s]

epoch 0 step 18399 loss tensor(0.0019, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 1000/1000 [01:44<00:00,  9.57it/s]
 41%|████      | 18401/45051 [1:36:59<163:46:21, 22.12s/it]

epoch  0 step 18399 acc  0.8985
Early Stopping record count 2 Max eval acc 0.9015


 41%|████▏     | 18601/45051 [1:37:31<1:11:40,  6.15it/s]  

epoch 0 step 18599 loss tensor(0.0420, device='cuda:0', grad_fn=<DivBackward0>)


 42%|████▏     | 18801/45051 [1:38:04<1:11:09,  6.15it/s]

epoch 0 step 18799 loss tensor(0.0600, device='cuda:0', grad_fn=<DivBackward0>)


 42%|████▏     | 19001/45051 [1:38:36<1:10:46,  6.13it/s]

epoch 0 step 18999 loss tensor(0.0109, device='cuda:0', grad_fn=<DivBackward0>)


 43%|████▎     | 19199/45051 [1:39:08<1:09:33,  6.19it/s]

epoch 0 step 19199 loss tensor(0.0778, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 1000/1000 [01:44<00:00,  9.56it/s]
 43%|████▎     | 19201/45051 [1:40:53<159:00:32, 22.14s/it]

epoch  0 step 19199 acc  0.89725
Early Stopping record count 3 Max eval acc 0.9015


 43%|████▎     | 19401/45051 [1:41:26<1:09:30,  6.15it/s]  

epoch 0 step 19399 loss tensor(0.0005, device='cuda:0', grad_fn=<DivBackward0>)


 44%|████▎     | 19601/45051 [1:41:58<1:08:58,  6.15it/s]

epoch 0 step 19599 loss tensor(0.0421, device='cuda:0', grad_fn=<DivBackward0>)


 44%|████▍     | 19801/45051 [1:42:31<1:08:50,  6.11it/s]

epoch 0 step 19799 loss tensor(0.0428, device='cuda:0', grad_fn=<DivBackward0>)


 44%|████▍     | 19999/45051 [1:43:03<1:06:59,  6.23it/s]

epoch 0 step 19999 loss tensor(0.0003, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 1000/1000 [01:44<00:00,  9.57it/s]
 44%|████▍     | 20001/45051 [1:44:47<153:47:26, 22.10s/it]

epoch  0 step 19999 acc  0.89825
Early Stopping record count 4 Max eval acc 0.9015


 45%|████▍     | 20201/45051 [1:45:20<1:07:42,  6.12it/s]  

epoch 0 step 20199 loss tensor(0.0592, device='cuda:0', grad_fn=<DivBackward0>)


 45%|████▌     | 20400/45051 [1:45:52<1:02:50,  6.54it/s]

epoch 0 step 20399 loss tensor(0.0212, device='cuda:0', grad_fn=<DivBackward0>)


 46%|████▌     | 20601/45051 [1:46:25<1:06:21,  6.14it/s]

epoch 0 step 20599 loss tensor(0.0566, device='cuda:0', grad_fn=<DivBackward0>)


 46%|████▌     | 20799/45051 [1:46:57<1:05:04,  6.21it/s]

epoch 0 step 20799 loss tensor(0.0015, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 1000/1000 [01:44<00:00,  9.56it/s]
 46%|████▌     | 20801/45051 [1:48:42<149:06:05, 22.13s/it]

epoch  0 step 20799 acc  0.895
Early Stopping record count 5 Max eval acc 0.9015


 47%|████▋     | 21001/45051 [1:49:14<1:05:21,  6.13it/s]  

epoch 0 step 20999 loss tensor(0.0047, device='cuda:0', grad_fn=<DivBackward0>)


 47%|████▋     | 21201/45051 [1:49:47<1:04:46,  6.14it/s]

epoch 0 step 21199 loss tensor(0.0004, device='cuda:0', grad_fn=<DivBackward0>)


 48%|████▊     | 21401/45051 [1:50:19<1:04:12,  6.14it/s]

epoch 0 step 21399 loss tensor(0.0036, device='cuda:0', grad_fn=<DivBackward0>)


 48%|████▊     | 21599/45051 [1:50:51<1:03:05,  6.20it/s]

epoch 0 step 21599 loss tensor(0.0302, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 1000/1000 [01:44<00:00,  9.57it/s]


epoch  0 step 21599 acc  0.9025
Update Max eval acc 0.9025


 48%|████▊     | 21801/45051 [1:53:28<1:03:03,  6.15it/s]  

epoch 0 step 21799 loss tensor(0.0027, device='cuda:0', grad_fn=<DivBackward0>)


 49%|████▉     | 22001/45051 [1:54:00<1:02:49,  6.12it/s]

epoch 0 step 21999 loss tensor(0.0012, device='cuda:0', grad_fn=<DivBackward0>)


 49%|████▉     | 22201/45051 [1:54:33<1:02:10,  6.13it/s]

epoch 0 step 22199 loss tensor(0.0854, device='cuda:0', grad_fn=<DivBackward0>)


 50%|████▉     | 22399/45051 [1:55:05<1:00:52,  6.20it/s]

epoch 0 step 22399 loss tensor(0.0002, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 1000/1000 [01:44<00:00,  9.58it/s]


epoch  0 step 22399 acc  0.911
Update Max eval acc 0.911


 50%|█████     | 22600/45051 [1:58:05<1:43:37,  3.61it/s]  

epoch 0 step 22599 loss tensor(0.0017, device='cuda:0', grad_fn=<DivBackward0>)


 51%|█████     | 22800/45051 [1:59:02<1:44:39,  3.54it/s]

epoch 0 step 22799 loss tensor(0.0228, device='cuda:0', grad_fn=<DivBackward0>)


 51%|█████     | 23000/45051 [1:59:58<1:48:55,  3.37it/s]

epoch 0 step 22999 loss tensor(0.1806, device='cuda:0', grad_fn=<DivBackward0>)


 51%|█████▏    | 23199/45051 [2:00:53<1:41:07,  3.60it/s]

epoch 0 step 23199 loss tensor(0.0009, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 1000/1000 [03:09<00:00,  5.28it/s]
 51%|█████▏    | 23200/45051 [2:04:03<347:00:38, 57.17s/it]

epoch  0 step 23199 acc  0.88975
Early Stopping record count 1 Max eval acc 0.911


 52%|█████▏    | 23401/45051 [2:04:40<58:53,  6.13it/s]    

epoch 0 step 23399 loss tensor(0.0569, device='cuda:0', grad_fn=<DivBackward0>)


 52%|█████▏    | 23601/45051 [2:05:13<58:26,  6.12it/s]  

epoch 0 step 23599 loss tensor(0.0188, device='cuda:0', grad_fn=<DivBackward0>)


 53%|█████▎    | 23801/45051 [2:05:45<57:49,  6.12it/s]

epoch 0 step 23799 loss tensor(0.0687, device='cuda:0', grad_fn=<DivBackward0>)


 53%|█████▎    | 23999/45051 [2:06:17<56:28,  6.21it/s]  

epoch 0 step 23999 loss tensor(1.1652e-05, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 1000/1000 [01:44<00:00,  9.58it/s]


epoch  0 step 23999 acc  0.91125
Update Max eval acc 0.91125


 54%|█████▎    | 24201/45051 [2:08:54<56:36,  6.14it/s]    

epoch 0 step 24199 loss tensor(0.0004, device='cuda:0', grad_fn=<DivBackward0>)


 54%|█████▍    | 24401/45051 [2:09:27<56:08,  6.13it/s]  

epoch 0 step 24399 loss tensor(0.0139, device='cuda:0', grad_fn=<DivBackward0>)


 55%|█████▍    | 24601/45051 [2:09:59<55:40,  6.12it/s]  

epoch 0 step 24599 loss tensor(0.0009, device='cuda:0', grad_fn=<DivBackward0>)


 55%|█████▌    | 24799/45051 [2:10:31<54:37,  6.18it/s]  

epoch 0 step 24799 loss tensor(0.0005, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 1000/1000 [01:44<00:00,  9.58it/s]
 55%|█████▌    | 24801/45051 [2:12:16<124:19:08, 22.10s/it]

epoch  0 step 24799 acc  0.89075
Early Stopping record count 1 Max eval acc 0.91125


 55%|█████▌    | 25001/45051 [2:12:49<54:33,  6.12it/s]    

epoch 0 step 24999 loss tensor(0.6162, device='cuda:0', grad_fn=<DivBackward0>)


 56%|█████▌    | 25201/45051 [2:13:21<53:59,  6.13it/s]

epoch 0 step 25199 loss tensor(0.0171, device='cuda:0', grad_fn=<DivBackward0>)


 56%|█████▋    | 25401/45051 [2:13:54<53:27,  6.13it/s]

epoch 0 step 25399 loss tensor(4.4131e-05, device='cuda:0', grad_fn=<DivBackward0>)


 57%|█████▋    | 25599/45051 [2:14:26<52:18,  6.20it/s]

epoch 0 step 25599 loss tensor(0.0128, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 1000/1000 [01:44<00:00,  9.54it/s]
 57%|█████▋    | 25601/45051 [2:16:11<119:53:46, 22.19s/it]

epoch  0 step 25599 acc  0.9055
Early Stopping record count 2 Max eval acc 0.91125


 57%|█████▋    | 25801/45051 [2:16:43<52:24,  6.12it/s]    

epoch 0 step 25799 loss tensor(0.0154, device='cuda:0', grad_fn=<DivBackward0>)


 58%|█████▊    | 26001/45051 [2:17:16<51:47,  6.13it/s]

epoch 0 step 25999 loss tensor(0.0012, device='cuda:0', grad_fn=<DivBackward0>)


 58%|█████▊    | 26201/45051 [2:17:48<51:16,  6.13it/s]

epoch 0 step 26199 loss tensor(0.0162, device='cuda:0', grad_fn=<DivBackward0>)


 59%|█████▊    | 26399/45051 [2:18:20<50:02,  6.21it/s]

epoch 0 step 26399 loss tensor(0.0677, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 1000/1000 [01:44<00:00,  9.56it/s]
 59%|█████▊    | 26401/45051 [2:20:05<114:41:20, 22.14s/it]

epoch  0 step 26399 acc  0.9055
Early Stopping record count 3 Max eval acc 0.91125


 59%|█████▉    | 26601/45051 [2:20:38<50:13,  6.12it/s]    

epoch 0 step 26599 loss tensor(0.0292, device='cuda:0', grad_fn=<DivBackward0>)


 59%|█████▉    | 26801/45051 [2:21:10<49:46,  6.11it/s]

epoch 0 step 26799 loss tensor(0.0022, device='cuda:0', grad_fn=<DivBackward0>)


 60%|█████▉    | 27001/45051 [2:21:43<49:04,  6.13it/s]

epoch 0 step 26999 loss tensor(0.0290, device='cuda:0', grad_fn=<DivBackward0>)


 60%|██████    | 27199/45051 [2:22:15<48:04,  6.19it/s]

epoch 0 step 27199 loss tensor(3.1574e-05, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 1000/1000 [01:44<00:00,  9.57it/s]
 60%|██████    | 27201/45051 [2:24:00<109:37:15, 22.11s/it]

epoch  0 step 27199 acc  0.90575
Early Stopping record count 4 Max eval acc 0.91125


 61%|██████    | 27401/45051 [2:24:32<48:04,  6.12it/s]    

epoch 0 step 27399 loss tensor(0.0127, device='cuda:0', grad_fn=<DivBackward0>)


 61%|██████▏   | 27601/45051 [2:25:05<47:25,  6.13it/s]

epoch 0 step 27599 loss tensor(0.0228, device='cuda:0', grad_fn=<DivBackward0>)


 62%|██████▏   | 27801/45051 [2:25:38<47:00,  6.11it/s]

epoch 0 step 27799 loss tensor(0.0964, device='cuda:0', grad_fn=<DivBackward0>)


 62%|██████▏   | 27999/45051 [2:26:10<45:51,  6.20it/s]

epoch 0 step 27999 loss tensor(0.0002, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 1000/1000 [01:44<00:00,  9.56it/s]
 62%|██████▏   | 28001/45051 [2:27:55<104:52:51, 22.14s/it]

epoch  0 step 27999 acc  0.904
Early Stopping record count 5 Max eval acc 0.91125


 63%|██████▎   | 28201/45051 [2:28:27<45:50,  6.13it/s]    

epoch 0 step 28199 loss tensor(0.0003, device='cuda:0', grad_fn=<DivBackward0>)


 63%|██████▎   | 28401/45051 [2:29:00<45:22,  6.12it/s]

epoch 0 step 28399 loss tensor(0.2572, device='cuda:0', grad_fn=<DivBackward0>)


 63%|██████▎   | 28601/45051 [2:29:32<44:48,  6.12it/s]

epoch 0 step 28599 loss tensor(0.0013, device='cuda:0', grad_fn=<DivBackward0>)


 64%|██████▍   | 28799/45051 [2:30:04<43:40,  6.20it/s]

epoch 0 step 28799 loss tensor(0.1493, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 1000/1000 [01:44<00:00,  9.57it/s]


epoch  0 step 28799 acc  0.914
Update Max eval acc 0.914


 64%|██████▍   | 29001/45051 [2:32:42<43:40,  6.13it/s]    

epoch 0 step 28999 loss tensor(0.1902, device='cuda:0', grad_fn=<DivBackward0>)


 65%|██████▍   | 29201/45051 [2:33:15<43:08,  6.12it/s]

epoch 0 step 29199 loss tensor(9.8583e-05, device='cuda:0', grad_fn=<DivBackward0>)


 65%|██████▌   | 29401/45051 [2:33:47<42:37,  6.12it/s]

epoch 0 step 29399 loss tensor(0.0043, device='cuda:0', grad_fn=<DivBackward0>)


 66%|██████▌   | 29599/45051 [2:34:20<41:39,  6.18it/s]

epoch 0 step 29599 loss tensor(0.1154, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 1000/1000 [01:44<00:00,  9.56it/s]
 66%|██████▌   | 29601/45051 [2:36:05<94:57:14, 22.13s/it] 

epoch  0 step 29599 acc  0.906
Early Stopping record count 1 Max eval acc 0.914


 66%|██████▌   | 29801/45051 [2:36:37<41:25,  6.14it/s]   

epoch 0 step 29799 loss tensor(0.0340, device='cuda:0', grad_fn=<DivBackward0>)


 67%|██████▋   | 30001/45051 [2:37:10<41:02,  6.11it/s]

epoch 0 step 29999 loss tensor(0.0817, device='cuda:0', grad_fn=<DivBackward0>)


 67%|██████▋   | 30201/45051 [2:37:42<40:28,  6.12it/s]

epoch 0 step 30199 loss tensor(0.1835, device='cuda:0', grad_fn=<DivBackward0>)


 67%|██████▋   | 30399/45051 [2:38:14<39:27,  6.19it/s]

epoch 0 step 30399 loss tensor(0.1140, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 1000/1000 [01:44<00:00,  9.57it/s]
 67%|██████▋   | 30401/45051 [2:39:59<89:57:39, 22.11s/it] 

epoch  0 step 30399 acc  0.91275
Early Stopping record count 2 Max eval acc 0.914


 68%|██████▊   | 30601/45051 [2:40:32<39:19,  6.12it/s]   

epoch 0 step 30599 loss tensor(0.0191, device='cuda:0', grad_fn=<DivBackward0>)


 68%|██████▊   | 30801/45051 [2:41:04<38:49,  6.12it/s]

epoch 0 step 30799 loss tensor(0.0048, device='cuda:0', grad_fn=<DivBackward0>)


 69%|██████▉   | 31001/45051 [2:41:37<38:09,  6.14it/s]

epoch 0 step 30999 loss tensor(0.0010, device='cuda:0', grad_fn=<DivBackward0>)


 69%|██████▉   | 31199/45051 [2:42:09<37:18,  6.19it/s]

epoch 0 step 31199 loss tensor(4.1107e-05, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 1000/1000 [01:44<00:00,  9.57it/s]
 69%|██████▉   | 31201/45051 [2:43:54<85:06:05, 22.12s/it] 

epoch  0 step 31199 acc  0.9125
Early Stopping record count 3 Max eval acc 0.914


 70%|██████▉   | 31401/45051 [2:44:26<37:13,  6.11it/s]   

epoch 0 step 31399 loss tensor(8.1538e-05, device='cuda:0', grad_fn=<DivBackward0>)


 70%|███████   | 31601/45051 [2:44:59<36:39,  6.11it/s]

epoch 0 step 31599 loss tensor(0.0002, device='cuda:0', grad_fn=<DivBackward0>)


 71%|███████   | 31801/45051 [2:45:31<35:59,  6.14it/s]

epoch 0 step 31799 loss tensor(9.4916e-05, device='cuda:0', grad_fn=<DivBackward0>)


 71%|███████   | 31999/45051 [2:46:03<35:11,  6.18it/s]

epoch 0 step 31999 loss tensor(0.6190, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 1000/1000 [01:44<00:00,  9.56it/s]
 71%|███████   | 32001/45051 [2:47:48<80:13:42, 22.13s/it] 

epoch  0 step 31999 acc  0.9135
Early Stopping record count 4 Max eval acc 0.914


 71%|███████▏  | 32201/45051 [2:48:21<35:45,  5.99it/s]   

epoch 0 step 32199 loss tensor(0.1603, device='cuda:0', grad_fn=<DivBackward0>)


 72%|███████▏  | 32401/45051 [2:48:53<34:22,  6.13it/s]

epoch 0 step 32399 loss tensor(0.0008, device='cuda:0', grad_fn=<DivBackward0>)


 72%|███████▏  | 32601/45051 [2:49:26<33:53,  6.12it/s]

epoch 0 step 32599 loss tensor(0.0083, device='cuda:0', grad_fn=<DivBackward0>)


 73%|███████▎  | 32799/45051 [2:49:58<32:53,  6.21it/s]

epoch 0 step 32799 loss tensor(0.0002, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 1000/1000 [01:44<00:00,  9.58it/s]
 73%|███████▎  | 32801/45051 [2:51:43<75:11:48, 22.10s/it] 

epoch  0 step 32799 acc  0.8865
Early Stopping record count 5 Max eval acc 0.914


 73%|███████▎  | 33001/45051 [2:52:15<32:45,  6.13it/s]   

epoch 0 step 32999 loss tensor(0.0001, device='cuda:0', grad_fn=<DivBackward0>)


 74%|███████▎  | 33201/45051 [2:52:48<32:20,  6.11it/s]

epoch 0 step 33199 loss tensor(0.0602, device='cuda:0', grad_fn=<DivBackward0>)


 74%|███████▍  | 33401/45051 [2:53:20<31:40,  6.13it/s]

epoch 0 step 33399 loss tensor(0.0022, device='cuda:0', grad_fn=<DivBackward0>)


 75%|███████▍  | 33599/45051 [2:53:52<30:42,  6.21it/s]

epoch 0 step 33599 loss tensor(0.0005, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 1000/1000 [01:44<00:00,  9.57it/s]
 75%|███████▍  | 33599/45051 [2:55:37<59:51,  3.19it/s]


epoch  0 step 33599 acc  0.90175
Early Stopping:Epoch 0  Step 33599 Eval_acc 0.90175
epoch 0 step 33599 trainLoss: 0.04863367924928612


('GPT2saved_models/tokenizer_config.json',
 'GPT2saved_models/special_tokens_map.json',
 'GPT2saved_models/vocab.json',
 'GPT2saved_models/merges.txt',
 'GPT2saved_models/added_tokens.json')

In [7]:
tokenizer.save_pretrained("GPT2saved_models")

('GPT2saved_models/tokenizer_config.json',
 'GPT2saved_models/special_tokens_map.json',
 'GPT2saved_models/vocab.json',
 'GPT2saved_models/merges.txt',
 'GPT2saved_models/added_tokens.json')