In [1]:
import os
import sys
import torch
import transformers
from datasets import load_dataset
from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_int8_training,
    set_peft_model_state_dict,
    PeftType,
    TaskType,
    PeftModelForSequenceClassification
)
from transformers import CodeLlamaTokenizer , LlamaForSequenceClassification ,get_linear_schedule_with_warmup
import evaluate
import numpy as np
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm import tqdm
from torch.functional import F
from torch.cuda.amp import autocast as autocast,GradScaler
import pandas as pd
from sklearn.metrics import f1_score

  from .autonotebook import tqdm as notebook_tqdm
  return torch._C._cuda_getDeviceCount() > 0
  warn("The installed version of bitsandbytes was compiled without GPU support. "



Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: Required library version not found: libsbitsandbytes_cpu.so. Maybe you need to compile it from source?
CUDA SETUP: Defaulting to libbitsandbytes_cpu.so...


In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
base_model =  "codellama/CodeLlama-7b-hf"
device_map = "auto"
num_epochs = 20
lr = 5e-5
train_batch_size = 2
eval_batch_size = 1
peft_type = PeftType.LORA
config = LoraConfig(
        r=8,
        lora_alpha=16,
        inference_mode=False,
        lora_dropout=0.1,
        task_type=TaskType.SEQ_CLS,
        target_modules=[
        "q_proj",
        "v_proj",
    ],
    )


tokenizer = CodeLlamaTokenizer.from_pretrained(base_model,model_max_length = 1024 , pad_token = "<|pad|>")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:

def compute_metrics(eval_pred):
        metirc = evaluate.load("accuracy")
        logits , labels = eval_pred
        predictions = np.argmax(logits,axis=-1)
        return metirc.compute(predictions=predictions,references=labels)


def tokenize_function(examples):
        return tokenizer(examples["text"],truncation = True) 

def collate_fn(examples):
    return tokenizer.pad(examples, padding="max_length", return_tensors="pt")

datasets = load_dataset("csv", data_files={"train":"train.csv","test":"valid.csv"})
tokenized_dataset = datasets.map(tokenize_function,batched=True,remove_columns="text").rename_column("label","labels")
train_dataloader = DataLoader(tokenized_dataset["train"],shuffle=True,collate_fn=collate_fn,batch_size = train_batch_size)
eval_dataloader = DataLoader(tokenized_dataset["test"] , collate_fn=collate_fn,batch_size = eval_batch_size)

Found cached dataset csv (/home/ljc/.cache/huggingface/datasets/csv/default-f8786a133c15e49f/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
100%|██████████| 2/2 [00:00<00:00, 1473.75it/s]
Loading cached processed dataset at /home/ljc/.cache/huggingface/datasets/csv/default-f8786a133c15e49f/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-50f1afa45508a4ce.arrow
Loading cached processed dataset at /home/ljc/.cache/huggingface/datasets/csv/default-f8786a133c15e49f/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-bb9de59c1f6b0d4a.arrow


In [4]:
model = LlamaForSequenceClassification.from_pretrained(
        base_model,
        load_in_8bit = True,
        torch_dtype = torch.float16,
        num_labels = 66,
        device_map = device_map
    )
model.config.pad_token_id = 32016
model.resize_token_embeddings(len(tokenizer))
model = prepare_model_for_int8_training(model)
model = get_peft_model(model, config)
model.print_trainable_parameters()
print(model)

In [5]:
optimizer = AdamW(params=model.parameters(), lr=lr)

# Instantiate scheduler
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0.06 * (len(train_dataloader) * num_epochs),
    num_training_steps=(len(train_dataloader) * num_epochs),
)
sclaer = GradScaler()

In [9]:
model.to(device)
max_eval_acc = 0
iter_to_accumlate = 4
epochloss = []
trainlogdf = pd.DataFrame(columns=["step","trainloss","validloss","acc","f1-score"])
rowindex = 0
for epoch in range(num_epochs):
    model.train()
    allloss = 0
    for step,batch in enumerate(tqdm(train_dataloader)):
        batch.to(device)
        with autocast():
            outputs = model(**batch)
        loss = outputs.loss/iter_to_accumlate
        sclaer.scale(loss).backward()
        allloss += loss.item()
        trainlogdf.loc[rowindex] = [rowindex,loss.item(),None,None,None]
        rowindex += 1
        epochloss.append(loss.item())
        if (step+1)%iter_to_accumlate==0:
            sclaer.step(optimizer)
            lr_scheduler.step()
            sclaer.update()
            optimizer.zero_grad()
        if (step+1)%(4*iter_to_accumlate) == 0:
            print("epoch",epoch,"step",step,"loss",loss,sep=" ")
            
    print("epoch",epoch,"trainLoss:",allloss/(len(train_dataloader)*train_batch_size))

    count = 0
    model.eval()
    validloss = 0
    preds = []
    labels = []
    for step,batch in enumerate(tqdm(eval_dataloader)):
        labels += batch['labels'].cpu()
        batch.to(device)
        with torch.no_grad():
            output = model(**batch)
        validloss += output.loss.item()
        pred = torch.argmax(F.softmax(output.logits,dim=1),dim=1)
        preds += pred
        count += int(batch['labels'].cpu() == pred.cpu())
    eval_acc = count/132
    trainlogdf.loc[rowindex-1,"validloss"] = validloss/132
    trainlogdf.loc[rowindex-1,"acc"] = eval_acc
    trainlogdf.loc[rowindex-1,"f1-score"] = f1_score(np.array(batch['labels'].cpu()),np.array(pred.cpu()),average="macro")
    print("epoch ",epoch,"acc ",eval_acc)
    if eval_acc > max_eval_acc:
        max_eval_acc = eval_acc
        model.save_pretrained("ljcoutputdir")
        torch.save(get_peft_model_state_dict(model),os.path.join("checkpoint","model.bin"))
        torch.save(optimizer.state_dict(),os.path.join("checkpoint","optimizer.bin"))
        torch.save(sclaer.state_dict(),os.path.join("checkpoint","sclaer.bin"))
        torch.save(lr_scheduler.state_dict(),os.path.join("checkpoint","lr_scheduler.bin"))

  0%|          | 0/264 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  6%|▌         | 16/264 [00:43<11:19,  2.74s/it]

epoch 0 step 15 loss tensor(1.3916, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 32/264 [01:28<10:46,  2.79s/it]

epoch 0 step 31 loss tensor(1.0911, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 48/264 [02:12<09:54,  2.75s/it]

epoch 0 step 47 loss tensor(1.0659, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 64/264 [02:56<09:10,  2.75s/it]

epoch 0 step 63 loss tensor(1.3096, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 80/264 [03:40<08:25,  2.75s/it]

epoch 0 step 79 loss tensor(0.9524, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▋      | 96/264 [04:24<07:45,  2.77s/it]

epoch 0 step 95 loss tensor(1.1860, device='cuda:0', grad_fn=<DivBackward0>)


 42%|████▏     | 112/264 [05:09<07:02,  2.78s/it]

epoch 0 step 111 loss tensor(1.3398, device='cuda:0', grad_fn=<DivBackward0>)


 48%|████▊     | 128/264 [05:53<06:14,  2.75s/it]

epoch 0 step 127 loss tensor(1.0713, device='cuda:0', grad_fn=<DivBackward0>)


 55%|█████▍    | 144/264 [06:37<05:32,  2.77s/it]

epoch 0 step 143 loss tensor(1.1406, device='cuda:0', grad_fn=<DivBackward0>)


 61%|██████    | 160/264 [07:21<04:46,  2.76s/it]

epoch 0 step 159 loss tensor(1.1362, device='cuda:0', grad_fn=<DivBackward0>)


 67%|██████▋   | 176/264 [08:06<04:05,  2.78s/it]

epoch 0 step 175 loss tensor(1.3818, device='cuda:0', grad_fn=<DivBackward0>)


 73%|███████▎  | 192/264 [08:50<03:19,  2.77s/it]

epoch 0 step 191 loss tensor(1.2612, device='cuda:0', grad_fn=<DivBackward0>)


 79%|███████▉  | 208/264 [09:34<02:35,  2.78s/it]

epoch 0 step 207 loss tensor(1.0464, device='cuda:0', grad_fn=<DivBackward0>)


 85%|████████▍ | 224/264 [10:18<01:50,  2.77s/it]

epoch 0 step 223 loss tensor(0.9690, device='cuda:0', grad_fn=<DivBackward0>)


 91%|█████████ | 240/264 [11:02<01:06,  2.77s/it]

epoch 0 step 239 loss tensor(1.0684, device='cuda:0', grad_fn=<DivBackward0>)


 97%|█████████▋| 256/264 [11:47<00:22,  2.76s/it]

epoch 0 step 255 loss tensor(1.2383, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 264/264 [12:09<00:00,  2.76s/it]


epoch 0 trainLoss: 0.584577618223248


100%|██████████| 132/132 [00:43<00:00,  3.04it/s]


epoch  0 acc  0.007575757575757576


  6%|▌         | 16/264 [00:44<11:25,  2.77s/it]

epoch 1 step 15 loss tensor(1.1787, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 32/264 [01:28<10:38,  2.75s/it]

epoch 1 step 31 loss tensor(1.2266, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 48/264 [02:12<10:00,  2.78s/it]

epoch 1 step 47 loss tensor(1.4512, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 64/264 [02:56<09:06,  2.73s/it]

epoch 1 step 63 loss tensor(1.2019, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 80/264 [03:41<08:26,  2.75s/it]

epoch 1 step 79 loss tensor(1.0898, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▋      | 96/264 [04:25<07:46,  2.78s/it]

epoch 1 step 95 loss tensor(1.2124, device='cuda:0', grad_fn=<DivBackward0>)


 42%|████▏     | 112/264 [05:09<07:00,  2.76s/it]

epoch 1 step 111 loss tensor(1.0090, device='cuda:0', grad_fn=<DivBackward0>)


 48%|████▊     | 128/264 [05:53<06:15,  2.76s/it]

epoch 1 step 127 loss tensor(0.9141, device='cuda:0', grad_fn=<DivBackward0>)


 55%|█████▍    | 144/264 [06:38<05:33,  2.78s/it]

epoch 1 step 143 loss tensor(0.9404, device='cuda:0', grad_fn=<DivBackward0>)


 61%|██████    | 160/264 [07:22<04:47,  2.76s/it]

epoch 1 step 159 loss tensor(1.0747, device='cuda:0', grad_fn=<DivBackward0>)


 67%|██████▋   | 176/264 [08:06<04:03,  2.76s/it]

epoch 1 step 175 loss tensor(1.0076, device='cuda:0', grad_fn=<DivBackward0>)


 73%|███████▎  | 192/264 [08:50<03:18,  2.76s/it]

epoch 1 step 191 loss tensor(0.8965, device='cuda:0', grad_fn=<DivBackward0>)


 79%|███████▉  | 208/264 [09:34<02:33,  2.75s/it]

epoch 1 step 207 loss tensor(1.0667, device='cuda:0', grad_fn=<DivBackward0>)


 85%|████████▍ | 224/264 [10:19<01:50,  2.76s/it]

epoch 1 step 223 loss tensor(1.1111, device='cuda:0', grad_fn=<DivBackward0>)


 91%|█████████ | 240/264 [11:03<01:06,  2.77s/it]

epoch 1 step 239 loss tensor(1.2158, device='cuda:0', grad_fn=<DivBackward0>)


 97%|█████████▋| 256/264 [11:47<00:22,  2.78s/it]

epoch 1 step 255 loss tensor(0.8306, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 264/264 [12:09<00:00,  2.76s/it]


epoch 1 trainLoss: 0.5481918797348485


100%|██████████| 132/132 [00:43<00:00,  3.04it/s]


epoch  1 acc  0.08333333333333333


  6%|▌         | 16/264 [00:44<11:27,  2.77s/it]

epoch 2 step 15 loss tensor(0.9937, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 32/264 [01:28<10:45,  2.78s/it]

epoch 2 step 31 loss tensor(0.7754, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 48/264 [02:12<09:58,  2.77s/it]

epoch 2 step 47 loss tensor(1.1533, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 64/264 [02:57<09:09,  2.75s/it]

epoch 2 step 63 loss tensor(0.9836, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 80/264 [03:41<08:27,  2.76s/it]

epoch 2 step 79 loss tensor(1.1318, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▋      | 96/264 [04:25<07:45,  2.77s/it]

epoch 2 step 95 loss tensor(0.9543, device='cuda:0', grad_fn=<DivBackward0>)


 42%|████▏     | 112/264 [05:09<07:01,  2.77s/it]

epoch 2 step 111 loss tensor(1.0002, device='cuda:0', grad_fn=<DivBackward0>)


 48%|████▊     | 128/264 [05:54<06:18,  2.78s/it]

epoch 2 step 127 loss tensor(0.6801, device='cuda:0', grad_fn=<DivBackward0>)


 55%|█████▍    | 144/264 [06:38<05:31,  2.77s/it]

epoch 2 step 143 loss tensor(0.9526, device='cuda:0', grad_fn=<DivBackward0>)


 61%|██████    | 160/264 [07:22<04:48,  2.77s/it]

epoch 2 step 159 loss tensor(1.0037, device='cuda:0', grad_fn=<DivBackward0>)


 67%|██████▋   | 176/264 [08:07<04:04,  2.77s/it]

epoch 2 step 175 loss tensor(1.0713, device='cuda:0', grad_fn=<DivBackward0>)


 73%|███████▎  | 192/264 [08:51<03:19,  2.77s/it]

epoch 2 step 191 loss tensor(0.9485, device='cuda:0', grad_fn=<DivBackward0>)


 79%|███████▉  | 208/264 [09:35<02:34,  2.76s/it]

epoch 2 step 207 loss tensor(0.9136, device='cuda:0', grad_fn=<DivBackward0>)


 85%|████████▍ | 224/264 [10:19<01:50,  2.76s/it]

epoch 2 step 223 loss tensor(0.7473, device='cuda:0', grad_fn=<DivBackward0>)


 91%|█████████ | 240/264 [11:03<01:05,  2.74s/it]

epoch 2 step 239 loss tensor(0.5871, device='cuda:0', grad_fn=<DivBackward0>)


 97%|█████████▋| 256/264 [11:47<00:21,  2.75s/it]

epoch 2 step 255 loss tensor(0.7140, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 264/264 [12:09<00:00,  2.76s/it]


epoch 2 trainLoss: 0.4687505779844342


100%|██████████| 132/132 [00:43<00:00,  3.05it/s]


epoch  2 acc  0.29545454545454547


  6%|▌         | 16/264 [00:44<11:21,  2.75s/it]

epoch 3 step 15 loss tensor(0.6650, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 32/264 [01:28<10:46,  2.79s/it]

epoch 3 step 31 loss tensor(0.6416, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 48/264 [02:12<10:00,  2.78s/it]

epoch 3 step 47 loss tensor(0.7055, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 64/264 [02:57<09:10,  2.75s/it]

epoch 3 step 63 loss tensor(0.9280, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 80/264 [03:41<08:25,  2.75s/it]

epoch 3 step 79 loss tensor(0.8665, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▋      | 96/264 [04:25<07:45,  2.77s/it]

epoch 3 step 95 loss tensor(0.3815, device='cuda:0', grad_fn=<DivBackward0>)


 42%|████▏     | 112/264 [05:09<07:01,  2.77s/it]

epoch 3 step 111 loss tensor(0.4338, device='cuda:0', grad_fn=<DivBackward0>)


 48%|████▊     | 128/264 [05:53<06:14,  2.75s/it]

epoch 3 step 127 loss tensor(0.7566, device='cuda:0', grad_fn=<DivBackward0>)


 55%|█████▍    | 144/264 [06:38<05:30,  2.76s/it]

epoch 3 step 143 loss tensor(0.1868, device='cuda:0', grad_fn=<DivBackward0>)


 61%|██████    | 160/264 [07:22<04:47,  2.76s/it]

epoch 3 step 159 loss tensor(0.7832, device='cuda:0', grad_fn=<DivBackward0>)


 67%|██████▋   | 176/264 [08:06<04:01,  2.75s/it]

epoch 3 step 175 loss tensor(0.5820, device='cuda:0', grad_fn=<DivBackward0>)


 73%|███████▎  | 192/264 [08:50<03:19,  2.78s/it]

epoch 3 step 191 loss tensor(0.8801, device='cuda:0', grad_fn=<DivBackward0>)


 79%|███████▉  | 208/264 [09:34<02:34,  2.75s/it]

epoch 3 step 207 loss tensor(0.5756, device='cuda:0', grad_fn=<DivBackward0>)


 85%|████████▍ | 224/264 [10:18<01:50,  2.75s/it]

epoch 3 step 223 loss tensor(0.3070, device='cuda:0', grad_fn=<DivBackward0>)


 91%|█████████ | 240/264 [11:03<01:06,  2.77s/it]

epoch 3 step 239 loss tensor(0.9260, device='cuda:0', grad_fn=<DivBackward0>)


 97%|█████████▋| 256/264 [11:47<00:22,  2.76s/it]

epoch 3 step 255 loss tensor(0.0672, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 264/264 [12:09<00:00,  2.76s/it]


epoch 3 trainLoss: 0.2870521545410156


100%|██████████| 132/132 [00:43<00:00,  3.05it/s]


epoch  3 acc  0.5606060606060606


  6%|▌         | 16/264 [00:44<11:28,  2.78s/it]

epoch 4 step 15 loss tensor(0.2477, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 32/264 [01:28<10:44,  2.78s/it]

epoch 4 step 31 loss tensor(0.2757, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 48/264 [02:12<09:51,  2.74s/it]

epoch 4 step 47 loss tensor(0.4189, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 64/264 [03:01<10:51,  3.26s/it]

epoch 4 step 63 loss tensor(0.1026, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 80/264 [03:53<10:04,  3.29s/it]

epoch 4 step 79 loss tensor(0.3830, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▋      | 96/264 [04:46<09:14,  3.30s/it]

epoch 4 step 95 loss tensor(0.4315, device='cuda:0', grad_fn=<DivBackward0>)


 42%|████▏     | 112/264 [05:39<08:21,  3.30s/it]

epoch 4 step 111 loss tensor(0.0859, device='cuda:0', grad_fn=<DivBackward0>)


 48%|████▊     | 128/264 [06:31<07:25,  3.28s/it]

epoch 4 step 127 loss tensor(0.0870, device='cuda:0', grad_fn=<DivBackward0>)


 55%|█████▍    | 144/264 [07:24<06:34,  3.29s/it]

epoch 4 step 143 loss tensor(0.9763, device='cuda:0', grad_fn=<DivBackward0>)


 61%|██████    | 160/264 [08:16<05:43,  3.31s/it]

epoch 4 step 159 loss tensor(0.4888, device='cuda:0', grad_fn=<DivBackward0>)


 67%|██████▋   | 176/264 [09:09<04:47,  3.27s/it]

epoch 4 step 175 loss tensor(0.0315, device='cuda:0', grad_fn=<DivBackward0>)


 73%|███████▎  | 192/264 [10:01<03:56,  3.28s/it]

epoch 4 step 191 loss tensor(0.0596, device='cuda:0', grad_fn=<DivBackward0>)


 79%|███████▉  | 208/264 [10:54<03:02,  3.27s/it]

epoch 4 step 207 loss tensor(0.0215, device='cuda:0', grad_fn=<DivBackward0>)


 85%|████████▍ | 224/264 [11:46<02:12,  3.30s/it]

epoch 4 step 223 loss tensor(0.0321, device='cuda:0', grad_fn=<DivBackward0>)


 91%|█████████ | 240/264 [12:39<01:18,  3.28s/it]

epoch 4 step 239 loss tensor(0.1099, device='cuda:0', grad_fn=<DivBackward0>)


 97%|█████████▋| 256/264 [13:31<00:26,  3.26s/it]

epoch 4 step 255 loss tensor(0.1591, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 264/264 [13:58<00:00,  3.18s/it]


epoch 4 trainLoss: 0.12154654842434508


100%|██████████| 132/132 [00:57<00:00,  2.29it/s]


epoch  4 acc  0.7121212121212122


  6%|▌         | 16/264 [00:52<13:34,  3.29s/it]

epoch 5 step 15 loss tensor(0.0329, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 32/264 [01:45<12:43,  3.29s/it]

epoch 5 step 31 loss tensor(0.0145, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 48/264 [02:37<11:53,  3.30s/it]

epoch 5 step 47 loss tensor(0.0140, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 64/264 [03:30<10:58,  3.29s/it]

epoch 5 step 63 loss tensor(0.0337, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 80/264 [04:23<10:06,  3.30s/it]

epoch 5 step 79 loss tensor(0.0247, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▋      | 96/264 [05:18<09:59,  3.57s/it]

epoch 5 step 95 loss tensor(0.6246, device='cuda:0', grad_fn=<DivBackward0>)


 42%|████▏     | 112/264 [06:19<09:36,  3.79s/it]

epoch 5 step 111 loss tensor(0.0264, device='cuda:0', grad_fn=<DivBackward0>)


 48%|████▊     | 128/264 [07:23<09:06,  4.02s/it]

epoch 5 step 127 loss tensor(0.0866, device='cuda:0', grad_fn=<DivBackward0>)


 55%|█████▍    | 144/264 [08:28<08:04,  4.04s/it]

epoch 5 step 143 loss tensor(0.3326, device='cuda:0', grad_fn=<DivBackward0>)


 61%|██████    | 160/264 [09:32<07:00,  4.05s/it]

epoch 5 step 159 loss tensor(0.0214, device='cuda:0', grad_fn=<DivBackward0>)


 67%|██████▋   | 176/264 [10:36<05:51,  3.99s/it]

epoch 5 step 175 loss tensor(0.5214, device='cuda:0', grad_fn=<DivBackward0>)


 73%|███████▎  | 192/264 [11:41<04:51,  4.05s/it]

epoch 5 step 191 loss tensor(0.2413, device='cuda:0', grad_fn=<DivBackward0>)


 79%|███████▉  | 208/264 [12:45<03:43,  3.99s/it]

epoch 5 step 207 loss tensor(0.0156, device='cuda:0', grad_fn=<DivBackward0>)


 85%|████████▍ | 224/264 [13:49<02:41,  4.03s/it]

epoch 5 step 223 loss tensor(0.0474, device='cuda:0', grad_fn=<DivBackward0>)


 91%|█████████ | 240/264 [14:54<01:36,  4.04s/it]

epoch 5 step 239 loss tensor(0.1359, device='cuda:0', grad_fn=<DivBackward0>)


 97%|█████████▋| 256/264 [15:58<00:31,  3.99s/it]

epoch 5 step 255 loss tensor(0.0252, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 264/264 [16:30<00:00,  3.75s/it]


epoch 5 trainLoss: 0.04842887141487815


100%|██████████| 132/132 [01:26<00:00,  1.53it/s]


epoch  5 acc  0.7727272727272727


  6%|▌         | 16/264 [01:03<16:33,  4.01s/it]

epoch 6 step 15 loss tensor(0.0084, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 32/264 [02:07<15:27,  4.00s/it]

epoch 6 step 31 loss tensor(0.0111, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 48/264 [03:12<14:30,  4.03s/it]

epoch 6 step 47 loss tensor(0.1586, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 64/264 [04:16<13:26,  4.03s/it]

epoch 6 step 63 loss tensor(0.0035, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 80/264 [05:20<12:18,  4.01s/it]

epoch 6 step 79 loss tensor(0.0105, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▋      | 96/264 [06:25<11:11,  4.00s/it]

epoch 6 step 95 loss tensor(0.0106, device='cuda:0', grad_fn=<DivBackward0>)


 42%|████▏     | 112/264 [07:29<10:10,  4.02s/it]

epoch 6 step 111 loss tensor(0.0205, device='cuda:0', grad_fn=<DivBackward0>)


 48%|████▊     | 128/264 [08:33<09:07,  4.03s/it]

epoch 6 step 127 loss tensor(0.0185, device='cuda:0', grad_fn=<DivBackward0>)


 55%|█████▍    | 144/264 [09:37<08:01,  4.01s/it]

epoch 6 step 143 loss tensor(0.0051, device='cuda:0', grad_fn=<DivBackward0>)


 61%|██████    | 160/264 [10:41<06:56,  4.01s/it]

epoch 6 step 159 loss tensor(0.0113, device='cuda:0', grad_fn=<DivBackward0>)


 67%|██████▋   | 176/264 [11:45<05:52,  4.00s/it]

epoch 6 step 175 loss tensor(0.0114, device='cuda:0', grad_fn=<DivBackward0>)


 73%|███████▎  | 192/264 [12:50<04:49,  4.01s/it]

epoch 6 step 191 loss tensor(0.0067, device='cuda:0', grad_fn=<DivBackward0>)


 79%|███████▉  | 208/264 [13:54<03:45,  4.03s/it]

epoch 6 step 207 loss tensor(0.0353, device='cuda:0', grad_fn=<DivBackward0>)


 85%|████████▍ | 224/264 [14:58<02:40,  4.02s/it]

epoch 6 step 223 loss tensor(0.0034, device='cuda:0', grad_fn=<DivBackward0>)


 91%|█████████ | 240/264 [16:02<01:36,  4.01s/it]

epoch 6 step 239 loss tensor(0.0096, device='cuda:0', grad_fn=<DivBackward0>)


 97%|█████████▋| 256/264 [17:06<00:32,  4.01s/it]

epoch 6 step 255 loss tensor(0.0115, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 264/264 [17:38<00:00,  4.01s/it]


epoch 6 trainLoss: 0.015951687413634674


100%|██████████| 132/132 [01:26<00:00,  1.53it/s]


epoch  6 acc  0.7954545454545454


  6%|▌         | 16/264 [01:03<16:26,  3.98s/it]

epoch 7 step 15 loss tensor(0.0246, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 32/264 [02:07<15:31,  4.02s/it]

epoch 7 step 31 loss tensor(0.0046, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 48/264 [03:12<14:24,  4.00s/it]

epoch 7 step 47 loss tensor(0.0062, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 64/264 [04:16<13:23,  4.02s/it]

epoch 7 step 63 loss tensor(0.0061, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 80/264 [05:20<12:22,  4.04s/it]

epoch 7 step 79 loss tensor(0.0144, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▋      | 96/264 [06:25<11:18,  4.04s/it]

epoch 7 step 95 loss tensor(0.0039, device='cuda:0', grad_fn=<DivBackward0>)


 42%|████▏     | 112/264 [07:29<10:07,  4.00s/it]

epoch 7 step 111 loss tensor(0.0068, device='cuda:0', grad_fn=<DivBackward0>)


 48%|████▊     | 128/264 [08:33<09:11,  4.05s/it]

epoch 7 step 127 loss tensor(0.0039, device='cuda:0', grad_fn=<DivBackward0>)


 55%|█████▍    | 144/264 [09:37<08:01,  4.01s/it]

epoch 7 step 143 loss tensor(0.0104, device='cuda:0', grad_fn=<DivBackward0>)


 61%|██████    | 160/264 [10:42<06:54,  3.99s/it]

epoch 7 step 159 loss tensor(0.0031, device='cuda:0', grad_fn=<DivBackward0>)


 67%|██████▋   | 176/264 [11:46<05:54,  4.03s/it]

epoch 7 step 175 loss tensor(0.0076, device='cuda:0', grad_fn=<DivBackward0>)


 73%|███████▎  | 192/264 [12:50<04:46,  3.98s/it]

epoch 7 step 191 loss tensor(0.0085, device='cuda:0', grad_fn=<DivBackward0>)


 79%|███████▉  | 208/264 [13:55<03:45,  4.03s/it]

epoch 7 step 207 loss tensor(0.0033, device='cuda:0', grad_fn=<DivBackward0>)


 85%|████████▍ | 224/264 [14:59<02:42,  4.05s/it]

epoch 7 step 223 loss tensor(0.0517, device='cuda:0', grad_fn=<DivBackward0>)


 91%|█████████ | 240/264 [16:03<01:36,  4.01s/it]

epoch 7 step 239 loss tensor(0.0042, device='cuda:0', grad_fn=<DivBackward0>)


 97%|█████████▋| 256/264 [17:08<00:32,  4.01s/it]

epoch 7 step 255 loss tensor(0.0015, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 264/264 [17:40<00:00,  4.02s/it]


epoch 7 trainLoss: 0.005792876084645589


100%|██████████| 132/132 [01:26<00:00,  1.53it/s]


epoch  7 acc  0.7954545454545454


  6%|▌         | 16/264 [01:04<16:43,  4.04s/it]

epoch 8 step 15 loss tensor(0.0066, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 32/264 [02:08<15:38,  4.05s/it]

epoch 8 step 31 loss tensor(0.0020, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 48/264 [03:13<14:27,  4.02s/it]

epoch 8 step 47 loss tensor(0.0038, device='cuda:0', grad_fn=<DivBackward0>)


