### 这个文档用于使用重新训练，加入tensorboard可视化

In [2]:
#!pip install tensorboard
!pip install scikit-learn seaborn

Looking in indexes: http://mirrors.aliyun.com/pypi/simple
Collecting seaborn
  Downloading http://mirrors.aliyun.com/pypi/packages/83/11/00d3c3dfc25ad54e731d91449895a79e4bf2384dc3ac01809010ba88f6d5/seaborn-0.13.2-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2
[0m

In [1]:
import torch
from torch.utils.data import DataLoader, random_split
from torchvision import datasets
from torchvision import models
import os
import torch.optim as optim
import torch.nn as nn
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.tensorboard import SummaryWriter
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

import datetime
datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

'2025-01-10 23:06:11'

# HYPER

In [2]:

torch.manual_seed(42)

<torch._C.Generator at 0x7f93ebda4590>

In [3]:
data_dir = "./data/Images"
#hyperparameter:
learning_rate = 5e-5
batch_size = 128
#adam paras:
betas=(0.9, 0.999)
eps=1e-8

training_steps = 1000
num_classes = 120
num_epochs = 5

In [4]:
device = torch.device('cuda' if torch.cuda.is_available else 'cpu')
device

device(type='cuda')

In [5]:
# !pip install timm

In [6]:
import timm
timm.data.IMAGENET_DEFAULT_MEAN,timm.data.IMAGENET_DEFAULT_STD

((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))

# DATA LOADER

In [7]:
import torchvision.transforms as T

trans_ = T.Compose([
    T.Resize((224, 224)), 
    T.ToTensor(),
    T.Normalize(mean=timm.data.IMAGENET_DEFAULT_MEAN, std=timm.data.IMAGENET_DEFAULT_STD)
])

dataset = datasets.ImageFolder(root=data_dir, transform=trans_)

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size


train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

### 计算每个epoch的batch数为step数，每个step都会更新梯度，乘以epochs得到总的steps
- = 20580 * 80% /128 = 128.625 向上取整
- 总的steps = 129 * numepochs = 1935
- 实际训练时可能会提前终止

In [8]:
len(train_loader)

129

In [9]:
num_steps = len(train_loader) * num_epochs
images, labels = next(iter(train_loader))
images.size(),labels.size(),num_steps

(torch.Size([128, 3, 224, 224]), torch.Size([128]), 645)

# MODEL_PRETRAINED

In [10]:
from transformers import ViTForImageClassification
model_path = '/workspaces'

model = ViTForImageClassification.from_pretrained(model_path,num_labels = num_classes)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at /workspaces and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
model = model.to(device)

# FINE_TUNING LOOP

In [12]:
# !pip install transformers[torch]

In [13]:
# !pip install mlflow

In [12]:
# 初始化 TensorBoard
writer = SummaryWriter(log_dir="./runs/vit_finetune")

<!-- from transformers import Adafactor, Trainer, TrainingArguments

# 定义训练参数
training_args = TrainingArguments(
    evaluation_strategy="steps",            # 每隔一定步数进行评估
    save_strategy="steps",                  # 每隔一定步数保存模型
    learning_rate=5e-5,                     # 学习率
    gradient_accumulation_steps=4,          # 梯度累积
    gradient_checkpointing=True,            # 启用梯度检查点
    optim="adafactor",                      # 使用 Adafactor 优化器
    max_steps=1000,                          # 总训练步数
    eval_delay=0,                           # 评估延迟
    logging_steps=100,                      # 每100步进行一次日志记录
    save_steps=200,                         # 每200步保存一次模型
    load_best_model_at_end=True,            # 在训练结束时加载最佳模型
    metric_for_best_model="f1",             # 评估标准
    greater_is_better=True,                 # F1 分数越高越好
    report_to="mlflow",                     # 将日志报告到 MLflow
    save_total_limit=2,                     # 最多保存2个模型
    output_dir = './output'
)

# 使用 Adafactor 优化器，指定学习率和 beta 值
optimizer = Adafactor(
    model.parameters(),        # 使用 training_args 中的学习率
    eps=1e-8,                               # 防止数值问题
    # betas=(0.9, 0.999),                     # beta 设置为（0.9， 0.999）
    weight_decay=0.01,                      # 权重衰减
    relative_step=True,                     # 使用相对步长
    warmup_init=True                        # 启用热启动
)

# 使用 Trainer API 进行训练
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,            # 训练数据集
    eval_dataset=val_dataset,              # 验证数据集
    optimizers=(optimizer, None),           # 使用自定义优化器
)
 -->


### 传统SGD：https://pytorch.org/docs/stable/generated/torch.optim.SGD.html 
$$
\theta_t = \theta_{t-1} - \eta \nabla_\theta J(\theta_{t-1}),
$$

### Adam：https://pytorch.org/docs/stable/generated/torch.optim.Adam.html
- **step1:**
    $$
m_t = \beta_1 m_{t-1} + (1 - \beta_1) \nabla_\theta J(\theta_{t-1}),
$$

    $$
v_t = \beta_2 v_{t-1} + (1 - \beta_2) \left(\nabla_\theta J(\theta_{t-1})\right)^2,
$$

- **step2:**

    $$
\hat{m}_t = \frac{m_t}{1 - \beta_1^t}, \quad \hat{v}_t = \frac{v_t}{1 - \beta_2^t},
$$ 
- **step3:**

    $$
\theta_t = \theta_{t-1} - \frac{\eta \hat{m}_t}{\sqrt{\hat{v}_t} + \epsilon},
$$

**优点：** Adam的算法可以简单的理解为RMSProp和动量优化的结合：
- 其中动量优化提供了动态调整学习率的思路，可以有效缓和震荡问题
- RMSProp可以理解为AdaGrad与指数加权移动平均算法的结合，其中：
    - AdaGrad对于不同更新力度的参数定制不同的学习率
    - 指数加权移动平均算法使AdaGrad梯度累加更加平滑，避免了早停

In [13]:
# train_loop
optimizer = optim.Adam(
    model.parameters(), 
    lr=5e-5, 
    betas=betas, 
    eps=eps
)

criterion = nn.CrossEntropyLoss()

# 定义学习率调度器
def lr_lambda(current_step: int):
    return max(0.0, 1.0 - current_step / num_steps)

lr_scheduler = LambdaLR(optimizer, lr_lambda)

total_steps = len(train_loader)  # 每个epoch的训练步数

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    all_preds = []
    all_labels = []

    for step, (images, labels) in enumerate(train_loader):
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs.logits, labels)

        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = torch.max(outputs.logits, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

        lr_scheduler.step()

        if (step + 1) % 10 == 0:
            avg_loss = running_loss / 10
            acc = 100 * correct / total
            print(f"Epoch [{epoch+1}/{num_epochs}], Step [{step+1}/{total_steps}], Loss: {avg_loss:.4f}, Accuracy: {acc:.2f}%")

            writer.add_scalar("Training/Loss", avg_loss, epoch * total_steps + step)
            writer.add_scalar("Training/Accuracy", acc, epoch * total_steps + step)

            running_loss = 0.0

    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average="weighted")
    writer.add_scalar("Training/Precision", precision, epoch)
    writer.add_scalar("Training/Recall", recall, epoch)
    writer.add_scalar("Training/F1-Score", f1, epoch)

    model.eval()
    eval_loss = 0.0
    correct = 0
    total = 0
    val_preds = []
    val_labels = []

    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs.logits, labels)
            eval_loss += loss.item()

            _, predicted = torch.max(outputs.logits, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            val_preds.extend(predicted.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    eval_loss /= len(val_loader)
    eval_acc = 100 * correct / total

    precision, recall, f1, _ = precision_recall_fscore_support(val_labels, val_preds, average="weighted",zero_division=1)
    conf_matrix = confusion_matrix(val_labels, val_preds)

    writer.add_scalar("Validation/Loss", eval_loss, epoch)
    writer.add_scalar("Validation/Accuracy", eval_acc, epoch)
    writer.add_scalar("Validation/Precision", precision, epoch)
    writer.add_scalar("Validation/Recall", recall, epoch)
    writer.add_scalar("Validation/F1-Score", f1, epoch)

    plt.figure(figsize=(10, 8))
    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.tight_layout()
    writer.add_figure("Validation/Confusion_Matrix", plt.gcf(), epoch)
    plt.close()

    print(f"Epoch [{epoch+1}/{num_epochs}] Eval Loss: {eval_loss:.4f}, Eval Accuracy: {eval_acc:.2f}%, Precision: {precision:.2f}, Recall: {recall:.2f}, F1-Score: {f1:.2f}")

writer.close()


Epoch [1/5], Step [10/129], Loss: 4.7561, Accuracy: 2.50%
Epoch [1/5], Step [20/129], Loss: 4.6905, Accuracy: 6.64%
Epoch [1/5], Step [30/129], Loss: 4.6250, Accuracy: 11.02%
Epoch [1/5], Step [40/129], Loss: 4.5335, Accuracy: 16.70%
Epoch [1/5], Step [50/129], Loss: 4.4544, Accuracy: 22.48%
Epoch [1/5], Step [60/129], Loss: 4.3607, Accuracy: 28.35%
Epoch [1/5], Step [70/129], Loss: 4.2749, Accuracy: 33.02%
Epoch [1/5], Step [80/129], Loss: 4.1992, Accuracy: 36.72%
Epoch [1/5], Step [90/129], Loss: 4.1109, Accuracy: 40.03%
Epoch [1/5], Step [100/129], Loss: 4.0191, Accuracy: 42.95%
Epoch [1/5], Step [110/129], Loss: 3.9443, Accuracy: 45.45%
Epoch [1/5], Step [120/129], Loss: 3.8663, Accuracy: 47.70%
Epoch [1/5] Eval Loss: 3.7515, Eval Accuracy: 75.30%, Precision: 0.81, Recall: 0.75, F1-Score: 0.74
Epoch [2/5], Step [10/129], Loss: 3.6780, Accuracy: 79.77%
Epoch [2/5], Step [20/129], Loss: 3.6194, Accuracy: 79.10%
Epoch [2/5], Step [30/129], Loss: 3.5355, Accuracy: 79.74%
Epoch [2/5], S

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch [2/5] Eval Loss: 3.0054, Eval Accuracy: 79.82%, Precision: 0.83, Recall: 0.80, F1-Score: 0.78
Epoch [3/5], Step [10/129], Loss: 2.8433, Accuracy: 87.89%
Epoch [3/5], Step [20/129], Loss: 2.8179, Accuracy: 87.93%
Epoch [3/5], Step [30/129], Loss: 2.7723, Accuracy: 87.71%
Epoch [3/5], Step [40/129], Loss: 2.7318, Accuracy: 87.56%
Epoch [3/5], Step [50/129], Loss: 2.6909, Accuracy: 87.44%
Epoch [3/5], Step [60/129], Loss: 2.6440, Accuracy: 87.57%
Epoch [3/5], Step [70/129], Loss: 2.5996, Accuracy: 87.51%
Epoch [3/5], Step [80/129], Loss: 2.5394, Accuracy: 87.71%
Epoch [3/5], Step [90/129], Loss: 2.5371, Accuracy: 87.68%
Epoch [3/5], Step [100/129], Loss: 2.5206, Accuracy: 87.66%
Epoch [3/5], Step [110/129], Loss: 2.4905, Accuracy: 87.76%
Epoch [3/5], Step [120/129], Loss: 2.4454, Accuracy: 87.70%


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch [3/5] Eval Loss: 2.5430, Eval Accuracy: 82.03%, Precision: 0.85, Recall: 0.82, F1-Score: 0.81
Epoch [4/5], Step [10/129], Loss: 2.3022, Accuracy: 90.86%
Epoch [4/5], Step [20/129], Loss: 2.2665, Accuracy: 91.56%
Epoch [4/5], Step [30/129], Loss: 2.2695, Accuracy: 91.09%
Epoch [4/5], Step [40/129], Loss: 2.2569, Accuracy: 91.04%
Epoch [4/5], Step [50/129], Loss: 2.2094, Accuracy: 91.00%
Epoch [4/5], Step [60/129], Loss: 2.2304, Accuracy: 90.85%
Epoch [4/5], Step [70/129], Loss: 2.1755, Accuracy: 90.99%
Epoch [4/5], Step [80/129], Loss: 2.1588, Accuracy: 90.99%
Epoch [4/5], Step [90/129], Loss: 2.1454, Accuracy: 90.96%
Epoch [4/5], Step [100/129], Loss: 2.1252, Accuracy: 91.09%
Epoch [4/5], Step [110/129], Loss: 2.1153, Accuracy: 90.94%
Epoch [4/5], Step [120/129], Loss: 2.0969, Accuracy: 90.90%


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch [4/5] Eval Loss: 2.2910, Eval Accuracy: 82.44%, Precision: 0.85, Recall: 0.82, F1-Score: 0.81
Epoch [5/5], Step [10/129], Loss: 2.0233, Accuracy: 92.81%
Epoch [5/5], Step [20/129], Loss: 2.0089, Accuracy: 92.89%
Epoch [5/5], Step [30/129], Loss: 2.0160, Accuracy: 92.53%
Epoch [5/5], Step [40/129], Loss: 1.9736, Accuracy: 92.60%
Epoch [5/5], Step [50/129], Loss: 1.9945, Accuracy: 92.20%
Epoch [5/5], Step [60/129], Loss: 1.9777, Accuracy: 92.16%
Epoch [5/5], Step [70/129], Loss: 1.9549, Accuracy: 92.20%
Epoch [5/5], Step [80/129], Loss: 1.9726, Accuracy: 92.21%
Epoch [5/5], Step [90/129], Loss: 1.9505, Accuracy: 92.27%
Epoch [5/5], Step [100/129], Loss: 1.9663, Accuracy: 92.21%
Epoch [5/5], Step [110/129], Loss: 1.9348, Accuracy: 92.24%
Epoch [5/5], Step [120/129], Loss: 1.9441, Accuracy: 92.24%


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch [5/5] Eval Loss: 2.2181, Eval Accuracy: 82.41%, Precision: 0.85, Recall: 0.82, F1-Score: 0.81


In [14]:
writer = SummaryWriter(log_dir='./runs/vit_finetune')
example_input = torch.randn(1, 3, 224, 224).to(device)

class WrappedModel(torch.nn.Module):
    def __init__(self, model):
        super(WrappedModel, self).__init__()
        self.model = model

    def forward(self, x):
        
        return self.model(x).logits


wrapped_model = WrappedModel(model)


writer.add_graph(wrapped_model, example_input)

writer.close()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
  if num_channels != self.num_channels:
  if height != self.image_size[0] or width != self.image_size[1]:


In [None]:
#!poweroff

In [15]:
model.save_pretrained("vit_finetuned_StanfordDogs_ep5")