In [1]:
import torch
from torch.utils.data import DataLoader, random_split
from torchvision import datasets
from torchvision import models
import os
import torch.optim as optim
import torch.nn as nn
from torch.optim.lr_scheduler import LambdaLR

import datetime
datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

'2025-01-08 16:55:41'

# HYPER

In [2]:

torch.manual_seed(42)

<torch._C.Generator at 0x7f0630b745d0>

In [3]:
data_dir = "./data/Images"
#hyperparameter:
learning_rate = 5e-5
batch_size = 128
#adam paras:
betas=(0.9, 0.999), 
eps=1e-8

training_steps = 1000
num_classes = 120
num_epochs = 15

In [4]:
device = torch.device('cuda' if torch.cuda.is_available else 'cpu')
device

device(type='cuda')

In [5]:
# !pip install timm

In [6]:
import timm
timm.data.IMAGENET_DEFAULT_MEAN,timm.data.IMAGENET_DEFAULT_STD

((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))

# DATA LOADER

In [7]:
import torchvision.transforms as T

trans_ = T.Compose([
    T.Resize((224, 224)), 
    T.ToTensor(),
    T.Normalize(mean=timm.data.IMAGENET_DEFAULT_MEAN, std=timm.data.IMAGENET_DEFAULT_STD)
])

dataset = datasets.ImageFolder(root=data_dir, transform=trans_)

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size


train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

### 计算每个epoch的batch数为step数，每个step都会更新梯度，乘以epochs得到总的steps
- = 20580 * 80% /128 = 128.625 向上取整
- 总的steps = 129 * numepochs = 1935
- 实际训练时可能会提前终止

In [8]:
len(train_loader)

129

In [9]:
num_steps = len(train_loader) * num_epochs
images, labels = next(iter(train_loader))
images.size(),labels.size(),num_steps

(torch.Size([128, 3, 224, 224]), torch.Size([128]), 1935)

# MODEL_PRETRAINED

In [10]:
from transformers import ViTForImageClassification
model_path = '/workspaces'

model = ViTForImageClassification.from_pretrained(model_path,num_labels = num_classes)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at /workspaces and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
model = model.to(device)

# FINE_TUNING LOOP

In [12]:
# !pip install transformers[torch]

In [13]:
# !pip install mlflow

<!-- from transformers import Adafactor, Trainer, TrainingArguments

# 定义训练参数
training_args = TrainingArguments(
    evaluation_strategy="steps",            # 每隔一定步数进行评估
    save_strategy="steps",                  # 每隔一定步数保存模型
    learning_rate=5e-5,                     # 学习率
    gradient_accumulation_steps=4,          # 梯度累积
    gradient_checkpointing=True,            # 启用梯度检查点
    optim="adafactor",                      # 使用 Adafactor 优化器
    max_steps=1000,                          # 总训练步数
    eval_delay=0,                           # 评估延迟
    logging_steps=100,                      # 每100步进行一次日志记录
    save_steps=200,                         # 每200步保存一次模型
    load_best_model_at_end=True,            # 在训练结束时加载最佳模型
    metric_for_best_model="f1",             # 评估标准
    greater_is_better=True,                 # F1 分数越高越好
    report_to="mlflow",                     # 将日志报告到 MLflow
    save_total_limit=2,                     # 最多保存2个模型
    output_dir = './output'
)

# 使用 Adafactor 优化器，指定学习率和 beta 值
optimizer = Adafactor(
    model.parameters(),        # 使用 training_args 中的学习率
    eps=1e-8,                               # 防止数值问题
    # betas=(0.9, 0.999),                     # beta 设置为（0.9， 0.999）
    weight_decay=0.01,                      # 权重衰减
    relative_step=True,                     # 使用相对步长
    warmup_init=True                        # 启用热启动
)

# 使用 Trainer API 进行训练
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,            # 训练数据集
    eval_dataset=val_dataset,              # 验证数据集
    optimizers=(optimizer, None),           # 使用自定义优化器
)
 -->


### 传统SGD：https://pytorch.org/docs/stable/generated/torch.optim.SGD.html 
$$
\theta_t = \theta_{t-1} - \eta \nabla_\theta J(\theta_{t-1}),
$$

### Adam：https://pytorch.org/docs/stable/generated/torch.optim.Adam.html
- **step1:**
    $$
m_t = \beta_1 m_{t-1} + (1 - \beta_1) \nabla_\theta J(\theta_{t-1}),
$$

    $$
v_t = \beta_2 v_{t-1} + (1 - \beta_2) \left(\nabla_\theta J(\theta_{t-1})\right)^2,
$$

- **step2:**

    $$
\hat{m}_t = \frac{m_t}{1 - \beta_1^t}, \quad \hat{v}_t = \frac{v_t}{1 - \beta_2^t},
$$ 
- **step3:**

    $$
\theta_t = \theta_{t-1} - \frac{\eta \hat{m}_t}{\sqrt{\hat{v}_t} + \epsilon},
$$

**优点：** Adam的算法可以简单的理解为RMSProp和动量优化的结合：
- 其中动量优化提供了动态调整学习率的思路，可以有效缓和震荡问题
- RMSProp可以理解为AdaGrad与指数加权移动平均算法的结合，其中：
    - AdaGrad对于不同更新力度的参数定制不同的学习率
    - 指数加权移动平均算法使AdaGrad梯度累加更加平滑，避免了早停

In [None]:
# train_loop
optimizer = optim.Adam(
    model.parameters(), 
    lr=5e-5, 
    betas=betas, 
    eps=eps
)

criterion = nn.CrossEntropyLoss()

# 定义学习率调度器
def lr_lambda(current_step: int):
    return max(0.0, 1.0 - current_step / num_steps)

lr_scheduler = LambdaLR(optimizer, lr_lambda)

total_steps = len(train_loader)  # 每个epoch的训练步数

for epoch in range(num_epochs):
    model.train() 
    running_loss = 0.0
    correct = 0
    total = 0
    for step, (images, labels) in enumerate(train_loader):
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad() 

        outputs = model(images)
        loss = criterion(outputs.logits, labels) 

        loss.backward()
        
        optimizer.step()
        
        running_loss += loss.item()

        _, predicted = torch.max(outputs.logits, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

        lr_scheduler.step()

        if step % 10 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Step [{step+1}/{total_steps}], Loss: {running_loss/10:.4f}, Accuracy: {100 * correct/total:.2f}%, {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
            running_loss = 0.0

    model.eval()  # 设置模型为评估模式
    eval_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():  
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)

            outputs = model(images)
            loss = criterion(outputs.logits, labels)  
            eval_loss += loss.item()

            _, predicted = torch.max(outputs.logits, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    eval_loss /= len(val_loader)
    eval_acc = correct / total * 100
    print(f"Epoch [{epoch+1}/{num_epochs}] Eval Loss: {eval_loss:.4f}, Eval Accuracy: {eval_acc:.2f}%")


Epoch [1/15], Step [1/129], Loss: 0.4783, Accuracy: 0.00%, 2025-01-08 16:55:50
Epoch [1/15], Step [11/129], Loss: 4.7495, Accuracy: 3.55%, 2025-01-08 16:56:21
Epoch [1/15], Step [21/129], Loss: 4.6840, Accuracy: 7.25%, 2025-01-08 16:56:53
Epoch [1/15], Step [31/129], Loss: 4.6149, Accuracy: 11.59%, 2025-01-08 16:57:24
Epoch [1/15], Step [41/129], Loss: 4.5167, Accuracy: 17.66%, 2025-01-08 16:57:54
Epoch [1/15], Step [51/129], Loss: 4.4376, Accuracy: 23.58%, 2025-01-08 16:58:26
Epoch [1/15], Step [61/129], Loss: 4.3386, Accuracy: 29.43%, 2025-01-08 16:58:56
Epoch [1/15], Step [71/129], Loss: 4.2511, Accuracy: 34.00%, 2025-01-08 16:59:28
Epoch [1/15], Step [81/129], Loss: 4.1620, Accuracy: 37.65%, 2025-01-08 16:59:58
Epoch [1/15], Step [91/129], Loss: 4.0740, Accuracy: 40.79%, 2025-01-08 17:00:29
Epoch [1/15], Step [101/129], Loss: 3.9692, Accuracy: 43.73%, 2025-01-08 17:01:01
Epoch [1/15], Step [111/129], Loss: 3.8877, Accuracy: 46.14%, 2025-01-08 17:01:32
Epoch [1/15], Step [121/129], 

In [None]:
#!poweroff

In [15]:
model.save_pretrained("vit_finetuned_StanfordDogs_ep5")