## 1. Import Modules and Data

In [10]:
from datasets import load_dataset

ds = load_dataset(
    "ILSVRC/imagenet-1k", resume_download=False, force_download=True, encoding="utf-16"
)

HfHubHTTPError: 416 Client Error: Requested Range Not Satisfiable for url: https://hf-mirror.com/datasets/ILSVRC/imagenet-1k/resolve/main/imagenet-1k.py (Request ID: Root=1-67027bda-7ad398dc5b1033462eb6ad82;136880ee-2d0c-496d-bbe6-9e30f30f1470)

In [2]:
model = ViTForImageClassification()
s1 = set(model.state_dict())

In [3]:
from transformers import ViTForImageClassification as HFModel
from transformers import ViTConfig
hf_model = HFModel(ViTConfig())
s2 = set(hf_model.state_dict())

In [4]:
s2 - s1

{'vit.encoder.layer.0.output.dense.bias',
 'vit.encoder.layer.0.output.dense.weight',
 'vit.encoder.layer.1.output.dense.bias',
 'vit.encoder.layer.1.output.dense.weight',
 'vit.encoder.layer.10.output.dense.bias',
 'vit.encoder.layer.10.output.dense.weight',
 'vit.encoder.layer.11.output.dense.bias',
 'vit.encoder.layer.11.output.dense.weight',
 'vit.encoder.layer.2.output.dense.bias',
 'vit.encoder.layer.2.output.dense.weight',
 'vit.encoder.layer.3.output.dense.bias',
 'vit.encoder.layer.3.output.dense.weight',
 'vit.encoder.layer.4.output.dense.bias',
 'vit.encoder.layer.4.output.dense.weight',
 'vit.encoder.layer.5.output.dense.bias',
 'vit.encoder.layer.5.output.dense.weight',
 'vit.encoder.layer.6.output.dense.bias',
 'vit.encoder.layer.6.output.dense.weight',
 'vit.encoder.layer.7.output.dense.bias',
 'vit.encoder.layer.7.output.dense.weight',
 'vit.encoder.layer.8.output.dense.bias',
 'vit.encoder.layer.8.output.dense.weight',
 'vit.encoder.layer.9.output.dense.bias',
 'vit.en

In [6]:
s2

{'classifier.bias',
 'classifier.weight',
 'vit.embeddings.cls_token',
 'vit.embeddings.patch_embeddings.projection.bias',
 'vit.embeddings.patch_embeddings.projection.weight',
 'vit.embeddings.position_embeddings',
 'vit.encoder.layer.0.attention.attention.key.bias',
 'vit.encoder.layer.0.attention.attention.key.weight',
 'vit.encoder.layer.0.attention.attention.query.bias',
 'vit.encoder.layer.0.attention.attention.query.weight',
 'vit.encoder.layer.0.attention.attention.value.bias',
 'vit.encoder.layer.0.attention.attention.value.weight',
 'vit.encoder.layer.0.attention.output.dense.bias',
 'vit.encoder.layer.0.attention.output.dense.weight',
 'vit.encoder.layer.0.intermediate.dense.bias',
 'vit.encoder.layer.0.intermediate.dense.weight',
 'vit.encoder.layer.0.layernorm_after.bias',
 'vit.encoder.layer.0.layernorm_after.weight',
 'vit.encoder.layer.0.layernorm_before.bias',
 'vit.encoder.layer.0.layernorm_before.weight',
 'vit.encoder.layer.0.output.dense.bias',
 'vit.encoder.layer.

In [8]:
model.config

ViTConfig {
  "_name_or_path": "google/vit-base-patch16-224",
  "architectures": [
    "ViTForImageClassification"
  ],
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9"
  },
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7,
    "LABEL_8": 8,
    "LABEL_9": 9
  },
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": true,
  "transformers_version": "4.44.2"
}

In [None]:

# 3. 定义优化器和损失函数
optimizer = optim.AdamW(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

# 4. 训练模型
num_epochs = 3

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}"):
        inputs, labels = batch
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        
        # 模型前向传播
        outputs = model(pixel_values=inputs).logits
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader)}, Accuracy: {100 * correct / total}%")

# 5. 测试模型
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(pixel_values=inputs).logits
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f"Test Accuracy: {100 * correct / total}%")


## 2. Build Model

## 3. Train Model