本项目使用的数据集书MNIST数据集，MNIST数据集一共有7万张图片，其中6万张是训练集，1万张是测试集。
项目目标是利用Vision Transformer 处理MINST数据集，其中前期以CNN进行处理达到准确率是98.690000%（Best），后期对Vision Transformer做了如下的变量控制：
    （1）数据预处理，归一化操作，将图像值都转换到[-1,1]之间；(input[channel] - mean[channel]) / std[channel]
    （2）手写Vision Transformer 与 调用 nn.Module 中的 nn.TransformerEncoder(nn.TransformerEncoderLayer())
     (3) 位置编码的使用 Position Embedding
    （4）CLS的使用
  最终手写的Vision Transformer 的准确率是 0.963800（Best）

  项目组织：dataset.py 加载处理数据 vit.py 模型的构建  train.py 训练  infer.py 推断

一、数据预处理，归一化操作，将图像值都转换到[-1,1]之间；(input[channel] - mean[channel]) / std[channel]
    处理方式： dataset.py （训练集和数据集一样对应）
如果 flag 为 True，只应用 transforms.ToTensor()，即将图像转换为张量。如果 flag 为 False，则应用一个组合的转换，首先将图像转换为张量，然后进行标准化处理 (transforms.Normalize((0.1307,), (0.3081,)))，将图像的像素值标准化到均值为 0.1307 和标准差为 0.3081 的分布。

In [2]:
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets


class MNIST_Train(Dataset):
    def __init__(self, root: str = '../data/', flag=True, train=True, download=True, batch_size=64):
        self.root = root
        self.train = train
        self.download = download
        self.batch_size = batch_size
        self._transform = transforms.ToTensor() if flag else transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307,), (0.3081,))
        ])
        self.dataset = datasets.MNIST(root=self.root, train=self.train, download=self.download,
                                      transform=self._transform)
        self.loader = DataLoader(self.dataset, shuffle=True, batch_size=self.batch_size)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, index):
        return self.dataset[index]

    def get_loader(self):
        return self.loader


class MNIST_Check(Dataset):
    def __init__(self, root: str = '../data/', flag=True, batch_size=64):
        self.root = root
        self.batch_size = batch_size
        self._transform = transforms.ToTensor() if flag else transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307,), (0.3081,))
        ])
        self.test_dataset = datasets.MNIST(root=self.root, train=False, download=True, transform=self._transform)
        self.test_loader = DataLoader(self.test_dataset, shuffle=False, batch_size=self.batch_size)

    def __len__(self):
        return len(self.test_dataset)

    def __getitem__(self, index):
        return self.test_dataset[index]

    def get_loader(self):
        return self.test_loader


二、 位置编码的使用 Position Embedding
    def  __init__
        self.pos_emb = nn.Parameter(torch.rand(1, self.patch_count ** 2 + 1, emb_size))  # 如果存在CLS 
    def forward
        x = self.pos_emb + x


三、 CLS的使用
    def  __init__
        self.cls_token = nn.Parameter(torch.rand(1, 1, emb_size))  # 分类头输入
    def forward
        cls_token = self.cls_token.expand(x.size(0), 1, x.size(2))  # (batch_size, 1, emb_size)
        x = torch.cat((cls_token, x), dim=1)  # add [cls] token
        # x = x.mean(dim=1)  # Example: mean pooling over patches # # 如果没有CLS
        return self.cls_linear(y[:, 0, :])

四、 手写Vision Transformer 与 调用 nn.Module 中的 nn.TransformerEncoder(nn.TransformerEncoderLayer())

In [3]:
# torch
import torch
from torch import nn


class ViT(nn.Module):
    def __init__(self, emb_size=16, CLSisUsed=True, PostionisUsed=True,nhead=2,num_layer=3):
        super().__init__()
        self.CLSisUsed = CLSisUsed
        self.PostionisUsed = PostionisUsed
        self.nhead = nhead
        self.num_layer = num_layer
        self.patch_size = 4
        self.patch_count = 28 // self.patch_size
        self.conv = nn.Conv2d(in_channels=1, out_channels=self.patch_size ** 2, kernel_size=self.patch_size, padding=0,
                              stride=self.patch_size)
        self.patch_emb = nn.Linear(in_features=self.patch_size ** 2, out_features=emb_size)

        if self.CLSisUsed:
            self.cls_token = nn.Parameter(torch.rand(1, 1, emb_size))
        if self.PostionisUsed:
            if self.CLSisUsed:
                self.pos_emb = nn.Parameter(torch.rand(1, self.patch_count ** 2 + 1, emb_size))
            else:
                self.pos_emb = nn.Parameter(torch.rand(1, self.patch_count ** 2, emb_size))

        self.transformer_enc = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=emb_size, nhead=nhead, batch_first=True), num_layers=num_layer)
        self.cls_linear = nn.Linear(in_features=emb_size, out_features=10)

    def forward(self, x):
        x = self.conv(x)  # 输出形状为 (batch_size, 16, height, width)
        x = x.view(x.size(0), x.size(1), -1)  # 变形为 (batch_size, 16, 49) 假设 height=7, width=7
        x = x.permute(0, 2, 1)  # 变形为 (batch_size, 49, 16)
        x = self.patch_emb(x)  # 输出形状为 (batch_size, 49, emb_size

        if self.PostionisUsed:
            if self.CLSisUsed:
                cls_token = self.cls_token.expand(x.size(0), 1, x.size(2))
                x = torch.cat((cls_token, x), dim=1)
            x = x + self.pos_emb
        elif self.CLSisUsed:
            cls_token = self.cls_token.expand(x.size(0), 1, x.size(2))
            x = torch.cat((cls_token, x), dim=1)

        y = self.transformer_enc(
            x)  # (batch_size, seq_len+1, emb_size) if CLSisUsed, else (batch_size, seq_len, emb_size)

        if self.CLSisUsed:
            return self.cls_linear(y[:, 0, :])  # (batch_size, num_classes)
        else:
            x = y.mean(dim=1)  # (batch_size, emb_size)
            return self.cls_linear(x)  # (batch_size, num_classes)


In [4]:
# 手写
from torch import nn
import torch


# 定义了一个名为 PatchEmbed 的 PyTorch 模型类，用于将图像转换为patch嵌入（Patch Embedding）
class PatchEmbed(nn.Module):

    def __init__(self, img_size, patch_size, in_c, embed_dim, norm_layer=None):
        '''
        img_size: 输入图像的大小，默认为 28x28 像素。
        patch_size: patch的大小，默认为 7x7 像素。
        in_c: 输入图像的通道数，默认为 1（适用于灰度图像）。
        embed_dim: 嵌入维度，默认为 64，即每个patch被嵌入到的特征向量的维度。
        norm_layer: 归一化层的类型，默认为 None。如果提供了，则用它来对嵌入向量进行归一化处理；否则使用恒等映射 nn.Identity()
        '''
        super().__init__()
        img_size = (img_size, img_size)  # 图像的高度和宽度
        patch_size = (patch_size, patch_size)  # patch的高度和宽度
        # grid_size 计算了图像可以被分成多少个patch。具体来说，它是一个元组，
        # 其中第一个元素是图像宽度除以patch宽度的整数部分，第二个元素是图像高度除以patch高度的整数部分。
        self.img_size = img_size
        self.patch_size = patch_size
        self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])  # 计算了图像被划分为多少个patch
        self.num_patches = self.grid_size[0] * self.grid_size[1]  # 代码计算了总的patch数量

        self.proj = nn.Conv2d(in_channels=in_c, out_channels=embed_dim, kernel_size=patch_size, stride=patch_size)
        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()

    def forward(self, x):
        B, C, H, W = x.shape
        assert H == self.img_size[0] and W == self.img_size[1], \
            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
        # flatten: [B, C, H, W] -> [B, C, HW]
        # transpose: [B, C, HW] -> [B, HW, C]
        x = self.proj(x).flatten(2).transpose(1, 2)
        x = self.norm(x)
        return x


# 定义注意力机制模块
class Attention(nn.Module):
    def __init__(self, dim, num_heads, qkv_bias, qk_scale, attn_drop_ratio, proj_drop_ratio):
        '''
        self.num_heads 记录了注意力头的数量。
        head_dim 计算每个头的维度。
        self.scale 是注意力分数的缩放因子。
        self.qkv 是一个线性变换层，将输入向量转换为查询（q）、键（k）、值（v）三部分。
        self.attn_drop 和 self.proj_drop 是用于在计算过程中应用的dropout层。
        self.proj 是用于将注意力加权后的结果进行投影的线性变换层。
        '''
        super(Attention, self).__init__()
        self.num_heads = num_heads  # 将输入向量分为多个头以并行计算注意力
        head_dim = dim // num_heads  # 通过整数除法，计算每个头的维度
        self.scale = qk_scale or head_dim ** -0.5  # 设置注意力分数的缩放因子
        # 输入向量的维度是 dim，输出的维度是 dim * 3，因为每个头需要三个不同的映射（q、k、v）
        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop_ratio)
        # 用于在计算完注意力后对加权结果进行投影
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop_ratio)

    def forward(self, x):
        # [batch_size, num_patches + 1, total_embed_dim]
        # num_patches 是指图像被划分为的图块数目，或者序列被划分为的片段数目。
        # 在某些实现中，为了兼容位置编码（position encoding）或其他需要添加的特殊符号
        B, N, C = x.shape
        '''
        B：批量大小（batch_size）
        N：patch 数量加上一个额外的位置编码（num_patches + 1）
        C：总嵌入维度（total_embed_dim）
        '''
        # qkv(): -> [batch_size, num_patches + 1, 3 * total_embed_dim]
        # reshape: -> [batch_size, num_patches + 1, 3, num_heads, embed_dim_per_head]
        # permute: -> [3, batch_size, num_heads, num_patches + 1, embed_dim_per_head]
        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)

        # [batch_size, num_heads, num_patches + 1, embed_dim_per_head]
        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)

        # transpose: -> [batch_size, num_heads, embed_dim_per_head, num_patches + 1]
        # @: multiply -> [batch_size, num_heads, num_patches + 1, num_patches + 1]
        attn = (q @ k.transpose(-2, -1)) * self.scale
        attn = attn.softmax(dim=-1)  # 对张量的最后一个维度应用 softmax 函数
        attn = self.attn_drop(attn)

        # @: multiply -> [batch_size, num_heads, num_patches + 1, embed_dim_per_head]
        # transpose: -> [batch_size, num_patches + 1, num_heads, embed_dim_per_head]
        # reshape: -> [batch_size, num_patches + 1, total_embed_dim]
        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x


class Mlp(nn.Module):

    def __init__(self, in_features, hidden_features, out_features, drop=0.0):

        super().__init__()
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = nn.GELU()  # 注意这里调用了 nn.GELU()
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop = nn.Dropout(drop)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)  # 应用 dropout 在激活函数后面
        x = self.fc2(x)
        return x


class Block(nn.Module):
    def __init__(self,
                 dim,
                 num_heads,
                 mlp_ratio,
                 qkv_bias,
                 qk_scale,
                 drop_ratio,
                 attn_drop_ratio,
                 ):
        super(Block, self).__init__()
        # self.norm1: 第一个归一化层，使用指定的 norm_layer 对输入进行归一化。
        self.norm1 = nn.LayerNorm(dim)

        # self.attn: 注意力机制，使用 Attention 类处理输入，包括注意力计算和投影的dropout。
        self.attn = Attention(dim=dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
                              attn_drop_ratio=attn_drop_ratio, proj_drop_ratio=drop_ratio)

        # self.norm2: 第二个归一化层，再次对输入进行归一化。
        self.norm2 = nn.LayerNorm(dim)
        mlp_hidden_dim = int(dim * mlp_ratio)
        # self.mlp: 多层感知机模块，使用指定的 Mlp 类进行处理，包括激活函数和dropout。
        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, out_features=dim, drop=drop_ratio)

    def forward(self, x):
        x = x + self.attn(self.norm1(x))
        x = x + self.mlp(self.norm2(x))
        return x


class VisionTransformer(nn.Module):
    def __init__(self, img_size=28, patch_size=4, in_c=1, num_classes=10,
                 embed_dim=16, depth=3, num_heads=2, mlp_ratio=4.0, qkv_bias=True,
                 qk_scale=0.5, drop_ratio=0.2, attn_drop_ratio=0.2,
                 embed_layer=PatchEmbed, CLSisUsed=False, PostionisUsed=True):
        super(VisionTransformer, self).__init__()
        self.CLSisUsed = CLSisUsed
        self.PostionisUsed = PostionisUsed
        self.num_classes = num_classes
        self.num_features = self.embed_dim = embed_dim

        self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_c=in_c, embed_dim=embed_dim)
        num_patches = self.patch_embed.num_patches

        if self.CLSisUsed:
            self.cls_token = nn.Parameter(torch.rand(1, 1, embed_dim))

        if self.PostionisUsed:
            if self.CLSisUsed:
                self.pos_emb = nn.Parameter(torch.rand(1, num_patches + 1, embed_dim))
            else:
                self.pos_emb = nn.Parameter(torch.rand(1, num_patches, embed_dim))

        self.pos_drop = nn.Dropout(p=drop_ratio)
        self.blocks = nn.Sequential(*[
            Block(dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
                  drop_ratio=drop_ratio, attn_drop_ratio=attn_drop_ratio)
            for _ in range(depth)
        ])

        self.norm = nn.LayerNorm(embed_dim)
        self.head = nn.Linear(self.num_features, num_classes)

        if self.PostionisUsed:
            nn.init.trunc_normal_(self.pos_emb, std=0.02)
        if self.CLSisUsed:
            nn.init.trunc_normal_(self.cls_token, std=0.02)
        self.apply(self._init_vit_weights)

    def forward_features(self, x):
        x = self.patch_embed(x)

        if self.PostionisUsed:
            if self.CLSisUsed:
                cls_token = self.cls_token.expand(x.size(0), 1, x.size(2))
                x = torch.cat((cls_token, x), dim=1)
            x = x + self.pos_emb
        elif self.CLSisUsed:
            cls_token = self.cls_token.expand(x.size(0), 1, x.size(2))
            x = torch.cat((cls_token, x), dim=1)

        x = self.pos_drop(x)
        x = self.blocks(x)
        x = self.norm(x)
        return x

    def forward(self, x):
        y = self.forward_features(x)
        if self.CLSisUsed:
            return self.head(y[:, 0, :])  # (batch_size, num_classes)
        else:
            x = y.mean(dim=1)  # (batch_size, emb_size)
            return self.head(x)  # (batch_size, num_classes)

    def _init_vit_weights(self, m):
        if isinstance(m, nn.Linear):
            nn.init.trunc_normal_(m.weight, std=.01)
            if m.bias is not None:
                nn.init.zeros_(m.bias)
        # 卷积层 (nn.Conv2d) 的处理
        elif isinstance(m, nn.Conv2d):
            nn.init.kaiming_normal_(m.weight, mode="fan_out")
            if m.bias is not None:
                nn.init.zeros_(m.bias)
        # 层归一化 (nn.LayerNorm) 的处理
        elif isinstance(m, nn.LayerNorm):
            nn.init.zeros_(m.bias)
            nn.init.ones_(m.weight)


In [5]:
import torch

import torch.nn.functional as F
from Look_Best.Vit_Hand import VisionTransformer

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

# Define your dataset
dataset = MNIST_Train()

# Initialize your ViT model
model = VisionTransformer().to(DEVICE)

# Attempt to load pretrained model
try:
    model.load_state_dict(torch.load('Torch_flagTrue_CLSisUsedFalse_PostionisUsedTrue_dim16.pth', map_location=DEVICE))
    print("Successfully loaded model.pth")
except FileNotFoundError:
    print("Pre-trained model not found, starting from scratch.")

# Define optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Training parameters
EPOCHS = 10
BATCH_SIZE = 64

# Data loader
try:
    dataloader = dataset.get_loader()
    iter_count = 0
    for epoch in range(EPOCHS):
        for batch_idx, (imgs, labels) in enumerate(dataloader):
            imgs, labels = imgs.to(DEVICE), labels.to(DEVICE)
            optimizer.zero_grad()
            logits = model(imgs)
            loss = F.cross_entropy(logits, labels)
            loss.backward()
            optimizer.step()
            iter_count += 1
            # Print loss every 1000 iterations
            if iter_count % 1000 == 0:
                print(f'Epoch [{epoch + 1}/{EPOCHS}], Iteration [{iter_count}], Loss: {loss.item()}')
        torch.save(model.state_dict(), 'Torch_flagTrue_CLSisUsedFalse_PostionisUsedTrue_dim16.pth')
        print(f'Saved model after epoch {epoch + 1}')
    print("Training finished!")

except Exception as e:
    print(f"Error during training: {e}")
    raise e

dataset = MNIST_Check()
dataloader = dataset.get_loader()

# 将模型设置为评估模式
model.eval()


# 定义评估函数
def evaluate(model, dataloader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in dataloader:
            data, target = data.to(DEVICE), target.to(DEVICE)
            output = model(data)
            test_loss += F.cross_entropy(output, target, reduction='sum').item()  # 适应你的损失函数
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(dataloader.dataset)
    accuracy = correct / len(dataloader.dataset)
    return test_loss, accuracy


# 执行评估
test_loss, accuracy = evaluate(model, dataloader)

# 打印结果
print(f'Test Loss: {test_loss:.6f}, Accuracy: {accuracy:.6f}')
# Test Loss: 0.310295, Accuracy: 0.901200

In [6]:
# infer.py
import torch
import torch.nn.functional as F


# 在数据准备之前设置设备
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

# 加载模型到正确的设备
model = VisionTransformer().to(DEVICE)
model.load_state_dict(torch.load('model.pth', map_location=DEVICE))

# 准备数据集和数据加载器
dataset = MNIST_Check()
dataloader = dataset.get_loader()

# 将模型设置为评估模式
model.eval()


# 定义评估函数
def evaluate(model, dataloader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in dataloader:
            data, target = data.to(DEVICE), target.to(DEVICE)
            output = model(data)
            test_loss += F.cross_entropy(output, target, reduction='sum').item()  # 适应你的损失函数
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(dataloader.dataset)
    accuracy = correct / len(dataloader.dataset)
    return test_loss, accuracy


# 执行评估
test_loss, accuracy = evaluate(model, dataloader)

# 打印结果
print(f'Test Loss: {test_loss:.6f}, Accuracy: {accuracy:.6f}')
