In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets,transforms as T
from torchsummary import summary
from tqdm import tqdm

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [2]:
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
print(device)
print(torch.cuda.is_available())

cuda:0
True


In [3]:
# in_channels---输入维度
# out_channels---输出维度
# kernel_size---卷积核大小
# stride---步长
# padding---边缘填充
# num_blocks---添加的层数
# max_pooling---最大池化

# 设置一个卷积层类
class Conv(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1,
                 padding=None, groups=1, activation=True):
        super(Conv, self).__init__()
        padding = kernel_size // 2 if padding is None else padding  # 来保持输入和输出的尺寸一致

        # 定义一个2维卷积层
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride,
                              padding, groups=groups, bias=True)
        # 定义一个激活函数
        self.act = nn.ReLU(inplace=True) if activation else nn.Identity()   # 如果activation为Ture则使用re正则化，反之使用恒等映射

    def forward(self, x):
        return self.act(self.conv(x))   # 卷积输出的值直接给激活函数进行激活，最后返回激活后的值

# 构建 VGG19 模型
class VGG19(nn.Module):
    def __init__(self, num_classes):
        super(VGG19, self).__init__()
        
        # 定义神经网络的各个阶段
        self.stages = nn.Sequential(*[self._make_stage(784, 784, num_blocks=1, max_pooling=False, dropout=False),
            self._make_stage(784, 392, num_blocks=1, max_pooling=False, dropout=False),
            self._make_stage(392, 196, num_blocks=1, max_pooling=False, dropout=False),
            self._make_stage(196, 92, num_blocks=1, max_pooling=False, dropout=False),]
        )
        
        # 定义神经网络的头部（全连接层）
        self.head = nn.Sequential(*[nn.Flatten(start_dim=1, end_dim=-1),
            nn.Linear(92, 92),
            nn.ReLU(inplace=True),
            nn.Linear(92, 46),
            nn.ReLU(inplace=True),
            nn.Linear(46, 46),
            nn.ReLU(inplace=True),
            nn.Linear(46, 32),
            nn.ReLU(inplace=True),
            nn.Linear(32, num_classes)]
        )

    # 定义一个静态方法
    @staticmethod
    def _make_stage(in_channels, out_channels, num_blocks, max_pooling, dropout):
        # 定义一个VGG阶段，包括卷积层和（可选的）最大池化层
        layers = [Conv(in_channels, out_channels, kernel_size=3, stride=1)]     # 创建了一个列表 layers，其中包含了阶段的第一个卷积层。

        for _ in range(1, num_blocks):  # "_"符号的意思就是一个占位符，告诉阅读的人该变量在循环内没使用
            layers.append(Conv(out_channels, out_channels, kernel_size=3, stride=1))

        if max_pooling:     # 如果要使用最大池化就再模型里添加一层
            layers.append(nn.MaxPool2d(kernel_size=2, stride=2, padding=0))
        
        # 再添加一层Dropout层
        if dropout:
            layers.append(nn.Dropout(0.5))
        return nn.Sequential(*layers)

    def forward(self, x):
        # 前向传播函数，将输入数据通过各个阶段和头部层
        return self.head(self.stages(x))


In [4]:
model = VGG19(num_classes=10).to(device)
summary(model)

Layer (type:depth-idx)                   Param #
├─Sequential: 1-1                        --
|    └─Sequential: 2-1                   --
|    |    └─Conv: 3-1                    5,532,688
|    └─Sequential: 2-2                   --
|    |    └─Conv: 3-2                    2,766,344
|    └─Sequential: 2-3                   --
|    |    └─Conv: 3-3                    691,684
|    └─Sequential: 2-4                   --
|    |    └─Conv: 3-4                    162,380
├─Sequential: 1-2                        --
|    └─Flatten: 2-5                      --
|    └─Linear: 2-6                       8,556
|    └─ReLU: 2-7                         --
|    └─Linear: 2-8                       4,278
|    └─ReLU: 2-9                         --
|    └─Linear: 2-10                      2,162
|    └─ReLU: 2-11                        --
|    └─Linear: 2-12                      1,504
|    └─ReLU: 2-13                        --
|    └─Linear: 2-14                      330
Total params: 9,169,926
Trainable 

Layer (type:depth-idx)                   Param #
├─Sequential: 1-1                        --
|    └─Sequential: 2-1                   --
|    |    └─Conv: 3-1                    5,532,688
|    └─Sequential: 2-2                   --
|    |    └─Conv: 3-2                    2,766,344
|    └─Sequential: 2-3                   --
|    |    └─Conv: 3-3                    691,684
|    └─Sequential: 2-4                   --
|    |    └─Conv: 3-4                    162,380
├─Sequential: 1-2                        --
|    └─Flatten: 2-5                      --
|    └─Linear: 2-6                       8,556
|    └─ReLU: 2-7                         --
|    └─Linear: 2-8                       4,278
|    └─ReLU: 2-9                         --
|    └─Linear: 2-10                      2,162
|    └─ReLU: 2-11                        --
|    └─Linear: 2-12                      1,504
|    └─ReLU: 2-13                        --
|    └─Linear: 2-14                      330
Total params: 9,169,926
Trainable 

In [5]:
# 设置批大小
batch_size=32

# 数据增强
train_enhance = T.Compose([
                    T.Resize((224,224)),            # 固定图像大小
                    T.RandomHorizontalFlip(0.5),    # 随机数据水平翻转
                    T.ToTensor()                    # 转换为torch接受的张量类型
                    ])

test_enhance = T.Compose([
                    T.Resize((224,224)),
                    T.ToTensor()
                    ])

# mnist数据集
train_database = datasets.MNIST(root = r"data\mnist",
                                train = True, transform = train_enhance, download = True)

test_database = datasets.MNIST(root = r"data\mnist",
                               train = False, transform=test_enhance, download = True)
train_loader = torch.utils.data.DataLoader(dataset = train_database, batch_size = batch_size, shuffle = True)
test_loader = torch.utils.data.DataLoader(dataset = test_database, batch_size = batch_size, shuffle = True)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to data\mnist\MNIST\raw\train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:10<00:00, 919321.39it/s] 


Extracting data\mnist\MNIST\raw\train-images-idx3-ubyte.gz to data\mnist\MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to data\mnist\MNIST\raw\train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 764929.05it/s]


Extracting data\mnist\MNIST\raw\train-labels-idx1-ubyte.gz to data\mnist\MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to data\mnist\MNIST\raw\t10k-images-idx3-ubyte.gz


100%|██████████| 1648877/1648877 [00:01<00:00, 1031294.28it/s]


Extracting data\mnist\MNIST\raw\t10k-images-idx3-ubyte.gz to data\mnist\MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to data\mnist\MNIST\raw\t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<?, ?it/s]

Extracting data\mnist\MNIST\raw\t10k-labels-idx1-ubyte.gz to data\mnist\MNIST\raw






In [6]:
for i in train_loader:
    print(i[0].size())
    break
print(len(train_loader))

torch.Size([32, 1, 224, 224])
1875


In [7]:
# 固定随机种子
torch.manual_seed(-1)
# 学习率
learning_rate = 1e-5
# 训练轮数
num_epochs = 50
# 优化算法Adam = RMSProp + Momentum (梯度、lr两方面优化下降更快更稳)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) 
# 交叉熵损失函数                                           
loss_fn = torch.nn.CrossEntropyLoss()  

In [8]:
def evaluate_accuracy(data_iter,model):
    '''
        模型预测精度
    '''
    total = 0
    correct = 0 
    with torch.no_grad():
        model.eval()
        for images,labels in data_iter:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            _,predicts = torch.max(outputs.data, dim=1)
            total += labels.size(0)
            correct += (predicts == labels).cpu().sum()
    return 100 * correct / total

In [9]:
def train(data_loader=train_loader, optimizer=optimizer, loss_fn=loss_fn, epochs=num_epochs, device=device):
    for epoch in range(epochs):
        print('当前训练周期 = {}'.format(epoch))
        loop=tqdm((data_loader),total=len(data_loader))
        for i, (images, labels) in enumerate(loop):
            train_accuracy_total = 0
            train_correct = 0
            train_loss_sum = 0
            model.train()  # 设置模型为训练模式

            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)
            loss = loss_fn(outputs, labels)   # 计算模型的损失
            optimizer.zero_grad()              # 清空梯度
            loss.backward()                   # 反向传播计算梯度
            optimizer.step()                  # 更新参数
            loop.set_description(f"epoch{epoch}")
            loop.set_postfix(loss=loss.cpu().detach().numpy())
            train_loss_sum += loss.item()
            _, predicts = torch.max(outputs.data, dim=1)  # 获取每行最大值的索引，即预测的类别
            train_accuracy_total += labels.size(0)
            train_correct += (predicts == labels).cpu().sum().item()

        test_acc = evaluate_accuracy(test_loader, model)  # 在测试集上评估准确度
        print(f'训练周期:{epoch},   损失:{train_loss_sum/len(data_loader):.4f},   训练准确度:{train_correct/train_accuracy_total:.3f},  测试准确度:{test_acc:.3f}')
        torch.save(model,f"VGG-19-{epoch}.pt")

    print('------------训练完成-------------')

In [10]:
train(train_loader, )

当前训练周期 = 0


epoch0:   0%|          | 2/1875 [00:18<4:48:39,  9.25s/it, loss=2.3042238]


KeyboardInterrupt: 

In [None]:
model = torch.load("./VGG-19-0.pt")
print(model)

In [13]:
import tkinter as tk
from PIL import Image, ImageDraw
import torch
from torchvision import transforms

class HandwritingRecognitionApp:
    def __init__(self, master):
        self.master = master
        self.master.title("手写识别应用")

        self.canvas = tk.Canvas(self.master, width=200, height=200, bg="white")
        self.canvas.pack()

        self.canvas.bind("<B1-Motion>", self.draw)

        self.label_result = tk.Label(self.master, text="预测结果: ")
        self.label_result.pack()

        self.button_predict = tk.Button(self.master, text="预测", command=self.predict)
        self.button_predict.pack()

        self.button_clear = tk.Button(self.master, text="清空", command=self.clear_canvas)
        self.button_clear.pack()

        # 加载模型
        self.model = torch.load("./VGG-19-1.pt")

        # 优先选择GPU，没有则选择CPU
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = self.model.to(self.device)

        self.transform = transforms.Compose([
            transforms.Grayscale(),
            transforms.Resize((224, 224)),
            transforms.ToTensor()
        ])

        self.image = Image.new("L", (200, 200), color="white")
        self.draw = ImageDraw.Draw(self.image)

    def draw(self, event):
        x1, y1 = (event.x - 1), (event.y - 1)
        x2, y2 = (event.x + 1), (event.y + 1)
        self.canvas.create_oval(x1, y1, x2, y2, fill="black", width=5)
        self.draw.line([x1, y1, x2, y2], fill="black", width=5)

    def predict(self):
        try:
            input_image = self.transform(self.image).unsqueeze(0).to(self.device, dtype=torch.float32)
            print("Input image size:", input_image.size())
            
            with torch.no_grad():
                output = self.model(input_image)
            print("Model output size:", output.size())
            print("Model output size:", output)
            
            _, predicted = torch.max(output, 1)
            prediction = str(predicted.item())
            self.label_result.config(text="Prediction: " + prediction)
        except Exception as e:
            self.label_result.config(text="Prediction failed. Error: " + str(e))

    def clear_canvas(self):
        self.canvas.delete("all")
        self.image = Image.new("L", (500, 500), color="white")
        self.draw = ImageDraw.Draw(self.image)
        self.label_result.config(text="Prediction: ")

if __name__ == "__main__":
    root = tk.Tk()
    app = HandwritingRecognitionApp(root)
    root.mainloop()


Input image size: torch.Size([1, 1, 224, 224])
Model output size: torch.Size([1, 10])
Model output size: tensor([[ 0.8151,  1.7634,  0.0844, -3.0375,  1.5822, -1.5031, -0.3473, -2.6272,
          3.2717, -1.1934]], device='cuda:0')
Input image size: torch.Size([1, 1, 224, 224])
Model output size: torch.Size([1, 10])
Model output size: tensor([[ 0.8216,  1.7584,  0.0825, -3.0422,  1.5899, -1.5076, -0.3442, -2.6251,
          3.2693, -1.1917]], device='cuda:0')
Input image size: torch.Size([1, 1, 224, 224])
Model output size: torch.Size([1, 10])
Model output size: tensor([[ 0.9087,  1.6903,  0.0501, -3.0874,  1.6221, -1.5084, -0.3379, -2.6022,
          3.2263, -1.1632]], device='cuda:0')
