# Mindcon 食物分类和西安旅游分类通用notebook

# 1.下载数据到notebook环境

以下是美食分类的数据




In [None]:
import moxing as mox
mox.file.copy('obs://mindcon00001/raw/mindcon_food_classification.zip','/home/ma-user/work/mindcon_food_classification.zip')
!unzip mindcon_food_classification.zip

## 2.打开终端，下载预训练模型
```
mkdir ckpt
cd ckpt
wget https://download.mindspore.cn/vision/classification/vit_b_16_224.ckpt
pwd
```
## 3.处理数据方便MindSpore读取。

处理成标签为文件夹，每个文件夹内含有对应图片，美食图片分类可以直接对照标签修改文件夹名

## 4.数据增强，主要参考了MindsSpore的[mindspore.dataset.vision](https://mindspore.cn/docs/zh-CN/r1.9/api_python/mindspore.dataset.vision.html) API

其中变换方式添加了随机翻转，随机亮度对比度

随机翻转从0.5提升到0.75的时候，验证集精度能提高1%

In [2]:
import os
import mindspore.dataset as ds
# import mindspore.dataset.vision.c_transforms as vision
import mindspore.dataset.vision as vision

def get_dataset():
    image_folder_dataset_dir = "./train"
    mapping = {}
    for i in range(10):
        mapping[str(i)] = int(i)

    mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]
    std = [0.229 * 255, 0.224 * 255, 0.225 * 255]

    transforms_list = [vision.RandomCropDecodeResize(size=224,
                                                    scale=(0.08, 1.0),
                                                    ratio=(0.75, 1.333)),
                    vision.RandomHorizontalFlip(0.75),
                    vision.RandomVerticalFlip(0.75),
                    vision.Normalize(mean=mean, std=std),
                    vision.HWC2CHW(),
                    ]

    dataset = ds.ImageFolderDataset(dataset_dir=image_folder_dataset_dir,
                                    shuffle=True,
                                    num_parallel_workers=8,
                                    class_indexing=mapping
                                    )

    dataset = dataset.map(operations=transforms_list)

    dataset = dataset.batch(32, drop_remainder=True, num_parallel_workers=8)

    train_dataset, val_dataset = dataset.split([0.9, 0.1])
    
    return train_dataset, val_dataset

建立网络，参考官方的ViT模型：
[Vision Transformer图像分类](https://mindspore.cn/tutorials/application/zh-CN/r1.9/cv/vit.html)

In [20]:
from mindspore import nn, ops
import mindspore as ms
from typing import Optional
from mindspore.common.initializer import Normal
from mindspore.common.initializer import initializer
from mindspore import Parameter

def init(init_type, shape, dtype, name, requires_grad):
    """Init."""
    initial = initializer(init_type, shape, dtype).init_data()
    return Parameter(initial, name=name, requires_grad=requires_grad)

class ViT(nn.Cell):
    def __init__(self,
                 image_size: int = 224,
                 input_channels: int = 3,
                 patch_size: int = 16,
                 embed_dim: int = 768,
                 num_layers: int = 12,
                 num_heads: int = 12,
                 mlp_dim: int = 3072,
                 keep_prob: float = 1.0,
                 attention_keep_prob: float = 1.0,
                 drop_path_keep_prob: float = 1.0,
                 activation: nn.Cell = nn.GELU,
                 norm: Optional[nn.Cell] = nn.LayerNorm,
                 pool: str = 'cls',
                 num_classes=1000) -> None:
        super(ViT, self).__init__()

        self.patch_embedding = PatchEmbedding(image_size=image_size,
                                              patch_size=patch_size,
                                              embed_dim=embed_dim,
                                              input_channels=input_channels)
        num_patches = self.patch_embedding.num_patches

        # 此处增加class_embedding和pos_embedding，如果不是进行分类任务
        # 可以只增加pos_embedding，通过pool参数进行控制
        self.cls_token = init(init_type=Normal(sigma=1.0),
                              shape=(1, 1, embed_dim),
                              dtype=ms.float32,
                              name='cls',
                              requires_grad=True)

        # pos_embedding也是一组可以学习的参数，会被加入到经过处理的patch矩阵中
        self.pos_embedding = init(init_type=Normal(sigma=1.0),
                                  shape=(1, num_patches + 1, embed_dim),
                                  dtype=ms.float32,
                                  name='pos_embedding',
                                  requires_grad=True)

        # axis=1定义了会在向量的开头加入class_embedding
        self.concat = ops.Concat(axis=1)

        self.pool = pool
        self.pos_dropout = nn.Dropout(keep_prob)
        self.norm = norm((embed_dim,))
        self.tile = ops.Tile()
        self.transformer = TransformerEncoder(dim=embed_dim,
                                              num_layers=num_layers,
                                              num_heads=num_heads,
                                              mlp_dim=mlp_dim,
                                              keep_prob=keep_prob,
                                              attention_keep_prob=attention_keep_prob,
                                              drop_path_keep_prob=drop_path_keep_prob,
                                              activation=activation,
                                              norm=norm)
        self.dropout = nn.Dropout(keep_prob)
        self.dense = nn.Dense(embed_dim, num_classes)

    def construct(self, x):
        """ViT construct."""
        x = self.patch_embedding(x)

        # class_embedding主要借鉴了BERT模型的用于文本分类时的思想
        # 在每一个word vector之前增加一个类别值，通常是加在向量的第一位
        cls_tokens = self.tile(self.cls_token, (x.shape[0], 1, 1))
        x = self.concat((cls_tokens, x))
        x += self.pos_embedding

        x = self.pos_dropout(x)
        x = self.transformer(x)
        x = self.norm(x)

        # 增加的class_embedding是一个可以学习的参数，经过网络的不断训练
        # 最终以输出向量的第一个维度的输出来决定最后的输出类别；
        x = x[:, 0]

        if self.training:
            x = self.dropout(x)
        x = self.dense(x)
        return x

class Attention(nn.Cell):
    def __init__(self,
                 dim: int,
                 num_heads: int = 8,
                 keep_prob: float = 1.0,
                 attention_keep_prob: float = 1.0):
        super(Attention, self).__init__()

        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = ms.Tensor(head_dim ** -0.5)

        self.qkv = nn.Dense(dim, dim * 3)
        self.attn_drop = nn.Dropout(attention_keep_prob)
        self.out = nn.Dense(dim, dim)
        self.out_drop = nn.Dropout(keep_prob)

        self.mul = ops.Mul()
        self.reshape = ops.Reshape()
        self.transpose = ops.Transpose()
        self.unstack = ops.Unstack(axis=0)
        self.attn_matmul_v = ops.BatchMatMul()
        self.q_matmul_k = ops.BatchMatMul(transpose_b=True)
        self.softmax = nn.Softmax(axis=-1)

    def construct(self, x):
        """Attention construct."""
        b, n, c = x.shape

        # 最初的输入向量首先会经过Embedding层映射成Q(Query)，K(Key)，V(Value)三个向量
        # 由于是并行操作，所以代码中是映射成为dim*3的向量然后进行分割
        qkv = self.qkv(x)

        #多头注意力机制就是将原本self-Attention处理的向量分割为多个Head进行处理
        qkv = self.reshape(qkv, (b, n, 3, self.num_heads, c // self.num_heads))
        qkv = self.transpose(qkv, (2, 0, 3, 1, 4))
        q, k, v = self.unstack(qkv)

        # 自注意力机制的自注意主要体现在它的Q，K，V都来源于其自身
        # 也就是该过程是在提取输入的不同顺序的向量的联系与特征
        # 最终通过不同顺序向量之间的联系紧密性（Q与K乘积经过Softmax的结果）来表现出来
        attn = self.q_matmul_k(q, k)
        attn = self.mul(attn, self.scale)
        attn = self.softmax(attn)
        attn = self.attn_drop(attn)

        # 其最终输出则是通过V这个映射后的向量与QK经过Softmax结果进行weight sum获得
        # 这个过程可以理解为在全局上进行自注意表示
        out = self.attn_matmul_v(attn, v)
        out = self.transpose(out, (0, 2, 1, 3))
        out = self.reshape(out, (b, n, c))
        out = self.out(out)
        out = self.out_drop(out)
        

        return out

class FeedForward(nn.Cell):
    def __init__(self,
                 in_features: int,
                 hidden_features: Optional[int] = None,
                 out_features: Optional[int] = None,
                 activation: nn.Cell = nn.GELU,
                 keep_prob: float = 1.0):
        super(FeedForward, self).__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.dense1 = nn.Dense(in_features, hidden_features)
        self.activation = activation()
        self.dense2 = nn.Dense(hidden_features, out_features)
        self.dropout = nn.Dropout(keep_prob)

    def construct(self, x):
        """Feed Forward construct."""
        x = self.dense1(x)
        x = self.activation(x)
        x = self.dropout(x)
        x = self.dense2(x)
        x = self.dropout(x)

        return x


class ResidualCell(nn.Cell):
    def __init__(self, cell):
        super(ResidualCell, self).__init__()
        self.cell = cell

    def construct(self, x):
        """ResidualCell construct."""
        return self.cell(x) + x


class TransformerEncoder(nn.Cell):
    def __init__(self,
                 dim: int,
                 num_layers: int,
                 num_heads: int,
                 mlp_dim: int,
                 keep_prob: float = 1.,
                 attention_keep_prob: float = 1.0,
                 drop_path_keep_prob: float = 1.0,
                 activation: nn.Cell = nn.GELU,
                 norm: nn.Cell = nn.LayerNorm):
        super(TransformerEncoder, self).__init__()
        layers = []

        # 从vit_architecture图可以发现，多个子encoder的堆叠就完成了模型编码器的构建
        # 在ViT模型中，依然沿用这个思路，通过配置超参数num_layers，就可以确定堆叠层数
        for _ in range(num_layers):
            normalization1 = norm((dim,))
            normalization2 = norm((dim,))
            attention = Attention(dim=dim,
                                  num_heads=num_heads,
                                  keep_prob=keep_prob,
                                  attention_keep_prob=attention_keep_prob)

            feedforward = FeedForward(in_features=dim,
                                      hidden_features=mlp_dim,
                                      activation=activation,
                                      keep_prob=keep_prob)

            # ViT模型中的基础结构与标准Transformer有所不同
            # 主要在于Normalization的位置是放在Self-Attention和Feed Forward之前
            # 其他结构如Residual Connection，Feed Forward，Normalization都如Transformer中所设计
            layers.append(
                nn.SequentialCell([
                    # Residual Connection，Normalization的结构可以保证模型有很强的扩展性
                    # 保证信息经过深层处理不会出现退化的现象，这是Residual Connection的作用
                    # Normalization和dropout的应用可以增强模型泛化能力
                    ResidualCell(nn.SequentialCell([normalization1,
                                                    attention])),

                    ResidualCell(nn.SequentialCell([normalization2,
                                                    feedforward]))
                ])
            )
        self.layers = nn.SequentialCell(layers)

    def construct(self, x):
        """Transformer construct."""
        return self.layers(x)


class PatchEmbedding(nn.Cell):
    MIN_NUM_PATCHES = 4
    def __init__(self,
                 image_size: int = 224,
                 patch_size: int = 16,
                 embed_dim: int = 768,
                 input_channels: int = 3):
        super(PatchEmbedding, self).__init__()

        self.image_size = image_size
        self.patch_size = patch_size
        self.num_patches = (image_size // patch_size) ** 2

        # 通过将输入图像在每个channel上划分为16*16个patch
        self.conv = nn.Conv2d(input_channels, embed_dim, kernel_size=patch_size, stride=patch_size, has_bias=True)
        self.reshape = ops.Reshape()
        self.transpose = ops.Transpose()

    def construct(self, x):
        """Path Embedding construct."""
        x = self.conv(x)
        b, c, h, w = x.shape

        # 再将每一个patch的矩阵拉伸成为一个1维向量，从而获得了近似词向量堆叠的效果；
        x = self.reshape(x, (b, c, h * w))
        x = self.transpose(x, (0, 2, 1))

        return x

定义损失函数

In [21]:
from mindspore.nn import LossBase
from mindspore import nn, ops
import mindspore as ms
from mindspore.common.initializer import One

class CrossEntropySmooth(LossBase):
    """CrossEntropy."""

    def __init__(self, sparse=True, reduction='mean', smooth_factor=0., num_classes=1000):
        super(CrossEntropySmooth, self).__init__()
        self.onehot = ops.OneHot()
        self.sparse = sparse
        self.on_value = ms.Tensor(1.0 - smooth_factor, ms.float32)
        self.off_value = ms.Tensor(1.0 * smooth_factor / (num_classes - 1), ms.float32)
        self.ce = nn.SoftmaxCrossEntropyWithLogits(reduction=reduction)

    def construct(self, logit, label):
        if self.sparse:
            label = self.onehot(label, ops.shape(logit)[1], self.on_value, self.off_value)
        loss = self.ce(logit, label)
        return loss

logits = ms.Tensor(shape = (32, 10), dtype=ms.float32, init=One())
label = ms.Tensor(shape = (32,), dtype=ms.int32, init=One())
network_loss = CrossEntropySmooth(sparse=True,
                                  reduction="mean",
                                  smooth_factor=0.1,
                                  num_classes=10)
loss = network_loss(logits, label)

定义网络

In [None]:
import mindspore as ms
# from vit import ViT
from mindspore.train.callback import LossMonitor, TimeMonitor, CheckpointConfig, ModelCheckpoint
from mindspore import nn, context
# from mindvision.engine.callback import ValAccMonitor


class CustomWithLossCell(nn.Cell):
    """连接前向网络和损失函数"""

    def __init__(self, backbone, loss_fn):
        """前向网络backbone和损失函数loss_fn"""
        super(CustomWithLossCell, self).__init__(auto_prefix=False)
        self._backbone = backbone
        self._loss_fn = loss_fn

    def construct(self, data, label):
        output = self._backbone(data)                 # 前向计算得到网络输出
        return self._loss_fn(output, label)  # 得到多标签损失值

class CustomWithEvalCell(nn.Cell):
    """自定义多标签评估网络"""

    def __init__(self, network):
        super(CustomWithEvalCell, self).__init__(auto_prefix=False)
        self.network = network

    def construct(self, data, label):
        output = self.network(data)
        return output, label

开始训练

In [22]:
ms.set_context(mode=ms.GRAPH_MODE,device_target="Ascend")
train_data, val_data = get_dataset()

network = ViT()
vit_path = "./ckpt/vit_b_16_224.ckpt"
param_dict = ms.load_checkpoint(vit_path)
ms.load_param_into_net(network, param_dict)
network.dense = nn.Dense(768, 10)

epoch_size = 50
momentum = 0.9
num_classes = 10
step_size = train_data.get_dataset_size()

lr = nn.cosine_decay_lr(min_lr=float(0),
                        max_lr=0.00005,
                        total_step=epoch_size * step_size,
                        step_per_epoch=step_size,
                        decay_epoch=10)
network_opt = nn.Adam(network.trainable_params(), lr, momentum)
network_loss = CrossEntropySmooth(sparse=True,
                                    reduction="mean",
                                    smooth_factor=0.1,
                                    num_classes=num_classes)

loss_net = CustomWithLossCell(network, network_loss)
eval_net = CustomWithEvalCell(network)
    
ckpt_config = CheckpointConfig(save_checkpoint_steps=3 * step_size, keep_checkpoint_max=100)
ckpt_callback = ModelCheckpoint(prefix='vit_b_16', directory='./ViT1', config=ckpt_config)
ascend_target = (ms.get_context("device_target") == "Ascend")
if ascend_target:
    model = ms.Model(loss_net, optimizer=network_opt, eval_network=eval_net, metrics={"acc"}, amp_level="O2")
else:
    model = ms.Model(loss_net, optimizer=network_opt, eval_network=eval_net, metrics={"acc"}, amp_level="O0")
model.train(epoch_size,
            train_data,
            callbacks=[ckpt_callback, LossMonitor(step_size), TimeMonitor(step_size)]
            )
result = model.eval(val_data)
print(result)



epoch: 1 step: 126, loss is 0.7470703125
epoch time: 80074.762 ms, per step time: 635.514 ms
epoch: 2 step: 126, loss is 0.63134765625
epoch time: 13280.406 ms, per step time: 105.400 ms
epoch: 3 step: 126, loss is 0.69091796875
epoch time: 17897.302 ms, per step time: 142.042 ms
epoch: 4 step: 126, loss is 0.5615234375
epoch time: 13280.550 ms, per step time: 105.401 ms
epoch: 5 step: 126, loss is 0.5830078125
epoch time: 13275.676 ms, per step time: 105.363 ms
epoch: 6 step: 126, loss is 0.5947265625
epoch time: 17813.303 ms, per step time: 141.375 ms
epoch: 7 step: 126, loss is 0.5986328125
epoch time: 13291.845 ms, per step time: 105.491 ms
epoch: 8 step: 126, loss is 0.58349609375
epoch time: 13284.360 ms, per step time: 105.431 ms
epoch: 9 step: 126, loss is 0.5888671875
epoch time: 17414.108 ms, per step time: 138.207 ms
epoch: 10 step: 126, loss is 0.55615234375
epoch time: 13276.757 ms, per step time: 105.371 ms
epoch: 11 step: 126, loss is 0.5625
epoch time: 13272.435 ms, per

In [7]:
%pwd

'/home/ma-user/work/mindcon_food_classification-main-99dad61560528d4d507df048402405d3db6bed18'

开始预测

In [24]:
from PIL import Image
import os
import numpy as np
import json

import mindspore as ms
from mindspore import Tensor

# from vit import ViT

def process_image(path):

    image_list = []

    mean = np.array([0.485 * 255, 0.456 * 255, 0.406 * 255])
    std = np.array([0.229 * 255, 0.224 * 255, 0.225 * 255])

    files = os.listdir(path)
    files = sorted(files, key=lambda x:(x.split('.')[0]))
    
    for file in files:
        image = Image.open(path + file).convert("RGB")
        image = image.resize((224, 224))
        # plt.imshow(image)
        image = (image - mean) / std
        image = image.astype(np.float32)
        image = np.transpose(image, (2, 0, 1))
        image = np.expand_dims(image, axis=0)
        image_list.append(image)
    return image_list




ms.set_context(device_target="CPU")
# ms.set_context(device_target="Ascend")
images = process_image("./test/")

network = ViT(num_classes=10)
vit_path = "./ViT1/vit_b_16-48_126.ckpt"
param_dict = ms.load_checkpoint(vit_path)
ms.load_param_into_net(network, param_dict)

model = ms.Model(network)

results = []
for image in images:
    pre = model.predict(Tensor(image,ms.float32))
    result = np.argmax(pre)
    results.append(result)
    print(result)
        
with open("food0113.txt","w") as f:
    for result in results:
        f.write(str(result))
        f.write('\n')
print(results)


7
1
8
9
5
1
7
2
2
4
2
1
3
7
7
8
4
2
0
1
1
4
0
4
6
8
1
4
7
5
2
0
2
7
3
9
6
0
5
7
6
3
4
0
1
1
4
8
9
8
1
3
8
2
5
2
2
5
8
8
0
5
2
9
6
6
8
0
4
0
5
2
7
2
2
2
7
0
3
6
6
8
3
1
9
5
3
4
0
3
3
4
9
1
2
1
7
1
3
8
1
9
7
1
9
0
7
1
6
4
5
1
0
6
8
2
6
0
2
5
9
4
5
4
1
5
9
8
2
4
0
4
7
9
5
2
6
9
2
5
9
2
1
0
6
0
2
7
0
1
2
0
2
1
0
5
6
0
1
0
9
7
2
3
3
0
4
3
4
4
7
1
8
6
0
9
9
2
4
1
5
0
9
7
8
5
4
4
5
4
1
6
6
9
9
8
5
5
6
6
3
5
3
9
0
0
8
2
3
3
8
4
3
4
6
8
8
7
5
4
1
7
9
5
5
5
4
8
5
2
0
8
2
0
4
7
7
9
0
8
8
8
7
9
2
3
0
9
3
1
9
3
5
0
3
0
9
8
7
6
3
1
3
3
4
2
3
6
0
2
8
6
5
7
7
9
5
8
4
1
9
5
4
9
8
7
3
5
6
6
4
8
4
9
5
0
0
3
2
6
5
2
9
4
0
1
3
0
8
6
7
8
9
4
1
2
5
9
5
1
6
7
8
0
6
8
6
1
6
1
6
7
6
2
6
9
8
1
2
2
8
4
5
4
0
4
7
0
3
6
1
7
9
9
0
3
0
7
5
3
2
8
6
8
8
6
8
6
6
2
3
1
9
0
9
3
5
7
1
4
1
3
7
6
2
9
7
1
2
4
3
0
8
1
0
4
3
7
2
2
2
4
0
2
4
5
0
2
6
2
5
0
0
5
3
1
8
5
3
8
3
7
0
9
6
5
2
0
5
7
7
9
2
8
3
6
9
4
0
6
9
6
9
6
4
8
4
6
7
6
9
5
3
7
9
7
4
8
3
9
3
1
5
2
9
4
8
3
3
1
5
5
6
7
4
8
7
8
4
3
1
8
5
2
5
5
7
6
4
1
9
7
8
9
3
6
5
9
7
1
