# ShuffleNetV1

首先需要了解分组卷积的概念

## 基本单元如下

![unit](shuffnetv1_unit.png)

## 结构如下

repeat为重复次数,g列对应的输入,表示group取对应值的时候,该层输出可能会有变化.而对于瓶颈层，将通道设为每个ShuffleNet单元输出通道的1/4

![unit](shuffnetv1_stru.png)

In [1]:
%matplotlib inline
import torch
from torch import nn
from torch.nn import functional as F
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import torch.optim as optim
from tqdm import tqdm
import os
print(torch.__version__)

1.10.2


In [2]:

class Bottleneck(nn.Module):
    def __init__(self, in_channels, out_channels, stride, groups):
        super().__init__()
        # 瓶颈层输出为每个单元输出的1/4
        mid_channles = int(out_channels/4)
        # 作者提到不在stage2的第一个pointwise层使用组卷积,因为输入channel数量太少,只有24
        # 所以第一个stage特殊处理
        if in_channels <=24:
            self.groups = 1
        else:
            self.groups = groups
        self.stride = stride
        # 分组1x1卷积,特征图深度变为本单元输出的1/4
        self.conv1 = nn.Sequential(
            # 第一个stage的第一个1x1可能会不用组卷积,所以特殊处理下
            nn.Conv2d(in_channels, mid_channles, 1, groups=self.groups, bias=False),
            nn.BatchNorm2d(mid_channles),
            nn.ReLU(inplace=True)
        )
        # 跨距为2时,缩小特征图wh,不改变深度,后接bn无relu
        self.conv2 = nn.Sequential(
            nn.Conv2d(mid_channles, mid_channles, 3, stride=stride, padding=1, groups=mid_channles, bias=False),
            nn.BatchNorm2d(mid_channles)
        )
        # 分组1x1卷积,不改变尺寸,但是改变深度为输出深度,后接bn无relu
        self.conv3 = nn.Sequential(
            nn.Conv2d(mid_channles, out_channels, 1, groups=groups, bias=False),
            nn.BatchNorm2d(out_channels)
        )
        if(self.stride==2):
            # 特征图尺寸减为一半,和跨距为2的效果相同
            self.shortcut = nn.Sequential(nn.AvgPool2d(3, stride=2, padding=1))

    def forward(self, x):
        out = self.conv1(x)
        # shuffle单元只对第一个1x1分组卷积之后进行shuffle,作者说第二个1x1卷积之后再做shuffle效果一般
        out = Bottleneck.shuffle(out, self.groups)
        out = self.conv2(out)
        out = self.conv3(out)
        if self.stride == 2:
            # 堆叠后,输出深度为out_channel+in_channel,外层需注意
            res = self.shortcut(x)
            out = F.relu(torch.cat([out, res], 1))
        else:
            # 都是对最后的输出加上relu
            out = F.relu(out+x)
        return out
    @staticmethod
    def shuffle(x, groups):
        N, C, H, W = x.size()
        out = x.view(N, groups, C // groups, H, W).permute(0, 2, 1, 3, 4).contiguous().view(N, C, H, W)
        return out
    
class ShuffleNet(nn.Module):
    channel_num=(
        # 有关各种group对应每层的输出参数
        # group的取值为1,2,3,4,8
        (144,288,576),
        (200,400,800),
        (240,480,960),
        (272,544,1088),
        (),
        (),
        (),
        (384,768,1536),
    )
    groups_area=(1,2,3,4,8)
    def __init__(self, groups, first_channel=3,class_num=10):
        super().__init__()
        self.class_num=class_num
        if not groups in self.groups_area:
            raise ValueError("groups value %s error"%groups)
        self.groups=groups
        self.channel_stru=self.channel_num[self.groups]
        # in 224x224x3
        self.conv1 = nn.Sequential(
            nn.Conv2d(first_channel, 24, kernel_size=3, stride=2, padding=1, bias=False),
            nn.BatchNorm2d(24),
            nn.ReLU(inplace=True)
        )
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        # in 56x56xx24
        self.stage2 = self.make_layers(24, self.channel_stru[0], repeat_times=4, strides=2, groups=groups)
        self.stage3 = self.make_layers(self.channel_stru[0], self.channel_stru[1], repeat_times=8, strides=2, groups=groups)
        self.stage4 = self.make_layers(self.channel_stru[1], self.channel_stru[2], repeat_times=4, strides=2, groups=groups)
        self.avgpool = nn.AdaptiveAvgPool2d(1)
        self.fc = nn.Linear(self.channel_stru[2],self.class_num)
    def make_layers(self, input_channels, output_channels, repeat_times, strides, groups):
        layers = []
        if strides!=2:
            raise ValueError("first head conv strides must 2",strides)
        # 第一层跨距为2,肯定会进行堆叠,所以输出层输写成 实际输出-输入层数
        layers.append(Bottleneck(input_channels, output_channels - input_channels, strides, groups))
        input_channels = output_channels
        for i in range(repeat_times - 1):
            Bottleneck(input_channels, output_channels, 1, groups)
        return nn.Sequential(*layers)
    def forward(self, x):
        x = self.conv1(x)
        x = self.maxpool(x)
        x = self.stage2(x)
        x = self.stage3(x)
        x = self.stage4(x)
        x = self.avgpool(x)
        # 这之后还是ncwh四维的,不过是(n,1024,1,1)这种
        # flatten(1),从第1个维度开始到一直展平到最后一个维度
        x = x.flatten(1)
        x = self.fc(x)
        return x

In [3]:
net=ShuffleNet(groups=2)
data=torch.randn(4,3,224,224)
out=net(data)
out.shape

torch.Size([4, 10])

In [4]:
device='cuda' if torch.cuda.is_available() else "cpu"
net.to(device)

ShuffleNet(
  (conv1): Sequential(
    (0): Conv2d(3, 24, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (1): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
  )
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (stage2): Sequential(
    (0): Bottleneck(
      (conv1): Sequential(
        (0): Conv2d(24, 54, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (1): BatchNorm2d(54, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU(inplace=True)
      )
      (conv2): Sequential(
        (0): Conv2d(54, 54, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=54, bias=False)
        (1): BatchNorm2d(54, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (conv3): Sequential(
        (0): Conv2d(54, 216, kernel_size=(1, 1), stride=(1, 1), groups=2, bias=False)
        (1): BatchNorm2d(216, eps=1e-05, moment

In [5]:
# 注意torch里是宽高顺序,和PIL不同
img_size=(224,224)
# mean和std均值需要按照数据集来修改
mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]
batch=64
lr=0.01

In [6]:
#prepare dataset and preprocessing
transform_train = transforms.Compose([
    transforms.Resize(img_size),
    transforms.RandomCrop(img_size, padding=4),
    transforms.RandomHorizontalFlip(),
    # 至少要加上下面这句ToTensor
    transforms.ToTensor(),
    # ciaf10固有均值标准差
    transforms.Normalize(mean,std)
])

transform_val = transforms.Compose([
    transforms.Resize(img_size),
    # 至少要加上下面这句ToTensor
    transforms.ToTensor(),
    transforms.Normalize(mean,std)
])

In [7]:
# animals10数据集
# 正常来说,train_set这个类是需要自己定义的,但是在官方数据集中已经给定义好了
data_dir="D:/data/image/"
if not os.path.exists(data_dir):
    # 尝试mac的文件夹
    data_dir="~/data"
    if not os.path.exists(data_dir):
            raise FileExistsError("data source not exist!")
train_set=datasets.ImageFolder(root=data_dir+'animals10/train',
                        transform=transform_train)

val_set=datasets.ImageFolder(root=data_dir+'animals10/val',
                        transform=transform_val)
train_set_len=len(train_set)
val_set_len=len(val_set)
print('train data',train_set_len)
print('val data',val_set_len)
print('train label',train_set.class_to_idx)
print('val label',val_set.class_to_idx)
train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch,
                                         shuffle=True, num_workers=6)
val_loader = torch.utils.data.DataLoader(val_set, batch_size=batch,
                                         shuffle=False, num_workers=6)


train data 23556
val data 2623
train label {'cane': 0, 'cavallo': 1, 'elefante': 2, 'farfalla': 3, 'gallina': 4, 'gatto': 5, 'mucca': 6, 'pecora': 7, 'ragno': 8, 'scoiattolo': 9}
val label {'cane': 0, 'cavallo': 1, 'elefante': 2, 'farfalla': 3, 'gallina': 4, 'gatto': 5, 'mucca': 6, 'pecora': 7, 'ragno': 8, 'scoiattolo': 9}


In [8]:
class CalcLoss(nn.Module):
    def __init__(self,num_classes=10):
        super().__init__()
        self.criterion=nn.CrossEntropyLoss()
    def forward(self,y_true,y_pred):
        return self.criterion(y_true,y_pred)
    
class TrainModel(object):
    _defaults={
        "eopch":2,
    }
    def __init__(self,net,loss,train_dataloder,optimizer,**kwargs):
        '''
        还支持传入字典参数
        '''
        self.__dict__.update(self._defaults)
        self.net=net
        self.loss=loss
        self.dataloder=train_dataloder
        self.optimizer=optimizer
        for name, value in kwargs.items():
            setattr(self, name, value)
        self.device='cuda' if torch.cuda.is_available() else "cpu"
    def __call__(self,train_total_len,batch_size):
        self.net.train()
        # 需要注意这里写的是train_set的长度,如果写错成train_loader,返回的是数据集一共有多少个batch
        with tqdm(total=train_total_len,desc=f'Train:') as pbar:
            for idx,data in enumerate(self.dataloder):
                data,label=data
                data,label=data.to(self.device),label.to(self.device)
                self.optimizer.zero_grad()
                # forward
                outputs=self.net(data)
                loss=self.loss(outputs,label)
                loss.backward()
                self.optimizer.step()
                # 更新进度条
                pbar.update(batch_size)

class TestModel(object):
    _defaults={
    "eopch":1,
    }
    def __init__(self,net,loss,val_dataloder,**kwargs):
        '''
        还支持传入字典参数
        '''
        self.__dict__.update(self._defaults)
        self.net=net
        self.loss=loss
        self.dataloder=val_dataloder
        for name, value in kwargs.items():
            setattr(self, name, value)
        self.device='cuda' if torch.cuda.is_available() else "cpu"
    def __call__(self,total_num,batch_size):
        # eval 模式下,dropout失效,bn层参数采用之前训练的,不更新
        self.net.eval()
        val_loss=0
        correct=0
        with tqdm(total=total_num,desc=f'Validation:') as pbar:
            # 不计算损失,这样速度更快
            with torch.no_grad():
                for idx,data in enumerate(self.dataloder):
                    x,y=data
                    x,y=x.to(self.device),y.to(self.device)
                    y_pre=self.net(x)
                    val_loss+=self.loss(y_pre,y).item()
                    # max 第一个返回的是元素值,第二个为索引值
                    # 求第一个维度的max,因此结果返回的是batch维度的max
                    # 返回是一个第一个元素为值,第二个元素为idx的tuple
                    pred=torch.max(y_pre,dim=1)[1]

                    # pred维度为batch,每个元素为索引
                    correct+=pred.eq(y).sum().item()
                    # 更新进度条
                    pbar.update(batch_size)
            # 格式化打印直接有% 带f%这种{:.2f%}是错的格式,format这种有点坑
            print("test loss {},accuracy {:.2%}".format(val_loss,correct/total_num))
        # 返回损失和准确率
        return (val_loss,correct/total_num)

In [9]:
optimizer=optim.SGD(net.parameters(),lr=lr,momentum=0.9)
loss=CalcLoss().to(device)
train=TrainModel(net,loss,train_loader,optimizer)
test=TestModel(net,loss,val_loader)

In [11]:
total_epoch=10
for i in range(total_epoch):
    train(train_set_len,batch)
    test(val_set_len,batch)

Train:: 23616it [00:26, 884.62it/s]                                                                                                
Validation:: 2624it [00:09, 285.04it/s]                                                                                            


test loss 37.586592614650726,accuracy 70.68%


Train:: 23616it [00:26, 883.54it/s]                                                                                                
Validation:: 2624it [00:09, 284.59it/s]                                                                                            


test loss 36.34435893595219,accuracy 70.45%


Train:: 23616it [00:26, 879.93it/s]                                                                                                
Validation:: 2624it [00:09, 283.61it/s]                                                                                            


test loss 34.46480464935303,accuracy 71.83%


Train:: 23616it [00:26, 878.31it/s]                                                                                                
Validation:: 2624it [00:09, 272.24it/s]                                                                                            


test loss 35.90738967806101,accuracy 71.56%


Train:: 23616it [00:27, 862.26it/s]                                                                                                
Validation:: 2624it [00:09, 283.00it/s]                                                                                            


test loss 36.119284614920616,accuracy 71.25%


Train:: 23616it [00:27, 861.39it/s]                                                                                                
Validation:: 2624it [00:09, 283.88it/s]                                                                                            


test loss 34.268888011574745,accuracy 72.93%


Train:: 23616it [00:27, 864.76it/s]                                                                                                
Validation:: 2624it [00:09, 283.28it/s]                                                                                            


test loss 32.148918479681015,accuracy 75.14%


Train:: 23616it [00:27, 859.65it/s]                                                                                                
Validation:: 2624it [00:09, 281.91it/s]                                                                                            


test loss 31.773900374770164,accuracy 75.22%


Train:: 23616it [00:27, 856.01it/s]                                                                                                
Validation:: 2624it [00:09, 285.27it/s]                                                                                            


test loss 32.455738842487335,accuracy 74.53%


Train:: 23616it [00:27, 867.82it/s]                                                                                                
Validation:: 2624it [00:09, 282.70it/s]                                                                                            

test loss 31.127081111073494,accuracy 75.68%





In [12]:
torch.save(net.state_dict(),"shufflenetv1_loss31_accuracy75.pth")

## 大小

这个模型才400KB,可以说是非常小了!