## 9.12 实战Kaggle比赛：图像分类（CIFAR-10）
该比赛的网页地址是 https://www.kaggle.com/c/cifar-10

In [1]:
import os
import d2lzh as d2l
import tensorflow as tf
from tensorflow import keras
import tensorflow.keras.backend as K
import os
import pandas as pd
import shutil
import time
import numpy as np

### 9.12.1 获取和整理数据集
#### 9.12.1.1 下载数据集
#### 9.12.1.2 解压数据集

下载完训练数据集train.7z和测试数据集test.7z后需要解压缩。解压缩后，将训练数据集、测试数据集以及训练数据集标签分别存放在以下3个路径：

* ../data/kaggle_cifar10/train/[1-50000].png；
* ../data/kaggle_cifar10/test/[1-300000].png；
* ../data/kaggle_cifar10/trainLabels.csv

#### 9.12.1.3 整理数据集
我们需要整理数据集，以方便训练和测试模型。以下的read_label_file函数将用来读取训练数据集的标签文件。该函数中的参数valid_ratio是验证集样本数与原始训练集样本数之比。

In [2]:
train_data = pd.read_csv('data/kaggle_cifar10/trainLabels.csv')
train_data['id'] = train_data['id'].apply(lambda i: str(i)+'.png')
test_data = pd.read_csv('data/kaggle_cifar10/sampleSubmission.csv')
test_data['id'] = test_data['id'].apply(lambda i: str(i)+'.png')
train_data.iloc[0:4, :]

Unnamed: 0,id,label
0,1.png,frog
1,2.png,truck
2,3.png,truck
3,4.png,deer


我们在这里只使用100个训练样本和1个测试样本。训练数据集和测试数据集的文件夹名称分别为train_tiny和test_tiny。相应地，我们仅将批量大小设为1。实际训练和测试时应使用Kaggle比赛的完整数据集，并将批量大小batch_size设为一个较大的整数，如128。我们将10%的训练样本作为调参使用的验证集。

In [3]:
train_dir, test_dir, batch_size = 'train', 'test', 128
data_dir, label_file = 'data/kaggle_cifar10', 'trainLabels.csv'
#input_dir, valid_ratio = 'train_valid_test', 0.1
#reorg_cifar10_data(data_dir, label_file, train_dir, test_dir, input_dir, valid_ratio)

### 9.12.2 图像增广

In [4]:
transform_train = keras.preprocessing.image.ImageDataGenerator(
    featurewise_center=True,
    featurewise_std_normalization=True,
    zoom_range=(0.64, 1),
    horizontal_flip=True,
    validation_split=0.1
)

transform_train.mean = 127.5
transform_train.std = 127.5

transform_test = keras.preprocessing.image.ImageDataGenerator(
    featurewise_center=True,
    featurewise_std_normalization=True
)

transform_test.mean = 127.5
transform_test.std = 127.5

### 9.12.3 读取数据集

In [5]:
train_ds = transform_train.flow_from_dataframe(
    train_data, 
    os.path.join(data_dir, 'train'), 
    x_col='id', y_col='label',
    class_mode="categorical",
    target_size=(32, 32), 
    batch_size=batch_size,
    shuffle=True,
    subset='training')

val_ds = transform_train.flow_from_dataframe(
    train_data, 
    os.path.join(data_dir, 'train'), 
    x_col='id', y_col='label',
    class_mode="categorical",
    target_size=(32, 32), 
    batch_size=batch_size,
    shuffle=False,
    subset='validation')

test_ds = transform_test.flow_from_dataframe(
    test_data, 
    os.path.join(data_dir, 'test'), 
    x_col='id', y_col='label',
    target_size=(32, 32), 
    batch_size=batch_size,
    shuffle=False)

Found 45000 validated image filenames belonging to 10 classes.
Found 5000 validated image filenames belonging to 10 classes.
Found 300000 validated image filenames belonging to 1 classes.


### 9.12.4 定义模型

In [6]:
class Residual(keras.layers.Layer):
    def __init__(self, num_channels, use_1x1conv=False, strides=1, **kwargs):
        super(Residual, self).__init__(**kwargs)
        self.conv1 = keras.layers.Conv2D(num_channels, kernel_size=3, padding='same', strides=strides)
        self.conv2 = keras.layers.Conv2D(num_channels, kernel_size=3, padding='same')
        if use_1x1conv:
            self.conv3 = keras.layers.Conv2D(num_channels, kernel_size=1, strides=strides)
        else:
            self.conv3 = None
        self.bn1 = keras.layers.BatchNormalization()
        self.bn2 = keras.layers.BatchNormalization()
        self.relu1 = keras.layers.ReLU()
        self.relu2 = keras.layers.ReLU()

    def call(self, X):
        Y = self.relu1(self.bn1(self.conv1(X)))
        Y = self.bn2(self.conv2(Y))
        if self.conv3:
            X = self.conv3(X)
        return self.relu2(Y + X)

def resnet18(num_classes):
    inputs = keras.Input((32, 32, 3))
    x = keras.layers.Conv2D(64, kernel_size=3, strides=1, padding='same')(inputs)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Activation('relu')(x)

    def resnet_block(x, num_channels, num_residuals, first_block=False):
        for i in range(num_residuals):
            if i == 0 and not first_block:
                x = Residual(num_channels, use_1x1conv=True, strides=2)(x)
            else:
                x = Residual(num_channels)(x)
        return x

    x = resnet_block(x, 64, 2, first_block=True)
    x = resnet_block(x, 128, 2)
    x = resnet_block(x, 256, 2)
    x = resnet_block(x, 512, 2)
    x = keras.layers.GlobalAvgPool2D()(x)
    x = keras.layers.Dense(num_classes)(x)
    x = keras.layers.Softmax()(x)
    return keras.Model(inputs, x)

In [7]:
def get_net():   
    num_classes = 10
    net = resnet18(num_classes)
    net.compile(optimizer='adam', 
                loss=keras.losses.categorical_crossentropy,
                metrics=['acc'])
    return net

### 9.12.5 定义训练函数
### 9.12.6 训练并验证模型

In [8]:
net = get_net()
net.summary()

W0129 23:55:55.609424 140296452523840 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 32, 32, 3)]       0         
_________________________________________________________________
conv2d (Conv2D)              (None, 32, 32, 64)        1792      
_________________________________________________________________
batch_normalization (BatchNo (None, 32, 32, 64)        256       
_________________________________________________________________
activation (Activation)      (None, 32, 32, 64)        0         
_________________________________________________________________
residual (Residual)          (None, 32, 32, 64)        74368     
_________________________________________________________________
residual_1 (Residual)        (None, 32, 32, 64)        74368     
_________________________________________________________________
residual_2 (Residual)        (None, 16, 16, 128)       230784

In [10]:
net.fit_generator(train_ds, epochs=5, validation_data=val_ds)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f988f989e80>

### 9.12.7 对测试集分类并在Kaggle提交结果

In [11]:
pred = net.predict_generator(test_ds)
classes = np.argmax(pred, axis=-1)

In [12]:
test_data['id'] = test_data['id'].apply(lambda x: x[:-4])
test_data['label'] = classes
test_data['label'] = test_data['label'].apply(lambda x: str(list(train_ds.class_indices.keys())[x]))
test_data.iloc[0:4, :]

Unnamed: 0,id,label
0,1,deer
1,2,airplane
2,3,automobile
3,4,ship


In [13]:
test_data.to_csv('submission.csv', index=False)