# Image Style Transferの実装

--------------------------------
Gatys, Ecker, and Bethge(2016)の画像スタイル変換を実装する。

【参考文献】  
L. A. Gatys, A. S. Ecker, and M. Bethge,  
Image style transfer using convolutional neural networks,  
In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pages, 2414-2423, 2016.

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [2]:
CONTENT_FILE = '/home/ishiyama/image_style_transfer/image/input/test_input_01.JPG'
STYLE_FILE = '/home/ishiyama/image_style_transfer/image/style/test_style_01.jpg'

In [3]:
def read_images_as_jpeg(file):
    
    """
    JPEG Image Reader
    
    This function reads the content and style images as JPEG format.
    These image data must be square for now, different height and
    width will be able to supplied for future.
    
    Args:
        file : str. A path of the image file.
    
    Returns:
        A tuple. Each Elements are Tensor object of the read images.
    """
    
    filename_queue = tf.train.string_input_producer([file])
    reader = tf.WholeFileReader()
    key, value = reader.read(filename_queue)
    image = tf.image.decode_jpeg(value)

    # Convert read image data to RGB representation.
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    tf.train.start_queue_runners(sess)
    image_rgb = sess.run(image)
    height, width, channels = image_rgb.shape

    return image_rgb.reshape([1, height, width, channels])

In [4]:
image = read_images_as_jpeg(file=CONTENT_FILE)

# VGGを実装する
------------------------
画像の特徴量を抽出するアルゴリズムにはSimonyan and Zisserman(2015)で提案されたCNN(VGG19)の畳込み層とプーリング層が使われている。  
ここでは、「TensorFlowで学ぶディープラーニング入門」の多層CNNの実装を参考にVGG19を構築する。  

【参考文献】  
K. Simonyan and A. Zisserman, Very Deep Convolutional Networks For Large-Scale Image Recognition, arXiv: 1409.1556v6, 2015  
中井悦司, TensorFlowで学ぶディープラーニング入門〜畳み込みニューラルネットワーク徹底解説, マイナビ出版, 2016

### 畳み込み層を実装する

In [5]:
import tensorflow as tf
import numpy as np
from tensorflow.examples.tutorials.mnist import input_data


def calculate_convolutional_layer(x, filter_height, filter_width, output_channels):

    """
    Executeing a convolutional layer task.
    
    Args:
        x                     : An image data.
        filter_height   (int) : A height of each filters.
        filter_width    (int) : A width of each filters.
        output_channels (int) : A number of channels which must be outputed.

    Returns:
        An activation of an convolutional layer.
    """

    if ((not isinstance(filter_height, int))
        or (not isinstance(filter_width, int))
        or (not isinstance(output_channels, int))):
        raise TypeError('"filter_height" and "filter_width" and "output_channels" '
                        'must be integer.')

    input_channels = int(x.shape[-1])
    W = tf.Variable(
        tf.truncated_normal(
            shape=[filter_height,
                   filter_width,
                   input_channels,
                   output_channels],
            stddev=0.1
        )
    )
    h = tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
    b = tf.Variable(tf.constant(0.1, shape=[output_channels]))
    convoluted_image = tf.nn.relu(h + b)

    return convoluted_image

In [6]:
x = tf.placeholder(tf.float32, [1, 1200, 1600, 3])
test = calculate_convolutional_layer(
    x=x,
    filter_height=3,
    filter_width=3,
    output_channels=1
)
sess = tf.InteractiveSession()
# tf.Session()だと、sess.runで返ってくる行列の要素がすべて0だった。
# TODO: Sessionメソッド と InteractiveSessionメソッドの違いを調べる
# sess = tf.Session()
sess.run(tf.global_variables_initializer())
sess.run(test, feed_dict={x: image})

array([[[[ 97.69783783],
         [ 41.22996902],
         [ 37.99833679],
         ..., 
         [ 30.52573395],
         [ 31.05334473],
         [ 38.29788589]],

        [[ 81.28762817],
         [ 13.52082539],
         [ 10.77885151],
         ..., 
         [ 14.0103426 ],
         [ 14.65690899],
         [ 23.6209507 ]],

        [[ 78.83480835],
         [  9.98932838],
         [  9.0935688 ],
         ..., 
         [ 14.03048992],
         [ 14.20788956],
         [ 22.81152534]],

        ..., 
        [[ 81.08071899],
         [ 14.54055214],
         [ 18.19096565],
         ..., 
         [ 16.33125496],
         [ 16.89733124],
         [ 30.88201332]],

        [[ 84.92486572],
         [ 18.98960686],
         [ 18.15483093],
         ..., 
         [ 16.37988472],
         [ 16.92608833],
         [ 31.40236473]],

        [[ 20.4477787 ],
         [  0.        ],
         [  0.        ],
         ..., 
         [  0.        ],
         [  0.        ],
         [ 

### Maxプーリング層を実装する

In [7]:
def calculate_max_pooling_layer(x, ksize, strides):

    """Wrapper function of tf.nn.max_pool.
    
    Args:
        x       : A Tensor produced by calculate_convolutional_layer.
        ksize   : A list of ints that has length >= 4. The size of
                  the window for each dimension of the input tensor.
        strides : A list of ints that has length >= 4. The stride
                  of the sliding window for each dimension of the
                  input tensor.
    
    Returns:
        A pooled image.
    """

    pooled_image = tf.nn.max_pool(x, ksize=ksize, strides=strides, padding='SAME')

    return pooled_image

### 畳込みとプーリング処理の途中経過を保持するクラスを実装する

In [12]:
class ConvNetProgressHolder(object):

    """Holder of convoluted images and pooled image.
    
    This class is used like the struct of C language.
    This has no methods.
    
    Attributes:
        input_data (Tensor) : An image that is applied to convolution and pooling.
        conv (list)         : The list of convoluted images, each images are Tensor objects.
        pool (Tensor)       : A image that is pooled after convolutional layer.
    """

    def __init__(self):

        self.input_data = None
        self.conv = []
        self.pool = None


# FILTER_CONF = {
#     'height': 3,
#     'width': 3,
#     'channels': 1,
#     'num': 1
# }

FILTER_CONF = {
    'height': 3,
    'width': 3
}


def apply_vgg_network_unit(x, channels, num_conv):

    """Apply VGG Network From a Convolutional Layer to Max Pooling Layer.

    Table 1 of Simonyan and Zisserman(2015) is separated by 5 parts,
    each parts is from an input data or a pooled data at previous part
    to a maxpool.
    This function provides to apply a that part.
    This will apply recursively.
    
    Args:
        x (Tensor)     : An input data or A Max pooled data returned by this function.
        channels (int) : A number of channels described at Table 1 of
                         Simonyan and Zisserman(2015).
        num_conv (int) : A number of applying covolutional layers.
                         See Simonyan and Zisserman(2015) for detail.

    Returns:
        A ConvNetProgressHolder object.
    """

    if num_conv < 2:
        raise ValueError('num_conv must be >= 2.')

    conv_holder = ConvNetProgressHolder()
    conv_holder.input_data = x

    conv = calculate_convolutional_layer(
        x=conv_holder.input_data,
        filter_height=FILTER_CONF['height'],
        filter_width=FILTER_CONF['width'],
        output_channels=channels
    )
    conv_holder.conv.append(conv)

    for i in range(1, num_conv):
        conv = calculate_convolutional_layer(
            x=conv_holder.conv[i - 1],
            filter_height=FILTER_CONF['height'],
            filter_width=FILTER_CONF['width'],
            output_channels=channels
        )
        conv_holder.conv.append(conv)

    conv_holder.pool = calculate_max_pooling_layer(
        x=conv_holder.conv[i - 1],
        ksize=[1, 1, 1, 1],
        strides=[1, 1, 1, 1]
    )

    return conv_holder


### VGGの畳込みとプーリング層を構築する

VGGの論文に従い、複数回の畳込み処理と1回のMAXプーリング処理を1セットとして、それを5回繰り返す。  
今回実装する画像変換アルゴリズムでは、この処理の途中経過を使うため、使いたい部分をリストで１つにまとめてsess.run()に投げる。

#### ※処理が重たいので実行に注意すること。
実行環境は下記の通り。  
今回はテストで実行するため、チャンネル数を少なくして実行しているが、  
それでも実行するとメモリをすべて使うため、20〜30秒ほど
フリーズしたかのように動作が重くなる。

OS： Linux Mint 17.1 Rebecca  
CPU： Intel(R) Core(TM)2 Duo CPU L9400 @ 1.86GHz  
メモリ: 4GB

In [27]:
x = tf.placeholder(tf.float32, [1, 1200, 1600, 3])
unit1 = apply_vgg_network_unit(x=x, channels=2, num_conv=2)
unit2 = apply_vgg_network_unit(x=unit1.pool, channels=4, num_conv=2)
unit3 = apply_vgg_network_unit(x=unit2.pool, channels=8, num_conv=4)
unit4 = apply_vgg_network_unit(x=unit3.pool, channels=16, num_conv=4)
unit5 = apply_vgg_network_unit(x=unit4.pool, channels=32, num_conv=4)
sess = tf.InteractiveSession()
# sess = tf.Session()
sess.run(tf.global_variables_initializer())

# 使いたい過程をリストでまとめてsess.runに投げると特徴量抽出の
# 途中経過を取り出せる
result_unit2_conv, result_unit5_conv = sess.run(
    [unit2.conv[1], unit5.conv[2]],
    feed_dict={x: image}
)

In [28]:
result_unit2_conv

array([[[[  7.02230096e-01,   0.00000000e+00,   0.00000000e+00,
            1.33927166e-03],
         [  1.18492639e+00,   6.70453548e-01,   0.00000000e+00,
            0.00000000e+00],
         [  1.43476248e-01,   0.00000000e+00,   4.11224872e-01,
            0.00000000e+00],
         ..., 
         [  3.92771602e-01,   0.00000000e+00,   0.00000000e+00,
            0.00000000e+00],
         [  1.19159639e+00,   0.00000000e+00,   0.00000000e+00,
            0.00000000e+00],
         [  7.82917082e-01,   0.00000000e+00,   0.00000000e+00,
            3.66333574e-01]],

        [[  0.00000000e+00,   0.00000000e+00,   2.52289981e-01,
            1.57277957e-01],
         [  7.93693423e-01,   1.52573013e+00,   6.64244413e-01,
            0.00000000e+00],
         [  5.97493172e-01,   0.00000000e+00,   7.89555132e-01,
            0.00000000e+00],
         ..., 
         [  7.58428931e-01,   2.29582638e-01,   6.53629899e-01,
            0.00000000e+00],
         [  0.00000000e+00,   0.000000

In [29]:
result_unit5_conv

array([[[[ 0.        ,  0.17371504,  0.36011642, ...,  0.28201228,
           0.00475203,  0.206494  ],
         [ 0.        ,  0.24119172,  0.20720905, ...,  0.22061062,
           0.00405187,  0.1887365 ],
         [ 0.12468454,  0.0722487 ,  0.31049809, ...,  0.25071266,
           0.        ,  0.16285834],
         ..., 
         [ 0.17162678,  0.17568502,  0.36769104, ...,  0.312451  ,
           0.        ,  0.11158792],
         [ 0.2111412 ,  0.18682373,  0.14762452, ...,  0.2968089 ,
           0.0769475 ,  0.1175055 ],
         [ 0.46944204,  0.14902915,  0.        , ...,  0.13304842,
           0.        ,  0.        ]],

        [[ 0.        ,  0.        ,  0.41027141, ...,  0.1395594 ,
           0.15535775,  0.24046576],
         [ 0.05219135,  0.        ,  0.26460406, ...,  0.06963778,
           0.1038407 ,  0.23447686],
         [ 0.21321549,  0.        ,  0.17954865, ...,  0.12687416,
           0.09673765,  0.1554819 ],
         ..., 
         [ 0.20133494,  0.      