## 概要
- Poseを使った異常検知をMNISTでテストしてみる

## モジュールのインポート

In [41]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from keras.datasets import mnist
from keras.layers import Input, Dense, Layer
from keras.layers import Lambda, Conv2D, Flatten, MaxPooling2D
from keras.layers import Reshape, Conv2DTranspose, UpSampling2D
from keras.models import Model
from keras import backend as K
#from keras import objectives
from keras.losses import binary_crossentropy
from scipy.stats import norm
from sklearn.model_selection import train_test_split

from glob import glob
from multiprocessing import Pool
import multiprocessing as mp
from tqdm import tqdm_notebook as tqdm
from time import time

## データの前処理

In [19]:
data_path = sorted(glob("../output/AbemaTV_Tournament_Final1/*"))
data_path += sorted(glob("../output/AbemaTV_Tournament_Final2/*"))
data_path += sorted(glob("../output/AbemaTV_Tournament_Final3/*"))
data_path = data_path[:1000]

In [20]:
def read_img(path):
    cv2.setNumThreads(0)
    img = cv2.imread(path, 0)
    
    # openposeのミスをできるだけ前処理で落とす
    try:
        x, w, y, h = trimming(img)
        margin = 5
        img = img[y-margin:h+margin, x-margin:w+margin]
        return cv2.resize(img, (64, 64))
    except:
        return 0

In [21]:
def trimming(img):
    mask = img < 255
    x = np.where(np.sum(mask, axis=0) > 1)[0]
    y = np.where(np.sum(mask, axis=1) > 1)[0]
    
    x_min, x_max = x[0], x[-1]
    y_min, y_max = y[0], y[-1]
    return x_min, x_max, y_min, y_max

In [22]:
imgs = []
with tqdm(total=len(data_path)) as pbar:
    with Pool(mp.cpu_count()) as p:
        for img in tqdm(p.imap_unordered(read_img, data_path)):
            imgs.append(img)
            pbar.update()

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

In [23]:
imgs = [img for img in tqdm(imgs) if img is not 0]

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

In [24]:
batch_size = 32

In [25]:
np.array(imgs).shape

(997, 64, 64)

In [26]:
#(X_train, y_train), (X_test, y_test) = mnist.load_data()
(X_train, X_test, y_train, y_test) = train_test_split(np.array(imgs), np.array([0 for _ in range(len(imgs))]), test_size=0.2, random_state=98)

In [27]:
# X_train = X_train[np.where(y_train == 0)[0]]
# y_train = [0 for _ in range(len(np.where(y_train == 0)[0]))]
# X_test = X_test[np.where(y_test == 0)[0]]
# y_test = [0 for _ in range(len(np.where(y_test == 0)[0]))]

In [28]:
X_train.shape

(797, 64, 64)

In [29]:
X_test.shape

(200, 64, 64)

In [30]:
# 前処理
X_train = X_train[:X_train.shape[0] // batch_size * batch_size].astype("float32") / 255
X_test = X_test[:X_test.shape[0] // batch_size * batch_size].astype("float32") / 255

train_num, height, width = X_train.shape
test_num = X_test.shape[0]

X_train = X_train.reshape(train_num, height, width, 1)
X_test = X_test.reshape(test_num, height, width, 1)

print(X_train.shape, X_test.shape)

(768, 64, 64, 1) (192, 64, 64, 1)


## モデルの構築

In [31]:
K.clear_session()

In [32]:
# ネットワークパラメータの設定
kernel_size = 3
filters = 32
n_epochs = 100
n_hidden = 64
z_dim = 2

In [33]:
X_train.shape

(768, 64, 64, 1)

In [42]:
class CustomVaridationalLayer(Layer):
    def set_z_mean(self, z_mean):
        self._z_mean = z_mean
        
    def set_z_log_var(self, z_log_var):
        self._z_log_var = z_log_var
        
    def _vae_loss(self, x, y):
        x = K.flatten(x)
        y = K.flatten(y)
        
        reconstruction_loss = binary_crossentropy(x, y)
        kl_loss = - 0.5 * K.sum(1 + log_var - K.square(mu) - K.exp(log_var), axis=-1)
        return reconstruction_loss + kl_loss
    
    def call(self, inputs):
        x = inputs[0]
        y = inputs[1]
        loss = self._vae_loss(x, y)
        self.add_loss(loss, inputs=inputs)
        return x

In [None]:
class VAE(object):
    def __init__(self, image_shape, latent_dim):
        self._latent_dim = latent_dim
        
        # Encoding
        

In [34]:
# エンコーダ
inputs = Input(shape=(X_train.shape[1:]))
x = inputs

for i in range(2):
    filters *= 2
    x = Conv2D(filters=filters,
               kernel_size=kernel_size,
               activation='relu',
               strides=2,
               padding='same')(x)
    x = MaxPooling2D(pool_size=(2, 2))(x)

shape_before_flattening = K.int_shape(x)
print(shape_before_flattening)

x = Flatten()(x)
x_encoded = Dense(n_hidden, activation="relu")(x)
#x_encoded = Dense(n_hidden//2, activation="relu")(x_encoded)

mu = Dense(z_dim)(x_encoded)
log_var = Dense(z_dim)(x_encoded)

(None, 4, 4, 128)


In [35]:
def sampling(args):
    mu, log_var = args
    eps = K.random_normal(shape=(K.shape(mu)[0],
                                z_dim), 
                          mean=0., 
                          stddev=1.0)
    return mu + K.exp(log_var) * eps

z = Lambda(sampling)([mu, log_var])

In [36]:
# デコーダ
# decoder_input = Input(K.int_shape(z)[1:])
z_decoded = Dense(np.prod(shape_before_flattening[1:]), 
                          activation="relu")(z)
#z_decoded = Dense(n_hidden, activation="relu")(z_decoded)

z_deconv = Reshape(shape_before_flattening[1:])(z_decoded)

for i in range(2):
    z_deconv = UpSampling2D((2, 2))(z_deconv)
    z_deconv = Conv2DTranspose(filters=filters,
                        kernel_size=kernel_size,
                        activation='relu',
                        strides=2,
                        padding='same')(z_deconv)
    filters //= 2

y = Conv2DTranspose(filters=1,
                          kernel_size=kernel_size,
                          activation='sigmoid',
                          padding='same',
                          name='decoder_output')(z_deconv)

- kl_loss
    - カルバック・ライブラー情報量
    - 確率論と情報理論を組み合わせた２つの確率分布がどの程度似ているかを表す尺度

$$ loss = \frac{1}{2} \sum^{N_z}_{j=1} (\mu^{2}_{z_j} + \sigma^{2}_{z_j} - \log{\sigma^2_{z_j} - 1}) \\ = \frac{1}{2} \cdot \sum^{N_z}_{j=1} (\mu^{2}_{z_j} + \exp(\log{\sigma}) - \log{\sigma} - 1)?$$


In [37]:
# loss
reconstruction_loss = binary_crossentropy(K.flatten(x), K.flatten(y)) * width * height
kl_loss = kl_loss = - 0.5 * K.sum(1 + log_var - K.square(mu) - K.exp(log_var), axis=-1)
vae_loss = reconstruction_loss + kl_loss

In [38]:
# build model
vae = Model(inputs, y)
vae.add_loss(vae_loss)
vae.compile(optimizer="rmsprop")
vae.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 64, 64, 1)    0                                            
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 32, 32, 64)   640         input_1[0][0]                    
__________________________________________________________________________________________________
max_pooling2d_1 (MaxPooling2D)  (None, 16, 16, 64)   0           conv2d_1[0][0]                   
__________________________________________________________________________________________________
conv2d_2 (Conv2D)               (None, 8, 8, 128)    73856       max_pooling2d_1[0][0]            
__________________________________________________________________________________________________
max_poolin

In [39]:
history = vae.fit(X_train,
        shuffle=True,
        epochs=n_epochs,
        batch_size=batch_size,
        validation_data=(X_test, None),
        verbose=1)

Train on 768 samples, validate on 192 samples
Epoch 1/100


InvalidArgumentError: Incompatible shapes: [131072] vs. [65536]
	 [[Node: logistic_loss/mul = Mul[T=DT_FLOAT, _class=["loc:@training/RMSprop/gradients/logistic_loss/mul_grad/Reshape"], _device="/job:localhost/replica:0/task:0/device:GPU:0"](Log, Reshape)]]
	 [[Node: loss/add/_111 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_1167_loss/add", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]

In [None]:
# build encoder
encoder = Model(x, mu)
encoder.summary()

In [None]:
# build decoder
decoder_input = Input(shape=(z_dim,))
_z_decoded = z_decoder1(decoder_input)
_z_decoded = z_decoder2(_z_decoded)
_y = y_decoder(_z_decoded)
generator = Model(decoder_input, _y)
generator.summary()

In [None]:
# display a 2D manifold of the digits
# 0 に対して分布を見てみる
n = 15 # figure with 15x15 digits
digit_size = 28
figure = np.zeros((digit_size * n, digit_size * n))

grid_x = norm.ppf(np.linspace(0.05, 0.95, n)) 
grid_y = norm.ppf(np.linspace(0.05, 0.95, n))

for i, yi in enumerate(grid_x):
    for j, xi in enumerate(grid_y):
        z_sample = np.array([[xi, yi]])
        x_decoded = generator.predict(z_sample)
        digit = x_decoded[0].reshape(digit_size, digit_size)
        figure[i * digit_size: (i + 1) * digit_size,
               j * digit_size: (j + 1) * digit_size] = digit

plt.figure(figsize=(10, 10))
plt.imshow(figure, cmap='Greys_r')
plt.show()

## まとめ
VAEによる1種類の生成モデルを学習させることで1枚の画像の分布を得ることができるのでこれを用いて画像比較ができる。