<a href="https://colab.research.google.com/github/Itsuki-Hamano123/dim-compression/blob/master/experiments/DNN/example_autoencoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# autoencoderを実装してみる

In [106]:
import os
import datetime

import pandas as pd

from sklearn.datasets import fetch_covtype
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping

In [107]:
print('{lib}:{ver}'.format(lib='tf',ver=tf.__version__))
!pip show scikit-learn

tf:2.2.0
Name: scikit-learn
Version: 0.22.2.post1
Summary: A set of python modules for machine learning and data mining
Home-page: http://scikit-learn.org
Author: None
Author-email: None
License: new BSD
Location: /usr/local/lib/python3.6/dist-packages
Requires: scipy, joblib, numpy
Required-by: yellowbrick, umap-learn, textgenrnn, sklearn, sklearn-pandas, mlxtend, lucid, lightgbm, librosa, imbalanced-learn, fancyimpute


# covtypeデータ読み込み
特徴量54次元で7クラス分類を行うデータ<br>
[scikit-learnのリファレンス](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_covtype.html#sklearn.datasets.fetch_covtype)<br>
[データセットの詳細ページ](https://archive.ics.uci.edu/ml/datasets/Covertype)


In [108]:
SEED = 2

data_bunch = fetch_covtype(random_state=SEED,
                           shuffle=True,
                           return_X_y=False)

print('data shape:{shape}'.format(shape=data_bunch.data.shape))
print('target shape:{shape}'.format(shape=data_bunch.target.shape))
data_bunch

data shape:(581012, 54)
target shape:(581012,)


 'data': array([[3.221e+03, 3.900e+01, 1.600e+01, ..., 0.000e+00, 0.000e+00,
         0.000e+00],
        [3.348e+03, 9.700e+01, 1.200e+01, ..., 1.000e+00, 0.000e+00,
         0.000e+00],
        [3.114e+03, 1.770e+02, 2.000e+01, ..., 0.000e+00, 0.000e+00,
         0.000e+00],
        ...,
        [2.960e+03, 1.200e+02, 2.600e+01, ..., 0.000e+00, 0.000e+00,
         0.000e+00],
        [3.247e+03, 4.600e+01, 1.900e+01, ..., 1.000e+00, 0.000e+00,
         0.000e+00],
        [2.975e+03, 9.900e+01, 8.000e+00, ..., 0.000e+00, 0.000e+00,
         0.000e+00]]),
 'target': array([1, 1, 2, ..., 2, 2, 2], dtype=int32)}

In [109]:
# 訓練データ,テストデータに分割
train_X, test_X, train_y, test_y = train_test_split(data_bunch.data, data_bunch.target,
                                                    test_size=0.2, random_state=SEED)
# 訓練データ,検証データに分割
train_X, validation_X, train_y, validation_y = train_test_split(train_X, train_y,
                                                                test_size=0.2, random_state=SEED)

print('train size:{size}'.format(size=train_y.shape[0]))
print('validation size:{size}'.format(size=validation_y.shape[0]))
print('test size:{size}'.format(size=test_y.shape[0]))

train size:371847
validation size:92962
test size:116203


In [110]:
NUM_CLASS = 7

# 予測ラベルをone-hotエンコード
categorical_train_y = to_categorical(train_y-1, num_classes=NUM_CLASS)
categorical_validation_y = to_categorical(validation_y-1, num_classes=NUM_CLASS)
categorical_test_y = to_categorical(test_y-1, num_classes=NUM_CLASS)

# データの前処理

## 標準化

In [111]:
STD_SCALER = StandardScaler()
# 標準化器を作成
STD_SCALER.fit(train_X)
print('sclaer fit end')

sclaer fit end


In [112]:
# 訓練データを標準化
std_train_X = STD_SCALER.transform(train_X, copy=True)
std_train_X[:,0]

array([ 0.31952626, -1.6259504 ,  1.09557117, ..., -0.64605958,
       -0.18830037,  1.2529259 ])

In [113]:
# 検証/テストデータも標準化
std_validation_X = STD_SCALER.transform(validation_X, copy=True)
std_test_X = STD_SCALER.transform(test_X, copy=True)

# Autoencoder構築

In [114]:
# 圧縮後の次元サイズ
ENCODING_DIM  = 40

In [115]:
def autoencoder_fn(input_dim, encoding_dim, hidden_units=None):
  '''
  オートエンコーダの定義

  Parameters
  -----
  input_dim : int
    元の次元数
  encoding_dim : int
    次元圧縮後の次元数
  hidden_units : list of int
    エンコーダの中間層のユニット数(default:None)
  
  Returns
  -----
  autoencoder : tf.keras.Models
  '''
  input_data = Input(shape=(input_dim))
  
  if hidden_units is None:
    def _simple_autoencoder(input_dim, encoding_dim):
      '''
      入力/中間*1/出力層のみのautoencoder
      '''
      encoded = Dense(encoding_dim, activation='relu')(input_data)
      decoded = Dense(input_dim, activation='sigmoid')(encoded)
      autoencoder = Model(input_data, decoded)
      return autoencoder
    return _simple_autoencoder(input_dim, encoding_dim)
  
  else:
    def _deep_autoencoder(input_dim, encoding_dim, hidden_units):
      '''
      エンコード前の中間層が1つ以上ある場合のautoencoderを構築
      '''
      def _encoded_fn(input_dim, encoding_dim, hidden_units):
        encoded = Dense(hidden_units[0], activation='relu')(input_data)

        if len(hidden_units) == 1:
          encoded = Dense(encoding_dim, activation='relu')(encoded)
          return encoded

        for i, unit_size in enumerate(hidden_units[1:]):
            encoded = Dense(unit_size, activation='relu')(encoded)
        encoded = Dense(encoding_dim, activation='relu')(encoded)
        return encoded


      def _decoded_fn(encoded, reverse_hidden_units, output_dim):
        decoded = Dense(reverse_hidden_units[0], activation='relu')(encoded)
        
        if len(reverse_hidden_units) == 1:
          decoded = Dense(output_dim, activation='sigmoid')(decoded)
          return decoded

        for i, unit_size in enumerate(reverse_hidden_units[1:]):
            decoded = Dense(unit_size, activation='relu')(decoded)
        decoded = Dense(output_dim, activation='sigmoid')(decoded)
        return decoded
      
      encoded = _encoded_fn(input_dim, encoding_dim, hidden_units)
      decoded = _decoded_fn(encoded, sorted(hidden_units), input_dim)
      autoencoder = Model(input_data, decoded)
      return autoencoder

  return _deep_autoencoder(input_dim, encoding_dim, hidden_units)

In [116]:
std_train_X.shape

(371847, 54)

## 中間層1つのAutoencoder

In [117]:
simple_autoencoder = autoencoder_fn(input_dim=std_train_X.shape[1], encoding_dim=ENCODING_DIM)
simple_autoencoder.summary()

Model: "model_27"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_29 (InputLayer)        [(None, 54)]              0         
_________________________________________________________________
dense_79 (Dense)             (None, 40)                2200      
_________________________________________________________________
dense_80 (Dense)             (None, 54)                2214      
Total params: 4,414
Trainable params: 4,414
Non-trainable params: 0
_________________________________________________________________


In [118]:
simple_autoencoder.compile(optimizer='Adagrad', loss='mean_squared_error')

In [119]:
simple_autoencoder.fit(std_train_X, std_train_X,
                epochs=2,
                batch_size=256,
                shuffle=True,
                validation_data=(std_test_X, std_test_X))

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f5e6ee5bac8>

In [120]:
def take_encoder(autoencoder, encoder_layer_num):
    '''
    オートエンコーダモデルからエンコーダを返す

    Parameters
    -----
    autoencoder : tf.keras.Models
    encoder_layer_num : int
        エンコーダの出力層となるレイヤー数を指定(start:0)

    Returns
    -----
    encoder : tf.keras.Models
    '''
    encoder_input = autoencoder.input
    encoder_output = autoencoder.get_layer(index=encoder_layer_num).output
    encoder = Model(inputs=encoder_input, outputs=encoder_output)
    return encoder

In [121]:
encoder = take_encoder(simple_autoencoder, 1)
encoder.summary()

Model: "model_28"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_29 (InputLayer)        [(None, 54)]              0         
_________________________________________________________________
dense_79 (Dense)             (None, 40)                2200      
Total params: 2,200
Trainable params: 2,200
Non-trainable params: 0
_________________________________________________________________


In [122]:
encoder.predict(x=std_test_X).shape

(116203, 40)

## DeepなAutoencoder

In [123]:
ENCODING_DIM

40

In [124]:
hidden_units = [50, 48]

deep_autoencoder = autoencoder_fn(input_dim=std_train_X.shape[1], encoding_dim=ENCODING_DIM, hidden_units=hidden_units)
deep_autoencoder.summary()

Model: "model_29"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_30 (InputLayer)        [(None, 54)]              0         
_________________________________________________________________
dense_81 (Dense)             (None, 50)                2750      
_________________________________________________________________
dense_82 (Dense)             (None, 48)                2448      
_________________________________________________________________
dense_83 (Dense)             (None, 40)                1960      
_________________________________________________________________
dense_84 (Dense)             (None, 48)                1968      
_________________________________________________________________
dense_85 (Dense)             (None, 50)                2450      
_________________________________________________________________
dense_86 (Dense)             (None, 54)                275

In [125]:
deep_autoencoder.compile(optimizer='Adagrad', loss='mean_squared_error')

In [126]:
deep_autoencoder.fit(std_train_X, std_train_X,
                epochs=2,
                batch_size=256,
                shuffle=True,
                validation_data=(std_test_X, std_test_X))

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f5e930929b0>

In [127]:
encoder_layer_num = len(hidden_units) + 1
encoder = take_encoder(deep_autoencoder, encoder_layer_num)
encoder.summary()

Model: "model_30"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_30 (InputLayer)        [(None, 54)]              0         
_________________________________________________________________
dense_81 (Dense)             (None, 50)                2750      
_________________________________________________________________
dense_82 (Dense)             (None, 48)                2448      
_________________________________________________________________
dense_83 (Dense)             (None, 40)                1960      
Total params: 7,158
Trainable params: 7,158
Non-trainable params: 0
_________________________________________________________________


In [128]:
encoder.predict(std_test_X).shape

(116203, 40)