<a href="https://colab.research.google.com/github/Itsuki-Hamano123/dim-compression/blob/master/experiments/compress_dim_pca.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PCA(主成分分析)で次元圧縮

In [20]:
import os
import datetime

import cloudpickle
import pandas as pd

from sklearn.datasets import fetch_covtype
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard

In [None]:
print('{lib}:{ver}'.format(lib='tf',ver=tf.__version__))
!pip show scikit-learn

# covtypeデータ読み込み
特徴量54次元で7クラス分類を行うデータ<br>
[scikit-learnのリファレンス](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_covtype.html#sklearn.datasets.fetch_covtype)<br>
[データセットの詳細ページ](https://archive.ics.uci.edu/ml/datasets/Covertype)


In [3]:
SEED = 2

data_bunch = fetch_covtype(random_state=SEED,
                           shuffle=True,
                           return_X_y=False)

print('data shape:{shape}'.format(shape=data_bunch.data.shape))
print('target shape:{shape}'.format(shape=data_bunch.target.shape))
data_bunch

Downloading https://ndownloader.figshare.com/files/5976039


data shape:(581012, 54)
target shape:(581012,)


 'data': array([[3.221e+03, 3.900e+01, 1.600e+01, ..., 0.000e+00, 0.000e+00,
         0.000e+00],
        [3.348e+03, 9.700e+01, 1.200e+01, ..., 1.000e+00, 0.000e+00,
         0.000e+00],
        [3.114e+03, 1.770e+02, 2.000e+01, ..., 0.000e+00, 0.000e+00,
         0.000e+00],
        ...,
        [2.960e+03, 1.200e+02, 2.600e+01, ..., 0.000e+00, 0.000e+00,
         0.000e+00],
        [3.247e+03, 4.600e+01, 1.900e+01, ..., 1.000e+00, 0.000e+00,
         0.000e+00],
        [2.975e+03, 9.900e+01, 8.000e+00, ..., 0.000e+00, 0.000e+00,
         0.000e+00]]),
 'target': array([1, 1, 2, ..., 2, 2, 2], dtype=int32)}

In [4]:
# 訓練データ,テストデータに分割
train_X, test_X, train_y, test_y = train_test_split(data_bunch.data, data_bunch.target,
                                                    test_size=0.2, random_state=SEED)
# 訓練データ,検証データに分割
train_X, validation_X, train_y, validation_y = train_test_split(train_X, train_y,
                                                                test_size=0.2, random_state=SEED)

print('train size:{size}'.format(size=train_y.shape[0]))
print('validation size:{size}'.format(size=validation_y.shape[0]))
print('test size:{size}'.format(size=test_y.shape[0]))

train size:371847
validation size:92962
test size:116203


In [5]:
NUM_CLASS = 7

# 予測ラベルをone-hotエンコード
categorical_train_y = to_categorical(train_y-1, num_classes=NUM_CLASS)
categorical_validation_y = to_categorical(validation_y-1, num_classes=NUM_CLASS)
categorical_test_y = to_categorical(test_y-1, num_classes=NUM_CLASS)

# データの前処理

## 標準化

In [6]:
STD_SCALER = StandardScaler()
# 標準化器を作成
STD_SCALER.fit(train_X)
print('sclaer fit end')

sclaer fit end


In [7]:
# 訓練データを標準化
std_train_X = STD_SCALER.transform(train_X, copy=True)
std_train_X[:,0]

array([ 0.31952626, -1.6259504 ,  1.09557117, ..., -0.64605958,
       -0.18830037,  1.2529259 ])

In [8]:
# 検証/テストデータも標準化
std_validation_X = STD_SCALER.transform(validation_X, copy=True)
std_test_X = STD_SCALER.transform(test_X, copy=True)

## PCAで次元圧縮

In [9]:
# 累積寄与率
ACCUMULATION_CONTIBUTION_RATE = 0.95

pca = PCA(n_components=ACCUMULATION_CONTIBUTION_RATE)
pca.fit(std_train_X)

print('圧縮前の次元サイズ:{size}'.format(size=std_train_X.shape[1]))
print('圧縮後の次元サイズ:{size}'.format(size=len(pca.explained_variance_ratio_)))
print('累積寄与率:{rate}'.format(rate=pca.explained_variance_ratio_.sum()))

圧縮前の次元サイズ:54
圧縮後の次元サイズ:43
累積寄与率:0.9504606638174287


In [10]:
# 訓練/検証/テストデータをPCAで次元圧縮
pca_train_X = pca.transform(std_train_X)
pca_validation_X = pca.transform(std_validation_X)
pca_test_X = pca.transform(std_test_X)

# DNN(classifier)でクラス分類モデル構築

In [11]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /gdrive
/gdrive


In [12]:
cd My Drive/機械学習練習/次元圧縮手法_比較/experiment

/gdrive/My Drive/機械学習練習/次元圧縮手法_比較/experiment


In [13]:
!ls 

compress_dim_ica.ipynb	compress_dim_tsne.ipynb  ml
compress_dim_pca.ipynb	logs


In [14]:
from ml.model import classifier_DNN_fn

LOG_DIR = 'logs'
model_name = 'compress_dim_pca'

In [15]:
model = classifier_DNN_fn(input_shape=pca_train_X.shape[1],
                          output_shape=NUM_CLASS,
                          hidden_shapes=[30, 20])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 30)                1320      
_________________________________________________________________
dropout (Dropout)            (None, 30)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 20)                620       
_________________________________________________________________
dropout_1 (Dropout)          (None, 20)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 7)                 147       
Total params: 2,087
Trainable params: 2,087
Non-trainable params: 0
_________________________________________________________________


In [16]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

est = EarlyStopping(patience=2)

# Load the TensorBoard notebook extension
%load_ext tensorboard
#%reload_ext tensorboard
log_dir = os.path.join(LOG_DIR, 'fit', model_name, datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

## モデルの学習

In [17]:
model.fit(x=pca_train_X, y=categorical_train_y, batch_size=500, epochs=30,
          validation_data=(pca_validation_X, categorical_validation_y),
          callbacks=[est, tensorboard_callback])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7fa2d5464128>

## モデルの評価

In [18]:
result = model.evaluate(x=pca_test_X, y=categorical_test_y, batch_size=500)
print('test loss:{loss}'.format(loss=result[0]))
print('test auc:{auc}'.format(auc=result[1]))

test loss:0.5749784111976624
test auc:0.7507981657981873


### 主な分類指標を示すレポート

In [39]:
predict_y = model.predict_classes(x=pca_test_X, batch_size=500)

report = classification_report(y_true=test_y, y_pred=predict_y+1)
print(report)

              precision    recall  f1-score   support

           1       0.74      0.76      0.75     42488
           2       0.77      0.81      0.79     56473
           3       0.64      0.90      0.75      7355
           4       0.00      0.00      0.00       524
           5       1.00      0.00      0.00      1866
           6       0.42      0.02      0.04      3432
           7       0.81      0.63      0.71      4065

    accuracy                           0.75    116203
   macro avg       0.63      0.45      0.43    116203
weighted avg       0.74      0.75      0.73    116203



  _warn_prf(average, modifier, msg_start, len(result))


### 混同行列

In [40]:
cm = confusion_matrix(y_true=test_y, y_pred=predict_y+1)
print(cm)

[[32106  9818     6     0     0     0   558]
 [ 9601 45886   887     0     0    41    58]
 [    0   673  6611     0     0    71     0]
 [    0     0   524     0     0     0     0]
 [    0  1809    56     0     1     0     0]
 [    0  1170  2181     0     0    81     0]
 [ 1473    32     0     0     0     0  2560]]


### モデルの保存

In [41]:
model_file = os.path.join(LOG_DIR, model_name+'_'+datetime.datetime.now().strftime("%Y%m%d-%H%M%S")+'.h5')
model.save_weights(model_file)

In [42]:
!ls logs/fit

compress_dim_ica  compress_dim_pca


In [None]:
%tensorboard --logdir logs/fit/compress_dim_pca

In [None]:
# モジュールを修正した場合のリロード用
import importlib
importlib.reload(ml.model)