<a href="https://colab.research.google.com/github/Itsuki-Hamano123/dim-compression/blob/master/experiments/compress_dim_ica.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ICA(独立成分分析)で次元圧縮

In [1]:
import os
import datetime

import cloudpickle
import pandas as pd

from sklearn.datasets import fetch_covtype
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
from sklearn.decomposition import FastICA
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard

In [2]:
print('{lib}:{ver}'.format(lib='tf',ver=tf.__version__))
!pip show scikit-learn

tf:2.2.0
Name: scikit-learn
Version: 0.22.2.post1
Summary: A set of python modules for machine learning and data mining
Home-page: http://scikit-learn.org
Author: None
Author-email: None
License: new BSD
Location: /usr/local/lib/python3.6/dist-packages
Requires: scipy, joblib, numpy
Required-by: yellowbrick, umap-learn, textgenrnn, sklearn, sklearn-pandas, mlxtend, lucid, lightgbm, librosa, imbalanced-learn, fancyimpute


# covtypeデータ読み込み
特徴量54次元で7クラス分類を行うデータ<br>
[scikit-learnのリファレンス](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_covtype.html#sklearn.datasets.fetch_covtype)<br>
[データセットの詳細ページ](https://archive.ics.uci.edu/ml/datasets/Covertype)


In [3]:
SEED = 2

data_bunch = fetch_covtype(random_state=SEED,
                           shuffle=True,
                           return_X_y=False)

print('data shape:{shape}'.format(shape=data_bunch.data.shape))
print('target shape:{shape}'.format(shape=data_bunch.target.shape))
data_bunch

Downloading https://ndownloader.figshare.com/files/5976039


data shape:(581012, 54)
target shape:(581012,)


 'data': array([[3.221e+03, 3.900e+01, 1.600e+01, ..., 0.000e+00, 0.000e+00,
         0.000e+00],
        [3.348e+03, 9.700e+01, 1.200e+01, ..., 1.000e+00, 0.000e+00,
         0.000e+00],
        [3.114e+03, 1.770e+02, 2.000e+01, ..., 0.000e+00, 0.000e+00,
         0.000e+00],
        ...,
        [2.960e+03, 1.200e+02, 2.600e+01, ..., 0.000e+00, 0.000e+00,
         0.000e+00],
        [3.247e+03, 4.600e+01, 1.900e+01, ..., 1.000e+00, 0.000e+00,
         0.000e+00],
        [2.975e+03, 9.900e+01, 8.000e+00, ..., 0.000e+00, 0.000e+00,
         0.000e+00]]),
 'target': array([1, 1, 2, ..., 2, 2, 2], dtype=int32)}

In [4]:
# 訓練データ,テストデータに分割
train_X, test_X, train_y, test_y = train_test_split(data_bunch.data, data_bunch.target,
                                                    test_size=0.2, random_state=SEED)
# 訓練データ,検証データに分割
train_X, validation_X, train_y, validation_y = train_test_split(train_X, train_y,
                                                                test_size=0.2, random_state=SEED)

print('train size:{size}'.format(size=train_y.shape[0]))
print('validation size:{size}'.format(size=validation_y.shape[0]))
print('test size:{size}'.format(size=test_y.shape[0]))

train size:371847
validation size:92962
test size:116203


In [5]:
NUM_CLASS = 7

# 予測ラベルをone-hotエンコード
categorical_train_y = to_categorical(train_y-1, num_classes=NUM_CLASS)
categorical_validation_y = to_categorical(validation_y-1, num_classes=NUM_CLASS)
categorical_test_y = to_categorical(test_y-1, num_classes=NUM_CLASS)

# データの前処理

## 標準化

In [6]:
STD_SCALER = StandardScaler()
# 標準化器を作成
STD_SCALER.fit(train_X)
print('sclaer fit end')

sclaer fit end


In [7]:
# 訓練データを標準化
std_train_X = STD_SCALER.transform(train_X, copy=True)
std_train_X[:,0]

array([ 0.31952626, -1.6259504 ,  1.09557117, ..., -0.64605958,
       -0.18830037,  1.2529259 ])

In [8]:
# 検証/テストデータも標準化
std_validation_X = STD_SCALER.transform(validation_X, copy=True)
std_test_X = STD_SCALER.transform(test_X, copy=True)

### ICAで次元圧縮

In [9]:
# 圧縮後の次元サイズ
COMPONENTS = 40

ica = FastICA(n_components=COMPONENTS, random_state=SEED)
ica.fit(std_train_X)
print('ica fit end')

ica fit end


In [10]:
# 訓練/検証/テストデータをICAで次元圧縮
ica_train_X = ica.transform(std_train_X)
ica_validation_X = ica.transform(std_validation_X)
ica_test_X = ica.transform(std_test_X)

print('圧縮前の次元サイズ:{size}'.format(size=std_train_X.shape[1]))
print('圧縮後の次元サイズ:{size}'.format(size=ica_train_X.shape[1]))

圧縮前の次元サイズ:54
圧縮後の次元サイズ:40


# DNN(classifier)でクラス分類モデル構築

In [11]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /gdrive
/gdrive


In [12]:
cd My Drive/機械学習練習/次元圧縮手法_比較/experiment

/gdrive/My Drive/機械学習練習/次元圧縮手法_比較/experiment


In [13]:
!ls 

compress_dim_ica.ipynb	compress_dim_tsne.ipynb  ml
compress_dim_pca.ipynb	logs


In [14]:
from ml.model import classifier_DNN_fn

LOG_DIR = 'logs'
model_name = 'compress_dim_ica'

In [15]:
model = classifier_DNN_fn(input_shape=ica_train_X.shape[1],
                          output_shape=NUM_CLASS,
                          hidden_shapes=[30, 20])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 30)                1230      
_________________________________________________________________
dropout (Dropout)            (None, 30)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 20)                620       
_________________________________________________________________
dropout_1 (Dropout)          (None, 20)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 7)                 147       
Total params: 1,997
Trainable params: 1,997
Non-trainable params: 0
_________________________________________________________________


In [16]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

est = EarlyStopping(patience=2)

# Load the TensorBoard notebook extension
%load_ext tensorboard
#%reload_ext tensorboard
log_dir = os.path.join(LOG_DIR, 'fit', model_name, datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

## モデルの学習

In [17]:
model.fit(x=ica_train_X, y=categorical_train_y, batch_size=500, epochs=100,
          validation_data=(ica_validation_X, categorical_validation_y),
          callbacks=[est, tensorboard_callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100


<tensorflow.python.keras.callbacks.History at 0x7fc8ead24160>

## モデルの評価

In [18]:
result = model.evaluate(x=ica_test_X, y=categorical_test_y, batch_size=500)
print('test loss:{loss}'.format(loss=result[0]))
print('test auc:{auc}'.format(auc=result[1]))

test loss:0.6672748327255249
test auc:0.7180795669555664


### 主な分類指標を示すレポート

In [20]:
predict_y = model.predict_classes(x=ica_test_X, batch_size=500)

report = classification_report(y_true=test_y, y_pred=predict_y+1)
print(report)

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
              precision    recall  f1-score   support

           1       0.68      0.77      0.72     42488
           2       0.76      0.77      0.77     56473
           3       0.63      0.87      0.73      7355
           4       0.00      0.00      0.00       524
           5       0.00      0.00      0.00      1866
           6       0.00      0.00      0.00      3432
           7       0.79      0.20      0.32      4065

    accuracy                           0.72    116203
   macro avg       0.41      0.37      0.36    116203
weighted avg       0.69      0.72      0.69    116203



  _warn_prf(average, modifier, msg_start, len(result))


### 混同行列

In [21]:
cm = confusion_matrix(y_true=test_y, y_pred=predict_y+1)
print(cm)

[[32715  9550     0     0     0     0   223]
 [12050 43464   956     0     0     0     3]
 [    0   920  6435     0     0     0     0]
 [    0     0   524     0     0     0     0]
 [   20  1790    56     0     0     0     0]
 [    0  1129  2303     0     0     0     0]
 [ 3216    20     0     0     0     0   829]]


### モデルの保存

In [None]:
model_file = os.path.join(LOG_DIR, model_name+'_'+datetime.datetime.now().strftime("%Y%m%d-%H%M%S")+'.h5')
model.save_weights(model_file)

In [None]:
!ls logs/fit

In [None]:
%tensorboard --logdir logs/fit/compress_dim_ica

In [None]:
# モジュールを修正した場合のリロード用
import importlib
importlib.reload(ml.model)