<a href="https://colab.research.google.com/github/Itsuki-Hamano123/dim-compression/blob/master/experiments/XGBoost/compress_dim_ica.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ICA(独立成分分析)で次元圧縮

In [1]:
import os
import datetime

import cloudpickle
import pandas as pd

from sklearn.datasets import fetch_covtype
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
from sklearn.decomposition import FastICA
from sklearn.metrics import classification_report, confusion_matrix
import xgboost as xgb

In [2]:
print('{lib}:{ver}'.format(lib='xgboost',ver=xgb.__version__))
!pip show scikit-learn

xgboost:0.90
Name: scikit-learn
Version: 0.22.2.post1
Summary: A set of python modules for machine learning and data mining
Home-page: http://scikit-learn.org
Author: None
Author-email: None
License: new BSD
Location: /usr/local/lib/python3.6/dist-packages
Requires: joblib, numpy, scipy
Required-by: yellowbrick, umap-learn, textgenrnn, sklearn, sklearn-pandas, mlxtend, lucid, lightgbm, librosa, imbalanced-learn, fancyimpute


# covtypeデータ読み込み
特徴量54次元で7クラス分類を行うデータ<br>
[scikit-learnのリファレンス](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_covtype.html#sklearn.datasets.fetch_covtype)<br>
[データセットの詳細ページ](https://archive.ics.uci.edu/ml/datasets/Covertype)


In [3]:
SEED = 2

data_bunch = fetch_covtype(random_state=SEED,
                           shuffle=True,
                           return_X_y=False)

print('data shape:{shape}'.format(shape=data_bunch.data.shape))
print('target shape:{shape}'.format(shape=data_bunch.target.shape))
data_bunch

Downloading https://ndownloader.figshare.com/files/5976039


data shape:(581012, 54)
target shape:(581012,)


 'data': array([[3.221e+03, 3.900e+01, 1.600e+01, ..., 0.000e+00, 0.000e+00,
         0.000e+00],
        [3.348e+03, 9.700e+01, 1.200e+01, ..., 1.000e+00, 0.000e+00,
         0.000e+00],
        [3.114e+03, 1.770e+02, 2.000e+01, ..., 0.000e+00, 0.000e+00,
         0.000e+00],
        ...,
        [2.960e+03, 1.200e+02, 2.600e+01, ..., 0.000e+00, 0.000e+00,
         0.000e+00],
        [3.247e+03, 4.600e+01, 1.900e+01, ..., 1.000e+00, 0.000e+00,
         0.000e+00],
        [2.975e+03, 9.900e+01, 8.000e+00, ..., 0.000e+00, 0.000e+00,
         0.000e+00]]),
 'target': array([1, 1, 2, ..., 2, 2, 2], dtype=int32)}

In [4]:
# 訓練データ,テストデータに分割
train_X, test_X, train_y, test_y = train_test_split(data_bunch.data, data_bunch.target,
                                                    test_size=0.2, random_state=SEED)
# 訓練データ,検証データに分割
train_X, validation_X, train_y, validation_y = train_test_split(train_X, train_y,
                                                                test_size=0.2, random_state=SEED)

print('train size:{size}'.format(size=train_y.shape[0]))
print('validation size:{size}'.format(size=validation_y.shape[0]))
print('test size:{size}'.format(size=test_y.shape[0]))

train size:371847
validation size:92962
test size:116203


In [23]:
NUM_CLASS = 7

# データの前処理

## 標準化

In [6]:
STD_SCALER = StandardScaler()
# 標準化器を作成
STD_SCALER.fit(train_X)
print('sclaer fit end')

sclaer fit end


In [7]:
# 訓練データを標準化
std_train_X = STD_SCALER.transform(train_X, copy=True)
std_train_X[:,0]

array([ 0.31952626, -1.6259504 ,  1.09557117, ..., -0.64605958,
       -0.18830037,  1.2529259 ])

In [8]:
# 検証/テストデータも標準化
std_validation_X = STD_SCALER.transform(validation_X, copy=True)
std_test_X = STD_SCALER.transform(test_X, copy=True)

### ICAで次元圧縮

In [9]:
# 圧縮後の次元サイズ
COMPONENTS = 40

ica = FastICA(n_components=COMPONENTS, random_state=SEED)
ica.fit(std_train_X)
print('ica fit end')

ica fit end


In [10]:
# 訓練/検証/テストデータをICAで次元圧縮
ica_train_X = ica.transform(std_train_X)
ica_validation_X = ica.transform(std_validation_X)
ica_test_X = ica.transform(std_test_X)

print('圧縮前の次元サイズ:{size}'.format(size=std_train_X.shape[1]))
print('圧縮後の次元サイズ:{size}'.format(size=ica_train_X.shape[1]))

圧縮前の次元サイズ:54
圧縮後の次元サイズ:40


# XGBoost(classifier)でクラス分類モデル構築

In [11]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /gdrive
/gdrive


In [12]:
cd My Drive/機械学習練習/次元圧縮手法_比較/experiment/XGBoost

/gdrive/My Drive/機械学習練習/次元圧縮手法_比較/experiment/XGBoost


In [13]:
!ls 

compress_dim_ica.ipynb	compress_dim_nmf.ipynb	compress_dim_pca.ipynb


In [14]:
LOG_DIR = '../logs'
model_name = 'ica_xgboost'

xgb_cls = xgb.XGBRFClassifier(random_state=SEED, tree_method='gpu_hist', gpu_id=0)

## モデルの学習

In [19]:
xgb_cls.fit(X=ica_train_X, y=train_y-1)

XGBRFClassifier(base_score=0.5, colsample_bylevel=1, colsample_bynode=0.8,
                colsample_bytree=1, gamma=0, gpu_id=0, learning_rate=1,
                max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
                n_estimators=100, n_jobs=1, nthread=None,
                objective='multi:softprob', random_state=2, reg_alpha=0,
                reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
                subsample=0.8, tree_method='gpu_hist', verbosity=1)

## モデルの評価

In [24]:
train_score = xgb_cls.score(X=ica_train_X, y=train_y-1)
test_score = xgb_cls.score(X=ica_test_X, y=test_y-1)
print('train score:{score}'.format(score=train_score))
print('test score:{score}'.format(score=test_score))

train score:0.6123082880862291
test score:0.612755264494032


### 主な分類指標を示すレポート

In [25]:
predict_y = xgb_cls.predict(data=ica_test_X)

report = classification_report(y_true=test_y, y_pred=predict_y+1)
print(report)

              precision    recall  f1-score   support

           1       0.66      0.42      0.51     42488
           2       0.59      0.85      0.70     56473
           3       0.65      0.43      0.51      7355
           4       0.68      0.27      0.38       524
           5       0.70      0.13      0.22      1866
           6       0.54      0.02      0.04      3432
           7       0.73      0.43      0.54      4065

    accuracy                           0.61    116203
   macro avg       0.65      0.36      0.42    116203
weighted avg       0.62      0.61      0.58    116203



### 混同行列

In [26]:
cm = confusion_matrix(y_true=test_y, y_pred=predict_y+1)
print(cm)

[[17642 24175    31     0    16     0   624]
 [ 7400 48214   727     3    85    19    25]
 [  623  3507  3127    56     1    41     0]
 [   11   262   107   140     0     4     0]
 [  158  1374    91     0   242     1     0]
 [  219  2419   711     8     0    75     0]
 [  849  1437    15     0     0     0  1764]]


### モデルの保存

In [29]:
model_file = os.path.join(LOG_DIR, model_name+'_'+datetime.datetime.now().strftime("%Y%m%d-%H%M%S")+'.pkl')

with open(model_file, 'wb') as f:
      f.write(cloudpickle.dumps(xgb_cls))

In [30]:
!ls ../logs

compress_dim_ica_20200706-000503.h5  fit
compress_dim_pca_20200705-080246.h5  ica_xgboost_20200712-055216.pkl
compress_dim_pca_20200705-080838.h5  ica_xgboost.pkl
compress_dim_pca_20200706-122832.h5
