# データセット取得（Mice Protein Expression Data Set）
Mice Protein Expression Data Set

本サンプルでは教師無しの異常検知を実施する。※PyCaret自体は教師有りの異常検知も可能

データセットの詳細は<https://archive.ics.uci.edu/ml/datasets/Mice+Protein+Expression>を参照。

In [1]:
# PyCaretチュートリアル用データセット取得
# 詳細は<https://pycaret.org/get-data/>を参照
from pycaret.datasets import get_data

dataset = get_data('mice')
dataset.to_csv('./dataset.csv')

Unnamed: 0,MouseID,DYRK1A_N,ITSN1_N,BDNF_N,NR1_N,NR2A_N,pAKT_N,pBRAF_N,pCAMKII_N,pCREB_N,...,pCFOS_N,SYP_N,H3AcK18_N,EGR1_N,H3MeK4_N,CaNA_N,Genotype,Treatment,Behavior,class
0,309_1,0.503644,0.747193,0.430175,2.816329,5.990152,0.21883,0.177565,2.373744,0.232224,...,0.108336,0.427099,0.114783,0.13179,0.128186,1.675652,Control,Memantine,C/S,c-CS-m
1,309_2,0.514617,0.689064,0.41177,2.789514,5.685038,0.211636,0.172817,2.29215,0.226972,...,0.104315,0.441581,0.111974,0.135103,0.131119,1.74361,Control,Memantine,C/S,c-CS-m
2,309_3,0.509183,0.730247,0.418309,2.687201,5.622059,0.209011,0.175722,2.283337,0.230247,...,0.106219,0.435777,0.111883,0.133362,0.127431,1.926427,Control,Memantine,C/S,c-CS-m
3,309_4,0.442107,0.617076,0.358626,2.466947,4.979503,0.222886,0.176463,2.152301,0.207004,...,0.111262,0.391691,0.130405,0.147444,0.146901,1.700563,Control,Memantine,C/S,c-CS-m
4,309_5,0.43494,0.61743,0.358802,2.365785,4.718679,0.213106,0.173627,2.134014,0.192158,...,0.110694,0.434154,0.118481,0.140314,0.14838,1.83973,Control,Memantine,C/S,c-CS-m


In [2]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1080 entries, 0 to 1079
Data columns (total 82 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   MouseID          1080 non-null   object 
 1   DYRK1A_N         1077 non-null   float64
 2   ITSN1_N          1077 non-null   float64
 3   BDNF_N           1077 non-null   float64
 4   NR1_N            1077 non-null   float64
 5   NR2A_N           1077 non-null   float64
 6   pAKT_N           1077 non-null   float64
 7   pBRAF_N          1077 non-null   float64
 8   pCAMKII_N        1077 non-null   float64
 9   pCREB_N          1077 non-null   float64
 10  pELK_N           1077 non-null   float64
 11  pERK_N           1077 non-null   float64
 12  pJNK_N           1077 non-null   float64
 13  PKCA_N           1077 non-null   float64
 14  pMEK_N           1077 non-null   float64
 15  pNR1_N           1077 non-null   float64
 16  pNR2A_N          1077 non-null   float64
 17  pNR2B_N       

In [3]:
# 学習データ/テストデータ分割
train_data = dataset.sample(frac=0.95, random_state=42).reset_index(drop=True)
test_data = dataset.drop(train_data.index).reset_index(drop=True)

print('データ      ：' + str(dataset.shape) + ' ' + str(dataset.index))
print('学習データ  ：' + str(train_data.shape) + ' ' + str(train_data.index))
print('テストデータ：' + str(test_data.shape) + ' ' + str(test_data.index))

データ      ：(1080, 82) RangeIndex(start=0, stop=1080, step=1)
学習データ  ：(1026, 82) RangeIndex(start=0, stop=1026, step=1)
テストデータ：(54, 82) RangeIndex(start=0, stop=54, step=1)


# PyCaretでのデータセットアップ

In [4]:
# 異常検知用インポート
from pycaret.anomaly import *

In [5]:
# 目的変数を'default'に指定しデータのセットアップを実施
# session_idを指定することで欄数シードを固定
# セットアップが完了するとデータの情報や前処理のパイプラインの情報が表示される
# 詳細は<https://pycaret.org/setup/>を参照
# 
# また、欠損値の補間等の前処理もsetup()実施時に可能
# 前処理の詳細は<https://pycaret.org/preprocessing/>を参照
#
# MouseIDは個体のID値であり異常検知に悪影響を与える可能性があるため、無視するよう指定
exp = setup(train_data, normalize = True, ignore_features = ['MouseID'], session_id =42)


Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,42
1,Original Data,"(1026, 82)"
2,Missing Values,True
3,Numeric Features,77
4,Categorical Features,5
5,Ordinal Features,False
6,High Cardinality Features,False
7,Transformed Data,"(1026, 91)"
8,Numeric Imputer,mean
9,Categorical Imputer,constant


In [6]:
print(exp)

(      DYRK1A_N   ITSN1_N    BDNF_N     NR1_N    NR2A_N    pAKT_N   pBRAF_N  \
0     0.259362  0.156719  0.424229  0.434794  0.826885  0.965815  1.370757   
1    -0.966722 -1.258253 -1.816209 -1.656435 -1.226890 -1.401395 -1.565385   
2    -0.547593 -0.405643 -0.659460 -0.733015 -0.569245 -0.346542  0.645484   
3     0.572315  0.509521 -0.145096  0.444585 -0.016626 -0.794055 -0.604064   
4    -0.552583 -0.374065  0.024163  0.372952 -0.271910  0.263408  0.278753   
...        ...       ...       ...       ...       ...       ...       ...   
1021 -0.070948  0.081524  0.286697  0.609830  1.135132  0.528333 -0.022506   
1022  0.000266  0.323667 -0.336361 -0.336539 -0.406771  0.646013 -0.113658   
1023 -0.520483 -0.655786 -0.815441 -1.705522 -1.611700 -0.546031  0.001961   
1024 -0.172737  0.031457  0.292890  0.175224 -0.129002  0.447574 -0.318476   
1025 -0.762474 -0.961390 -1.004757 -1.193196 -0.991396  0.321319  0.318207   

      pCAMKII_N   pCREB_N    pELK_N  ...  Behavior_C/S  Behavi

# モデル生成
以降では、例とてiForestでの生成を実施する。

ver1.0.0現在使用可能なモデルは以下の通り。

詳細は<https://pycaret.org/create-model/>を参照。

| モデル名 | 引数での指定 |
----|---- 
| Angle-base Outlier Detection | ‘abod’ |
| Isolation Forest | ‘iforest’ |
| Clustering-Based Local Outlier | ‘cluster’ |
| Connectivity-Based Outlier Factor | ‘cof’ |
| Histogram-based Outlier Detection | ‘histogram’ |
| k-Nearest Neighbors Detector | ‘knn’ |
| Local Outlier Factor | ‘lof’ |
| One-class SVM detector | ‘svm’ |
| Principal Component Analysis | ‘pca’ |
| Minimum Covariance Determinant | ‘mcd’ |
| Subspace Outlier Detection | ‘sod’ |
| Stochastic Outlier Selection | ‘sos |

In [7]:
# 引数で作成するモデルを指定
iforest_model = create_model('iforest')

In [8]:
print(iforest_model)

IForest(behaviour='new', bootstrap=False, contamination=0.05,
    max_features=1.0, max_samples='auto', n_estimators=100, n_jobs=1,
    random_state=42, verbose=0)


# モデルの割り当て
末尾にLabel列とScore列が追加される。

Label：0=正常、1=異常

Score：外れ値の場合はより大きなスコアになる

In [9]:
iforest_results = assign_model(iforest_model)
iforest_results.head()

Unnamed: 0,MouseID,DYRK1A_N,ITSN1_N,BDNF_N,NR1_N,NR2A_N,pAKT_N,pBRAF_N,pCAMKII_N,pCREB_N,...,H3AcK18_N,EGR1_N,H3MeK4_N,CaNA_N,Genotype,Treatment,Behavior,class,Label,Score
0,50810F_4,0.492403,0.658379,0.339319,2.446823,4.613029,0.27325,0.218692,4.184162,0.26128,...,,,,1.45239,Control,Saline,C/S,c-CS-s,0,-0.054857
1,3516_9,0.182518,0.298969,0.229708,1.725425,2.699869,0.174822,0.139538,2.747931,0.187309,...,,0.220072,0.338278,1.090741,Control,Saline,S/C,c-SC-s,0,-0.010339
2,3411_12,0.28845,0.515536,0.286301,2.043971,3.312488,0.218683,0.19914,2.929255,0.226304,...,,0.286819,,1.152579,Ts65Dn,Memantine,S/C,t-SC-m,0,-0.077329
3,3416_4,0.5715,0.747993,0.311465,2.450201,3.82727,0.200075,0.165454,2.424611,0.192925,...,0.14936,,,1.720202,Ts65Dn,Memantine,C/S,t-CS-m,0,-0.094895
4,J1291_2,0.287189,0.523557,0.319746,2.42549,3.589465,0.244044,0.189254,3.807835,0.250662,...,0.287167,0.127822,0.220443,1.372286,Ts65Dn,Saline,S/C,t-SC-s,0,-0.074807


# モデルプロット

In [10]:
# t-SNE
plot_model(iforest_model)

In [11]:
# UMAP
plot_model(iforest_model, plot='umap')

# 推論

In [12]:
predictions = predict_model(iforest_model, data=test_data)
predictions.head()

Unnamed: 0,MouseID,DYRK1A_N,ITSN1_N,BDNF_N,NR1_N,NR2A_N,pAKT_N,pBRAF_N,pCAMKII_N,pCREB_N,...,H3AcK18_N,EGR1_N,H3MeK4_N,CaNA_N,Genotype,Treatment,Behavior,class,Label,Score
0,3517_7,0.361782,0.565987,0.376145,2.774771,4.45025,0.262906,0.238968,6.041007,0.229808,...,0.208611,,0.27082,0.782176,Ts65Dn,Saline,S/C,t-SC-s,1,0.020794
1,3517_8,0.329361,0.503216,0.326145,2.487893,3.925842,0.231177,0.195611,5.337306,0.210556,...,0.211478,,0.255582,0.882451,Ts65Dn,Saline,S/C,t-SC-s,0,-0.054447
2,3517_9,0.328314,0.500465,0.363265,2.40937,3.796988,0.22625,0.187024,5.330917,0.203198,...,0.201974,,0.241815,0.881029,Ts65Dn,Saline,S/C,t-SC-s,0,-0.076923
3,3517_10,0.37749,0.528331,0.37956,2.689263,3.978008,0.266753,0.229495,5.441138,0.227684,...,0.26262,,0.26262,0.781328,Ts65Dn,Saline,S/C,t-SC-s,1,0.04039
4,3517_11,0.367813,0.478322,0.346371,2.356975,3.561499,0.252121,0.226437,4.86852,0.204288,...,0.26467,,0.247165,0.783654,Ts65Dn,Saline,S/C,t-SC-s,0,-0.045889


# モデルの保存/読み込み

In [13]:
# モデル保存
save_model(iforest_model, 'iForestAnomalyDetectionModel')

Transformation Pipeline and Model Succesfully Saved


In [14]:
# モデル読み込み
load_iforest_model = load_model('iForestAnomalyDetectionModel')

Transformation Pipeline and Model Sucessfully Loaded


In [15]:
# ロードしたモデルでの推論
predictions = predict_model(load_iforest_model, data=test_data)
predictions.head()

Unnamed: 0,MouseID,DYRK1A_N,ITSN1_N,BDNF_N,NR1_N,NR2A_N,pAKT_N,pBRAF_N,pCAMKII_N,pCREB_N,...,H3AcK18_N,EGR1_N,H3MeK4_N,CaNA_N,Genotype,Treatment,Behavior,class,Label,Score
0,3517_7,0.361782,0.565987,0.376145,2.774771,4.45025,0.262906,0.238968,6.041007,0.229808,...,0.208611,,0.27082,0.782176,Ts65Dn,Saline,S/C,t-SC-s,1,0.020794
1,3517_8,0.329361,0.503216,0.326145,2.487893,3.925842,0.231177,0.195611,5.337306,0.210556,...,0.211478,,0.255582,0.882451,Ts65Dn,Saline,S/C,t-SC-s,0,-0.054447
2,3517_9,0.328314,0.500465,0.363265,2.40937,3.796988,0.22625,0.187024,5.330917,0.203198,...,0.201974,,0.241815,0.881029,Ts65Dn,Saline,S/C,t-SC-s,0,-0.076923
3,3517_10,0.37749,0.528331,0.37956,2.689263,3.978008,0.266753,0.229495,5.441138,0.227684,...,0.26262,,0.26262,0.781328,Ts65Dn,Saline,S/C,t-SC-s,1,0.04039
4,3517_11,0.367813,0.478322,0.346371,2.356975,3.561499,0.252121,0.226437,4.86852,0.204288,...,0.26467,,0.247165,0.783654,Ts65Dn,Saline,S/C,t-SC-s,0,-0.045889


# 実験全体の保存/読み込み
すべての出力とモデルを含む実験全体を保存可能

In [16]:
# 保存
save_experiment('AnomalyDetectionExperiment')

Experiment Succesfully Saved


In [17]:
# 読み込み
experiment = load_experiment('AnomalyDetectionExperiment')

Unnamed: 0,Object
0,Anomaly Setup Config
1,Orignal Dataset
2,Transformed Dataset
3,Transformation Pipeline
4,Isolation Forest
5,Assigned Isolation Forest


In [18]:
# 任意の実行結果を取り出し
experiment_loaded_model = experiment[4]  # Isolation Forest

In [19]:
predictions = predict_model(experiment_loaded_model, data=test_data)
predictions.head()

Unnamed: 0,MouseID,DYRK1A_N,ITSN1_N,BDNF_N,NR1_N,NR2A_N,pAKT_N,pBRAF_N,pCAMKII_N,pCREB_N,...,H3AcK18_N,EGR1_N,H3MeK4_N,CaNA_N,Genotype,Treatment,Behavior,class,Label,Score
0,3517_7,0.361782,0.565987,0.376145,2.774771,4.45025,0.262906,0.238968,6.041007,0.229808,...,0.208611,,0.27082,0.782176,Ts65Dn,Saline,S/C,t-SC-s,1,0.020794
1,3517_8,0.329361,0.503216,0.326145,2.487893,3.925842,0.231177,0.195611,5.337306,0.210556,...,0.211478,,0.255582,0.882451,Ts65Dn,Saline,S/C,t-SC-s,0,-0.054447
2,3517_9,0.328314,0.500465,0.363265,2.40937,3.796988,0.22625,0.187024,5.330917,0.203198,...,0.201974,,0.241815,0.881029,Ts65Dn,Saline,S/C,t-SC-s,0,-0.076923
3,3517_10,0.37749,0.528331,0.37956,2.689263,3.978008,0.266753,0.229495,5.441138,0.227684,...,0.26262,,0.26262,0.781328,Ts65Dn,Saline,S/C,t-SC-s,1,0.04039
4,3517_11,0.367813,0.478322,0.346371,2.356975,3.561499,0.252121,0.226437,4.86852,0.204288,...,0.26467,,0.247165,0.783654,Ts65Dn,Saline,S/C,t-SC-s,0,-0.045889


In [20]:
predictions = predict_model(iforest_model, data=test_data)
predictions.head()

Unnamed: 0,MouseID,DYRK1A_N,ITSN1_N,BDNF_N,NR1_N,NR2A_N,pAKT_N,pBRAF_N,pCAMKII_N,pCREB_N,...,H3AcK18_N,EGR1_N,H3MeK4_N,CaNA_N,Genotype,Treatment,Behavior,class,Label,Score
0,3517_7,0.361782,0.565987,0.376145,2.774771,4.45025,0.262906,0.238968,6.041007,0.229808,...,0.208611,,0.27082,0.782176,Ts65Dn,Saline,S/C,t-SC-s,1,0.020794
1,3517_8,0.329361,0.503216,0.326145,2.487893,3.925842,0.231177,0.195611,5.337306,0.210556,...,0.211478,,0.255582,0.882451,Ts65Dn,Saline,S/C,t-SC-s,0,-0.054447
2,3517_9,0.328314,0.500465,0.363265,2.40937,3.796988,0.22625,0.187024,5.330917,0.203198,...,0.201974,,0.241815,0.881029,Ts65Dn,Saline,S/C,t-SC-s,0,-0.076923
3,3517_10,0.37749,0.528331,0.37956,2.689263,3.978008,0.266753,0.229495,5.441138,0.227684,...,0.26262,,0.26262,0.781328,Ts65Dn,Saline,S/C,t-SC-s,1,0.04039
4,3517_11,0.367813,0.478322,0.346371,2.356975,3.561499,0.252121,0.226437,4.86852,0.204288,...,0.26467,,0.247165,0.783654,Ts65Dn,Saline,S/C,t-SC-s,0,-0.045889
