# <font color="white"> 解析OJT用ツール(Jupyter Notebook)<font>

In [2]:
%%capture installer_log
!apt-get update && apt-get install -y build-essential
!python -m pip install shap
!pip install pymc3
!pip install tqdm
!pip install pandas-profiling
!pip install msoffcrypto-tool
!pip install japanize-matplotlib
!pip install beautifulsoup4
!pip install ipywidgets jupyterlab matplotlib
!pip install pubchempy
%matplotlib inline

In [3]:
# rdkit can't install by pip, so we use the following command to install.
! conda install -y -c rdkit rdkit # It takes just a few minites.

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Retrieving notices: ...working... done


In [4]:
import os
import sagemaker
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tybmilib import prep
from tybmilib import vis
from tybmilib import datamgmt
from tybmilib import modeling
from tybmilib import paramsearch
from tybmilib import chemembeding
from tybmilib import sampling
from tybmilib import myfilename as mfn
import pubchempy as pcp
from rdkit import rdBase, Chem
from rdkit.Chem import AllChem, Draw
from rdkit.Chem import MACCSkeys
from rdkit import DataStructs
import warnings
warnings.filterwarnings('ignore')
mfn.prepare_folder()

## 初期パラメーター設定（★要変更）

In [5]:
# 入力データは、このnotebookファイルと同じ階層に配置してください
input_filename = "paint_polymer_ac_mask.csv"

# 各ユーザーは、自身の情報を入力してください
department_name = "deloitte"
user_name = "yuhei_tachi"
experiment_ID = "00000"

# 分子構造組み込み利用者は
cas_filename = "cas_master.csv"

In [6]:
# S3の保存先
bucket_name = mfn.get_user_s3_bucket(department_name)
prefix_name = mfn.get_user_s3_prefix(user_name, experiment_ID)
datamgmt.create_user_bucket(bucket_name)

# 入力データのS3URL
s3uri_original_data = datamgmt.upload_file(bucket_name, input_filename, prefix_name)

# CASコードのマスターデータのS3URL
s3uri_master_data = datamgmt.upload_file(bucket_name, cas_filename, prefix_name)

An error occurred (BucketAlreadyOwnedByYou) when calling the CreateBucket operation: Your previous request to create the named bucket succeeded and you already own it.


# <font color="white"> 1. データ投入・確認<font>

#### <font color="white"> データ読み込み<font>

In [7]:
# データ読み込み
# 実験データのExcelファイルにパスワードが設定されている場合、データ読み込み時に入力が必要となります。
df_s3 = prep.read_s3_bucket_data(s3uri_original_data,experiment_ID)

読み込みデータのカラム名: ID,A1,A2,A3,A4,A5,A6,A7,A8,G1,G2,G3,G4,G5,G6,G7,G8,G9,C1,C2,RV,Anneal_Temp,Anneal_Time,Tg,Tm,Jg


## パラメーター設定（★要変更）

In [8]:
# 本ステップで必要な設定項目
# 目的変数
objectives = ["Tg","Tm","Jg"]
# 入力データに含まれる不要カラムの指定
drop_cols = ["ID","RV","Anneal_Temp","Anneal_Time"]

#分子構造表現方法（'maccs': MACCS key, 'mfp': Morgan Finger Print）を選択、行わない場合は空欄（''）
structure_mode = ""
#'mfp'を選んだ場合は入力、そうでなければ適当な値を入力
radius = 3
bit_num = 4096
# データ可視化方法
# profiles = pandas-profiles / pairplot = N×N散布図行列 / pairplot_1by1 = 1×N散布図行列 / correlation_matrix = 相関変数行列
show_plot_method=['pairplot','correlation_matrix']

## 処理実行

#### <font color="white"> 不要列削除 & 可視化<font>

In [9]:
# 不要列の削除
# 実験データ内で不要カラムがある場合には、drop_colsのリスト内にカラム名を追加し削除して下さい。
df_reg = prep.drop_cols(df_s3,drop_cols,experiment_ID)

# データ可視化
vis.show_plot(df_reg, bucket_name, prefix_name, experiment_ID, show_plot_method)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

#### 分子構造組込の準備と、分子構造の可視化

##### <font color="red"> 分子構造組込を実行しない方も、以下のセルを実行してください。

In [10]:
#CASコードのSMILE表記取得
chem = chemembeding.Features(s3uri_master_data,df_reg.columns, experiment_ID,structure_mode,radius,bit_num)
mol_list, true_name_list = chem.get_smiles()
#Morgan Fingerprint指定時に、分子構造の特徴を一覧で可視化
chem.preview_chemical()

読み込みデータのカラム名: Source_Name,CAS


#### <font color="white"> 学習データ作成<font>

In [11]:
# 学習データの作成
df_chem = chem.generate_fingerprint_dataset(df_reg, objectives)
# サンプリング用のファイルを生成
prep.create_sampling_prepare(df_reg,experiment_ID,objectives=objectives)
# 目的変数分の学習データを作成し、S3のSageMaker用ディレクトリに格納
traindata_path_list = prep.create_multi_traindata(df_chem,experiment_ID,objectives=objectives)
s3_uri_list = datamgmt.upload_file_list(bucket_name, traindata_path_list, prefix_name)

# 次のステップで使用する情報を提示
role = sagemaker.get_execution_role()
prep.present_info(objectives,s3_uri_list,traindata_path_list,role,bucket_name,user_name,experiment_ID)

目的変数: Tg
学習データ: train_Tg.csv
目的変数: Tm
学習データ: train_Tm.csv
目的変数: Jg
学習データ: train_Jg.csv
説明変数のみデータ: train(only_x).csv
目的変数のみデータ: train(only_y).csv
------------------------------
objectives = ['Tg', 'Tm', 'Jg']
s3_uri_list = ['s3://mi-deloitte/yuhei_tachi/00000/train_Tg.csv', 's3://mi-deloitte/yuhei_tachi/00000/train_Tm.csv', 's3://mi-deloitte/yuhei_tachi/00000/train_Jg.csv']
traindata_path_list = ['/root/poc/doe/test_rv1_new_tool/data/train_Tg.csv', '/root/poc/doe/test_rv1_new_tool/data/train_Tm.csv', '/root/poc/doe/test_rv1_new_tool/data/train_Jg.csv']
role = 'arn:aws:iam::375869297825:role/AmazonSageMaker-ExecutionRole-20211228T101010'
bucket_name = 'mi-deloitte'
user_name = 'yuhei_tachi'
experiment_ID = '00000'


---

# <font color="white"> 2. モデル構築・評価<font>

## パラメーター設定（★要変更）

In [12]:
# 本ステップで必要な設定項目
# 機械学習モデルの指定
#'Regression|BinaryClassification|MulticlassClassification'
problemtype = 'Regression'

# モデル評価指標の設定
# Regressionの場合:'MSE' / BinaryClassificationの場合:'F1' / MulticlassClassificationの場合:'F1macro'を指定
metrics = 'MSE'

## 処理実行

#### <font color="white"> モデル学習<font>

In [13]:
mlmodel = modeling.SagemakerCtrl(bucket_name,role,experiment_ID,user_name,problemtype,metrics)
model_list = mlmodel.fit_multi_model(objectives, s3_uri_list)

目的変数：Tg
An error occurred (BucketAlreadyOwnedByYou) when calling the CreateBucket operation: Your previous request to create the named bucket succeeded and you already own it.


  0%|          | 0/5 [00:00<?, ?it/s]

Metric: validation:mse
Value: 0.09029000252485275
モデル名: ml-yuhei-tachi-0-00000
------------------------------
目的変数：Tm
An error occurred (BucketAlreadyOwnedByYou) when calling the CreateBucket operation: Your previous request to create the named bucket succeeded and you already own it.


  0%|          | 0/5 [00:00<?, ?it/s]

Metric: validation:mse
Value: 0.014030000194907188
モデル名: ml-yuhei-tachi-1-00000
------------------------------
目的変数：Jg
An error occurred (BucketAlreadyOwnedByYou) when calling the CreateBucket operation: Your previous request to create the named bucket succeeded and you already own it.


  0%|          | 0/5 [00:00<?, ?it/s]

Metric: validation:mse
Value: 0.024150000885128975
モデル名: ml-yuhei-tachi-2-00000
------------------------------


#### <font color="white"> モデル評価<font>

In [14]:
# テストデータでの評価値
mlmodel.estimate_multi_testdata(objectives,model_list,traindata_path_list)
# 線形モデルでの係数
mlmodel.estimate_multi_coefficients(objectives,traindata_path_list)
# Shap値出力
mlmodel.analyze_multi_model(objectives,s3_uri_list,traindata_path_list,model_list)

# 次のステップで使用する情報を提示
mlmodel.present_info(objectives, model_list)

  0%|          | 0/3 [00:00<?, ?it/s]

=====目的変数:Tg デプロイモデルの性能評価=====
決定係数R2: 1.00 MAE: 0.09 MSE: 0.08 RMSE: 0.28
=====目的変数:Tm デプロイモデルの性能評価=====
決定係数R2: 1.00 MAE: 0.02 MSE: 0.01 RMSE: 0.11
=====目的変数:Jg デプロイモデルの性能評価=====
決定係数R2: 1.00 MAE: 0.03 MSE: 0.02 RMSE: 0.15
テストデータとの比較結果: test_Tg.png
テストデータとの比較結果: test_Tm.png
テストデータとの比較結果: test_Jg.png
====(参考)目的変数:Tg 線形モデルの性能評価====
決定係数R2:  0.92 MAE:  5.57 MSE:  84.86 RMSE:  9.21
coef値グラフ(全変数/重要変数): visulize_linear_(coef_all/coef_importance)_Tg.png
====(参考)目的変数:Tm 線形モデルの性能評価====
決定係数R2:  0.67 MAE:  9.04 MSE:  160.18 RMSE:  12.66
coef値グラフ(全変数/重要変数): visulize_linear_(coef_all/coef_importance)_Tm.png
====(参考)目的変数:Jg 線形モデルの性能評価====
決定係数R2:  0.60 MAE:  3.66 MSE:  22.54 RMSE:  4.75
coef値グラフ(全変数/重要変数): visulize_linear_(coef_all/coef_importance)_Jg.png
------------------------------


  0%|          | 0/3 [00:00<?, ?it/s]

=====目的変数:Tg Shap値レポート=====
/root/poc/doe/test_rv1_new_tool/output/report_Tg.html
=====目的変数:Tm Shap値レポート=====
/root/poc/doe/test_rv1_new_tool/output/report_Tm.html
=====目的変数:Jg Shap値レポート=====
/root/poc/doe/test_rv1_new_tool/output/report_Jg.html
------------------------------
objectives = ['Tg', 'Tm', 'Jg']
model_list = ['ml-yuhei-tachi-0-00000', 'ml-yuhei-tachi-1-00000', 'ml-yuhei-tachi-2-00000']
problemtype = 'Regression'
user_name = 'yuhei_tachi'
experiment_ID = '00000'


---

# <font color="white"> 3. テスト用サンプル作成/読み込み（ここから実験計画のPoC独自のコード）<font>

## パラメーター設定（★要変更）

#### <font color="white"> 途中実行する場合での引継ぎ情報<font>

In [None]:
#=========【途中再起動した場合、別セルに貼り付け、実行】以下の情報は、次セクションでも利用します。=========
model_list = ["ml-yuhei-tachi-0-00000", "ml-yuhei-tachi-1-00000", "ml-yuhei-tachi-2-00000"]

In [None]:
# テスト用のターゲット物性を入力
doe_test_target = 'Tg'
doe_test_target_value = 40

## 処理実行

#### <font color="white"> 実験サンプル生成（★要変更）<font>

In [15]:
# テストに使うターゲット物性を入力
doe_test_target = 'Tg'
# ターゲット物性値を入力（現状は、テストなのである数値以上のみ想定）
doe_test_target_value = 40
# 制約条件として、100にするグループ（現状は、テストなので重合で酸とグリコールの場合のみ想定）
acid_list = ['A1','A2','A3','A4','A5','A6','A7']
glycol_list = ['G1','G2','G3','G4','G5','G6','G7','G8','G9']
# 実験サンプルの出力ファイル名を入力
samples_file_name = 'samples_noise.csv'

In [16]:
# ノイズを加える前のデータを取得
samples = df_reg[df_reg[doe_test_target] >= doe_test_target_value].reset_index().drop(['index'] + objectives, axis=1)
samples.head(10)

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,G1,G2,G3,G4,G5,G6,G7,G8,G9,C1,C2
0,66.0,34.0,0,0,0.0,0.0,0,0,14,0,0,0,0,55.0,0,31,0.0,0.0,0
1,83.0,17.0,0,0,0.0,0.0,0,0,7,0,0,0,0,64.0,0,29,0.0,0.0,0
2,86.0,14.0,0,0,0.0,0.0,0,0,0,0,0,0,0,64.0,0,36,0.0,0.0,0
3,86.0,14.0,0,0,0.0,0.0,0,0,10,0,0,0,0,59.0,0,32,0.0,0.0,0
4,81.0,19.0,0,0,0.0,0.0,0,0,5,0,0,0,0,65.0,0,30,0.0,0.0,0
5,81.0,19.0,0,0,0.0,0.0,0,0,0,0,0,0,0,67.0,7,26,0.0,0.0,0
6,81.0,14.0,5,0,0.0,0.0,0,0,12,0,0,0,0,59.0,0,29,0.0,0.0,0
7,80.0,19.0,0,0,0.0,0.0,0,1,7,0,0,0,0,62.0,0,31,0.0,0.0,0
8,81.0,19.0,0,0,0.0,0.0,0,0,15,0,0,0,0,54.0,0,31,0.0,0.0,0
9,71.0,19.0,10,0,0.0,0.0,0,0,4,0,0,0,0,67.0,0,29,0.0,0.0,0


In [17]:
# ノイズを加える
for i,column in enumerate(samples.columns):
    mu, sigma = 0, 0.05
    np.random.seed(i)
    noise = np.abs(np.random.normal(mu, sigma, (len(samples), 1))) * df_reg[column].mean()
    index = list(samples[samples[column] != 0.0].index)
    samples[column][index] = samples[column][index] + pd.DataFrame(noise)[0][index]

In [18]:
import random
random.seed(999)

for i,row in samples.iterrows():
    over_value = row[].sum() - 100
    row_booleans = list(row[acid_list] > 0)
    random_num = random.choice([i for i, x in enumerate(row_booleans) if x == True])
    row[acid_list].iloc[random_num] = row[acid_list][random_num] - over_value
    
    over_value = row[].sum() - 100
    row_booleans = list(row[glycol_list] > 0)
    random_num = random.choice([i for i, x in enumerate(row_booleans) if x == True])
    row[glycol_list].iloc[random_num] = row[glycol_list][random_num] - over_value
    
    samples.iloc[i] = row
samples.to_csv(samples_file_name)
samples.head(10)

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,G1,G2,G3,G4,G5,G6,G7,G8,G9,C1,C2
0,72.277571,35.271327,0.0,0.0,0.0,0.0,0.0,0.0,14.031559,0.0,0.0,0.0,0.0,57.338503,0.0,31.237618,0.0,0.0,0.0
1,84.424003,17.478804,0.0,0.0,0.0,0.0,0.0,0.0,7.377609,0.0,0.0,0.0,0.0,66.474324,0.0,29.258126,0.0,0.0,0.0
2,89.482945,14.413384,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64.146086,0.0,36.118615,0.0,0.0,0.0
3,93.97446,14.839781,0.0,0.0,0.0,0.0,0.0,0.0,10.479708,0.0,0.0,0.0,0.0,60.483125,0.0,32.381759,0.0,0.0,0.0
4,87.645907,19.677329,0.0,0.0,0.0,0.0,0.0,0.0,5.794638,0.0,0.0,0.0,0.0,69.415449,0.0,30.17922,0.0,0.0,0.0
5,84.477749,20.801346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,68.74746,7.003206,27.341743,0.0,0.0,0.0
6,84.380992,15.365613,5.084576,0.0,0.0,0.0,0.0,0.0,12.597871,0.0,0.0,0.0,0.0,63.432145,0.0,29.833727,0.0,0.0,0.0
7,80.538621,19.595774,0.0,0.0,0.0,0.0,0.0,1.017946,7.762827,0.0,0.0,0.0,0.0,64.827024,0.0,31.827567,0.0,0.0,0.0
8,81.367315,19.249702,0.0,0.0,0.0,0.0,0.0,0.0,15.275028,0.0,0.0,0.0,0.0,58.853954,0.0,31.232172,0.0,0.0,0.0
9,72.461159,19.195175,10.152879,0.0,0.0,0.0,0.0,0.0,4.337864,0.0,0.0,0.0,0.0,70.431569,0.0,29.360426,0.0,0.0,0.0


#### モデル推論の実行

In [19]:
ifr = sampling.Inference(experiment_ID, user_name, bucket_name)
samples_inference = ifr.multi_model_inference(samples, objectives, model_list, chem)

  0%|          | 0/3 [00:00<?, ?it/s]

#### モデル推論結果の出力（★要変更）

In [20]:
# 予測結果の出力ファイル名
inference_file_name = 'samples_inference.xlsx'
samples_inference.to_excel(inference_file_name)

# <font color="white"> 4. 作成物一括削除<font>

## パラメーター設定（★要変更）

In [6]:
# 途中実行する場合、モデル評価での出力データをコピーし実行
#=========【途中再起動した場合、別セルに貼り付け、実行】以下の情報は、次セクションでも利用します。=========
department_name = "deloitte"
user_name = "ryota_matsubara"
experiment_ID = "005"
model_list = ["ml-ryota-matsubara-0-005", "ml-ryota-matsubara-1-005"]

## 処理実行

In [7]:
prep.delete_resources(department_name, user_name, experiment_ID, model_list)