프로젝트의 초기 단계에서는 pycaret를 사용하여 빠르게 모델을 구축하고 최적의 모델을 선택한 다음, 이후에 Optuna를 사용하여 선택한 모델의 하이퍼파라미터를 미세 조정하고 최적화

In [107]:
import pandas as pd
import numpy as np
import datamol as dm

from rdkit.Chem import SaltRemover
from molfeat.trans.fp import FPVecTransformer
from molfeat.trans.concat import FeatConcat
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from pycaret.regression import *

In [112]:
ETC_COLUMNS = ["AlogP", "Molecular_Weight", "Num_H_Acceptors", "Num_H_Donors", "Num_RotatableBonds", "LogD", "Molecular_PolarSurfaceArea"]
AVAILABLE_FPS = ['maccs', 'avalon', 'ecfp', 'fcfp', 'topological', 'atompair', 'rdkit', 'pattern', 'layered', 'secfp', 'erg', 'estate', 'avalon-count', 'rdkit-count', 'ecfp-count', 'fcfp-count', 'topological-count', 'atompair-count',
                 'cats2D', 'pharm2D', 'scaffoldkeys', 'skeys']
MODEL = "v1_0"
SEED = 42

dm.disable_rdkit_log()

In [109]:
def preprocess_mol(row):
    mol = dm.to_mol(row["SMILES"], ordered=True)
    mol = dm.fix_mol(mol)
    mol = dm.sanitize_mol(mol, sanifix=True, charge_neutral=False)
    mol = dm.standardize_mol(
        mol,
        disconnect_metals=False,
        normalize=True,
        reionize=True,
        uncharge=False,
        stereo=True,
    )

    #mol = SaltRemover.SaltRemover().StripMol(mol, dontRemoveEverything=True)    
    row["Standard_Smiles"] = dm.to_smiles(mol)
    
    return row

def fill_na(df, imputer=None):
    if imputer is None:
        imputer = IterativeImputer(estimator=RandomForestRegressor(n_jobs=-1), random_state=SEED)        
        df[ETC_COLUMNS] = imputer.fit_transform(df[ETC_COLUMNS].to_numpy())
    
        return pd.DataFrame(df), imputer
    else:
        df[ETC_COLUMNS] = imputer.transform(df[ETC_COLUMNS].to_numpy())
    
        return pd.DataFrame(df)

def extract_features(df):    
    _df = df.apply(preprocess_mol, axis=1)
    
    fps = []
    for fp in AVAILABLE_FPS:
        fps.append(FPVecTransformer(fp, dtype=np.float64, n_jobs=-1))
    
    featurizer = FeatConcat(fps, dtype=np.float64)
    smiles = _df["Standard_Smiles"].to_list()
    descriptors = featurizer(smiles)
    
    etcs = _df[ETC_COLUMNS].to_numpy()
    
    return pd.DataFrame(np.concatenate([descriptors, etcs], axis=1))

In [110]:
df_train = pd.read_csv("./data/train.csv").drop(columns=["id"], axis=1)

df_train["MLM"] = df_train.groupby(by=["SMILES"])["MLM"].transform("max")
df_train["HLM"] = df_train.groupby(by=["SMILES"])["HLM"].transform("max")
df_train = df_train.drop_duplicates().reset_index(drop=True)

In [111]:
df, imputer = fill_na(df_train)
df = extract_features(df)

df[["MLM", "HLM"]] = df_train[["MLM", "HLM"]]

In [118]:
def train(df, target="MLM"):    
    _df = df.drop(columns=["MLM", "HLM"], axis=1).copy()
    _df[target] = df[target]    
    _setup = setup(data=_df, target=target, train_size=0.8, session_id=SEED, transformation=False, normalize=False, use_gpu=False)
    
    # 단순 모델 비교
    _compare_models = compare_models(sort="RMSE", include=["rf", "gbr", "lightgbm", "xgboost", "catboost"], n_select=2)
    # 하이퍼파라미터 튜닝
    tuned_models = [tune_model(model, n_iter=5, optimize="RMSE") for model in _compare_models]
    
    # 앙상블 모델
    blender = blend_models(tuned_models, optimize="RMSE")
    save_model(blender, f"./models/blender_{target}_{MODEL}")

In [119]:
train(df, target="MLM")
train(df, target="HLM")

Unnamed: 0,Description,Value
0,Session id,42
1,Target,MLM
2,Target type,Regression
3,Original data shape,"(3471, 32842)"
4,Transformed data shape,"(3471, 32842)"
5,Transformed train set shape,"(2776, 32842)"
6,Transformed test set shape,"(695, 32842)"
7,Numeric features,32841
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,25.145,940.4745,30.6556,0.2683,1.5677,52.8839,161.369
rf,Random Forest Regressor,25.7463,948.5672,30.7909,0.2622,1.5993,53.4553,180.587
lightgbm,Light Gradient Boosting Machine,25.3519,971.2681,31.1595,0.2443,1.5687,50.241,4.37
gbr,Gradient Boosting Regressor,26.1612,973.8892,31.2005,0.2426,1.6219,58.1947,24.992
xgboost,Extreme Gradient Boosting,26.1235,1048.9545,32.3756,0.184,1.587,51.5317,139.161


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,27.2584,1004.2009,31.6891,0.2517,1.5699,59.889
1,27.9019,1049.523,32.3963,0.2356,1.6243,42.3664
2,27.0392,935.8171,30.5911,0.2731,1.6727,83.5475
3,26.9837,976.6244,31.251,0.264,1.6273,59.6118
4,27.1211,1021.9009,31.9672,0.1955,1.7842,56.7852
5,25.9095,942.5296,30.7006,0.2403,1.6372,52.5061
6,27.2297,1022.6779,31.9793,0.2277,1.6751,64.8183
7,27.0325,979.8425,31.3024,0.216,1.6953,48.7407
8,25.8838,953.8772,30.8849,0.2192,1.5627,55.2899
9,26.8862,976.4979,31.249,0.2044,1.7263,112.6107


Fitting 10 folds for each of 5 candidates, totalling 50 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,30.6767,1184.6147,34.4182,0.1173,1.6888,77.1327
1,31.4336,1240.6305,35.2226,0.0964,1.735,51.0354
2,30.6372,1120.4055,33.4725,0.1298,1.7896,111.8127
3,31.1115,1178.4234,34.3282,0.1119,1.7439,61.5611
4,30.352,1147.8337,33.8797,0.0964,1.8894,69.1033
5,29.4975,1103.284,33.2157,0.1107,1.7526,61.7427
6,30.8143,1187.0467,34.4535,0.1035,1.7896,87.4609
7,30.6033,1147.0353,33.8679,0.0823,1.8163,61.0756
8,29.1061,1076.9467,32.8169,0.1185,1.6453,68.3467
9,29.9274,1108.9631,33.3011,0.0965,1.8114,116.7346


Fitting 10 folds for each of 5 candidates, totalling 50 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,24.9807,926.9031,30.4451,0.3093,1.4505,40.8768
1,26.3386,1005.3734,31.7076,0.2678,1.5512,36.1268
2,25.0288,860.1437,29.3282,0.3319,1.6,61.8111
3,25.056,892.9238,29.8818,0.3271,1.5487,49.4093
4,25.7078,994.7505,31.5397,0.2169,1.7198,50.6245
5,24.3746,875.0207,29.5807,0.2947,1.5775,43.8632
6,25.5689,948.2303,30.7933,0.2839,1.5965,54.6193
7,25.5918,912.0729,30.2005,0.2703,1.6251,41.2451
8,24.5678,908.3688,30.1392,0.2565,1.5009,50.9305
9,25.1391,906.733,30.112,0.2613,1.635,101.547


Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Description,Value
0,Session id,42
1,Target,HLM
2,Target type,Regression
3,Original data shape,"(3471, 32842)"
4,Transformed data shape,"(3471, 32842)"
5,Transformed train set shape,"(2776, 32842)"
6,Transformed test set shape,"(695, 32842)"
7,Numeric features,32841
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
rf,Random Forest Regressor,26.8617,995.7915,31.5394,0.2381,1.3138,35.7878,164.749
catboost,CatBoost Regressor,26.5277,999.526,31.596,0.2348,1.2918,33.6067,166.685
gbr,Gradient Boosting Regressor,27.0093,1006.6907,31.7044,0.2301,1.3225,35.6426,27.67
lightgbm,Light Gradient Boosting Machine,26.4219,1007.2418,31.7159,0.2291,1.2898,33.1395,4.89
xgboost,Extreme Gradient Boosting,27.2905,1116.9079,33.401,0.1451,1.2942,33.1167,152.697


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,31.62,1213.9272,34.8415,0.1193,1.4398,30.1368
1,31.808,1259.6691,35.4918,0.0705,1.4899,35.6244
2,31.1467,1214.8857,34.8552,0.1214,1.4836,87.1613
3,30.9896,1191.507,34.5182,0.094,1.3975,24.2123
4,30.6479,1206.309,34.732,0.0823,1.5246,70.0114
5,28.7061,1061.6791,32.5834,0.1179,1.3495,18.5665
6,31.2854,1227.6552,35.0379,0.0776,1.4937,48.1926
7,30.9091,1205.3121,34.7176,0.071,1.4306,23.7483
8,28.6768,1070.0991,32.7124,0.1137,1.2044,42.3456
9,29.8671,1155.1432,33.9874,0.1019,1.3941,51.8759


Fitting 10 folds for each of 5 candidates, totalling 50 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,27.8751,992.8606,31.5097,0.2797,1.3533,22.0419
1,28.0826,1087.0464,32.9704,0.1978,1.4175,31.4809
2,28.1132,1034.7658,32.1678,0.2517,1.4222,78.4036
3,28.3493,1070.9164,32.7249,0.1857,1.3424,23.2559
4,27.7034,1071.1402,32.7283,0.1851,1.4596,63.0803
5,26.0886,914.8497,30.2465,0.2399,1.2741,15.4994
6,28.1876,1032.5446,32.1332,0.2242,1.413,35.9237
7,28.1553,1063.2811,32.608,0.1805,1.3416,16.6446
8,26.093,925.0425,30.4145,0.2339,1.1428,41.1023
9,27.08,1015.1935,31.8621,0.2107,1.3346,47.6955


Fitting 10 folds for each of 5 candidates, totalling 50 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,26.2423,932.0263,30.5291,0.3238,1.2861,17.3589
1,27.4794,1073.4326,32.7633,0.2079,1.3794,28.336
2,26.7833,980.3481,31.3105,0.291,1.3724,71.7582
3,27.3029,1041.6402,32.2745,0.208,1.2942,20.7132
4,27.1025,1034.3338,32.1611,0.2131,1.416,60.0816
5,25.2556,868.6978,29.4737,0.2783,1.2213,13.8518
6,26.288,936.7068,30.6057,0.2962,1.363,34.3222
7,27.0892,1032.4283,32.1314,0.2042,1.2824,13.787
8,25.2688,895.7239,29.9286,0.2582,1.115,41.2224
9,26.296,963.6025,31.0419,0.2508,1.2829,45.4207


Transformation Pipeline and Model Successfully Saved


In [120]:
def predict(df):    
    blender_MLM = load_model(f"./models/blender_MLM_{MODEL}")
    blender_HLM = load_model(f"./models/blender_HLM_{MODEL}")    
    
    pred_MLM = predict_model(blender_MLM, df)
    pred_HLM = predict_model(blender_HLM, df)
    
    return pred_MLM, pred_HLM   

In [121]:
df_test = pd.read_csv("./data/test.csv").drop(columns=["id"])
df = fill_na(df_test, imputer)
df = extract_features(df)

pred_MLM, pred_HLM = predict(df)

Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded


In [124]:
pred_HLM

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,32832,32833,32834,32835,32836,32837,32838,32839,32840,prediction_label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.641,361.505005,4.0,2.0,7.0,2.635,92.760002,52.112990
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.585,370.398987,5.0,0.0,3.0,0.585,68.309998,82.081763
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.276,347.414001,4.0,4.0,5.0,4.290,92.860001,55.106593
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.795,345.358002,5.0,0.0,2.0,1.795,81.209999,66.607175
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.219,353.417999,4.0,0.0,2.0,0.169,61.150002,74.315488
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.207,306.442993,2.0,1.0,7.0,4.207,55.130001,34.314965
479,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.608,335.398010,5.0,0.0,1.0,-1.736,70.160004,87.786468
480,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.792,349.382996,3.0,1.0,3.0,1.792,69.720001,65.751606
481,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.790,341.131989,3.0,2.0,2.0,0.423,69.639999,68.877401


In [125]:
df_submission = pd.read_csv("./data/sample_submission.csv")
df_submission["MLM"] = pred_MLM["prediction_label"]
df_submission["HLM"] = pred_HLM["prediction_label"]
df_submission.to_csv(f"./submissions/{MODEL}.csv", index=False)