In [5]:
!pip install optuna



In [1]:
import random
import os

import numpy as np
import pandas as pd
import datamol as dm

from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import missingno
import joblib

from rdkit.Chem import SaltRemover
from molfeat.trans.fp import FPVecTransformer
from molfeat.trans.concat import FeatConcat
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from pycaret.regression import *

# device 설정
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print (device)

seed = 42 # seed 값 설정
random.seed(seed) # 파이썬 난수 생성기
os.environ['PYTHONHASHSEED'] = str(seed) # 해시 시크릿값 고정
np.random.seed(seed) # 넘파이 난수 생성기

torch.manual_seed(seed) # 파이토치 CPU 난수 생성기
torch.backends.cudnn.deterministic = True # 확정적 연산 사용 설정
torch.backends.cudnn.benchmark = False   # 벤치마크 기능 사용 해제
torch.backends.cudnn.enabled = False        # cudnn 기능 사용 해제

if device == 'cuda':
    torch.cuda.manual_seed(seed) # 파이토치 GPU 난수 생성기
    torch.cuda.manual_seed_all(seed) # 파이토치 멀티 GPU 난수 생성기

cpu


In [27]:
ETC_COLUMNS = ["AlogP", "Molecular_Weight", "Num_H_Acceptors", "Num_H_Donors", "Num_RotatableBonds", "LogD", "Molecular_PolarSurfaceArea", 'logP', 'apka', 'num_rotatable_bonds', 'num_heteroatoms','num_hydrogen_acceptors', 'num_hydrogen_donors']
AVAILABLE_FPS = ['maccs', 'avalon', 'ecfp', 'fcfp', 'topological', 'atompair', 'rdkit', 'pattern', 'layered', 'secfp', 'erg', 'estate', 'avalon-count', 'rdkit-count', 'ecfp-count', 'fcfp-count', 'topological-count', 'atompair-count',
                 'cats2D', 'pharm2D', 'scaffoldkeys', 'skeys']
MODEL = "v9_2"
SEED = 42
version = 0.2

dm.disable_rdkit_log()

In [17]:
def preprocess_mol(row):
    mol = dm.to_mol(row["SMILES"], ordered=True)
    mol = dm.fix_mol(mol)
    mol = dm.sanitize_mol(mol, sanifix=True, charge_neutral=False)
    mol = dm.standardize_mol(
        mol,
        disconnect_metals=False,
        normalize=True,
        reionize=True,
        uncharge=False,
        stereo=True,
    )

    #mol = SaltRemover.SaltRemover().StripMol(mol, dontRemoveEverything=True)    
    row["Standard_Smiles"] = dm.to_smiles(mol)
    
    return row

def fill_na(df, imputer=None):
    if imputer is None:
        imputer = IterativeImputer(estimator=RandomForestRegressor(n_jobs=-1), random_state=SEED)        
        df[ETC_COLUMNS] = imputer.fit_transform(df[ETC_COLUMNS].to_numpy())
    
        return pd.DataFrame(df), imputer
    else:
        df[ETC_COLUMNS] = imputer.transform(df[ETC_COLUMNS].to_numpy())
    
        return pd.DataFrame(df)

def extract_features(df):    
    _df = df.apply(preprocess_mol, axis=1)
    
    fps = []
    for fp in AVAILABLE_FPS:
        fps.append(FPVecTransformer(fp, dtype=np.float64, n_jobs=-1))
    
    featurizer = FeatConcat(fps, dtype=np.float64)
    smiles = _df["Standard_Smiles"].to_list()
    descriptors = featurizer(smiles)
    
    etcs = _df[ETC_COLUMNS].to_numpy()
    
    return pd.DataFrame(np.concatenate([descriptors, etcs], axis=1))

In [18]:
import pandas as pd
import numpy as np
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors

def calculate_metabolic_stability_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    logP = Descriptors.MolLogP(mol)
    # 화합물의 친유성을 측정한 것으로 지질 또는 비극성 환경에서의 용해도를 나타냅니다. 생물학적 막을 통과하는 화합물의 능력을 반영합니다.
    apka = Descriptors.MolWt(mol)
    # 화합물의 산 해리 상수의 추정치로 다양한 pH 조건에서 이온화 거동에 대한 정보를 제공합니다.
    num_rotatable_bonds = Descriptors.NumRotatableBonds(mol)
    # 화합물에서 회전 가능한 결합의 수입니다. 이것은 화합물의 유연성과 효소 또는 다른 분자와의 잠재적인 상호 작용에 대한 통찰력을 제공할 수 있습니다.
    num_heteroatoms = Descriptors.NumHeteroatoms(mol)
    # 분자 내 헤테로원자(탄소 및 수소 이외의 원자) 수. 이는 화합물의 반응성과 대사 안정성에 영향을 줄 수 있습니다.
    num_hydrogen_acceptors = Descriptors.NumHAcceptors(mol)
    # 분자 내 수소 결합 수용체의 수. 이들은 결합 및 반응성에 영향을 미치는 다른 분자의 수소 결합 기증자와 상호 작용할 수 있는 사이트입니다.
    num_hydrogen_donors = Descriptors.NumHDonors(mol)
    # 분자 내 수소 결합 기증자의 수입니다. 이들은 수소 결합 상호작용에서 수소 원자를 제공할 수 있는 사이트입니다.
    # morgan_fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
    # morgan_fingerprint = AllChem.GetHashedMorganFingerprint(mol, 6, nBits=4096)
    # 분자 하위 구조의 이진 벡터 표현입니다. 이 열에는 화합물과 효소의 상호 작용 및 대사 안정성에 영향을 줄 수 있는 구조적 특징을 포착하는 이진 지문이 포함되어 있습니다.
    # morgan_array = np.zeros((1,), dtype=np.int8)
    # DataStructs.ConvertToNumpyArray(morgan_fingerprint, morgan_array)

    return logP, apka, num_rotatable_bonds, num_heteroatoms, num_hydrogen_acceptors, num_hydrogen_donors

In [19]:
df_train = pd.read_csv("./data/train.csv").drop(columns=["id"], axis=1)

# 중복값 제거 및 최댓값으로 설정
df_train["MLM"] = df_train.groupby(by=["SMILES"])["MLM"].transform("max")
df_train["HLM"] = df_train.groupby(by=["SMILES"])["HLM"].transform("max")
df_train = df_train.drop_duplicates().reset_index(drop=True)

df_train[[
    'logP', 'apka', 'num_rotatable_bonds', 'num_heteroatoms', 'num_hydrogen_acceptors', 'num_hydrogen_donors'
]] = df_train['SMILES'].apply(calculate_metabolic_stability_descriptors).apply(pd.Series)

In [20]:
df, imputer = fill_na(df_train)
df = extract_features(df)

df[["MLM", "HLM"]] = df_train[["MLM", "HLM"]]

In [21]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,32839,32840,32841,32842,32843,32844,32845,32846,MLM,HLM
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.259,117.37,3.87744,400.504,8.0,8.0,6.0,2.0,26.010,50.680
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.172,73.47,3.35474,301.415,2.0,5.0,4.0,1.0,29.270,50.590
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.585,62.45,1.20450,297.366,3.0,7.0,7.0,0.0,5.586,80.892
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.475,92.60,3.89356,494.665,5.0,9.0,7.0,0.0,5.710,2.000
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.337,42.43,2.81772,268.316,1.0,4.0,3.0,0.0,93.270,99.990
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3466,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.409,64.74,2.74730,396.200,4.0,11.0,5.0,1.0,1.556,3.079
3467,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.844,77.37,2.27630,359.389,3.0,7.0,5.0,1.0,35.560,47.630
3468,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.124,70.14,2.04130,261.325,5.0,5.0,5.0,1.0,56.150,1.790
3469,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.989,91.51,1.42720,284.699,4.0,7.0,6.0,1.0,0.030,2.770


In [23]:
# 학습용 데이터 만들기
_df = df.drop(columns=["MLM", "HLM"], axis=1).copy()
_MLM = df['MLM']
_HLM = df['HLM']

### MLM (RandomForest)

In [48]:
import optuna
from sklearn.ensemble import RandomForestRegressor  # RandomForest 임포트
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer

# RMSE를 사용하는 스코어 함수 정의
def custom_scorer(y_true, y_pred):
    rmse = np.sqrt(np.mean((y_true - y_pred) ** 2))
    return 0.5 * rmse

# Objective 함수 정의
def objective(trial):
    param_space = {
        'n_jobs': -1,
        'n_estimators': 729,
        'max_depth': trial.suggest_int('max_depth', 20, 50),
        'min_samples_split': 4,
        'min_samples_leaf': 5,
        'max_features': trial.suggest_uniform('max_features', 0.1, 1.0),
    }

    # RandomForest 모델 생성
    model = RandomForestRegressor(**param_space, random_state=42)
    
    # 교차 검증 수행
    scores = cross_val_score(model, _df, _MLM, cv=5, scoring=make_scorer(custom_scorer))
    
    # 교차 검증 스코어의 평균을 사용하여 Objective 함수를 평가
    avg_rmse = np.mean(scores)
    print("Average RMSE:", avg_rmse)
    
    return avg_rmse

# Optuna 스터디 생성
study = optuna.create_study(direction='minimize')  # 최소화 문제로 설정

# 최적의 하이퍼파라미터 탐색
study.optimize(objective, n_trials=30)

# 최적의 하이퍼파라미터와 스코어 출력
best_params = study.best_params
best_score = study.best_value
print("Best Params:", best_params)
print("Best Score:", best_score)

[I 2023-09-22 22:16:36,850] A new study created in memory with name: no-name-1aaca52f-3067-4baa-9e8a-8ec2003804c0
[I 2023-09-22 23:59:10,668] Trial 0 finished with value: 15.372118243288705 and parameters: {'max_depth': 41, 'max_features': 0.8333155471619458}. Best is trial 0 with value: 15.372118243288705.


Average RMSE: 15.372118243288705


[I 2023-09-23 00:35:47,536] Trial 1 finished with value: 15.302729561190691 and parameters: {'max_depth': 40, 'max_features': 0.2879393353178585}. Best is trial 1 with value: 15.302729561190691.


Average RMSE: 15.302729561190691


[I 2023-09-23 01:22:56,837] Trial 2 finished with value: 15.311733443462135 and parameters: {'max_depth': 30, 'max_features': 0.4073590032926326}. Best is trial 1 with value: 15.302729561190691.


Average RMSE: 15.311733443462135


[I 2023-09-23 03:01:43,636] Trial 3 finished with value: 15.36607600370707 and parameters: {'max_depth': 46, 'max_features': 0.8069408392326028}. Best is trial 1 with value: 15.302729561190691.


Average RMSE: 15.36607600370707


[I 2023-09-23 04:29:09,539] Trial 4 finished with value: 15.375724140508193 and parameters: {'max_depth': 23, 'max_features': 0.8492884524711669}. Best is trial 1 with value: 15.302729561190691.


Average RMSE: 15.375724140508193


[I 2023-09-23 05:04:58,586] Trial 5 finished with value: 15.300992553570685 and parameters: {'max_depth': 35, 'max_features': 0.29333715053042586}. Best is trial 5 with value: 15.300992553570685.


Average RMSE: 15.300992553570685


[I 2023-09-23 06:45:10,048] Trial 6 finished with value: 15.372973806053452 and parameters: {'max_depth': 37, 'max_features': 0.9212340532836467}. Best is trial 5 with value: 15.300992553570685.


Average RMSE: 15.372973806053452


[I 2023-09-23 07:04:31,334] Trial 7 finished with value: 15.253878289683504 and parameters: {'max_depth': 41, 'max_features': 0.16898305935760013}. Best is trial 7 with value: 15.253878289683504.


Average RMSE: 15.253878289683504


[I 2023-09-23 07:40:59,850] Trial 8 finished with value: 15.300648051282753 and parameters: {'max_depth': 46, 'max_features': 0.3227278537137681}. Best is trial 7 with value: 15.253878289683504.


Average RMSE: 15.300648051282753


[I 2023-09-23 08:13:48,232] Trial 9 finished with value: 15.30867510825306 and parameters: {'max_depth': 24, 'max_features': 0.3335279457176186}. Best is trial 7 with value: 15.253878289683504.


Average RMSE: 15.30867510825306


[I 2023-09-23 08:30:44,373] Trial 10 finished with value: 15.27161169957078 and parameters: {'max_depth': 30, 'max_features': 0.1548928634229785}. Best is trial 7 with value: 15.253878289683504.


Average RMSE: 15.27161169957078


[I 2023-09-23 08:43:45,874] Trial 11 finished with value: 15.292689951122929 and parameters: {'max_depth': 32, 'max_features': 0.11672160213619276}. Best is trial 7 with value: 15.253878289683504.


Average RMSE: 15.292689951122929


[I 2023-09-23 08:57:20,475] Trial 12 finished with value: 15.29482433335742 and parameters: {'max_depth': 28, 'max_features': 0.11293896747685045}. Best is trial 7 with value: 15.253878289683504.


Average RMSE: 15.29482433335742


[W 2023-09-23 10:04:48,661] Trial 13 failed with parameters: {'max_depth': 49, 'max_features': 0.5283112511548668} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/opt/homebrew/anaconda3/lib/python3.10/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/var/folders/7j/2czlfb0x36j_nmkftnnpyg3r0000gp/T/ipykernel_50225/1444339744.py", line 27, in objective
    scores = cross_val_score(model, _df, _MLM, cv=5, scoring=make_scorer(custom_scorer))
  File "/opt/homebrew/anaconda3/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 515, in cross_val_score
    cv_results = cross_validate(
  File "/opt/homebrew/anaconda3/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 266, in cross_validate
    results = parallel(
  File "/opt/homebrew/anaconda3/lib/python3.10/site-packages/sklearn/utils/parallel.py", line 63, in __call__
    return super().__c

KeyboardInterrupt: 

In [34]:
# 하이퍼파라미터별 중요도를 확인할 수 있는 그래프
optuna.visualization.plot_param_importances(study)

In [35]:
# 하이퍼파라미터 최적화 과정을 확인
optuna.visualization.plot_optimization_history(study)

In [36]:
# 최적의 하이퍼파라미터를 사용하여 RandomForest 모델 생성
# early-stopping 적용할지 말지 고려
best_MLM_RFmodel = RandomForestRegressor(**best_params, random_state=42)

# 데이터 분할 (훈련 세트와 검증 세트)
X_train, X_valid, y_train, y_valid = train_test_split(_df, _MLM, test_size=0.2, random_state=42)

# 모델 훈련
best_MLM_RFmodel.fit(X_train, y_train)

# 모델 저장
MLM_model_path = f"./models/MLM_RFmodel_{version:.1f}.joblib"
joblib.dump(best_MLM_RFmodel, MLM_model_path)

['./models/MLM_RFmodel_0.2.joblib']

In [37]:
# 저장된 모델 로드
MLM_loaded_model = RandomForestRegressor()
MLM_loaded_model = joblib.load(MLM_model_path)  # joblib로 모델 로드

# 로드된 모델을 사용하여 예측 수행
y_valid_pred = MLM_loaded_model.predict(X_valid)

# 검증 세트에서 RMSE 또는 다른 평가 지표 계산
rmse = np.sqrt(mean_squared_error(y_valid, y_valid_pred))
print("Validation RMSE:", rmse)

Validation RMSE: 30.04702695470916


### HLM (RandomForest)

In [51]:
import optuna
from sklearn.ensemble import RandomForestRegressor  # RandomForest 임포트
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer

# RMSE를 사용하는 스코어 함수 정의
def custom_scorer(y_true, y_pred):
    rmse = np.sqrt(np.mean((y_true - y_pred) ** 2))
    return 0.5 * rmse

# Objective 함수 정의
def objective(trial):
    param_space = {
        'n_jobs': -1,
        'n_estimators': trial.suggest_int('n_estimators', 500, 1300),
        'max_depth': trial.suggest_int('max_depth', 20, 60),
        'min_samples_split': trial.suggest_int('min_samples_split', 1, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 15),
        'max_features': trial.suggest_uniform('max_features', 0.1, 1.0),
    }

    # RandomForest 모델 생성
    model = RandomForestRegressor(**param_space, random_state=42)
    
    # 교차 검증 수행
    scores = cross_val_score(model, _df, _HLM, cv=5, scoring=make_scorer(custom_scorer))
    
    # 교차 검증 스코어의 평균을 사용하여 Objective 함수를 평가
    avg_rmse = np.mean(scores)
    print("Average RMSE:", avg_rmse)
    
    return avg_rmse

# Optuna 스터디 생성
study = optuna.create_study(direction='minimize')  # 최소화 문제로 설정

# 최적의 하이퍼파라미터 탐색
study.optimize(objective, n_trials=10)

# 최적의 하이퍼파라미터와 스코어 출력
best_params = study.best_params
best_score = study.best_value
print("Best Params:", best_params)
print("Best Score:", best_score)

[I 2023-09-23 10:18:54,127] A new study created in memory with name: no-name-899af180-a519-4610-9e7e-e99a4b3069eb
[I 2023-09-23 11:27:00,027] Trial 0 finished with value: 15.761635813087569 and parameters: {'n_estimators': 840, 'max_depth': 46, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_features': 0.6579630181065937}. Best is trial 0 with value: 15.761635813087569.


Average RMSE: 15.761635813087569


[I 2023-09-23 12:05:54,533] Trial 1 finished with value: 15.734673365655544 and parameters: {'n_estimators': 640, 'max_depth': 20, 'min_samples_split': 4, 'min_samples_leaf': 4, 'max_features': 0.486812472220273}. Best is trial 1 with value: 15.734673365655544.


Average RMSE: 15.734673365655544


[I 2023-09-23 12:58:38,502] Trial 2 finished with value: 15.779479932921557 and parameters: {'n_estimators': 612, 'max_depth': 34, 'min_samples_split': 4, 'min_samples_leaf': 6, 'max_features': 0.7804653148438724}. Best is trial 1 with value: 15.734673365655544.


Average RMSE: 15.779479932921557


[I 2023-09-23 14:15:45,078] Trial 3 finished with value: 15.775599358374302 and parameters: {'n_estimators': 699, 'max_depth': 43, 'min_samples_split': 7, 'min_samples_leaf': 1, 'max_features': 0.6748936963847364}. Best is trial 1 with value: 15.734673365655544.


Average RMSE: 15.775599358374302


[I 2023-09-23 15:27:39,272] Trial 4 finished with value: 15.764432965628277 and parameters: {'n_estimators': 1268, 'max_depth': 20, 'min_samples_split': 6, 'min_samples_leaf': 10, 'max_features': 0.6384399541164546}. Best is trial 1 with value: 15.734673365655544.


Average RMSE: 15.764432965628277


[I 2023-09-23 16:08:42,699] Trial 5 finished with value: 15.79950275533118 and parameters: {'n_estimators': 523, 'max_depth': 26, 'min_samples_split': 3, 'min_samples_leaf': 10, 'max_features': 0.8763245247829335}. Best is trial 1 with value: 15.734673365655544.


Average RMSE: 15.79950275533118


[I 2023-09-23 17:10:18,738] Trial 6 finished with value: 15.782486270954326 and parameters: {'n_estimators': 851, 'max_depth': 22, 'min_samples_split': 7, 'min_samples_leaf': 9, 'max_features': 0.7685550662729222}. Best is trial 1 with value: 15.734673365655544.


Average RMSE: 15.782486270954326


[W 2023-09-23 18:01:02,152] Trial 7 failed with parameters: {'n_estimators': 1266, 'max_depth': 50, 'min_samples_split': 7, 'min_samples_leaf': 8, 'max_features': 0.7515095750341627} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/opt/homebrew/anaconda3/lib/python3.10/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/var/folders/7j/2czlfb0x36j_nmkftnnpyg3r0000gp/T/ipykernel_50225/2816451396.py", line 27, in objective
    scores = cross_val_score(model, _df, _HLM, cv=5, scoring=make_scorer(custom_scorer))
  File "/opt/homebrew/anaconda3/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 515, in cross_val_score
    cv_results = cross_validate(
  File "/opt/homebrew/anaconda3/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 266, in cross_validate
    results = parallel(
  File "/opt/homebrew/anaconda3/lib/python3.10/site-packages/skl

KeyboardInterrupt: 

In [39]:
# 하이퍼파라미터별 중요도를 확인할 수 있는 그래프
optuna.visualization.plot_param_importances(study)

In [40]:
# 하이퍼파라미터 최적화 과정을 확인
optuna.visualization.plot_optimization_history(study)

In [41]:
# 최적의 하이퍼파라미터를 사용하여 RandomForest 모델 생성
# early-stopping 적용할지 말지 고려
best_HLM_RFmodel = RandomForestRegressor(**best_params, random_state=42)

# 데이터 분할 (훈련 세트와 검증 세트)
X_train, X_valid, y_train, y_valid = train_test_split(_df, _HLM, test_size=0.2, random_state=42)

# 모델 훈련
best_HLM_RFmodel.fit(X_train, y_train)

# 모델 저장
HLM_model_path = f"./models/HLM_RFmodel_{version:.1f}.joblib"
joblib.dump(best_HLM_RFmodel, HLM_model_path)

['./models/HLM_RFmodel_0.2.joblib']

In [42]:
# 저장된 모델 로드
HLM_loaded_model = RandomForestRegressor()
HLM_loaded_model = joblib.load(HLM_model_path)  # joblib로 모델 로드

# 로드된 모델을 사용하여 예측 수행
y_valid_pred = HLM_loaded_model.predict(X_valid)

# 검증 세트에서 RMSE 또는 다른 평가 지표 계산
rmse = np.sqrt(mean_squared_error(y_valid, y_valid_pred))
print("Validation RMSE:", rmse)

Validation RMSE: 31.67243088827376


### Test

In [45]:
df_test = pd.read_csv("./data/test.csv").drop(columns=["id"])
df_test[[
    'logP', 'apka', 'num_rotatable_bonds', 'num_heteroatoms', 'num_hydrogen_acceptors', 'num_hydrogen_donors'
]] = df_test['SMILES'].apply(calculate_metabolic_stability_descriptors).apply(pd.Series)

test_df = fill_na(df_test, imputer)
test_df = extract_features(test_df)

pred_MLM = MLM_loaded_model.predict(test_df)
pred_HLM = HLM_loaded_model.predict(test_df)

In [46]:
submission = pd.read_csv('./data/sample_submission.csv')

submission['MLM'] = pred_MLM
submission['HLM'] = pred_HLM
submission

Unnamed: 0,id,MLM,HLM
0,TEST_000,32.040788,49.561658
1,TEST_001,63.779308,75.558401
2,TEST_002,32.659838,51.052286
3,TEST_003,46.238246,67.054067
4,TEST_004,54.454228,72.621162
...,...,...,...
478,TEST_478,15.533701,33.801334
479,TEST_479,74.103024,78.266426
480,TEST_480,43.950179,61.288224
481,TEST_481,57.961244,74.646280


In [47]:
submission.to_csv('./submissions/Optuna+RandomForest_submission_2.csv', index=False)