In [9]:
import random
import os

import numpy as np
import pandas as pd
import datamol as dm

from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import missingno
import joblib

from rdkit.Chem import SaltRemover
from molfeat.trans.fp import FPVecTransformer
from molfeat.trans.concat import FeatConcat
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from pycaret.regression import *
from catboost import CatBoostRegressor

# Now you can use CatBoostRegressor in your code

# device 설정
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print (device)

seed = 42 # seed 값 설정
random.seed(seed) # 파이썬 난수 생성기
os.environ['PYTHONHASHSEED'] = str(seed) # 해시 시크릿값 고정
np.random.seed(seed) # 넘파이 난수 생성기

torch.manual_seed(seed) # 파이토치 CPU 난수 생성기
torch.backends.cudnn.deterministic = True # 확정적 연산 사용 설정
torch.backends.cudnn.benchmark = False   # 벤치마크 기능 사용 해제
torch.backends.cudnn.enabled = False        # cudnn 기능 사용 해제

if device == 'cuda':
    torch.cuda.manual_seed(seed) # 파이토치 GPU 난수 생성기
    torch.cuda.manual_seed_all(seed) # 파이토치 멀티 GPU 난수 생성기

cpu


In [2]:
ETC_COLUMNS = ["AlogP", "Molecular_Weight", "Num_H_Acceptors", "Num_H_Donors", "Num_RotatableBonds", "LogD", "Molecular_PolarSurfaceArea"]
AVAILABLE_FPS = ['maccs', 'avalon', 'ecfp', 'fcfp', 'topological', 'atompair', 'rdkit', 'pattern', 'layered', 'secfp', 'erg', 'estate', 'avalon-count', 'rdkit-count', 'ecfp-count', 'fcfp-count', 'topological-count', 'atompair-count',
                 'cats2D', 'pharm2D', 'scaffoldkeys', 'skeys']
MODEL = "v9_2"
SEED = 42

dm.disable_rdkit_log()

In [3]:
def preprocess_mol(row):
    mol = dm.to_mol(row["SMILES"], ordered=True)
    mol = dm.fix_mol(mol)
    mol = dm.sanitize_mol(mol, sanifix=True, charge_neutral=False)
    mol = dm.standardize_mol(
        mol,
        disconnect_metals=False,
        normalize=True,
        reionize=True,
        uncharge=False,
        stereo=True,
    )

    #mol = SaltRemover.SaltRemover().StripMol(mol, dontRemoveEverything=True)    
    row["Standard_Smiles"] = dm.to_smiles(mol)
    
    return row

def fill_na(df, imputer=None):
    if imputer is None:
        imputer = IterativeImputer(estimator=RandomForestRegressor(n_jobs=-1), random_state=SEED)        
        df[ETC_COLUMNS] = imputer.fit_transform(df[ETC_COLUMNS].to_numpy())
    
        return pd.DataFrame(df), imputer
    else:
        df[ETC_COLUMNS] = imputer.transform(df[ETC_COLUMNS].to_numpy())
    
        return pd.DataFrame(df)

def extract_features(df):    
    _df = df.apply(preprocess_mol, axis=1)
    
    fps = []
    for fp in AVAILABLE_FPS:
        fps.append(FPVecTransformer(fp, dtype=np.float64, n_jobs=-1))
    
    featurizer = FeatConcat(fps, dtype=np.float64)
    smiles = _df["Standard_Smiles"].to_list()
    descriptors = featurizer(smiles)
    
    etcs = _df[ETC_COLUMNS].to_numpy()
    
    return pd.DataFrame(np.concatenate([descriptors, etcs], axis=1))

In [4]:
df_train = pd.read_csv("./data/train.csv").drop(columns=["id"], axis=1)

# 중복값 제거 및 최댓값으로 설정
df_train["MLM"] = df_train.groupby(by=["SMILES"])["MLM"].transform("max")
df_train["HLM"] = df_train.groupby(by=["SMILES"])["HLM"].transform("max")
df_train = df_train.drop_duplicates().reset_index(drop=True)

In [5]:
df, imputer = fill_na(df_train)
df = extract_features(df)

df[["MLM", "HLM"]] = df_train[["MLM", "HLM"]]

In [6]:
# 학습용 데이터 만들기
_df = df.drop(columns=["MLM", "HLM"], axis=1).copy()
_MLM = df['MLM']
_HLM = df['HLM']

In [10]:
version = 0.2
HLM_RFmodel_path = f"./models/HLM_RFmodel_{version:.1f}.joblib"
MLM_RFmodel_path = f"./models/MLM_RFmodel_{version:.1f}.joblib"

MLM_Cmodel_path = f"./models/MLM_Cmodel_{version:.1f}.cbm"
HLM_Cmodel_path = f"./models/HLM_Cmodel_{version:.1f}.cbm"

# 저장된 모델 로드
MLM_loaded_RFmodel = RandomForestRegressor()
MLM_loaded_RFmodel = joblib.load(MLM_RFmodel_path)

HLM_loaded_RFmodel = RandomForestRegressor()
HLM_loaded_RFmodel = joblib.load(HLM_RFmodel_path)

MLM_loaded_Cmodel = CatBoostRegressor()
MLM_loaded_Cmodel.load_model(MLM_Cmodel_path)

HLM_loaded_Cmodel = CatBoostRegressor()
HLM_loaded_Cmodel.load_model(HLM_Cmodel_path)

<catboost.core.CatBoostRegressor at 0x176e87340>

In [11]:
from sklearn.ensemble import VotingRegressor

# MLM 모델과 HLM 모델을 조합한 VotingRegressor 생성
MLM_ensemble = VotingRegressor(estimators=[
    ('rf', MLM_loaded_RFmodel),
    ('cb', MLM_loaded_Cmodel)
])

HLM_ensemble = VotingRegressor(estimators=[
    ('rf', HLM_loaded_RFmodel),
    ('cb', HLM_loaded_Cmodel)
])

# 데이터 분할 (훈련 세트와 검증 세트)
X_Mtrain, X_Mvalid, y_Mtrain, y_Mvalid = train_test_split(_df, _MLM, test_size=0.2, random_state=42)

# 데이터 분할 (훈련 세트와 검증 세트)
X_Htrain, X_Hvalid, y_Htrain, y_Hvalid = train_test_split(_df, _HLM, test_size=0.2, random_state=42)

# MLM 앙상블 모델 학습
MLM_ensemble.fit(X_Mtrain, y_Mtrain)

# HLM 앙상블 모델 학습
HLM_ensemble.fit(X_Htrain, y_Htrain)


# MLM 앙상블 모델 저장
MLM_ensemble_model_path = f"./models/MLM_ensemble_model_{version:.1f}.joblib"
joblib.dump(MLM_ensemble, MLM_ensemble_model_path)

# HLM 앙상블 모델 저장
HLM_ensemble_model_path = f"./models/HLM_ensemble_model_{version:.1f}.joblib"
joblib.dump(HLM_ensemble, HLM_ensemble_model_path)

0:	learn: 35.8178698	total: 356ms	remaining: 5m 53s
1:	learn: 35.6743706	total: 659ms	remaining: 5m 26s
2:	learn: 35.5279308	total: 958ms	remaining: 5m 16s
3:	learn: 35.3828809	total: 1.28s	remaining: 5m 16s
4:	learn: 35.2563329	total: 1.58s	remaining: 5m 12s
5:	learn: 35.1349955	total: 1.88s	remaining: 5m 9s
6:	learn: 35.0423493	total: 2.17s	remaining: 5m 6s
7:	learn: 34.9088082	total: 2.48s	remaining: 5m 5s
8:	learn: 34.7878326	total: 2.78s	remaining: 5m 3s
9:	learn: 34.6664235	total: 3.08s	remaining: 5m 2s
10:	learn: 34.5432854	total: 3.37s	remaining: 5m 1s
11:	learn: 34.4417991	total: 3.67s	remaining: 5m
12:	learn: 34.3557420	total: 3.97s	remaining: 4m 59s
13:	learn: 34.2751083	total: 4.27s	remaining: 4m 59s
14:	learn: 34.1692468	total: 4.57s	remaining: 4m 58s
15:	learn: 34.0947666	total: 4.87s	remaining: 4m 57s
16:	learn: 33.9956249	total: 5.17s	remaining: 4m 56s
17:	learn: 33.8983778	total: 5.46s	remaining: 4m 56s
18:	learn: 33.8075439	total: 5.76s	remaining: 4m 55s
19:	learn: 33

155:	learn: 28.3309717	total: 47.1s	remaining: 4m 12s
156:	learn: 28.2960006	total: 47.4s	remaining: 4m 12s
157:	learn: 28.2663031	total: 47.7s	remaining: 4m 12s
158:	learn: 28.2531906	total: 48s	remaining: 4m 11s
159:	learn: 28.2236210	total: 48.3s	remaining: 4m 11s
160:	learn: 28.1912639	total: 48.6s	remaining: 4m 11s
161:	learn: 28.1796698	total: 48.9s	remaining: 4m 11s
162:	learn: 28.1536965	total: 49.2s	remaining: 4m 10s
163:	learn: 28.1340060	total: 49.5s	remaining: 4m 10s
164:	learn: 28.1009580	total: 49.8s	remaining: 4m 10s
165:	learn: 28.0680300	total: 50.1s	remaining: 4m 9s
166:	learn: 28.0486363	total: 50.4s	remaining: 4m 9s
167:	learn: 28.0364242	total: 50.7s	remaining: 4m 9s
168:	learn: 28.0182003	total: 51s	remaining: 4m 8s
169:	learn: 27.9814169	total: 51.3s	remaining: 4m 8s
170:	learn: 27.9624855	total: 51.6s	remaining: 4m 8s
171:	learn: 27.9503440	total: 51.9s	remaining: 4m 7s
172:	learn: 27.9332249	total: 52.1s	remaining: 4m 7s
173:	learn: 27.9110068	total: 52.4s	rema

307:	learn: 25.3938742	total: 1m 32s	remaining: 3m 26s
308:	learn: 25.3588875	total: 1m 32s	remaining: 3m 26s
309:	learn: 25.3486922	total: 1m 33s	remaining: 3m 25s
310:	learn: 25.3345999	total: 1m 33s	remaining: 3m 25s
311:	learn: 25.3078435	total: 1m 33s	remaining: 3m 25s
312:	learn: 25.2864229	total: 1m 34s	remaining: 3m 24s
313:	learn: 25.2696812	total: 1m 34s	remaining: 3m 24s
314:	learn: 25.2513236	total: 1m 34s	remaining: 3m 24s
315:	learn: 25.2322396	total: 1m 35s	remaining: 3m 23s
316:	learn: 25.2182252	total: 1m 35s	remaining: 3m 23s
317:	learn: 25.1930952	total: 1m 35s	remaining: 3m 23s
318:	learn: 25.1808342	total: 1m 35s	remaining: 3m 23s
319:	learn: 25.1574952	total: 1m 36s	remaining: 3m 22s
320:	learn: 25.1434457	total: 1m 36s	remaining: 3m 22s
321:	learn: 25.1298041	total: 1m 36s	remaining: 3m 22s
322:	learn: 25.1108054	total: 1m 37s	remaining: 3m 21s
323:	learn: 25.0803355	total: 1m 37s	remaining: 3m 21s
324:	learn: 25.0625311	total: 1m 37s	remaining: 3m 21s
325:	learn

458:	learn: 22.2056650	total: 2m 18s	remaining: 2m 40s
459:	learn: 22.1924253	total: 2m 18s	remaining: 2m 40s
460:	learn: 22.1772431	total: 2m 18s	remaining: 2m 40s
461:	learn: 22.1605925	total: 2m 18s	remaining: 2m 39s
462:	learn: 22.1414591	total: 2m 19s	remaining: 2m 39s
463:	learn: 22.1207634	total: 2m 19s	remaining: 2m 39s
464:	learn: 22.0978318	total: 2m 19s	remaining: 2m 39s
465:	learn: 22.0822201	total: 2m 20s	remaining: 2m 38s
466:	learn: 22.0657648	total: 2m 20s	remaining: 2m 38s
467:	learn: 22.0552308	total: 2m 20s	remaining: 2m 38s
468:	learn: 22.0375055	total: 2m 21s	remaining: 2m 37s
469:	learn: 22.0187482	total: 2m 21s	remaining: 2m 37s
470:	learn: 22.0008247	total: 2m 21s	remaining: 2m 37s
471:	learn: 21.9773404	total: 2m 22s	remaining: 2m 37s
472:	learn: 21.9631121	total: 2m 22s	remaining: 2m 36s
473:	learn: 21.9474796	total: 2m 22s	remaining: 2m 36s
474:	learn: 21.9208433	total: 2m 22s	remaining: 2m 36s
475:	learn: 21.8985869	total: 2m 23s	remaining: 2m 35s
476:	learn

609:	learn: 19.3906817	total: 3m 3s	remaining: 1m 55s
610:	learn: 19.3776474	total: 3m 4s	remaining: 1m 55s
611:	learn: 19.3656246	total: 3m 4s	remaining: 1m 55s
612:	learn: 19.3525152	total: 3m 4s	remaining: 1m 54s
613:	learn: 19.3361290	total: 3m 4s	remaining: 1m 54s
614:	learn: 19.3223439	total: 3m 5s	remaining: 1m 54s
615:	learn: 19.3055424	total: 3m 5s	remaining: 1m 53s
616:	learn: 19.2819528	total: 3m 5s	remaining: 1m 53s
617:	learn: 19.2644486	total: 3m 6s	remaining: 1m 53s
618:	learn: 19.2465223	total: 3m 6s	remaining: 1m 52s
619:	learn: 19.2352908	total: 3m 6s	remaining: 1m 52s
620:	learn: 19.2232032	total: 3m 7s	remaining: 1m 52s
621:	learn: 19.2061886	total: 3m 7s	remaining: 1m 52s
622:	learn: 19.1948410	total: 3m 7s	remaining: 1m 51s
623:	learn: 19.1821189	total: 3m 8s	remaining: 1m 51s
624:	learn: 19.1667027	total: 3m 8s	remaining: 1m 51s
625:	learn: 19.1523500	total: 3m 8s	remaining: 1m 50s
626:	learn: 19.1344191	total: 3m 8s	remaining: 1m 50s
627:	learn: 19.1220813	total

759:	learn: 17.1513839	total: 3m 49s	remaining: 1m 10s
760:	learn: 17.1380536	total: 3m 49s	remaining: 1m 10s
761:	learn: 17.1222451	total: 3m 50s	remaining: 1m 10s
762:	learn: 17.1131490	total: 3m 50s	remaining: 1m 9s
763:	learn: 17.0981870	total: 3m 50s	remaining: 1m 9s
764:	learn: 17.0820993	total: 3m 51s	remaining: 1m 9s
765:	learn: 17.0699362	total: 3m 51s	remaining: 1m 8s
766:	learn: 17.0490730	total: 3m 51s	remaining: 1m 8s
767:	learn: 17.0298392	total: 3m 51s	remaining: 1m 8s
768:	learn: 17.0221519	total: 3m 52s	remaining: 1m 7s
769:	learn: 17.0078304	total: 3m 52s	remaining: 1m 7s
770:	learn: 16.9905519	total: 3m 52s	remaining: 1m 7s
771:	learn: 16.9749518	total: 3m 53s	remaining: 1m 7s
772:	learn: 16.9588172	total: 3m 53s	remaining: 1m 6s
773:	learn: 16.9411847	total: 3m 53s	remaining: 1m 6s
774:	learn: 16.9241986	total: 3m 54s	remaining: 1m 6s
775:	learn: 16.9087111	total: 3m 54s	remaining: 1m 5s
776:	learn: 16.8944906	total: 3m 54s	remaining: 1m 5s
777:	learn: 16.8840709	to

913:	learn: 15.1986868	total: 4m 36s	remaining: 24.2s
914:	learn: 15.1873797	total: 4m 37s	remaining: 23.9s
915:	learn: 15.1738880	total: 4m 37s	remaining: 23.6s
916:	learn: 15.1582674	total: 4m 37s	remaining: 23.3s
917:	learn: 15.1510112	total: 4m 38s	remaining: 23s
918:	learn: 15.1417839	total: 4m 38s	remaining: 22.7s
919:	learn: 15.1346652	total: 4m 38s	remaining: 22.4s
920:	learn: 15.1284027	total: 4m 38s	remaining: 22.1s
921:	learn: 15.1119609	total: 4m 39s	remaining: 21.8s
922:	learn: 15.1041268	total: 4m 39s	remaining: 21.5s
923:	learn: 15.0968002	total: 4m 39s	remaining: 21.2s
924:	learn: 15.0882996	total: 4m 40s	remaining: 20.9s
925:	learn: 15.0841523	total: 4m 40s	remaining: 20.6s
926:	learn: 15.0703611	total: 4m 40s	remaining: 20.3s
927:	learn: 15.0610014	total: 4m 41s	remaining: 20s
928:	learn: 15.0456901	total: 4m 41s	remaining: 19.7s
929:	learn: 15.0301087	total: 4m 41s	remaining: 19.4s
930:	learn: 15.0208485	total: 4m 42s	remaining: 19.1s
931:	learn: 15.0082657	total: 4m

72:	learn: 30.5499891	total: 1m 18s	remaining: 22m 10s
73:	learn: 30.5044770	total: 1m 19s	remaining: 22m 9s
74:	learn: 30.4667078	total: 1m 21s	remaining: 22m 7s
75:	learn: 30.4354941	total: 1m 22s	remaining: 22m 7s
76:	learn: 30.4094037	total: 1m 23s	remaining: 22m 5s
77:	learn: 30.3672716	total: 1m 24s	remaining: 22m 4s
78:	learn: 30.3228517	total: 1m 25s	remaining: 22m 2s
79:	learn: 30.2709075	total: 1m 26s	remaining: 22m 1s
80:	learn: 30.2301062	total: 1m 27s	remaining: 21m 59s
81:	learn: 30.1815865	total: 1m 28s	remaining: 21m 58s
82:	learn: 30.1360540	total: 1m 29s	remaining: 21m 56s
83:	learn: 30.1019426	total: 1m 30s	remaining: 21m 54s
84:	learn: 30.0622840	total: 1m 31s	remaining: 21m 53s
85:	learn: 30.0323330	total: 1m 32s	remaining: 21m 53s
86:	learn: 29.9979931	total: 1m 33s	remaining: 21m 51s
87:	learn: 29.9545609	total: 1m 34s	remaining: 21m 50s
88:	learn: 29.9108353	total: 1m 35s	remaining: 21m 48s
89:	learn: 29.8514409	total: 1m 36s	remaining: 21m 47s
90:	learn: 29.803

220:	learn: 25.4291642	total: 3m 57s	remaining: 19m 22s
221:	learn: 25.3846110	total: 3m 58s	remaining: 19m 21s
222:	learn: 25.3580597	total: 3m 59s	remaining: 19m 19s
223:	learn: 25.3405771	total: 4m	remaining: 19m 18s
224:	learn: 25.3166264	total: 4m 1s	remaining: 19m 17s
225:	learn: 25.2957804	total: 4m 2s	remaining: 19m 16s
226:	learn: 25.2726073	total: 4m 3s	remaining: 19m 16s
227:	learn: 25.2460721	total: 4m 4s	remaining: 19m 15s
228:	learn: 25.2292129	total: 4m 5s	remaining: 19m 14s
229:	learn: 25.2115110	total: 4m 7s	remaining: 19m 13s
230:	learn: 25.1902085	total: 4m 8s	remaining: 19m 12s
231:	learn: 25.1580088	total: 4m 9s	remaining: 19m 10s
232:	learn: 25.1400928	total: 4m 10s	remaining: 19m 9s
233:	learn: 25.1208842	total: 4m 11s	remaining: 19m 8s
234:	learn: 25.0946993	total: 4m 12s	remaining: 19m 7s
235:	learn: 25.0616046	total: 4m 13s	remaining: 19m 6s
236:	learn: 25.0498079	total: 4m 14s	remaining: 19m 5s
237:	learn: 24.9997997	total: 4m 15s	remaining: 19m 4s
238:	learn

368:	learn: 22.1869385	total: 6m 35s	remaining: 16m 41s
369:	learn: 22.1768898	total: 6m 36s	remaining: 16m 40s
370:	learn: 22.1622160	total: 6m 37s	remaining: 16m 39s
371:	learn: 22.1388114	total: 6m 38s	remaining: 16m 39s
372:	learn: 22.1104703	total: 6m 39s	remaining: 16m 38s
373:	learn: 22.0674449	total: 6m 40s	remaining: 16m 37s
374:	learn: 22.0506097	total: 6m 42s	remaining: 16m 35s
375:	learn: 22.0318468	total: 6m 43s	remaining: 16m 34s
376:	learn: 22.0157421	total: 6m 44s	remaining: 16m 33s
377:	learn: 21.9694902	total: 6m 45s	remaining: 16m 32s
378:	learn: 21.9381457	total: 6m 46s	remaining: 16m 32s
379:	learn: 21.9215436	total: 6m 47s	remaining: 16m 31s
380:	learn: 21.9037002	total: 6m 48s	remaining: 16m 29s
381:	learn: 21.8647202	total: 6m 49s	remaining: 16m 28s
382:	learn: 21.8482390	total: 6m 50s	remaining: 16m 27s
383:	learn: 21.8367602	total: 6m 51s	remaining: 16m 26s
384:	learn: 21.8238358	total: 6m 52s	remaining: 16m 25s
385:	learn: 21.8167476	total: 6m 54s	remaining: 

516:	learn: 18.6746711	total: 9m 17s	remaining: 14m 7s
517:	learn: 18.6591824	total: 9m 18s	remaining: 14m 6s
518:	learn: 18.6224223	total: 9m 19s	remaining: 14m 5s
519:	learn: 18.6001315	total: 9m 20s	remaining: 14m 4s
520:	learn: 18.5755064	total: 9m 21s	remaining: 14m 3s
521:	learn: 18.5554313	total: 9m 22s	remaining: 14m 2s
522:	learn: 18.5333875	total: 9m 23s	remaining: 14m 1s
523:	learn: 18.4931298	total: 9m 24s	remaining: 14m
524:	learn: 18.4726137	total: 9m 25s	remaining: 13m 59s
525:	learn: 18.4485682	total: 9m 26s	remaining: 13m 57s
526:	learn: 18.4344384	total: 9m 27s	remaining: 13m 56s
527:	learn: 18.4063837	total: 9m 28s	remaining: 13m 55s
528:	learn: 18.3860228	total: 9m 29s	remaining: 13m 54s
529:	learn: 18.3545309	total: 9m 30s	remaining: 13m 53s
530:	learn: 18.3375453	total: 9m 32s	remaining: 13m 52s
531:	learn: 18.3074910	total: 9m 33s	remaining: 13m 52s
532:	learn: 18.2932446	total: 9m 34s	remaining: 13m 50s
533:	learn: 18.2624958	total: 9m 35s	remaining: 13m 49s
534

662:	learn: 15.5897413	total: 11m 57s	remaining: 11m 33s
663:	learn: 15.5784983	total: 11m 58s	remaining: 11m 32s
664:	learn: 15.5557650	total: 11m 59s	remaining: 11m 31s
665:	learn: 15.5391077	total: 12m	remaining: 11m 30s
666:	learn: 15.5254694	total: 12m 1s	remaining: 11m 29s
667:	learn: 15.5096788	total: 12m 2s	remaining: 11m 27s
668:	learn: 15.4769117	total: 12m 3s	remaining: 11m 26s
669:	learn: 15.4648573	total: 12m 4s	remaining: 11m 25s
670:	learn: 15.4346866	total: 12m 5s	remaining: 11m 24s
671:	learn: 15.4244333	total: 12m 6s	remaining: 11m 23s
672:	learn: 15.3969444	total: 12m 8s	remaining: 11m 22s
673:	learn: 15.3867958	total: 12m 9s	remaining: 11m 21s
674:	learn: 15.3746053	total: 12m 10s	remaining: 11m 20s
675:	learn: 15.3527841	total: 12m 11s	remaining: 11m 19s
676:	learn: 15.3269224	total: 12m 12s	remaining: 11m 18s
677:	learn: 15.3030837	total: 12m 13s	remaining: 11m 17s
678:	learn: 15.2800764	total: 12m 14s	remaining: 11m 16s
679:	learn: 15.2577484	total: 12m 15s	remai

809:	learn: 13.3005253	total: 14m 32s	remaining: 8m 52s
810:	learn: 13.2895418	total: 14m 33s	remaining: 8m 51s
811:	learn: 13.2764106	total: 14m 34s	remaining: 8m 50s
812:	learn: 13.2543590	total: 14m 35s	remaining: 8m 48s
813:	learn: 13.2512155	total: 14m 36s	remaining: 8m 47s
814:	learn: 13.2419869	total: 14m 38s	remaining: 8m 46s
815:	learn: 13.2252435	total: 14m 39s	remaining: 8m 45s
816:	learn: 13.2107185	total: 14m 40s	remaining: 8m 44s
817:	learn: 13.2024349	total: 14m 41s	remaining: 8m 43s
818:	learn: 13.1883300	total: 14m 42s	remaining: 8m 42s
819:	learn: 13.1802960	total: 14m 43s	remaining: 8m 41s
820:	learn: 13.1589714	total: 14m 44s	remaining: 8m 40s
821:	learn: 13.1498876	total: 14m 45s	remaining: 8m 39s
822:	learn: 13.1355903	total: 14m 46s	remaining: 8m 37s
823:	learn: 13.1258988	total: 14m 47s	remaining: 8m 36s
824:	learn: 13.1182282	total: 14m 48s	remaining: 8m 35s
825:	learn: 13.1023403	total: 14m 49s	remaining: 8m 34s
826:	learn: 13.0930650	total: 14m 50s	remaining:

957:	learn: 11.4207142	total: 17m 8s	remaining: 6m 11s
958:	learn: 11.4100563	total: 17m 9s	remaining: 6m 10s
959:	learn: 11.3997652	total: 17m 10s	remaining: 6m 9s
960:	learn: 11.3972869	total: 17m 11s	remaining: 6m 8s
961:	learn: 11.3913572	total: 17m 12s	remaining: 6m 7s
962:	learn: 11.3830355	total: 17m 13s	remaining: 6m 5s
963:	learn: 11.3750137	total: 17m 14s	remaining: 6m 4s
964:	learn: 11.3652249	total: 17m 15s	remaining: 6m 3s
965:	learn: 11.3621852	total: 17m 16s	remaining: 6m 2s
966:	learn: 11.3540539	total: 17m 17s	remaining: 6m 1s
967:	learn: 11.3393347	total: 17m 18s	remaining: 6m
968:	learn: 11.3369556	total: 17m 19s	remaining: 5m 59s
969:	learn: 11.3269493	total: 17m 21s	remaining: 5m 58s
970:	learn: 11.3131726	total: 17m 22s	remaining: 5m 57s
971:	learn: 11.2975721	total: 17m 23s	remaining: 5m 56s
972:	learn: 11.2829422	total: 17m 24s	remaining: 5m 55s
973:	learn: 11.2801533	total: 17m 25s	remaining: 5m 54s
974:	learn: 11.2620571	total: 17m 26s	remaining: 5m 53s
975:	l

1103:	learn: 9.8337393	total: 19m 45s	remaining: 3m 34s
1104:	learn: 9.8270206	total: 19m 46s	remaining: 3m 33s
1105:	learn: 9.8211821	total: 19m 47s	remaining: 3m 32s
1106:	learn: 9.8080207	total: 19m 48s	remaining: 3m 31s
1107:	learn: 9.8035817	total: 19m 49s	remaining: 3m 30s
1108:	learn: 9.7859304	total: 19m 50s	remaining: 3m 29s
1109:	learn: 9.7800453	total: 19m 51s	remaining: 3m 28s
1110:	learn: 9.7767580	total: 19m 52s	remaining: 3m 27s
1111:	learn: 9.7652902	total: 19m 53s	remaining: 3m 26s
1112:	learn: 9.7501437	total: 19m 54s	remaining: 3m 24s
1113:	learn: 9.7371360	total: 19m 55s	remaining: 3m 23s
1114:	learn: 9.7267765	total: 19m 56s	remaining: 3m 22s
1115:	learn: 9.7139646	total: 19m 57s	remaining: 3m 21s
1116:	learn: 9.7053148	total: 19m 58s	remaining: 3m 20s
1117:	learn: 9.7002270	total: 19m 59s	remaining: 3m 19s
1118:	learn: 9.6927290	total: 20m 1s	remaining: 3m 18s
1119:	learn: 9.6908727	total: 20m 2s	remaining: 3m 17s
1120:	learn: 9.6781947	total: 20m 3s	remaining: 3m

1251:	learn: 8.5388612	total: 22m 23s	remaining: 55.8s
1252:	learn: 8.5282620	total: 22m 24s	remaining: 54.7s
1253:	learn: 8.5183673	total: 22m 25s	remaining: 53.7s
1254:	learn: 8.5103781	total: 22m 26s	remaining: 52.6s
1255:	learn: 8.4930782	total: 22m 27s	remaining: 51.5s
1256:	learn: 8.4875287	total: 22m 28s	remaining: 50.4s
1257:	learn: 8.4754074	total: 22m 30s	remaining: 49.4s
1258:	learn: 8.4638160	total: 22m 31s	remaining: 48.3s
1259:	learn: 8.4549267	total: 22m 32s	remaining: 47.2s
1260:	learn: 8.4445161	total: 22m 33s	remaining: 46.1s
1261:	learn: 8.4345498	total: 22m 34s	remaining: 45.1s
1262:	learn: 8.4229735	total: 22m 35s	remaining: 44s
1263:	learn: 8.4182089	total: 22m 36s	remaining: 42.9s
1264:	learn: 8.4116420	total: 22m 37s	remaining: 41.9s
1265:	learn: 8.3982403	total: 22m 38s	remaining: 40.8s
1266:	learn: 8.3899019	total: 22m 39s	remaining: 39.7s
1267:	learn: 8.3739638	total: 22m 40s	remaining: 38.6s
1268:	learn: 8.3697792	total: 22m 41s	remaining: 37.6s
1269:	learn:

['./models/HLM_ensemble_model_0.2.joblib']

In [12]:
loaded_MLM_ensemble = joblib.load(MLM_ensemble_model_path)
loaded_MLM_ensemble

In [13]:
loaded_HLM_ensemble = joblib.load(HLM_ensemble_model_path)
loaded_HLM_ensemble

In [14]:
# 저장된 HLM 앙상블 모델 불러오기
loaded_MLM_ensemble = joblib.load(MLM_ensemble_model_path)
loaded_HLM_ensemble = joblib.load(HLM_ensemble_model_path)

# MLM 앙상블 모델 평가
MLM_ensemble_pred = loaded_MLM_ensemble.predict(X_Mvalid)
MLM_rmse = np.sqrt(mean_squared_error(y_Mvalid, MLM_ensemble_pred))
print("MLM Ensemble Validation RMSE:", MLM_rmse)

# HLM 앙상블 모델 평가
HLM_ensemble_pred = loaded_HLM_ensemble.predict(X_Hvalid)
HLM_rmse = np.sqrt(mean_squared_error(y_Hvalid, HLM_ensemble_pred))
print("HLM Ensemble Validation RMSE:", HLM_rmse)

MLM Ensemble Validation RMSE: 30.06378300964724
HLM Ensemble Validation RMSE: 31.62772936628081


### Test

In [15]:
df_test = pd.read_csv("./data/test.csv").drop(columns=["id"])
test_df = fill_na(df_test, imputer)
test_df = extract_features(test_df)

pred_MLM = loaded_MLM_ensemble.predict(test_df)
pred_HLM = loaded_HLM_ensemble.predict(test_df)

In [16]:
submission = pd.read_csv('./data/sample_submission.csv')

submission['MLM'] = pred_MLM
submission['HLM'] = pred_HLM
submission

Unnamed: 0,id,MLM,HLM
0,TEST_000,32.849244,49.339513
1,TEST_001,66.042196,77.042494
2,TEST_002,37.719434,49.580332
3,TEST_003,48.596129,66.681940
4,TEST_004,50.667117,68.597392
...,...,...,...
478,TEST_478,15.433358,37.084261
479,TEST_479,72.764480,81.430540
480,TEST_480,43.814790,58.743133
481,TEST_481,64.370002,76.558741


In [17]:
submission.to_csv('./submissions/Ensemble_CATBoost+RandomForest_submission_2.csv', index=False)