In [1]:
import random
import os

import numpy as np
import pandas as pd

from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import missingno

# device 설정
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print (device)

seed = 42 # seed 값 설정
random.seed(seed) # 파이썬 난수 생성기
os.environ['PYTHONHASHSEED'] = str(seed) # 해시 시크릿값 고정
np.random.seed(seed) # 넘파이 난수 생성기

torch.manual_seed(seed) # 파이토치 CPU 난수 생성기
torch.backends.cudnn.deterministic = True # 확정적 연산 사용 설정
torch.backends.cudnn.benchmark = False   # 벤치마크 기능 사용 해제
torch.backends.cudnn.enabled = False        # cudnn 기능 사용 해제

if device == 'cuda':
    torch.cuda.manual_seed(seed) # 파이토치 GPU 난수 생성기
    torch.cuda.manual_seed_all(seed) # 파이토치 멀티 GPU 난수 생성기

cpu


In [2]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [3]:
!pip install rdkit-pypi



In [4]:
!pip install lightgbm



In [5]:
!pip install xgboost



In [6]:
!pip install scikit-optimize



In [7]:
import pandas as pd
import numpy as np
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors

def calculate_metabolic_stability_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    logP = Descriptors.MolLogP(mol)
    # 화합물의 친유성을 측정한 것으로 지질 또는 비극성 환경에서의 용해도를 나타냅니다. 생물학적 막을 통과하는 화합물의 능력을 반영합니다.
    apka = Descriptors.MolWt(mol)
    # 화합물의 산 해리 상수의 추정치로 다양한 pH 조건에서 이온화 거동에 대한 정보를 제공합니다.
    num_rotatable_bonds = Descriptors.NumRotatableBonds(mol)
    # 화합물에서 회전 가능한 결합의 수입니다. 이것은 화합물의 유연성과 효소 또는 다른 분자와의 잠재적인 상호 작용에 대한 통찰력을 제공할 수 있습니다.
    num_heteroatoms = Descriptors.NumHeteroatoms(mol)
    # 분자 내 헤테로원자(탄소 및 수소 이외의 원자) 수. 이는 화합물의 반응성과 대사 안정성에 영향을 줄 수 있습니다.
    num_hydrogen_acceptors = Descriptors.NumHAcceptors(mol)
    # 분자 내 수소 결합 수용체의 수. 이들은 결합 및 반응성에 영향을 미치는 다른 분자의 수소 결합 기증자와 상호 작용할 수 있는 사이트입니다.
    num_hydrogen_donors = Descriptors.NumHDonors(mol)
    # 분자 내 수소 결합 기증자의 수입니다. 이들은 수소 결합 상호작용에서 수소 원자를 제공할 수 있는 사이트입니다.
    # morgan_fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
    morgan_fingerprint = AllChem.GetHashedMorganFingerprint(mol, 6, nBits=4096)
    # 분자 하위 구조의 이진 벡터 표현입니다. 이 열에는 화합물과 효소의 상호 작용 및 대사 안정성에 영향을 줄 수 있는 구조적 특징을 포착하는 이진 지문이 포함되어 있습니다.
    morgan_array = np.zeros((1,), dtype=np.int8)
    DataStructs.ConvertToNumpyArray(morgan_fingerprint, morgan_array)

    return logP, apka, num_rotatable_bonds, num_heteroatoms, num_hydrogen_acceptors, num_hydrogen_donors, morgan_array

train[[
    'logP', 'apka', 'num_rotatable_bonds', 'num_heteroatoms',
    'num_hydrogen_acceptors', 'num_hydrogen_donors', 'morgan_fingerprint'
]] = train['SMILES'].apply(calculate_metabolic_stability_descriptors).apply(pd.Series)

test[[
    'logP', 'apka', 'num_rotatable_bonds', 'num_heteroatoms',
    'num_hydrogen_acceptors', 'num_hydrogen_donors', 'morgan_fingerprint'
]] = test['SMILES'].apply(calculate_metabolic_stability_descriptors).apply(pd.Series)

train


Unnamed: 0,id,SMILES,MLM,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea,logP,apka,num_rotatable_bonds,num_heteroatoms,num_hydrogen_acceptors,num_hydrogen_donors,morgan_fingerprint
0,TRAIN_0000,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,26.010,50.680,3.259,400.495,5,2,8,3.259,117.37,3.87744,400.504,8,8,6,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,TRAIN_0001,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,29.270,50.590,2.169,301.407,2,1,2,2.172,73.47,3.35474,301.415,2,5,4,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,TRAIN_0002,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,5.586,80.892,1.593,297.358,5,0,3,1.585,62.45,1.20450,297.366,3,7,7,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,TRAIN_0003,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,5.710,2.000,4.771,494.652,6,0,5,3.475,92.60,3.89356,494.665,5,9,7,0,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,TRAIN_0004,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,93.270,99.990,2.335,268.310,3,0,1,2.337,42.43,2.81772,268.316,1,4,3,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3493,TRAIN_3493,Cn1nc(CNC(=O)Cn2nc(C(F)(F)F)c3c2CCC3)c(Cl)c1Cl,1.556,3.079,3.409,396.195,3,1,5,3.409,64.74,2.74730,396.200,4,11,5,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3494,TRAIN_3494,CCn1[nH]cc/c1=N\C(=O)c1nn(-c2ccccc2)c(=O)c2ccc...,35.560,47.630,1.912,359.381,4,1,3,1.844,77.37,2.27630,359.389,3,7,5,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3495,TRAIN_3495,CCOC(=O)CCCc1nc2cc(N)ccc2n1C,56.150,1.790,1.941,261.320,3,1,6,2.124,70.14,2.04130,261.325,5,5,5,1,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3496,TRAIN_3496,Nc1cc(C(=O)OCCC2CCOC2=O)cnc1Cl,0.030,2.770,0.989,284.696,5,1,5,0.989,91.51,1.42720,284.699,4,7,6,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."


In [8]:
train['AlogP'].fillna(train['AlogP'].median(), inplace=True)
test['AlogP'].fillna(test['AlogP'].median(), inplace=True)

In [9]:
class CustomDataset(Dataset):
    def __init__(self, data, target_col=None, transform=None, is_test=False):
        self.is_test = is_test
        self.transform = transform
        self.is_test = is_test

        if not self.is_test:
            self.data = self.transform.fit_transform(np.stack(data['morgan_fingerprint']))
        else: # test
            self.data = self.transform.transform(np.stack(data['morgan_fingerprint']))

        if target_col is not None and not self.is_test:
            self.target = data[target_col]

    def __getitem__(self, index):
        features = self.data[index]

        if hasattr(self, 'target'):
            target = self.target[index]
            return torch.tensor(features).to(device).float(), torch.tensor(target).to(device).float().unsqueeze(dim=-1)
        else:
            return torch.tensor(features).to(device).float()

    def __len__(self):
        return len(self.data)


In [10]:
transform = VarianceThreshold(threshold=0.05)

train_MLM = CustomDataset(train, target_col='MLM', transform=transform, is_test=False)
train_HLM = CustomDataset(train, target_col='HLM', transform=transform, is_test=False)

### 디폴트 테스트
> 하이퍼 파라미터 튜닝을 통해 각 머신러닝 모델을 보다 최적화할 수 있지만, 우선 디폴트값으로 바로 예측 가능

In [11]:
import numpy as np # 각 모델에서 내부적으로 관련 라이브러리 사용 가능
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# Define regression models
knn_model = KNeighborsRegressor()
linear_model = LinearRegression()
svr_model = SVR()
decision_model = DecisionTreeRegressor()
random_model = RandomForestRegressor()
extra_model = ExtraTreesRegressor()
gbm_model = GradientBoostingRegressor()
xgb_model = XGBRegressor(eval_metric='rmse')
lgbm_model = LGBMRegressor()

# Assuming train_MLM is the dataset you want to use
X_MLM_train = train_MLM.data
y_MLM_train = train_MLM.target

X_HLM_train = train_HLM.data
y_HLM_train = train_HLM.target

# Define the regression models in a list
regression_models = [
    knn_model,
    linear_model,
    svr_model,
    decision_model,
    random_model,
    extra_model,
    gbm_model,
    xgb_model,
    lgbm_model
]

k_fold = KFold(n_splits=10, shuffle=True, random_state=0)
results = dict()

# RMSE를 사용하는 스코어 함수 생성
def custom_scorer(y_true, y_pred):
    rmse = np.sqrt(np.mean((y_true - y_pred) ** 2))
    return 0.5 * rmse

# 이 스코어 함수를 make_scorer 함수를 사용하여 Scikit-Learn의 교차 검증과 GridSearchCV에서 사용할 수 있도록 만들 수 있습니다.
rmse_scorer = make_scorer(custom_scorer, greater_is_better=False)

# Training and evaluating regression models
for alg in regression_models:
    alg.fit(X_MLM_train, y_MLM_train)
    score = cross_val_score(alg, X_MLM_train, y_MLM_train, cv=k_fold, scoring=rmse_scorer)
    results[alg.__class__.__name__] = -np.mean(score)  # Negative MSE for evaluation


In [12]:
results

{'KNeighborsRegressor': 17.729745735154275,
 'LinearRegression': 16.369748457690815,
 'SVR': 17.401100142743594,
 'DecisionTreeRegressor': 22.251730171162826,
 'RandomForestRegressor': 16.106666954041916,
 'ExtraTreesRegressor': 16.40737265506526,
 'GradientBoostingRegressor': 16.08863215878475,
 'XGBRegressor': 16.590218001828255,
 'LGBMRegressor': 16.07126338414415}

In [13]:
sorted(results.items(), key=lambda x: x[1], reverse=True) # reverse=True 면 높은 순서대로 정렬

[('DecisionTreeRegressor', 22.251730171162826),
 ('KNeighborsRegressor', 17.729745735154275),
 ('SVR', 17.401100142743594),
 ('XGBRegressor', 16.590218001828255),
 ('ExtraTreesRegressor', 16.40737265506526),
 ('LinearRegression', 16.369748457690815),
 ('RandomForestRegressor', 16.106666954041916),
 ('GradientBoostingRegressor', 16.08863215878475),
 ('LGBMRegressor', 16.07126338414415)]

### Grid Search Random Forest Regressor 적용 (MLM)

In [28]:
from sklearn.model_selection import GridSearchCV

n_estimators = [800, 1000, 1200]
max_depth = [10, 12, 15]
min_samples_split = [4, 5, 6]
min_samples_leaf = [4, 5, 6]

hyperparams = {'n_estimators': n_estimators, 'max_depth': max_depth, 
               'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf}

# RMSE를 사용하는 스코어 함수 생성
def custom_scorer(y_true, y_pred):
    rmse = np.sqrt(np.mean((y_true - y_pred) ** 2))
    return 0.5 * rmse

# 이 스코어 함수를 make_scorer 함수를 사용하여 Scikit-Learn의 교차 검증과 GridSearchCV에서 사용할 수 있도록 만들 수 있습니다.
rmse_scorer = make_scorer(custom_scorer, greater_is_better=False)

gd_MLM = GridSearchCV(estimator = RandomForestRegressor(), param_grid = hyperparams, 
                verbose=True, scoring=rmse_scorer, cv=5, n_jobs=-1)

gd_MLM.fit(X_MLM_train, y_MLM_train)
print(gd_MLM.best_score_)
print(gd_MLM.best_params_)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
-16.026424190110806
{'max_depth': 15, 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 1000}


### Grid Search Random Forest Regressor 적용 (HLM)

In [29]:
from sklearn.model_selection import GridSearchCV

n_estimators = [800, 1000, 1200]
max_depth = [10, 12, 15]
min_samples_split = [4, 5, 6]
min_samples_leaf = [4, 5, 6]

hyperparams = {'n_estimators': n_estimators, 'max_depth': max_depth, 
               'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf}

# RMSE를 사용하는 스코어 함수 생성
def custom_scorer(y_true, y_pred):
    rmse = np.sqrt(np.mean((y_true - y_pred) ** 2))
    return 0.5 * rmse

# 이 스코어 함수를 make_scorer 함수를 사용하여 Scikit-Learn의 교차 검증과 GridSearchCV에서 사용할 수 있도록 만들 수 있습니다.
rmse_scorer = make_scorer(custom_scorer, greater_is_better=False)

gd_HLM = GridSearchCV(estimator = RandomForestRegressor(), param_grid = hyperparams, 
                verbose=True, scoring=rmse_scorer, cv=5, n_jobs=-1)

gd_HLM.fit(X_HLM_train, y_HLM_train)
print(gd_HLM.best_score_)
print(gd_HLM.best_params_)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
-16.51600721720511
{'max_depth': 15, 'min_samples_leaf': 5, 'min_samples_split': 5, 'n_estimators': 1000}


### Grid Search XGBoost Regressor 적용 (MLM)
#### 학습 시간이 너무 오래 걸림

In [None]:
from xgboost import XGBRegressor # 회귀트리 모델
from sklearn.model_selection import GridSearchCV

hyperparams = {
    'nthread': [4],
    'learning_rate': [0.05, 0.06, 0.1, 0.15],
    'max_depth': [4, 5, 6, 7],  # 더 깊은 트리를 시도해 볼 수 있습니다.
    'min_child_weight': [3, 4, 5, 6],
    'subsample': [0.6, 0.7, 0.8],
    'colsample_bytree': [0.6, 0.7, 0.8],
    'n_estimators': [150, 250, 500, 750, 1000],  # 더 많은 추정자를 시도해 볼 수 있습니다.
    'gamma': [0, 0.1, 0.2],  # gamma 파라미터 추가
    'lambda': [0, 0.1, 0.2],  # L2 규제 파라미터 추가
    'alpha': [0, 0.1, 0.2]  # L1 규제 파라미터 추가
}

# RMSE를 사용하는 스코어 함수 생성
def custom_scorer(y_true, y_pred):
    rmse = np.sqrt(np.mean((y_true - y_pred) ** 2))
    return 0.5 * rmse

# 이 스코어 함수를 make_scorer 함수를 사용하여 Scikit-Learn의 교차 검증과 GridSearchCV에서 사용할 수 있도록 만들 수 있습니다.
rmse_scorer = make_scorer(custom_scorer, greater_is_better=False)

gd_MLM = GridSearchCV(estimator = XGBRegressor(), param_grid = hyperparams, 
                verbose=True, scoring=rmse_scorer, cv=5, n_jobs=-1)

gd_MLM.fit(X_MLM_train, y_MLM_train)
print(gd_MLM.best_score_)
print(gd_MLM.best_params_)

Fitting 5 folds for each of 77760 candidates, totalling 388800 fits


### Grid Search XGBoost Regressor 적용 (HLM)
#### 학습 시간이 너무 오래 걸림

In [None]:
from xgboost import XGBRegressor # 회귀트리 모델
from sklearn.model_selection import GridSearchCV

hyperparams = {
    'nthread': [4],
    'learning_rate': [0.05, 0.06, 0.1, 0.15],
    'max_depth': [4, 5, 6, 7],  # 더 깊은 트리를 시도해 볼 수 있습니다.
    'min_child_weight': [3, 4, 5, 6],
    'subsample': [0.6, 0.7, 0.8],
    'colsample_bytree': [0.6, 0.7, 0.8],
    'n_estimators': [150, 250, 500, 750, 1000],  # 더 많은 추정자를 시도해 볼 수 있습니다.
    'gamma': [0, 0.1, 0.2],  # gamma 파라미터 추가
    'lambda': [0, 0.1, 0.2],  # L2 규제 파라미터 추가
    'alpha': [0, 0.1, 0.2]  # L1 규제 파라미터 추가
}

# RMSE를 사용하는 스코어 함수 생성
def custom_scorer(y_true, y_pred):
    rmse = np.sqrt(np.mean((y_true - y_pred) ** 2))
    return 0.5 * rmse

# 이 스코어 함수를 make_scorer 함수를 사용하여 Scikit-Learn의 교차 검증과 GridSearchCV에서 사용할 수 있도록 만들 수 있습니다.
rmse_scorer = make_scorer(custom_scorer, greater_is_better=False)

gd_HLM = GridSearchCV(estimator = XGBRegressor(), param_grid = hyperparams, 
                verbose=True, scoring=rmse_scorer, cv=5, n_jobs=-1)

gd_HLM.fit(X_HLM_train, y_HLM_train)
print(gd_HLM.best_score_)
print(gd_HLM.best_params_)

### Grid Search DecisionTree Regressor 적용 (MLM)

In [14]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

# DecisionTreeRegressor 모델을 사용하기 위한 하이퍼파라미터 그리드 정의
hyperparams = {
    'max_depth': [None, 10, 20, 30],  # 결정 트리의 최대 깊이
    'min_samples_split': [2, 5, 10],  # 노드를 분할하기 위한 최소 샘플 수
    'min_samples_leaf': [1, 2, 4]    # 리프 노드의 최소 샘플 수
}

# RMSE를 사용하는 스코어 함수 생성
def custom_scorer(y_true, y_pred):
    rmse = np.sqrt(np.mean((y_true - y_pred) ** 2))
    return 0.5 * rmse

# 이 스코어 함수를 make_scorer 함수를 사용하여 Scikit-Learn의 교차 검증과 GridSearchCV에서 사용할 수 있도록 만들 수 있습니다.
rmse_scorer = make_scorer(custom_scorer, greater_is_better=False)

# GridSearchCV 설정
gd_MLM = GridSearchCV(
    estimator=DecisionTreeRegressor(),
    param_grid=hyperparams,
    verbose=True,
    scoring=rmse_scorer,
    cv=5,
    n_jobs=-1
)

# 학습 데이터에 대해 GridSearchCV를 수행
gd_MLM.fit(X_MLM_train, y_MLM_train)

# 최적의 스코어와 하이퍼파라미터 출력
print(gd_MLM.best_score_)
print(gd_MLM.best_params_)


Fitting 5 folds for each of 36 candidates, totalling 180 fits
-18.500933666965388
{'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}


### Grid Search DecisionTree Regressor 적용 (HLM)

In [15]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

# DecisionTreeRegressor 모델을 사용하기 위한 하이퍼파라미터 그리드 정의
hyperparams = {
    'max_depth': [None, 10, 20, 30],  # 결정 트리의 최대 깊이
    'min_samples_split': [2, 5, 10],  # 노드를 분할하기 위한 최소 샘플 수
    'min_samples_leaf': [1, 2, 4]    # 리프 노드의 최소 샘플 수
}

# RMSE를 사용하는 스코어 함수 생성
def custom_scorer(y_true, y_pred):
    rmse = np.sqrt(np.mean((y_true - y_pred) ** 2))
    return 0.5 * rmse

# 이 스코어 함수를 make_scorer 함수를 사용하여 Scikit-Learn의 교차 검증과 GridSearchCV에서 사용할 수 있도록 만들 수 있습니다.
rmse_scorer = make_scorer(custom_scorer, greater_is_better=False)

# GridSearchCV 설정
gd_HLM = GridSearchCV(
    estimator=DecisionTreeRegressor(),
    param_grid=hyperparams,
    verbose=True,
    scoring=rmse_scorer,
    cv=5,
    n_jobs=-1
)

# 학습 데이터에 대해 GridSearchCV를 수행
gd_HLM.fit(X_HLM_train, y_HLM_train)

# 최적의 스코어와 하이퍼파라미터 출력
print(gd_HLM.best_score_)
print(gd_HLM.best_params_)


Fitting 5 folds for each of 36 candidates, totalling 180 fits
-18.84128136146975
{'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}


### RandomizedSearchCV XGBoost Regressor 적용 (MLM)

In [21]:
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
from scipy.stats import randint, uniform

hyperparams = {
    'nthread': [4],
    'learning_rate': uniform(0.05, 0.15),
    'max_depth': randint(4, 8),
    'min_child_weight': randint(3, 7),
    'subsample': uniform(0.6, 0.2),
    'colsample_bytree': uniform(0.6, 0.2),
    'n_estimators': [150, 250, 500, 750, 1000],
    'gamma': uniform(0, 0.2),
    'lambda': uniform(0, 0.2),
    'alpha': uniform(0, 0.2),
    'reg_lambda': uniform(0, 1.0),  # 추가: L2 규제 파라미터
    'reg_alpha': uniform(0, 1.0),  # 추가: L1 규제 파라미터
    'scale_pos_weight': uniform(1, 10)  # 추가: 클래스 불균형 조절 파라미터
}


# RandomizedSearchCV를 사용할 때 n_iter 파라미터로 샘플링할 하이퍼파라미터 조합의 수를 지정합니다.
n_iter = 100  # 예시로 100개의 조합을 시도합니다.

# RMSE를 사용하는 스코어 함수 생성
def custom_scorer(y_true, y_pred):
    rmse = np.sqrt(np.mean((y_true - y_pred) ** 2))
    return 0.5 * rmse

# 이 스코어 함수를 make_scorer 함수를 사용하여 Scikit-Learn의 교차 검증과 RandomizedSearchCV에서 사용할 수 있도록 만들 수 있습니다.
from sklearn.metrics import make_scorer
rmse_scorer = make_scorer(custom_scorer, greater_is_better=False)

rs_MLM = RandomizedSearchCV(estimator=XGBRegressor(), param_distributions=hyperparams,
                            n_iter=n_iter, verbose=True, scoring=rmse_scorer, cv=5, n_jobs=-1, random_state=42)

rs_MLM.fit(X_MLM_train, y_MLM_train)
print(rs_MLM.best_score_)
print(rs_MLM.best_params_)


Fitting 5 folds for each of 100 candidates, totalling 500 fits
-15.930874076105676
{'alpha': 0.0067758320493189975, 'colsample_bytree': 0.6622946342265514, 'gamma': 0.15609914150921755, 'lambda': 0.055517434525883626, 'learning_rate': 0.08301458211216456, 'max_depth': 5, 'min_child_weight': 3, 'n_estimators': 150, 'nthread': 4, 'reg_alpha': 0.17803596686975143, 'reg_lambda': 0.9610703174694551, 'scale_pos_weight': 2.4866272775311296, 'subsample': 0.6829248247454047}


### RandomizedSearchCV XGBoost Regressor 적용 (HLM)

In [22]:
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
from scipy.stats import randint, uniform

hyperparams = {
    'nthread': [4],
    'learning_rate': uniform(0.05, 0.15),
    'max_depth': randint(4, 8),
    'min_child_weight': randint(3, 7),
    'subsample': uniform(0.6, 0.2),
    'colsample_bytree': uniform(0.6, 0.2),
    'n_estimators': [150, 250, 500, 750, 1000],
    'gamma': uniform(0, 0.2),
    'lambda': uniform(0, 0.2),
    'alpha': uniform(0, 0.2),
    'reg_lambda': uniform(0, 1.0),  # 추가: L2 규제 파라미터
    'reg_alpha': uniform(0, 1.0),  # 추가: L1 규제 파라미터
    'scale_pos_weight': uniform(1, 10)  # 추가: 클래스 불균형 조절 파라미터
}


# RandomizedSearchCV를 사용할 때 n_iter 파라미터로 샘플링할 하이퍼파라미터 조합의 수를 지정합니다.
n_iter = 100  # 예시로 100개의 조합을 시도합니다.

# RMSE를 사용하는 스코어 함수 생성
def custom_scorer(y_true, y_pred):
    rmse = np.sqrt(np.mean((y_true - y_pred) ** 2))
    return 0.5 * rmse

# 이 스코어 함수를 make_scorer 함수를 사용하여 Scikit-Learn의 교차 검증과 RandomizedSearchCV에서 사용할 수 있도록 만들 수 있습니다.
from sklearn.metrics import make_scorer
rmse_scorer = make_scorer(custom_scorer, greater_is_better=False)

rs_HLM = RandomizedSearchCV(estimator=XGBRegressor(), param_distributions=hyperparams,
                            n_iter=n_iter, verbose=True, scoring=rmse_scorer, cv=5, n_jobs=-1, random_state=42)

rs_HLM.fit(X_HLM_train, y_HLM_train)
print(rs_HLM.best_score_)
print(rs_HLM.best_params_)


Fitting 5 folds for each of 100 candidates, totalling 500 fits
-16.452423783414634
{'alpha': 0.0067758320493189975, 'colsample_bytree': 0.6622946342265514, 'gamma': 0.15609914150921755, 'lambda': 0.055517434525883626, 'learning_rate': 0.08301458211216456, 'max_depth': 5, 'min_child_weight': 3, 'n_estimators': 150, 'nthread': 4, 'reg_alpha': 0.17803596686975143, 'reg_lambda': 0.9610703174694551, 'scale_pos_weight': 2.4866272775311296, 'subsample': 0.6829248247454047}


### Bayesian OptimizationCV XGBoost Regressor 적용 (MLM)

In [None]:
from xgboost import XGBRegressor
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
import numpy as np

# 하이퍼파라미터 공간 정의
param_space = {
    'nthread': [4],
    'learning_rate': [0.01],
    'max_depth': Integer(4, 8),  # 4에서 8 사이의 정수값
    'min_child_weight': Integer(5,10),  # 3에서 7 사이의 정수값
    'subsample': Real(0.6, 0.8, prior='uniform'),  # 0.6에서 0.8 사이의 실수값
    'colsample_bytree': Real(0.6, 0.8, prior='uniform'),  # 0.6에서 0.8 사이의 실수값
    'n_estimators': Categorical([50, 100, 150, 250, 500]),  # 주어진 후보값 중 하나 선택
    'gamma': Real(0, 0.5, prior='uniform'),  # 0에서 0.2 사이의 실수값
    'lambda': Real(0, 0.2, prior='uniform'),  # 0에서 0.2 사이의 실수값
    'alpha': Real(0, 0.2, prior='uniform'),  # 0에서 0.2 사이의 실수값
    'reg_lambda': Real(0.5, 2.0, prior='uniform'),  # 추가: L2 규제 파라미터
    'reg_alpha': Real(0, 1.0, prior='uniform'),  # 추가: L1 규제 파라미터
    'scale_pos_weight': Real(1, 10, prior='uniform')  # 추가: 클래스 불균형 조절 파라미터
}

# RMSE를 사용하는 스코어 함수 정의
def custom_scorer(y_true, y_pred):
    rmse = np.sqrt(np.mean((y_true - y_pred) ** 2))
    return 0.5 * rmse

rmse_scorer = make_scorer(custom_scorer, greater_is_better=False)

# Bayesian Optimization을 위한 BayesSearchCV 객체 생성
bo_MLM = BayesSearchCV(
    XGBRegressor(),
    param_space,
    n_iter=500,  # 예시로 100번 탐색
    cv=5,
    scoring=rmse_scorer,
    n_jobs=-1,
    verbose=1,
    random_state=42
)

# Bayesian Optimization 수행
bo_MLM.fit(X_MLM_train, y_MLM_train)

# 최적의 하이퍼파라미터와 스코어 출력
print(bo_MLM.best_params_)
print(bo_MLM.best_score_)


### Bayesian OptimizationCV XGBoost Regressor 적용 (HLM)

In [23]:
from xgboost import XGBRegressor
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
import numpy as np

# 하이퍼파라미터 공간 정의
param_space = {
    'nthread': [4],
    'learning_rate': [0.01],
    'max_depth': Integer(2, 6),  # 4에서 8 사이의 정수값
    'min_child_weight': Integer(1, 5),  # 3에서 7 사이의 정수값
    'subsample': Real(0.5, 1.0, prior='uniform'),  # 0.6에서 0.8 사이의 실수값
    'colsample_bytree': Real(0.1, 0.6, prior='uniform'),  # 0.6에서 0.8 사이의 실수값
    'n_estimators': Categorical([150, 250, 500, 750]),  # 주어진 후보값 중 하나 선택
    'gamma': Real(0, 0.2, prior='uniform'),  # 0에서 0.2 사이의 실수값
    'lambda': Real(0.2, 0.5, prior='uniform'),  # 0에서 0.2 사이의 실수값
    'alpha': Real(0, 0.25, prior='uniform'),  # 0에서 0.2 사이의 실수값
    'reg_lambda': Real(1.5, 3, prior='uniform'),  # 추가: L2 규제 파라미터
    'reg_alpha': Real(0, 2.0, prior='uniform'),  # 추가: L1 규제 파라미터
    'scale_pos_weight': Real(1, 10, prior='uniform')  # 추가: 클래스 불균형 조절 파라미터
}

# RMSE를 사용하는 스코어 함수 정의
def custom_scorer(y_true, y_pred):
    rmse = np.sqrt(np.mean((y_true - y_pred) ** 2))
    return 0.5 * rmse

rmse_scorer = make_scorer(custom_scorer, greater_is_better=False)

# Bayesian Optimization을 위한 BayesSearchCV 객체 생성
bo_HLM = BayesSearchCV(
    XGBRegressor(),
    param_space,
    n_iter=500,  # 예시로 100번 탐색
    cv=5,
    scoring=rmse_scorer,
    n_jobs=-1,
    verbose=1,
    random_state=42
)

# Bayesian Optimization 수행
bo_HLM.fit(X_HLM_train, y_HLM_train)

# 최적의 하이퍼파라미터와 스코어 출력
print(bo_HLM.best_params_)
print(bo_HLM.best_score_)


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi



Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi



Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi



Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi



Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi



Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi



Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

### RandomizedSearchCV Random Forest Regressor 적용 (MLM)

In [14]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
import numpy as np

# 하이퍼파라미터 공간 정의
hyperparams = {
    'n_estimators': [800, 1000, 1200],
    'max_depth': [10, 12, 15],
    'min_samples_split': randint(2, 10),  # 2에서 9까지 무작위 정수값
    'min_samples_leaf': randint(1, 10),  # 1에서 9까지 무작위 정수값
    'max_features': ['auto', 'sqrt', 'log2'],  # 추가: 최대 특성 수 설정
    'bootstrap': [True, False]  # 추가: 부트스트랩 샘플링 여부
}

# RMSE를 사용하는 스코어 함수 생성
def custom_scorer(y_true, y_pred):
    rmse = np.sqrt(np.mean((y_true - y_pred) ** 2))
    return 0.5 * rmse

# 이 스코어 함수를 make_scorer 함수를 사용하여 Scikit-Learn의 교차 검증과 RandomizedSearchCV에서 사용할 수 있도록 만들 수 있습니다.
from sklearn.metrics import make_scorer
rmse_scorer = make_scorer(custom_scorer, greater_is_better=False)

rs_MLM = RandomizedSearchCV(estimator=RandomForestRegressor(), param_distributions=hyperparams,
                            n_iter=100, verbose=True, scoring=rmse_scorer, cv=5, n_jobs=-1, random_state=42)

rs_MLM.fit(X_MLM_train, y_MLM_train)

print(rs_MLM.best_score_)
print(rs_MLM.best_params_)


Fitting 5 folds for each of 100 candidates, totalling 500 fits


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


-16.01862387170444
{'bootstrap': True, 'max_depth': 15, 'max_features': 'auto', 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 800}


### RandomizedSearchCV Random Forest Regressor 적용 (HLM)

In [15]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
import numpy as np

# 하이퍼파라미터 공간 정의
hyperparams = {
    'n_estimators': [800, 1000, 1200],
    'max_depth': [10, 12, 15],
    'min_samples_split': randint(2, 10),  # 2에서 9까지 무작위 정수값
    'min_samples_leaf': randint(1, 10),  # 1에서 9까지 무작위 정수값
    'max_features': ['auto', 'sqrt', 'log2'],  # 추가: 최대 특성 수 설정
    'bootstrap': [True, False]  # 추가: 부트스트랩 샘플링 여부
}

# RMSE를 사용하는 스코어 함수 생성
def custom_scorer(y_true, y_pred):
    rmse = np.sqrt(np.mean((y_true - y_pred) ** 2))
    return 0.5 * rmse

# 이 스코어 함수를 make_scorer 함수를 사용하여 Scikit-Learn의 교차 검증과 RandomizedSearchCV에서 사용할 수 있도록 만들 수 있습니다.
from sklearn.metrics import make_scorer
rmse_scorer = make_scorer(custom_scorer, greater_is_better=False)

rs_HLM = RandomizedSearchCV(estimator=RandomForestRegressor(), param_distributions=hyperparams,
                            n_iter=100, verbose=True, scoring=rmse_scorer, cv=5, n_jobs=-1, random_state=42)

rs_HLM.fit(X_HLM_train, y_HLM_train)

print(rs_HLM.best_score_)
print(rs_HLM.best_params_)


Fitting 5 folds for each of 100 candidates, totalling 500 fits


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


-16.525128565533727
{'bootstrap': True, 'max_depth': 15, 'max_features': 'auto', 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 800}


### 예측값 구하기

In [24]:
test_MLM = CustomDataset(test, target_col=None, transform=transform, is_test=True)
test_HLM = CustomDataset(test, target_col=None, transform=transform, is_test=True)

X_MLM_test = test_MLM.data
X_HLM_test = test_HLM.data

In [25]:
# GridSearchCV를 사용하여 최적 하이퍼파라미터로 모델 학습 (이미 위에서 수행한 코드)

# 최적 모델 선택
best_MLM_model = bo_MLM.best_estimator_
best_HLM_model = bo_HLM.best_estimator_

# 테스트 데이터로 예측 생성
y_MLM_pred = best_MLM_model.predict(X_MLM_test)
y_HLM_pred = best_HLM_model.predict(X_HLM_test)

# 예측 결과를 저장하거나 활용
y_HLM_pred
y_MLM_pred

array([29.652998, 56.83143 , 37.777054, 34.46736 , 48.53841 , 45.679665,
       29.04333 , 38.90979 , 23.597385, 17.483755, 25.007118, 36.940075,
       54.28212 , 39.652245, 21.603998, 46.987217, 28.553984, 44.522667,
       72.812744, 56.491646, 38.050568, 20.172144, 23.505802, 32.832787,
       22.395302, 41.534   , 52.661   , 51.44407 , 40.495014, 28.40952 ,
       55.69158 , 28.978437, 24.304834, 40.47987 , 28.664436, 29.357706,
       24.379988, 42.189827, 36.83881 , 26.015606, 33.466263, 39.401855,
       20.164087, 18.094997, 38.479942, 62.22133 , 22.052124, 17.895939,
       52.115643, 43.489086, 56.505386, 23.974918, 28.632784, 25.273432,
       20.987068, 29.286604, 39.08383 , 24.243479, 35.31363 , 41.290756,
       27.403435, 43.1678  , 37.638577, 41.425705, 44.132397, 49.552578,
       34.469307, 49.00883 , 42.06248 , 36.952213, 31.956676, 32.434025,
       40.551384, 35.79158 , 23.8302  , 48.154175, 34.179146, 31.361357,
       43.68954 , 36.927902, 43.115273, 65.44331 , 

In [26]:
submission = pd.read_csv('./data/sample_submission.csv')

submission['MLM'] = y_MLM_pred
submission['HLM'] = y_HLM_pred
submission

Unnamed: 0,id,MLM,HLM
0,TEST_000,29.652998,52.105209
1,TEST_001,56.831429,73.295517
2,TEST_002,37.777054,53.823582
3,TEST_003,34.467361,59.225586
4,TEST_004,48.538410,63.683804
...,...,...,...
478,TEST_478,35.170895,53.322693
479,TEST_479,64.495758,76.954536
480,TEST_480,22.845003,39.991165
481,TEST_481,56.439724,71.295723


In [27]:
submission.to_csv('./submissions/XGBoost_submission_6.csv', index=False)