In [2]:
import random
import os

import numpy as np
import pandas as pd

from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import missingno

# device 설정
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print (device)

seed = 42 # seed 값 설정
random.seed(seed) # 파이썬 난수 생성기
os.environ['PYTHONHASHSEED'] = str(seed) # 해시 시크릿값 고정
np.random.seed(seed) # 넘파이 난수 생성기

torch.manual_seed(seed) # 파이토치 CPU 난수 생성기
torch.backends.cudnn.deterministic = True # 확정적 연산 사용 설정
torch.backends.cudnn.benchmark = False   # 벤치마크 기능 사용 해제
torch.backends.cudnn.enabled = False        # cudnn 기능 사용 해제

if device == 'cuda':
    torch.cuda.manual_seed(seed) # 파이토치 GPU 난수 생성기
    torch.cuda.manual_seed_all(seed) # 파이토치 멀티 GPU 난수 생성기

cpu


In [3]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google.colab'

In [4]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [38]:
test

Unnamed: 0,id,SMILES,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea,logP,apka,num_rotatable_bonds,num_heteroatoms,num_hydrogen_acceptors,num_hydrogen_donors,morgan_fingerprint
0,TEST_000,CC(C)Nc1ccnc(N2CCN(Cc3cccs3)C(CCO)C2)n1,2.641,361.505,4,2,7,2.635,92.76,2.43160,361.515,7,7,7,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,TEST_001,COc1cc(=O)n(-c2ccccc2)cc1C(=O)N1CCC2(CC1)OCCO2,0.585,370.399,5,0,3,0.585,68.31,1.82520,370.405,3,7,6,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,TEST_002,Cc1cccc(NC(=N)/N=c2\nc(O)c(Cc3ccccc3)c(C)[nH]2)c1,4.276,347.414,4,4,5,4.290,92.86,3.27051,347.422,3,6,3,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,TEST_003,O=C(c1nc2ncccn2n1)N1CCCn2cc(-c3ccccc3)nc21,1.795,345.358,5,0,2,1.795,81.21,2.03830,345.366,2,8,7,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,TEST_004,CCN1CCN(C(=O)c2cc3c(=O)n4cc(C)ccc4nc3n2C)CC1,1.219,353.418,4,0,2,0.169,61.15,1.27232,353.426,2,7,6,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
478,TEST_478,CCc1noc(CC)c1CC(=O)NCC1(CC)CCCCC1,4.207,306.443,2,1,7,4.207,55.13,3.81860,306.450,7,4,3,1,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
479,TEST_479,CC(=O)N1CCC2(CC1)OC(=O)C(C)=C2C(=O)N1CCN(C)CC1,-0.608,335.398,5,0,1,-1.736,70.16,0.01480,335.404,1,7,5,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, ..."
480,TEST_480,CC(C)NC(=O)CN1C(=O)c2ccccc2N2C(=O)c3ccccc3C12,1.792,349.383,3,1,3,1.792,69.72,2.32600,349.390,3,6,3,1,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
481,TEST_481,Cn1cc(Br)c(=O)c(NC(=O)c2ccc(O)cc2F)c1,0.790,341.132,3,2,2,0.423,69.64,2.24480,341.136,2,7,4,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [6]:
!pip install rdkit-pypi



In [7]:
!pip install lightgbm



In [8]:
!pip install xgboost



In [9]:
import pandas as pd
import numpy as np
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors

def calculate_metabolic_stability_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    logP = Descriptors.MolLogP(mol)
    # 화합물의 친유성을 측정한 것으로 지질 또는 비극성 환경에서의 용해도를 나타냅니다. 생물학적 막을 통과하는 화합물의 능력을 반영합니다.
    apka = Descriptors.MolWt(mol)
    # 화합물의 산 해리 상수의 추정치로 다양한 pH 조건에서 이온화 거동에 대한 정보를 제공합니다.
    num_rotatable_bonds = Descriptors.NumRotatableBonds(mol)
    # 화합물에서 회전 가능한 결합의 수입니다. 이것은 화합물의 유연성과 효소 또는 다른 분자와의 잠재적인 상호 작용에 대한 통찰력을 제공할 수 있습니다.
    num_heteroatoms = Descriptors.NumHeteroatoms(mol)
    # 분자 내 헤테로원자(탄소 및 수소 이외의 원자) 수. 이는 화합물의 반응성과 대사 안정성에 영향을 줄 수 있습니다.
    num_hydrogen_acceptors = Descriptors.NumHAcceptors(mol)
    # 분자 내 수소 결합 수용체의 수. 이들은 결합 및 반응성에 영향을 미치는 다른 분자의 수소 결합 기증자와 상호 작용할 수 있는 사이트입니다.
    num_hydrogen_donors = Descriptors.NumHDonors(mol)
    # 분자 내 수소 결합 기증자의 수입니다. 이들은 수소 결합 상호작용에서 수소 원자를 제공할 수 있는 사이트입니다.
    # morgan_fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
    morgan_fingerprint = AllChem.GetHashedMorganFingerprint(mol, 6, nBits=4096)
    # 분자 하위 구조의 이진 벡터 표현입니다. 이 열에는 화합물과 효소의 상호 작용 및 대사 안정성에 영향을 줄 수 있는 구조적 특징을 포착하는 이진 지문이 포함되어 있습니다.
    morgan_array = np.zeros((1,), dtype=np.int8)
    DataStructs.ConvertToNumpyArray(morgan_fingerprint, morgan_array)

    return logP, apka, num_rotatable_bonds, num_heteroatoms, num_hydrogen_acceptors, num_hydrogen_donors, morgan_array

train[[
    'logP', 'apka', 'num_rotatable_bonds', 'num_heteroatoms',
    'num_hydrogen_acceptors', 'num_hydrogen_donors', 'morgan_fingerprint'
]] = train['SMILES'].apply(calculate_metabolic_stability_descriptors).apply(pd.Series)

test[[
    'logP', 'apka', 'num_rotatable_bonds', 'num_heteroatoms',
    'num_hydrogen_acceptors', 'num_hydrogen_donors', 'morgan_fingerprint'
]] = test['SMILES'].apply(calculate_metabolic_stability_descriptors).apply(pd.Series)

train


Unnamed: 0,id,SMILES,MLM,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea,logP,apka,num_rotatable_bonds,num_heteroatoms,num_hydrogen_acceptors,num_hydrogen_donors,morgan_fingerprint
0,TRAIN_0000,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,26.010,50.680,3.259,400.495,5,2,8,3.259,117.37,3.87744,400.504,8,8,6,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,TRAIN_0001,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,29.270,50.590,2.169,301.407,2,1,2,2.172,73.47,3.35474,301.415,2,5,4,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,TRAIN_0002,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,5.586,80.892,1.593,297.358,5,0,3,1.585,62.45,1.20450,297.366,3,7,7,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,TRAIN_0003,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,5.710,2.000,4.771,494.652,6,0,5,3.475,92.60,3.89356,494.665,5,9,7,0,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,TRAIN_0004,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,93.270,99.990,2.335,268.310,3,0,1,2.337,42.43,2.81772,268.316,1,4,3,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3493,TRAIN_3493,Cn1nc(CNC(=O)Cn2nc(C(F)(F)F)c3c2CCC3)c(Cl)c1Cl,1.556,3.079,3.409,396.195,3,1,5,3.409,64.74,2.74730,396.200,4,11,5,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3494,TRAIN_3494,CCn1[nH]cc/c1=N\C(=O)c1nn(-c2ccccc2)c(=O)c2ccc...,35.560,47.630,1.912,359.381,4,1,3,1.844,77.37,2.27630,359.389,3,7,5,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3495,TRAIN_3495,CCOC(=O)CCCc1nc2cc(N)ccc2n1C,56.150,1.790,1.941,261.320,3,1,6,2.124,70.14,2.04130,261.325,5,5,5,1,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3496,TRAIN_3496,Nc1cc(C(=O)OCCC2CCOC2=O)cnc1Cl,0.030,2.770,0.989,284.696,5,1,5,0.989,91.51,1.42720,284.699,4,7,6,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."


In [10]:
train['AlogP'].fillna(train['AlogP'].median(), inplace=True)
test['AlogP'].fillna(test['AlogP'].median(), inplace=True)

In [11]:
class CustomDataset(Dataset):
    def __init__(self, data, target_col=None, transform=None, is_test=False):
        self.is_test = is_test
        self.transform = transform
        self.is_test = is_test

        if not self.is_test:
            self.data = self.transform.fit_transform(np.stack(data['morgan_fingerprint']))
        else: # test
            self.data = self.transform.transform(np.stack(data['morgan_fingerprint']))

        if target_col is not None and not self.is_test:
            self.target = data[target_col]

    def __getitem__(self, index):
        features = self.data[index]

        if hasattr(self, 'target'):
            target = self.target[index]
            return torch.tensor(features).to(device).float(), torch.tensor(target).to(device).float().unsqueeze(dim=-1)
        else:
            return torch.tensor(features).to(device).float()

    def __len__(self):
        return len(self.data)


In [12]:
transform = VarianceThreshold(threshold=0.05)

train_MLM = CustomDataset(train, target_col='MLM', transform=transform, is_test=False)
train_HLM = CustomDataset(train, target_col='HLM', transform=transform, is_test=False)

251

In [36]:
len(train_HLM.target)

3498

### 디폴트 테스트
> 하이퍼 파라미터 튜닝을 통해 각 머신러닝 모델을 보다 최적화할 수 있지만, 우선 디폴트값으로 바로 예측 가능

In [14]:
import numpy as np # 각 모델에서 내부적으로 관련 라이브러리 사용 가능
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# Define regression models
knn_model = KNeighborsRegressor()
linear_model = LinearRegression()
svr_model = SVR()
decision_model = DecisionTreeRegressor()
random_model = RandomForestRegressor()
extra_model = ExtraTreesRegressor()
gbm_model = GradientBoostingRegressor()
xgb_model = XGBRegressor(eval_metric='rmse')
lgbm_model = LGBMRegressor()

# Assuming train_MLM is the dataset you want to use
X_MLM_train = train_MLM.data
y_MLM_train = train_MLM.target

X_HLM_train = train_HLM.data
y_HLM_train = train_HLM.target

# Define the regression models in a list
regression_models = [
    knn_model,
    linear_model,
    svr_model,
    decision_model,
    random_model,
    extra_model,
    gbm_model,
    xgb_model,
    lgbm_model
]

k_fold = KFold(n_splits=10, shuffle=True, random_state=0)
results = dict()

# RMSE를 사용하는 스코어 함수 생성
rmse_scorer = make_scorer(lambda y_true, y_pred: -np.sqrt(np.mean((y_true - y_pred) ** 2)), greater_is_better=False)

# Training and evaluating regression models
for alg in regression_models:
    alg.fit(X_MLM_train, y_MLM_train)
    score = cross_val_score(alg, X_MLM_train, y_MLM_train, cv=k_fold, scoring=rmse_scorer)
    results[alg.__class__.__name__] = -np.mean(score)  # Negative MSE for evaluation


In [15]:
results

{'KNeighborsRegressor': 1258.438592820784,
 'LinearRegression': 1073.6081399593293,
 'SVR': 1213.1768267499413,
 'DecisionTreeRegressor': 1983.7823636885319,
 'RandomForestRegressor': 1038.6077412060572,
 'ExtraTreesRegressor': 1077.6726822720034,
 'GradientBoostingRegressor': 1036.3051223493233,
 'XGBRegressor': 1102.698864141333,
 'LGBMRegressor': 1034.2047848480465}

### 정확도 높은 순으로 정렬하기

In [16]:
sorted(results.items(), key=lambda x: x[1], reverse=True) # reverse=True 면 높은 순서대로 정렬

[('DecisionTreeRegressor', 1983.7823636885319),
 ('KNeighborsRegressor', 1258.438592820784),
 ('SVR', 1213.1768267499413),
 ('XGBRegressor', 1102.698864141333),
 ('ExtraTreesRegressor', 1077.6726822720034),
 ('LinearRegression', 1073.6081399593293),
 ('RandomForestRegressor', 1038.6077412060572),
 ('GradientBoostingRegressor', 1036.3051223493233),
 ('LGBMRegressor', 1034.2047848480465)]

### Grid Search XGBoost 적용 1단계 (MLM)

In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

learning_rate = [0.001, 0.005, 0.01, 0.05, 0.06, 0.1, 0.12, 0.15, 0.17, 0.2]
n_estimators = [10, 50, 60, 75, 85, 100, 125, 150, 200, 250, 500, 1000]

hyperparams = {
    'learning_rate': learning_rate,
    'n_estimators': n_estimators
}

# RMSE를 사용하는 스코어 함수 생성
rmse_scorer = make_scorer(lambda y_true, y_pred: np.sqrt(np.mean((y_true - y_pred) ** 2)), greater_is_better=False)

gd_MLM = GridSearchCV(
    estimator = XGBRegressor(random_state=1),
    param_grid = hyperparams,
    verbose=True,
    cv=5,
    scoring = rmse_scorer,
    n_jobs=-1
)

gd_MLM.fit(X_MLM_train, y_MLM_train)
print(gd_MLM.best_score_)
print(gd_MLM.best_params_)

Fitting 5 folds for each of 120 candidates, totalling 600 fits
-32.01350880628669
{'learning_rate': 0.06, 'n_estimators': 150}


### Grid Search XGBoost 적용 2단계 (MLM)

In [None]:
max_depth = [3, 4, 5, 6, 7, 8, 9, 10]
min_child_weight = [1, 2, 3, 4, 5, 6, 7]

hyperparams = {
    'max_depth': max_depth,
    'min_child_weight': min_child_weight
}

gd_MLM=GridSearchCV(
    estimator = XGBRegressor(learning_rate=0.06, n_estimators=150, random_state=1),
    param_grid = hyperparams,
    verbose=True,
    cv=5,
    scoring = rmse_scorer,
    n_jobs=-1
)

gd.fit(X_MLM_train, y_MLM_train)
print(gd_MLM.best_score_)
print(gd_MLM.best_params_)

Fitting 5 folds for each of 56 candidates, totalling 280 fits
-31.946058242263472
{'max_depth': 5, 'min_child_weight': 6}


### Grid Search XGBoost 적용 3단계 (MLM)

In [17]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

gamma =  [i*0.1 for i in range(0,5)]
subsample = [0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1]
colsample_bytree = [0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1]
reg_alpha = [1e-5, 1e-2, 0.1, 1, 100]

hyperparams = {
    'gamma': gamma,
    'subsample':subsample,
    'colsample_bytree':colsample_bytree,
    'reg_alpha': reg_alpha
}


# RMSE를 사용하는 스코어 함수 생성
rmse_scorer = make_scorer(lambda y_true, y_pred: np.sqrt(np.mean((y_true - y_pred) ** 2)), greater_is_better=False)

gd_MLM = GridSearchCV(
    estimator = XGBRegressor(
        learning_rate=0.06,
        n_estimators=150,
        max_depth=5,
        min_child_weight=6,
        random_state=1,
    ),
    param_grid = hyperparams,
    verbose=True,
    cv=5,
    scoring = rmse_scorer,
    n_jobs=-1
)

gd_MLM.fit(X_MLM_train, y_MLM_train)
print(gd_MLM.best_score_)
print(gd_MLM.best_params_)

Fitting 5 folds for each of 2025 candidates, totalling 10125 fits
-31.585677859861192
{'colsample_bytree': 0.6, 'gamma': 0.0, 'reg_alpha': 0.01, 'subsample': 0.6}


### Grid Search XGBoost 적용 1단계 (HLM)

In [18]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

learning_rate = [0.001, 0.005, 0.01, 0.05, 0.06, 0.1, 0.12, 0.15, 0.17, 0.2]
n_estimators = [10, 50, 60, 75, 85, 100, 125, 150, 200, 250, 500, 1000]

hyperparams = {
    'learning_rate': learning_rate,
    'n_estimators': n_estimators
}

# RMSE를 사용하는 스코어 함수 생성
rmse_scorer = make_scorer(lambda y_true, y_pred: np.sqrt(np.mean((y_true - y_pred) ** 2)), greater_is_better=False)

gd_HLM = GridSearchCV(
    estimator = XGBRegressor(random_state=1),
    param_grid = hyperparams,
    verbose=True,
    cv=5,
    scoring = rmse_scorer,
    n_jobs=-1
)

gd_HLM.fit(X_HLM_train, y_HLM_train)
print(gd_HLM.best_score_)
print(gd_HLM.best_params_)

Fitting 5 folds for each of 120 candidates, totalling 600 fits
-33.15123900352593
{'learning_rate': 0.005, 'n_estimators': 1000}


### Grid Search XGBoost 적용 2단계 (HLM)

In [19]:
max_depth = [3, 4, 5, 6, 7, 8, 9, 10]
min_child_weight = [1, 2, 3, 4, 5, 6, 7]

hyperparams = {
    'max_depth': max_depth,
    'min_child_weight': min_child_weight
}

gd_HLM = GridSearchCV(
    estimator = XGBRegressor(learning_rate=0.005, n_estimators=1000, random_state=1),
    param_grid = hyperparams,
    verbose=True,
    cv=5,
    scoring = rmse_scorer,
    n_jobs=-1
)

gd_HLM.fit(X_HLM_train, y_HLM_train)
print(gd_HLM.best_score_)
print(gd_HLM.best_params_)

Fitting 5 folds for each of 56 candidates, totalling 280 fits
-33.0972776749168
{'max_depth': 7, 'min_child_weight': 2}


### Grid Search XGBoost 적용 3단계 (HLM)

In [20]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

gamma =  [i*0.1 for i in range(0,5)]
subsample = [0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1]
colsample_bytree = [0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1]
reg_alpha = [1e-5, 1e-2, 0.1, 1, 100]

hyperparams = {
    'gamma': gamma,
    'subsample':subsample,
    'colsample_bytree':colsample_bytree,
    'reg_alpha': reg_alpha
}


# RMSE를 사용하는 스코어 함수 생성
rmse_scorer = make_scorer(lambda y_true, y_pred: np.sqrt(np.mean((y_true - y_pred) ** 2)), greater_is_better=False)

gd_HLM = GridSearchCV(
    estimator = XGBRegressor(
        learning_rate=0.06,
        n_estimators=150,
        max_depth=7,
        min_child_weight=2,
        random_state=1,
    ),
    param_grid = hyperparams,
    verbose=True,
    cv=5,
    scoring = rmse_scorer,
    n_jobs=-1
)

gd_HLM.fit(X_HLM_train, y_HLM_train)
print(gd_HLM.best_score_)
print(gd_HLM.best_params_)

Fitting 5 folds for each of 2025 candidates, totalling 10125 fits
-32.72859872046245
{'colsample_bytree': 0.7, 'gamma': 0.1, 'reg_alpha': 100, 'subsample': 0.7}


In [39]:
test_MLM = CustomDataset(test, target_col=None, transform=transform, is_test=True)
test_HLM = CustomDataset(test, target_col=None, transform=transform, is_test=True)

X_MLM_test = test_MLM.data
X_HLM_test = test_HLM.data


In [41]:
# GridSearchCV를 사용하여 최적 하이퍼파라미터로 모델 학습 (이미 위에서 수행한 코드)

# 최적 모델 선택
best_MLM_model = gd_MLM.best_estimator_
best_HLM_model = gd_HLM.best_estimator_

# 테스트 데이터로 예측 생성
y_MLM_pred = best_MLM_model.predict(X_MLM_test)
y_HLM_pred = best_HLM_model.predict(X_HLM_test)

# 예측 결과를 저장하거나 활용
y_MLM_pred

array([32.498413 , 63.314526 , 37.307056 , 35.736782 , 41.19404  ,
       45.544655 , 23.965065 , 36.39433  , 28.369818 , 17.321465 ,
       21.565899 , 30.368078 , 51.260845 , 32.924393 , 18.88596  ,
       52.834156 , 40.758064 , 46.372005 , 72.88903  , 52.177605 ,
       37.607136 , 25.966589 , 21.209915 , 27.133661 , 19.061777 ,
       37.20327  , 45.224037 , 57.51753  , 36.481083 , 32.632557 ,
       49.99327  , 29.83285  , 26.331156 , 37.47392  , 29.131905 ,
       31.899126 , 28.01601  , 43.017982 , 37.591118 , 23.26043  ,
       30.645182 , 39.672993 , 23.947403 , 18.986769 , 38.94439  ,
       64.95583  , 16.309208 , 19.078386 , 52.206207 , 50.112747 ,
       50.502758 , 22.751413 , 30.521042 , 29.349707 , 17.601301 ,
       25.54568  , 40.728558 , 28.332035 , 35.1954   , 37.992767 ,
       30.40296  , 37.890137 , 41.09887  , 44.906826 , 37.282154 ,
       43.370865 , 35.82449  , 44.237206 , 42.63974  , 34.995277 ,
       31.151976 , 28.1848   , 42.744133 , 36.26829  , 33.1960

In [44]:
submission = pd.read_csv('./data/sample_submission.csv')

submission['MLM'] = y_MLM_pred
submission['HLM'] = y_HLM_pred
submission

Unnamed: 0,id,MLM,HLM
0,TEST_000,32.498413,55.957294
1,TEST_001,63.314526,74.960510
2,TEST_002,37.307056,52.804085
3,TEST_003,35.736782,62.331223
4,TEST_004,41.194038,66.093102
...,...,...,...
478,TEST_478,39.669918,56.745899
479,TEST_479,65.164055,77.949089
480,TEST_480,20.394632,42.223988
481,TEST_481,52.609375,68.454201


In [45]:
submission.to_csv('./submissions/XGBoost_submission_1.csv', index=False)