In [None]:
import os

print(os.path.abspath(''))
os.path.abspath(os.path.join(os.path.dirname(os.path.abspath('')), ''))

In [40]:
# 필요한 라이브러리 설치 (필요시)
# !pip install xgboost scikit-learn
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.datasets import make_regression
import pandas as pd
import numpy as np

from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

import sys
import os

# 현재 노트북의 디렉터리를 기준으로 루트 디렉터리 경로를 추가
project_root = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath('')), ''))
print(project_root)
sys.path.append(project_root)

# 이제 모듈을 임포트할 수 있습니다
from dataset import SimpleDNNPreprocess, SimpleDNNDataset
from utils import set_seed

CFG = {
    'NBITS':2048,
    'SEED':42,
}

set_seed(CFG['SEED'])

# SMILES 데이터를 분자 지문으로 변환
def smiles_to_fingerprint(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=CFG['NBITS'])
        return np.array(fp)
    else:
        return np.zeros((CFG['NBITS'],))

# 1. 데이터 생성 또는 로드
# 예시 데이터셋 생성 (회귀 문제용)
# X, y = make_regression(n_samples=1000, n_features=20, noise=0.1, random_state=42)
chembl_data = pd.read_csv('../data/train.csv')  # 예시 파일 이름

train = chembl_data[['Smiles', 'pIC50']]
# train = chembl_data[['Smiles', 'IC50_nM']] # 실험
train['Fingerprint'] = train['Smiles'].apply(smiles_to_fingerprint)

train_x = np.stack(train['Fingerprint'].values)
train_y = train['pIC50'].values
# train_y = train['IC50_nM'].values # 실험

# 학습 및 검증 데이터 분리
# train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.3, random_state=42)
preprocess = SimpleDNNPreprocess('../data')
train_df, valid_df, test_df = preprocess.split(.3)
train_x, train_y = np.stack(train_df['baseline_fingerprint'].values), train_df['pIC50'].values
val_x, val_y = np.stack(valid_df['baseline_fingerprint'].values), valid_df['pIC50'].values

# 3. XGBoost 회귀 모델 생성 및 학습
model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=5)

# 모델 학습
model.fit(train_x, train_y)

# 4. 추론 (테스트 데이터에 대한 예측)
y_pred = model.predict(val_x)

# 5. 모델 성능 평가 (평균 제곱 오차)
mse = mean_squared_error(val_y, y_pred)
rmse = mse ** 0.5

print(f"테스트 데이터에 대한 RMSE: {rmse:.4f}")


[32m2024-09-11 10:02:03.462[0m | [1mINFO    [0m | [36mutils[0m:[36mset_seed[0m:[36m14[0m - [1m[utils] set seed as 42...[0m


/Users/jsh/Projects/dacon/ic50-prediction


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['Fingerprint'] = train['Smiles'].apply(smiles_to_fingerprint)
[32m2024-09-11 10:02:07.382[0m | [1mINFO    [0m | [36mdataset[0m:[36m_load_datas[0m:[36m30[0m - [1m[Preprocess] start loading datas...[0m
[32m2024-09-11 10:02:07.409[0m | [1mINFO    [0m | [36mdataset[0m:[36m_load_datas[0m:[36m33[0m - [1m[Preprocess] end loading datas...[0m
[32m2024-09-11 10:02:07.413[0m | [1mINFO    [0m | [36mdataset[0m:[36m_preprocess[0m:[36m151[0m - [1m[SimpleDNNPreprocess] baseline fingerprint...[0m
[32m2024-09-11 10:02:11.622[0m | [1mINFO    [0m | [36mdataset[0m:[36m_preprocess[0m:[36m163[0m - [1m[SimpleDNNPreprocess] Transform to Morgan Embedding...[0m
[32m2024-09-11 10:02:12.466[0m | [1

테스트 데이터에 대한 RMSE: 0.6579


In [None]:
# 6. test inference
test = pd.read_csv('../data/test.csv')
test['Fingerprint'] = test['Smiles'].apply(smiles_to_fingerprint)

test_x = np.stack(test['Fingerprint'].values)

test_y_pred = model.predict(test_x)

In [None]:
submit = pd.read_csv('../data/sample_submission.csv')
# submit['IC50_nM'] = pIC50_to_IC50(test_y_pred)
submit['IC50_nM'] = test_y_pred
submit.head()

In [None]:
submit.to_csv('../data/submissions/xgboost.csv', index=False)