In [2]:
import pandas as pd
import numpy as np
import os
import random

from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [3]:
CFG = {
    'NBITS':2048,
    'SEED':42,
}

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(CFG['SEED']) # Seed 고정

In [5]:
# SMILES 데이터를 분자 지문으로 변환
def smiles_to_fingerprint(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=CFG['NBITS'])
        return np.array(fp)
    else:
        return np.zeros((CFG['NBITS'],))

In [6]:
import os

print(os.getcwd())

/home/wjstjrals417/Dacon/New_drug_development/Seokmin/src/baseline


In [7]:
from os.path import join
train_path = join('..', '..', '..', 'data', 'total_data', 'train.csv')

In [8]:
# 학습 ChEMBL 데이터 로드
chembl_data = pd.read_csv(train_path)  # 예시 파일 이름
chembl_data.head()

train = chembl_data[['Smiles', 'pIC50']]
train['Fingerprint'] = train['Smiles'].apply(smiles_to_fingerprint)

train_x = np.stack(train['Fingerprint'].values)
train_y = train['pIC50'].values

# 학습 및 검증 데이터 분리
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.3, random_state=42)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['Fingerprint'] = train['Smiles'].apply(smiles_to_fingerprint)


In [9]:
# 랜덤 포레스트 모델 학습
model = RandomForestRegressor(random_state=CFG['SEED'])
model.fit(train_x, train_y)

In [10]:
def pIC50_to_IC50(pic50_values):
    """Convert pIC50 values to IC50 (nM)."""
    return 10 ** (9 - pic50_values)

# Validation 데이터로부터의 학습 모델 평가
val_y_pred = model.predict(val_x)
mse = mean_squared_error(pIC50_to_IC50(val_y), pIC50_to_IC50(val_y_pred))
rmse = np.sqrt(mse)

In [11]:
mse

4707069.1709900815

In [12]:
rmse

2169.5781089857264

In [13]:
# Validation 데이터로부터의 학습 모델 평가
val_y_pred = model.predict(val_x)
mse = mean_squared_error(pIC50_to_IC50(val_y), pIC50_to_IC50(val_y_pred))
rmse = np.sqrt(mse)

print(f'RMSE: {rmse}')

RMSE: 2169.5781089857264


In [15]:
test_path = join('..', '..', 'data', 'total_data', 'test.csv')
submission_path = join('..', '..', 'data', 'total_data', 'sample_submission.csv')

In [16]:
test = pd.read_csv(test_path)
test['Fingerprint'] = test['Smiles'].apply(smiles_to_fingerprint)

test_x = np.stack(test['Fingerprint'].values)

test_y_pred = model.predict(test_x)

submit = pd.read_csv(submission_path)
submit['IC50_nM'] = pIC50_to_IC50(test_y_pred)
submit.head()

submit.to_csv('./baseline_submit.csv', index=False)



### 평가 코드

In [14]:
import numpy as np

def calculate_score(y_true, y_pred):
    # IC50(nM) to pIC50 변환
    def to_pIC50(IC50):
        return -np.log10(IC50 * 1e-9)
    
    y_true_pIC50 = to_pIC50(y_true)
    y_pred_pIC50 = to_pIC50(y_pred)
    
    # Normalized RMSE 계산
    rmse = np.sqrt(np.mean((y_true - y_pred)**2))
    normalized_rmse = rmse / (np.max(y_true) - np.min(y_true))
    
    # Correct Ratio 계산
    absolute_errors_pIC50 = np.abs(y_true_pIC50 - y_pred_pIC50)
    correct_ratio = np.mean(absolute_errors_pIC50 <= 0.5)
    
    # 최종 점수 계산
    A = normalized_rmse
    B = correct_ratio
    score = 0.5 * (1 - min(A, 1)) + 0.5 * B
    
    return score, normalized_rmse, correct_ratio

# 사용 예시
y_true = np.array([100, 200, 300, 400, 500])  # 실제 IC50(nM) 값
y_pred = np.array([110, 190, 280, 420, 510])  # 예측된 IC50(nM) 값

final_score, normalized_rmse, correct_ratio = calculate_score(y_true, y_pred)

print(f"Normalized RMSE: {normalized_rmse:.4f}")
print(f"Correct Ratio: {correct_ratio:.4f}")
print(f"Final Score: {final_score:.4f}")

Normalized RMSE: 0.0371
Correct Ratio: 1.0000
Final Score: 0.9815


In [17]:
final_score, normalized_rmse, correct_ratio = calculate_score(val_y, val_y_pred)

In [18]:
print(f"Normalized RMSE: {normalized_rmse:.4f}")
print(f"Correct Ratio: {correct_ratio:.4f}")
print(f"Final Score: {final_score:.4f}")

Normalized RMSE: 0.1087
Correct Ratio: 1.0000
Final Score: 0.9457
