### Import

In [15]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, root_mean_squared_error
import os
import random

In [2]:
CFG = {
    'NBITS': 2048,
    'SEED': 42
}

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(CFG['SEED'])

### Data Load

In [4]:
# SMILES 데이터를 분자 지문으로 변환
def smiles_to_fingerprint(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=CFG['NBITS'])
        return np.array(fp)
    else:
        return np.zeros((CFG['NBITS'],))

In [5]:
def IC50_to_pIC50(ic50_nM):
    ic50_nM = np.clip(ic50_nM, 1e-10, None)
    return 9 - np.log10(ic50_nM)

In [7]:
chembl = pd.read_csv("../data/250708_raw/ChEMBL_ASK1(IC50).csv", sep=';')
pubchem = pd.read_csv("../data/250708_raw/Pubchem_ASK1.csv")

  pubchem = pd.read_csv("../data/250708_raw/Pubchem_ASK1.csv")


### Data Preprocessing

In [8]:
chembl.columns = chembl.columns.str.strip().str.replace('"', '')
chembl = chembl[chembl['Standard Type'] == 'IC50'] 
chembl = chembl[['Smiles', 'Standard Value']].rename(columns={'Smiles': 'smiles', 'Standard Value': 'ic50_nM'}).dropna()
chembl['ic50_nM'] = pd.to_numeric(chembl['ic50_nM'], errors='coerce')
chembl['pIC50'] = IC50_to_pIC50(chembl['ic50_nM'])

In [9]:
pubchem = pubchem[['SMILES', 'Activity_Value']].rename(columns={'SMILES': 'smiles', 'Activity_Value': 'ic50_nM'}).dropna()
pubchem['ic50_nM'] = pd.to_numeric(pubchem['ic50_nM'], errors='coerce')
pubchem['pIC50'] = IC50_to_pIC50(pubchem['ic50_nM'])

In [10]:
total = pd.concat([chembl, pubchem], ignore_index=True)
total = total.drop_duplicates(subset='smiles')
total = total[total['ic50_nM'] > 0].dropna()

In [11]:
total['Fingerprint'] = total['smiles'].apply(smiles_to_fingerprint)
total = total[total['Fingerprint'].notnull()]
X = np.stack(total['Fingerprint'].values)
y = total['pIC50'].values



In [12]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=CFG['SEED'])

### Train

In [13]:
model = RandomForestRegressor(random_state=CFG['SEED'])
model.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [16]:
y_val_pred = model.predict(X_val)
rmse = root_mean_squared_error(IC50_to_pIC50(y_val), IC50_to_pIC50(y_val_pred))
print(f"\n Validation RMSE (IC50 scale): {rmse:.4f}\n")


 Validation RMSE (IC50 scale): 0.1077



### Predict

In [17]:
def pIC50_to_IC50(pIC50): 
    return 10 ** (9 - pIC50)

In [19]:
test = pd.read_csv("../data/250708_raw/test.csv") 

In [20]:
test['Fingerprint'] = test['Smiles'].apply(smiles_to_fingerprint)
test = test[test['Fingerprint'].notnull()]

X_test = np.stack(test['Fingerprint'].values)
test['pIC50_pred'] = model.predict(X_test)
test['ASK1_IC50_nM'] = pIC50_to_IC50(test['pIC50_pred'])



### Submission

In [21]:
submission = pd.read_csv('../data/250708_raw/sample_submission.csv') 

In [22]:
submission['ASK1_IC50_nM'] = test['ASK1_IC50_nM']

In [25]:
submission.to_csv("../submits/250709_baseline_submit.csv", index=False)