In [4]:
import pandas as pd, numpy as np, random, os
from rdkit import Chem  # 분자 구조 처리
from rdkit.Chem import AllChem, DataStructs, Descriptors  # 분자 특징 추출
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
import lightgbm as lgb
from sklearn.metrics import r2_score, mean_squared_error
import optuna

In [5]:
CFG = {
    'NBITS' : 2048,
    'SEED':42,
    'N_SPLITES':5,
    'N_TRIALS':50
}

In [6]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
seed_everything(CFG['SEED'])

In [7]:
def load_and_preprocess_data():
    chembl = pd.read_csv('data/ChEMBL_ASK1(IC50).csv')
    pubchem = pd.read_csv('data/Pubchem_ASK1.csv')
    chembl.columns = chembl.columns.str.strip().str.replace(",")
    
    chembl = chembl[chembl['Standard Type'] == 'IC50']
    chembl = chembl[['Smiles', 'Standard Value']].rename(columns={'Smiles': 'smiles', 'Standard Value': 'ic50_nM'})
    chembl['ic50_nM'] = pd.to_numeric(chembl['ic50_nM'], errors='coerce')

    pubchem = pubchem[['SMILES', 'Activity_Value']].rename(columns={'SMILES': 'smiles', 'Activity_Value': 'ic50_nM'})
    pubchem['ic50_nM'] = pd.to_numeric(pubchem['ic50_nM'], errors='coerce')

    df = pd.concat([chembl, pubchem], ignore_index=True).dropna(subset=['smiles', 'ic50_nM'])
    df = df.drop_duplicates(subset='smiles').reset_index(drop=True)
    df = df[df['ic50_nM'] > 0]
    return df
    

In [None]:
def smiles_to_fingerprint(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=CFG['NBITS'])
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(fp, arr)
        return arr
    return None