### Import

In [4]:
import pandas as pd
import numpy as np
import os
import random

from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [2]:
CFG = {
    'NBITS':2048,
    'SEED':42,
}

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(CFG['SEED']) # Seed 고정

---
### Atom Pairs

In [8]:
from rdkit import DataStructs, Chem
from rdkit.Chem import AllChem

m = Chem.MolFromSmiles('CCOC')

In [7]:
fpgen = AllChem.GetAtomPairGenerator()
fps = fpgen.GetSparseCountFingerprint(m)
fps.GetNonzeroElements()

{541731: 1, 558113: 1, 558114: 1, 1606689: 1, 1606690: 1, 1606721: 1}

---
### Morgan Fingerprints(Circular Fingerprints)

In [37]:
additional_output = AllChem.AdditionalOutput()
additional_output.CollectBitInfoMap()
morgan_gen = AllChem.GetMorganGenerator()
morgan_fps = morgan_gen.GetSparseCountFingerprint(m, additionalOutput=additional_output)

In [38]:
print(len(morgan_fps.GetNonzeroElements()))
info = additional_output.GetBitInfoMap()

8


In [39]:
len(info)

8

In [40]:
info

{864674487: ((2, 0),),
 2079181617: ((1, 2),),
 2222621677: ((2, 1),),
 2245384272: ((1, 0),),
 2246728737: ((0, 0), (3, 0)),
 3542456614: ((0, 1),),
 3975275337: ((3, 1),),
 3994088662: ((1, 1),)}

In [1]:
def smiles_to_morgan(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        additional_output = AllChem.AdditionalOutput()
        additional_output.CollectBitInfoMap()
        morgan_gen = AllChem.GetMorganGenerator()
        morgan_gen.GetSparseCountFingerprint(mol, additionalOutput=additional_output)
        info = additional_output.GetBitInfoMap()
        return info
    else:
        return np.zeros((CFG['NBITS'],))

In [2]:
import pandas as pd
from collections import Counter

train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

In [5]:
train_df['morgan_info'] = train_df['Smiles'].apply(smiles_to_morgan)
test_df['morgan_info'] = test_df['Smiles'].apply(smiles_to_morgan)

In [6]:
total_morgan_key_set = set()
train_df['morgan_info'].apply(lambda info: total_morgan_key_set.update(info.keys()))
test_df['morgan_info'].apply(lambda info: total_morgan_key_set.update(info.keys()))

0      None
1      None
2      None
3      None
4      None
       ... 
108    None
109    None
110    None
111    None
112    None
Name: morgan_info, Length: 113, dtype: object

In [222]:
train_df['morgan_embedding'].describe()

In [7]:
total_morgan_key = sorted(list(total_morgan_key_set))
total_morgan_key2idx = {k:i for i, k in enumerate(total_morgan_key)}

In [8]:
def morgan_info_to_embedding(info, key_count=13_279, radius_count=4):
    import numpy as np
    from collections import Counter
    embedding = np.zeros((key_count, radius_count, ))

    for key, radius_list in info.items():
        radius_count = dict(Counter([radius[-1] for radius in radius_list]))
        for radius, count in radius_count.items():
            embedding[total_morgan_key2idx[key], radius] = count

    return embedding

In [9]:
train_df['morgan_embedding'] = train_df['morgan_info'].apply(morgan_info_to_embedding)
test_df['morgan_embedding'] = test_df['morgan_info'].apply(morgan_info_to_embedding)

In [11]:
def standardization(embedding):
    mean = np.mean(embedding.flatten())
    std = np.std(embedding.flatten())

    return embedding - mean / std

In [12]:
train_df['morgan_embedding'] = train_df['morgan_embedding'].apply(standardization)

In [13]:
train_df['morgan_embedding'].iloc[0]

array([[-0.04050447, -0.04050447, -0.04050447, -0.04050447],
       [-0.04050447, -0.04050447, -0.04050447, -0.04050447],
       [-0.04050447, -0.04050447, -0.04050447, -0.04050447],
       ...,
       [-0.04050447, -0.04050447, -0.04050447, -0.04050447],
       [-0.04050447, -0.04050447, -0.04050447, -0.04050447],
       [-0.04050447, -0.04050447, -0.04050447, -0.04050447]])

In [219]:
train_df['morgan_embedding'].iloc[0].flatten()[train_df['morgan_embedding'].iloc[0].flatten() > 5]

array([ 6.,  8., 12.,  9.,  7.])

In [194]:
train_df['morgan_embedding'].apply(lambda e: e.nonzero())

0       ([32, 35, 310, 535, 728, 868, 894, 925, 997, 1...
1       ([35, 159, 310, 539, 601, 822, 1019, 1116, 115...
2       ([35, 159, 310, 539, 601, 751, 822, 901, 1116,...
3       ([35, 159, 310, 539, 822, 1116, 1155, 1174, 12...
4       ([32, 35, 507, 515, 590, 868, 872, 924, 1047, ...
                              ...                        
1947    ([198, 310, 408, 890, 1506, 2736, 2738, 2798, ...
1948    ([310, 408, 484, 552, 635, 1296, 1300, 1506, 1...
1949    ([304, 310, 408, 890, 1122, 1506, 1987, 2057, ...
1950    ([28, 310, 533, 826, 872, 1025, 1624, 1661, 17...
1951    ([310, 1199, 1373, 2320, 2735, 2797, 2798, 307...
Name: morgan_embedding, Length: 1952, dtype: object

In [180]:
morgan_info_to_embedding(train_df['morgan_info'].iloc[0]).nonzero()

(array([   32,    35,   310,   535,   728,   868,   894,   925,   997,
         1037,  1047,  1110,  1153,  1171,  1199,  1217,  1256,  1299,
         1317,  1331,  1335,  1421,  1433,  1661,  1703,  1706,  1713,
         1887,  1969,  2032,  2178,  2308,  2464,  2735,  2736,  2797,
         2798,  2907,  3014,  3017,  3049,  3070,  3092,  3214,  3439,
         3541,  3667,  3779,  4058,  4133,  4237,  4246,  4336,  4406,
         4408,  4585,  4701,  4758,  4768,  4790,  4842,  4871,  4919,
         5060,  5094,  5227,  5296,  5404,  5409,  5633,  5667,  5816,
         5881,  6059,  6091,  6170,  6496,  6603,  6635,  6655,  6672,
         6736,  6740,  6745,  6797,  6833,  7005,  7045,  7051,  7078,
         7089,  7097,  7175,  7199,  7201,  7203,  7206,  7208,  7311,
         7325,  7408,  7544,  7629,  8019,  8150,  8181,  8272,  8313,
         8412,  8552,  8558,  8795,  8877,  8899,  8973,  8976,  9105,
         9135,  9354,  9378,  9546,  9577,  9772,  9847,  9921, 10087,
      

In [160]:
len(total_morgan_key2idx)

13279

In [133]:
train_df['morgan_keys'] = train_df['Smiles'].apply(lambda s: [*smiles_to_fingerprint(s).keys()])
train_df['morgan_values'] = train_df['Smiles'].apply(lambda s: [*smiles_to_fingerprint(s).values()])
test_df['morgan_keys'] = test_df['Smiles'].apply(lambda s: [*smiles_to_fingerprint(s).keys()])
test_df['morgan_values'] = test_df['Smiles'].apply(lambda s: [*smiles_to_fingerprint(s).values()])

In [128]:
train_df['morgan_values'].head()

0    [((37, 2),), ((38, 1),), ((62, 1), (63, 1)), (...
1    [((38, 1),), ((36, 3),), ((20, 1),), ((4, 3),)...
2    [((38, 1),), ((36, 3),), ((20, 1),), ((4, 3),)...
3    [((38, 1),), ((36, 3),), ((20, 1),), ((4, 3),)...
4    [((11, 2),), ((12, 1),), ((21, 2), (22, 2), (2...
Name: morgan_values, dtype: object

In [134]:
train_df['morgan_radius'] = train_df['morgan_values'].apply(lambda vs: [v[0][-1] for v in vs])
test_df['morgan_radius'] = test_df['morgan_values'].apply(lambda vs: [v[0][-1] for v in vs])

In [136]:
train_df['morgan_counters'] = train_df['morgan_radius'].apply(lambda v: dict(Counter(v)))
test_df['morgan_counters'] = test_df['morgan_radius'].apply(lambda v: dict(Counter(v)))


In [145]:
train_df['morgan_counters']

0       {2: 55, 1: 50, 3: 53, 0: 16}
1       {1: 33, 3: 27, 2: 30, 0: 15}
2       {1: 33, 3: 27, 2: 30, 0: 16}
3       {1: 31, 3: 27, 2: 29, 0: 14}
4       {2: 48, 1: 49, 3: 43, 0: 19}
                    ...             
1947     {1: 17, 2: 17, 0: 9, 3: 16}
1948    {1: 18, 2: 19, 3: 18, 0: 10}
1949    {3: 17, 1: 18, 2: 17, 0: 10}
1950     {3: 24, 1: 22, 2: 25, 0: 9}
1951       {1: 10, 2: 8, 3: 7, 0: 8}
Name: morgan_counters, Length: 1952, dtype: object

In [None]:
train_df

In [144]:
test_df['morgan_counters'].apply(lambda c: max(c.values())).describe()

count    113.000000
mean      27.938053
std        2.696834
min       19.000000
25%       26.000000
50%       28.000000
75%       30.000000
max       35.000000
Name: morgan_counters, dtype: float64

In [129]:
train_df['morgan_radius'].head()

0    [2, 1, 1, 2, 2, 2, 3, 2, 2, 3, 1, 2, 3, 2, 2, ...
1    [1, 3, 1, 3, 1, 3, 2, 2, 3, 3, 3, 1, 2, 1, 1, ...
2    [1, 3, 1, 3, 1, 3, 3, 3, 2, 3, 3, 3, 1, 2, 1, ...
3    [1, 3, 1, 3, 3, 2, 3, 3, 3, 2, 3, 1, 3, 2, 1, ...
4    [2, 1, 2, 3, 2, 2, 1, 3, 1, 2, 2, 2, 3, 1, 1, ...
Name: morgan_radius, dtype: object

In [126]:
train_df['morgan_radius'].apply(max).describe()

count    1952.0
mean        3.0
std         0.0
min         3.0
25%         3.0
50%         3.0
75%         3.0
max         3.0
Name: morgan_radius, dtype: float64

In [127]:
test_df['morgan_radius'].apply(max).describe()

count    113.0
mean       3.0
std        0.0
min        3.0
25%        3.0
50%        3.0
75%        3.0
max        3.0
Name: morgan_radius, dtype: float64

In [109]:
train_df['morgan_keys_len'] = train_df['morgan_keys'].apply(len)

In [110]:
train_df['morgan_keys_len'].describe()

count    1952.000000
mean       86.031250
std        12.698553
min        33.000000
25%        80.000000
50%        87.000000
75%        94.000000
max       174.000000
Name: morgan_keys_len, dtype: float64

In [102]:
87 / 12_825

0.006783625730994152

In [90]:
train_morgan_keys = set()
train_df['morgan_keys'].apply(lambda k: train_morgan_keys.update(k))

0       None
1       None
2       None
3       None
4       None
        ... 
1947    None
1948    None
1949    None
1950    None
1951    None
Name: morgan_keys, Length: 1952, dtype: object

In [87]:
len(train_morgan_keys), min(train_morgan_keys), max(train_morgan_keys)

(12825, 79728, 4294707339)

In [88]:
test_df = pd.read_csv('../data/test.csv')

In [103]:
test_df['morgan_keys'] = test_df['Smiles'].apply(smiles_to_fingerprint)
test_df['morgan_keys_len'] = test_df['morgan_keys'].apply(len)

In [104]:
test_df['morgan_keys_len'].describe()

count    113.000000
mean      90.646018
std        9.075989
min       64.000000
25%       86.000000
50%       92.000000
75%       97.000000
max      108.000000
Name: morgan_keys_len, dtype: float64

In [91]:
test_morgan_keys = set()
test_df['morgan_keys'].apply(lambda k: test_morgan_keys.update(k))

0      None
1      None
2      None
3      None
4      None
       ... 
108    None
109    None
110    None
111    None
112    None
Name: morgan_keys, Length: 113, dtype: object

In [99]:
len(train_morgan_keys | test_morgan_keys)

13279

In [95]:
len(test_morgan_keys)

1045

In [105]:
87 / 13279, 92 / 13279

(0.006551698170042925, 0.006928232547631598)

---
### MACCS keys

In [13]:
from rdkit.Chem import MACCSkeys

In [49]:
MACCSkeys.GenMACCSKeys(m)

<rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x12075c6d0>

### DataLoad

In [4]:
# SMILES 데이터를 분자 지문으로 변환
def smiles_to_fingerprint(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=CFG['NBITS'])
        return np.array(fp)
    else:
        return np.zeros((CFG['NBITS'],))

In [5]:
# 학습 ChEMBL 데이터 로드
chembl_data = pd.read_csv('../data/train.csv')  # 예시 파일 이름
chembl_data.head()

Unnamed: 0,Molecule ChEMBL ID,Standard Type,Standard Relation,Standard Value,Standard Units,pChEMBL Value,Assay ChEMBL ID,Target ChEMBL ID,Target Name,Target Organism,Target Type,Document ChEMBL ID,IC50_nM,pIC50,Smiles
0,CHEMBL4443947,IC50,'=',0.022,nM,10.66,CHEMBL4361896,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4359855,0.022,10.66,CN[C@@H](C)C(=O)N[C@H](C(=O)N1C[C@@H](NC(=O)CC...
1,CHEMBL4556091,IC50,'=',0.026,nM,10.59,CHEMBL4345131,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4342485,0.026,10.59,CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...
2,CHEMBL4566431,IC50,'=',0.078,nM,10.11,CHEMBL4345131,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4342485,0.078,10.11,CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...
3,CHEMBL4545898,IC50,'=',0.081,nM,10.09,CHEMBL4345131,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4342485,0.081,10.09,CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...
4,CHEMBL4448950,IC50,'=',0.099,nM,10.0,CHEMBL4361896,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4359855,0.099,10.0,COc1cc2c(OC[C@@H]3CCC(=O)N3)ncc(C#CCCCCCCCCCCC...


### Data Pre-processing

In [6]:
train = chembl_data[['Smiles', 'pIC50']]
# train = chembl_data[['Smiles', 'IC50_nM']] # 실험
train['Fingerprint'] = train['Smiles'].apply(smiles_to_fingerprint)

train_x = np.stack(train['Fingerprint'].values)
train_y = train['pIC50'].values
# train_y = train['IC50_nM'].values

# 학습 및 검증 데이터 분리
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.3, random_state=42)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['Fingerprint'] = train['Smiles'].apply(smiles_to_fingerprint)


In [12]:
sample = train.loc[1:4,'Fingerprint']

In [18]:
sum(sample[1]), sum(sample[2]), sum(sample[1] * sample[2])

(np.int64(76), np.int64(78), np.int64(70))

### Train & Validation

In [26]:
# 랜덤 포레스트 모델 학습
model = RandomForestRegressor(random_state=CFG['SEED'])
model.fit(train_x, train_y)

In [27]:
def pIC50_to_IC50(pic50_values):
    """Convert pIC50 values to IC50 (nM)."""
    return 10 ** (9 - pic50_values)

# Validation 데이터로부터의 학습 모델 평가
val_y_pred = model.predict(val_x)
mse = mean_squared_error(pIC50_to_IC50(val_y), pIC50_to_IC50(val_y_pred))
# mse = mean_squared_error(val_y, val_y_pred)
rmse = np.sqrt(mse)

print(f'RMSE: {rmse}')

RMSE: 2228.3071398231064


### Inference

In [28]:
test = pd.read_csv('../data/test.csv')
test['Fingerprint'] = test['Smiles'].apply(smiles_to_fingerprint)

test_x = np.stack(test['Fingerprint'].values)

test_y_pred = model.predict(test_x)



### Submission

In [31]:
submit = pd.read_csv('../data/sample_submission.csv')
# submit['IC50_nM'] = pIC50_to_IC50(test_y_pred)
submit['IC50_nM'] = test_y_pred
submit.head()

Unnamed: 0,ID,IC50_nM
0,TEST_000,739.876333
1,TEST_001,559.90806
2,TEST_002,656.163567
3,TEST_003,394.9184
4,TEST_004,215.36049


In [32]:
submit.to_csv('../data/submissions/baseline_submit.csv', index=False)