In [1]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2024.3.5-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.9 kB)
Downloading rdkit-2024.3.5-cp310-cp310-manylinux_2_28_x86_64.whl (33.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.1/33.1 MB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2024.3.5


In [2]:
!pip install mol2vec

Collecting mol2vec
  Downloading mol2vec-0.2.2-py3-none-any.whl.metadata (5.5 kB)
Collecting jedi>=0.16 (from IPython->mol2vec)
  Using cached jedi-0.19.1-py2.py3-none-any.whl.metadata (22 kB)
Downloading mol2vec-0.2.2-py3-none-any.whl (15 kB)
Using cached jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)
Installing collected packages: jedi, mol2vec
Successfully installed jedi-0.19.1 mol2vec-0.2.2


In [3]:
import pandas as pd
import numpy as np
import os
import random

from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor

# Import Mol2Vec-related modules
from gensim.models import word2vec
from mol2vec.features import mol2alt_sentence, DfVec, sentences2vec

In [32]:
# Configuration settings
CFG = {
    'NBITS': 2048,
    'SEED': 42,  # 결정성(결과 재현성)을 위한 SEED 설정
    'EMBEDDING_SIZE': 300,  # 임베딩 벡터 크기
}


In [33]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [34]:
# Seed 고정 함수 (이 부분을 추가하여 난수 발생을 고정함)
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)  # Python 환경 변수에 SEED 설정
    np.random.seed(seed)

seed_everything(CFG['SEED'])  # Seed 고정 (random, numpy, 그리고 os에 대해 적용)

In [36]:
# Custom Mol2Vec embedding conversion
def sentences2vec_custom(sentences, model, unseen=None):
    keys = set(model.wv.key_to_index)
    unseen_vec = np.zeros(model.vector_size)
    if unseen:
        unseen_vec = model.wv[unseen] if unseen in keys else np.zeros(model.vector_size)

    def to_vec(sentence):
        vecs = [model.wv[word] if word in keys else unseen_vec for word in sentence]
        return np.mean(vecs, axis=0)

    return np.array([to_vec(sentence) for sentence in sentences])

In [8]:
# SMILES 데이터를 Mol2Vec 임베딩으로 변환하는 함수
def smiles_to_mol2vec(smiles, model):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        # Canonical SMILES 사용하여 결정성 유지 (이 부분을 추가하여 일관된 SMILES 표현을 사용함)
        smiles = Chem.MolToSmiles(mol, canonical=True)
        sentence = mol2alt_sentence(Chem.MolFromSmiles(smiles), radius=1)  # Mol2Vec 문장 생성
        embedding = sentences2vec_custom([sentence], model, unseen='UNK')  # 벡터로 변환
        return np.array(embedding[0])
    else:
        return np.zeros((CFG['EMBEDDING_SIZE'],))

In [37]:
# 데이터 로드
chembl_data = pd.read_csv('/content/drive/MyDrive/신약개발 데이콘/open (2)/train.csv')
chembl_data

Unnamed: 0,Molecule ChEMBL ID,Standard Type,Standard Relation,Standard Value,Standard Units,pChEMBL Value,Assay ChEMBL ID,Target ChEMBL ID,Target Name,Target Organism,Target Type,Document ChEMBL ID,IC50_nM,pIC50,Smiles
0,CHEMBL4443947,IC50,'=',0.022,nM,10.66,CHEMBL4361896,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4359855,0.022,10.66,CN[C@@H](C)C(=O)N[C@H](C(=O)N1C[C@@H](NC(=O)CC...
1,CHEMBL4556091,IC50,'=',0.026,nM,10.59,CHEMBL4345131,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4342485,0.026,10.59,CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...
2,CHEMBL4566431,IC50,'=',0.078,nM,10.11,CHEMBL4345131,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4342485,0.078,10.11,CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...
3,CHEMBL4545898,IC50,'=',0.081,nM,10.09,CHEMBL4345131,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4342485,0.081,10.09,CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...
4,CHEMBL4448950,IC50,'=',0.099,nM,10.00,CHEMBL4361896,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4359855,0.099,10.00,COc1cc2c(OC[C@@H]3CCC(=O)N3)ncc(C#CCCCCCCCCCCC...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1947,CHEMBL380009,IC50,'=',30000.000,nM,4.52,CHEMBL870864,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL1147720,30000.000,4.52,O=C(Nc1nc2cc[nH]cc-2n1)c1cccc([N+](=O)[O-])c1
1948,CHEMBL377654,IC50,'=',30000.000,nM,4.52,CHEMBL870864,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL1147720,30000.000,4.52,CCCCn1c(NC(=O)c2cccc(Cl)c2)nc2ccccc21
1949,CHEMBL208667,IC50,'=',30000.000,nM,4.52,CHEMBL870864,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL1147720,30000.000,4.52,O=C(Nc1nc2cc(F)c(F)cc2[nH]1)c1cccc([N+](=O)[O-...
1950,CHEMBL3403453,IC50,'=',42000.000,nM,4.38,CHEMBL3407004,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL3400143,42000.000,4.38,OC[C@H]1C[C@@H](Nc2nc(Nc3ccccc3)ncc2-c2nc3cccc...


In [38]:
# 모든 SMILES 데이터를 문장으로 변환
sentences = [mol2alt_sentence(Chem.MolFromSmiles(smiles), radius=1) for smiles in chembl_data['Smiles']]



In [39]:
# Word2Vec 모델 학습 (workers=1로 병렬 처리 고정)
mol2vec_model = word2vec.Word2Vec(sentences, vector_size=CFG['EMBEDDING_SIZE'], window=10, min_count=1, workers=1, seed=CFG['SEED'])
# workers=1로 설정하여 병렬 처리를 없앰으로써 결과의 일관성을 유지함

In [40]:
# 모델 저장 및 로드
mol2vec_model.save('model_300dim.pkl')
mol2vec_model = word2vec.Word2Vec.load('model_300dim.pkl')

In [41]:
# SMILES 데이터를 Mol2Vec 벡터로 변환
train = chembl_data[['Smiles', 'pIC50']]
train['Embedding'] = train['Smiles'].apply(lambda x: smiles_to_mol2vec(x, mol2vec_model))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['Embedding'] = train['Smiles'].apply(lambda x: smiles_to_mol2vec(x, mol2vec_model))


In [42]:
train_x = np.stack(train['Embedding'].values)
train_y = train['pIC50'].values

In [43]:
# 학습 및 검증 데이터 분리 (random_state를 설정하여 데이터 분할 고정)
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.3, random_state=CFG['SEED'])

In [68]:
# KNN 모델 하이퍼파라미터 튜닝
param_grid = {
    'n_neighbors': [ 8, 9, 10, 11],  # 이웃의 수
    'weights': ['uniform','distance'],  # 가중치 방식
    'p': [1, 2]  # 거리 측정 방식 (1: 맨해튼 거리)
}

In [69]:
# KFold를 사용해 데이터 분할 고정 (이 부분에서 cv의 결정성을 보장함)
cv = KFold(n_splits=5, shuffle=True, random_state=CFG['SEED'])

In [70]:
# GridSearchCV를 사용하여 최적의 하이퍼파라미터 찾기 (cv 고정)
knn = KNeighborsRegressor()
grid_search = GridSearchCV(knn, param_grid, cv=cv, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
grid_search.fit(train_x, train_y)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


In [71]:
# 최적의 하이퍼파라미터 출력
print(f'Best Parameters: {grid_search.best_params_}')

Best Parameters: {'n_neighbors': 8, 'p': 1, 'weights': 'distance'}


In [72]:
# 최적의 모델로 학습
best_knn_model = grid_search.best_estimator_

In [73]:
# pIC50을 IC50으로 변환하는 함수
def pIC50_to_IC50(pic50_values):
    """Convert pIC50 values to IC50 (nM)."""
    return 10 ** (9 - pic50_values)

In [74]:
# Validation 데이터에서 모델 평가
val_y_pred = best_knn_model.predict(val_x)
mse = mean_squared_error(pIC50_to_IC50(val_y), pIC50_to_IC50(val_y_pred))
rmse = np.sqrt(mse)
print(f'Validation RMSE: {rmse}')

Validation RMSE: 2149.39227517569


In [75]:
# 테스트 데이터 로드 및 예측
test = pd.read_csv('/content/drive/MyDrive/신약개발 데이콘/open (2)/test.csv')
test['Embedding'] = test['Smiles'].apply(lambda x: smiles_to_mol2vec(x, mol2vec_model))
test_x = np.stack(test['Embedding'].values)



In [76]:
# 테스트 데이터 예측
test_y_pred = best_knn_model.predict(test_x)


In [77]:
# 결과 저장
submit = pd.read_csv('/content/drive/MyDrive/신약개발 데이콘/open (2)/sample_submission.csv')
submit['IC50_nM'] = pIC50_to_IC50(test_y_pred)
submit.to_csv('/content/drive/MyDrive/신약개발 데이콘/final/embedding(mol2vec)_KNN(n_8_p_1_w_d).csv', index=False)

In [78]:
submit.head()

Unnamed: 0,ID,IC50_nM
0,TEST_000,90.765214
1,TEST_001,46.238789
2,TEST_002,13.091842
3,TEST_003,17.875613
4,TEST_004,41.877485
