- https://deepchem.io/models/
- https://www.kaggle.com/code/sdlee94/belka-molecule-representations-for-ml-tutorial
- https://dacon.io/en/competitions/official/236127/codeshare/8791

In [None]:
'''
!pip uninstall torch torch-geometric rdkit dpdata rdkit-pypi -y
!pip uninstall pydot dgllife deepchem lightning tf-keras -y
'''

In [None]:
'''
!pip install torch torch-geometric rdkit dpdata rdkit-pypi
!pip install pydot dgllife deepchem lightning
!pip install numpy pyarrow pydot torch tf-keras Pillow
'''

In [None]:
!pip install deepchem dgl dgllife lightning
!pip uninstall torch -y
!pip install torch==2.2.1
!pip install torch.utils

from google.colab import output
output.clear()

Collecting deepchem
  Downloading deepchem-2.8.0-py3-none-any.whl.metadata (2.0 kB)
Collecting dgl
  Downloading dgl-2.1.0-cp310-cp310-manylinux1_x86_64.whl.metadata (553 bytes)
Collecting dgllife
  Downloading dgllife-0.3.2-py3-none-any.whl.metadata (667 bytes)
Collecting lightning
  Downloading lightning-2.4.0-py3-none-any.whl.metadata (38 kB)
Collecting rdkit (from deepchem)
  Downloading rdkit-2024.3.5-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.9 kB)
Collecting torchdata>=0.5.0 (from dgl)
  Downloading torchdata-0.8.0-cp310-cp310-manylinux1_x86_64.whl.metadata (5.4 kB)
Collecting lightning-utilities<2.0,>=0.10.0 (from lightning)
  Downloading lightning_utilities-0.11.7-py3-none-any.whl.metadata (5.2 kB)
Collecting torchmetrics<3.0,>=0.7.0 (from lightning)
  Downloading torchmetrics-1.4.1-py3-none-any.whl.metadata (20 kB)
Collecting pytorch-lightning (from lightning)
  Downloading pytorch_lightning-2.4.0-py3-none-any.whl.metadata (21 kB)
Downloading deepchem-2.8.0-py3-none-an

In [None]:
import os
# Temporary suppress tf logs -> tf-keras v2.5
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
from rdkit import Chem, RDLogger
from rdkit.Chem.Draw import IPythonConsole, MolsToGridImage
from rdkit.Chem import PandasTools

# Temporary suppress warnings and RDKit logs
warnings.filterwarnings("ignore")
RDLogger.DisableLog("rdApp.*")

np.random.seed(42)
tf.random.set_seed(42)

from deepchem.feat import ConvMolFeaturizer, WeaveFeaturizer, DMPNNFeaturizer
from deepchem.feat.mol_graphs import ConvMol
import torch
import deepchem as dc

1. Import dataset and Featurize it

In [None]:
# Load data
train, test = pd.read_csv('/content/train.csv', index_col=False)[['IC50_nM', 'Smiles']], pd.read_csv('/content/test.csv')['Smiles']
X_train, y_train = train['Smiles'], train['IC50_nM']

'''
# Featurize data
conv_X_train, conv_X_test = ConvMolFeaturizer().featurize(X_train), ConvMolFeaturizer().featurize(test)
weave_X_train, weave_X_test = dc.feat.WeaveFeaturizer().featurize(X_train), dc.feat.WeaveFeaturizer().featurize(test)
'''
# DMPNNFeaturizer
dmpnn_X_train, dmpnn_X_test = DMPNNFeaturizer().featurize(X_train), DMPNNFeaturizer().featurize(test)
dmpnn_X_train.shape

(1952,)

2. n_atoms 불규칙 -> reshape error -> padding 처리

'pair_features(keras v2.5)' 대신 'get_pair_features(keras v3.0)' 사용하기

3. Fit to the model

In [None]:
print(dc.__version__) # Version of deepchem

MPNN Modeling Issue
- Data Featurization Handling : conv_X_train은 MPNN model의 training에 not-utilized 구조 -> wasted computation 발생 가능성
- conv_X_train은 ConvMolFeaturizer을 거쳤음 -> Tensor 구조 -> y_train은 reshape(converting)을 통해 NumpyDataset으로 결합 가능한 구조로 바뀌어야함.

AttributeError: 'ConvMol' object has no attribute 'get_pair_features'
- 'ConvMolFeaturizer'은 graph-based models에서 'atom feature'만을 생성한다. ('pair feature' 생성 안함)
- 'MPNN'은 'node', 'bond' 2가지 정보를 모두 이용하는 것이 큰 특징

ValueError: cannot reshape array of size N into shape ()
- 'MPNN'은 tensor reshape을 통해서 training을 수행한다.
- atom_nums 가 불규칙하면 reshape에 어려움 -> padding(확실해..?)

In [None]:
conv_train = dc.data.NumpyDataset(dmpnn_X_train, y_train)
# padded_conv_train = dc.data.NumpyDataset(padded_atoms, y_train)

model = MPNNModel(
    n_tasks=1,                 # C. Single target
    mode='regression',         # C. Regeression
    n_atom_feat=75,            # N(atoms in molecule) - Error!!
    n_pair_feat=6,            # N(pair feature)

    batch_size=16,
    n_hidden=100,              # C. N(Hidden unit) in each message passing layer
    n_layers=3,                # C. N(layer) in passing
    learning_rate=0.01
)

NameError: name 'MPNNModel' is not defined

In [None]:
model.fit(dmpnn_X_train, nb_epoch=10)

In [None]:
!pip install dgl

In [None]:
!export DGLBACKEND=pytorch

In [None]:
import pandas as pd
import numpy as np
from dgl import DGLGraph
from dgl.nn import GATConv
import torch
from torch import nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from dmpnn import DMPNNFeaturizer, DMPNNModel

# 전처리: SMILES 문자열을 D-MPNNFeaturizer를 사용하여 피쳐로 변환
featurizer = DMPNNFeaturizer()

# 피쳐와 타겟 변수 추출
X = featurizer.featurize(train['Smiles'])
y = train['IC50_nM'].values

# 데이터셋 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 데이터를 DGLGraph 형식으로 변환
def create_dgl_graphs(features):
    graphs = []
    for feature in features:
        g = DGLGraph()
        # 여기에서 feature를 기반으로 그래프 삽입
        # (노드와 간선을 추가하는 로직을 구현해야 함)
        graphs.append(g)
    return graphs

train_graphs = create_dgl_graphs(X_train)
test_graphs = create_dgl_graphs(X_test)

# MPNN 모델 정의
class MPNN(nn.Module):
    def __init__(self, in_feats, hidden_size, out_feats):
        super(MPNN, self).__init__()
        self.conv1 = GATConv(in_feats, hidden_size, num_heads=8)
        self.conv2 = GATConv(hidden_size * 8, out_feats, num_heads=1)
        self.fc = nn.Linear(out_feats, 1)

    def forward(self, g, features):
        h = self.conv1(g, features)
        h = self.conv2(g, h)
        readout = dgl.mean_nodes(h)
        return self.fc(readout)

# 모델 초기화
model = MPNN(in_feats=1952, hidden_size=128, out_feats=64)  # your_input_size는 D-MPNNFeaturizer의 출력 크기
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# 모델 훈련
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()

    # 각 그래프에 대한 배치 처리
    predictions = []
    for g in train_graphs:
        pred = model(g, g.ndata['features'])
        predictions.append(pred)

    loss = criterion(torch.cat(predictions), torch.tensor(y_train, dtype=torch.float32))
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch {epoch + 1}, Loss: {loss.item()}')

# 테스트
model.eval()
with torch.no_grad():
    test_predictions = []
    for g in test_graphs:
        pred = model(g, g.ndata['features'])
        test_predictions.append(pred)

# 테스트 결과 출력
print("Test Predictions:", test_predictions)

https://github.com/deepchem/deepchem/issues/1505

In [None]:
!pip install rdkit

In [None]:
# 필요한 라이브러리 임포트
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow import keras
from tensorflow.keras import layers
from keras import backend as K

# 1. 데이터 로딩
# 'IC50'과 'Smiles' 데이터가 포함된 CSV 파일 로드
data = pd.read_csv('/content/train.csv')  # dataset.csv 파일 경로

# 2. 데이터 전처리
# SMILES 문자열을 화학 구조로 변환
data['Molecule'] = data['Smiles'].apply(Chem.MolFromSmiles)

# Lipinski's Rule에 따른 화합물의 물리화학적 특성 추가
data['Molecular_Weight'] = data['Molecule'].apply(Descriptors.MolWt)
data['LogP'] = data['Molecule'].apply(Descriptors.MolLogP)
data['HBA'] = data['Molecule'].apply(Descriptors.NumHAcceptors)
data['HBD'] = data['Molecule'].apply(Descriptors.NumHDonors)

# 사용할 특성과 목표 변수 정의
features = data[['Molecular_Weight', 'LogP', 'HBA', 'HBD']]
target = data['IC50_nM']

# 3. 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# 4. 표준화
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# 5. 모델 구성
model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    layers.Dense(32, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(4, activation='relu'),
    layers.Dense(1)  # 출력층
])

# 6. 모델 컴파일
model.compile(optimizer='rmsprop', loss='mse' , metrics=['accuracy'])

# 7. 모델 학습
model.fit(X_train_scaled, y_train, epochs=4000, validation_split=0.2)

# 8. 모델 평가
loss, mae = model.evaluate(X_test_scaled, y_test)
print(f'테스트 손실: {loss:.4f}, 평균 절대 오차: {mae:.4f}')

In [None]:
from sklearn.metrics import mean_squared_error

n, nums = [0, 10000000], np.linspace(0, 1, 20)
for i in range(1, len(nums)) :
  if mean_squared_error(y_test, model.predict(X_test) * nums[i]) < nums[1] :
    n = [i, mean_squared_error(y_test, model.predict(X_test) * nums[i])]

print(n)

In [None]:
mean_squared_error(y_test, model.predict(X_test) * 1.5)

In [None]:
X_test = pd.read_csv('/content/test.csv')  # dataset.csv 파일 경로

# 2. 데이터 전처리
# SMILES 문자열을 화학 구조로 변환
X_test['Molecule'] = X_test['Smiles'].apply(Chem.MolFromSmiles)

# Lipinski's Rule에 따른 화합물의 물리화학적 특성 추가
X_test['Molecular_Weight'] = X_test['Molecule'].apply(Descriptors.MolWt)
X_test['LogP'] = X_test['Molecule'].apply(Descriptors.MolLogP)
X_test['HBA'] = X_test['Molecule'].apply(Descriptors.NumHAcceptors)
X_test['HBD'] = X_test['Molecule'].apply(Descriptors.NumHDonors)

# 사용할 특성과 목표 변수 정의
X_id, X_test = X_test['ID'], X_test[['Molecular_Weight', 'LogP', 'HBA', 'HBD']]

In [None]:
X_test_scaled = scaler.transform(X_test)

pred = model.predict(X_test_scaled)
pred_list = [float(pred[i]) for i in range(pred.shape[0])]

submission = pd.DataFrame({'ID':X_id, 'IC50_nM':pred_list})
submission.to_csv('rf_submission.csv', index=False)