https://dacon.io/en/competitions/official/236127/codeshare/8791?page=1&dtype=tag&fType=&category=codeshare

train에서 'MLM', 'HLM', 'AlogP' 등등은 모두 int64 자료형 -> Regression

In [None]:
'''
!pip install deepchem dgl dgllife lightning
!pip uninstall torch -y
!pip install torch==2.2.1
!pip install torch.utils
!pip install torchvision torchaudio
'''
from google.colab import output
output.clear()

In [8]:
# !pip uninstall dgl -y
!pip install dgl -f https://data.dgl.ai/wheels/cu102/repo.html.
!sudo apt-get -qq install graphviz
!pip install deepchem

# torch, dgl : 2.4.0, 2.1.0(support torch up to 2.2.1)
# https://discuss.dgl.ai/t/filenotfounderror-cannot-find-dgl-c-graphbolt-library/4302/4
output.clear()
import dgl

FileNotFoundError: Cannot find DGL C++ graphbolt library at /usr/local/lib/python3.10/dist-packages/dgl/graphbolt/libgraphbolt_pytorch_2.4.0.so

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import deepchem as dc
import lightning as L

from torch.optim import AdamW
from tqdm.auto import tqdm
from torch.utils.data import DataLoader
from lightning.pytorch.callbacks import ModelCheckpoint
from deepchem.models.torch_models import MPNNModel
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
train_df = pd.read_csv('/content/train.csv')[['IC50_nM', 'Smiles']]
test_df = pd.read_csv('/content/test.csv')

BATCH_SIZE, SEED, K_FOLD = 32, 42, 5

In [None]:
L.seed_everything(SEED) # Set Lighting seed

Molecule structure에 따라서 k_fold 기법을 이용해 train/val로 분할하기

In [None]:
def smiles_split(df, smiles, seed=42, k_fold=5, splitter='scaffold'):
    import deepchem as dc
    # Xs, Xs : Index(arange), Dummy table
    Xs, ys = np.arange(len(smiles)), np.ones(len(smiles))

    dataset = dc.data.DiskDataset.from_numpy(X=Xs,y=ys,w=np.zeros(len(smiles)),ids=smiles)

    if splitter == 'random': splitter = dc.splits.RandomSplitter()
    elif splitter == 'scaffold': splitter = dc.splits.ScaffoldSplitter()
    elif splitter == 'fingerprints': splitter = dc.splits.FingerprintSplitter()

    folds = splitter.k_fold_split(dataset, k=k_fold, seed=seed)
    dfs = []
    for fold in folds:
        train_indices = fold[0].X
        val_indices = fold[1].X
        train_df = df.iloc[train_indices].reset_index(drop=True)
        val_df = df.iloc[val_indices].reset_index(drop=True)
        dfs.append((train_df, val_df))
    return dfs

In [None]:
for train_fold, val_fold in smiles_split(train_df, train_df['Smiles'].tolist(), seed=SEED, k_fold=K_FOLD, splitter='fingerprints'):
    break

output.clear()

In [None]:
len(train_fold), len(val_fold) # 분할된 모습

In [None]:
val_fold.head(2)

In [None]:
# Optional : IC50에 MinMaxScaler 적용하기
'''
scaler = MinMaxScaler()
train_fold['IC50_nM'] = scaler.fit_transform(train_fold['IC50_nM'].values.reshape(-1, 1))
val_fold['IC50_nM'] = scaler.fit_transform(val_fold['IC50_nM'].values.reshape(-1, 1))
'''

val_fold.head(2)

In [None]:
featurizer = dc.feat.MolGraphConvFeaturizer(use_edges=True)

In [None]:
train_X = featurizer.featurize(train_fold['Smiles'].tolist())
train_w = train_fold['IC50_nM'].values
train_dataset = dc.data.NumpyDataset(X=train_X, y=train_fold['IC50_nM'].values, w=train_w)

val_X = featurizer.featurize(val_fold['Smiles'].tolist())
val_w = val_fold['IC50_nM'].values
val_dataset = dc.data.NumpyDataset(X=val_X, y=val_fold['IC50_nM'].values, w=val_w)

test_df['IC50_nM'] = 0 # Initial Value
test_X = featurizer.featurize(test_df['Smiles'].tolist())
test_w = test_df['IC50_nM'].values
test_dataset = dc.data.NumpyDataset(X=test_X, y=test_df['IC50_nM'].values, w=test_w)

Define Dataset

In [None]:
def collate_fn(samples):
    X = [sample[0] for sample in samples]
    y = torch.Tensor([sample[1] for sample in samples])
    w = torch.Tensor([sample[2] for sample in samples])
    return ([X],y,w)

In [None]:
train_datas = []
val_datas = []
test_datas = []

for x,y,w in zip(train_dataset.X, train_dataset.y, train_dataset.w):
    train_datas.append((x,y,w))

for x,y,w in zip(val_dataset.X, val_dataset.y, val_dataset.w):
    val_datas.append((x,y,w))

for x,y,w in zip(test_dataset.X, test_dataset.y, test_dataset.w):
    test_datas.append((x,y,w))

train_dataloader = DataLoader(train_datas, batch_size=32, collate_fn=collate_fn)
val_dataloader = DataLoader(val_datas, batch_size=32*2, collate_fn=collate_fn)
test_dataloader = DataLoader(test_datas, batch_size=32*2, collate_fn=collate_fn)

print(train_dataloader)

In [None]:
class LitMPNNSMILESClassification(L.LightningModule):
    def __init__(self, batch_size, node_out_feats=64, n_tasks=1):
        super().__init__()
        model = MPNNModel(
            mode='regression',
            n_tasks=n_tasks,
            node_out_feats=node_out_feats,
            batch_size=batch_size,
        )
        self._prepare_batch = lambda batch : model._prepare_batch(batch)
        self.model = model.model
        self.model.model.predict = nn.Identity()
        self.batch_size=batch_size
        self.classifier = nn.Sequential(
            nn.LazyLinear(node_out_feats//2),
            nn.ReLU(),
            nn.LazyLinear(n_tasks)
        )

        self.validation_step_outputs = []

    def forward(self, x, xp):
        x = self.model(x)
        x = torch.cat([x, xp], dim=-1)
        x_out = self.classifier(x)
        return x_out

    def training_step(self, batch, batch_idx):
        x, *_ = self._prepare_batch(batch)
        y_true, xp = batch[1]/100, batch[2]
        y_pred = self(x, xp)
        loss1 = F.mse_loss(y_pred[:,0].flatten(), y_true[:,0].flatten())
        loss2 = F.mse_loss(y_pred[:,1].flatten(), y_true[:,1].flatten())
        loss = (loss1**0.5 + loss2**0.5)/2
        self.log_dict({"train_loss": loss}, on_step=True, prog_bar=True, batch_size=self.batch_size)
        return loss

    def validation_step(self, batch, batch_idx):
        x, *_ = self._prepare_batch(batch)
        y_true, xp = batch[1], batch[2]
        y_pred = self(x, xp)*100
        loss1 = F.mse_loss(y_pred[:,0].flatten(), y_true[:,0].flatten())
        loss2 = F.mse_loss(y_pred[:,1].flatten(), y_true[:,1].flatten())
        loss = (loss1, loss2)
        self.validation_step_outputs.append(loss)
        return loss

    def on_validation_epoch_end(self):
        loss = torch.Tensor(self.validation_step_outputs)
        loss1, loss2 = loss[:, 0], loss[:, 1]
        loss = ((loss1.mean())**0.5 + (loss2.mean())**0.5)/2
        self.log("val_loss", loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        self.validation_step_outputs.clear()

    def predict_step(self, batch, batch_idx):
        x, *_ = self._prepare_batch(batch)
        y_true, xp = batch[1], batch[2]
        y_pred = self(x, xp)*100
        return y_pred

    def configure_optimizers(self):
        optimizer = AdamW(self.model.parameters(), lr=1e-3, betas=(0.9, 0.999), weight_decay=0.01)
        return optimizer

In [None]:
lit_model = LitMPNNSMILESClassification(
    batch_size=BATCH_SIZE
)

In [None]:
checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',
    dirpath='./checkpoint/',
    filename='MPNN-{epoch:02d}-{train_loss:.2f}-{val_loss:.2f}',
    save_top_k=1,
)

In [None]:
trainer = L.Trainer(
    accelerator='gpu',
#     precision='bf16-mixed',
    max_epochs=100,
    callbacks=[checkpoint_callback],
)

In [None]:
trainer.fit(lit_model, train_dataloader, val_dataloader)

In [None]:
lit_model = LitMPNNSMILESClassification.load_from_checkpoint(
    'checkpoint/MPNN-epoch=19-train_loss=0.29-val_loss=32.97.ckpt',
    batch_size=BATCH_SIZE
)

In [None]:
trainer = L.Trainer(
    accelerator='gpu',
)

In [None]:
preds = trainer.predict(lit_model, test_dataloader)

In [None]:
sub_array = torch(preds).numpy()

In [None]:
sub_array