In [1]:
%cd ..

/home/nikita/edu/competitions/admet


In [2]:
import numpy as np
import pandas as pd

import torch
from torch import nn
from torch.nn import functional as F

from lightning import pytorch as pl

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

from chemprop import data, featurizers, models, nn

In [3]:
df_train = pd.read_csv("data/train_admet.csv", index_col=0)
df_test = pd.read_csv("data/test_data.csv", index_col=0)
sample = pd.read_csv("data/sample.csv")

In [4]:
df_trains = []
df_vals = []
df_tests = []
properties = df_train.property.unique()

for prop in properties:
    subset_train = df_train[df_train.property == prop]
    subset_train, subset_val = train_test_split(
        subset_train, test_size=0.2, random_state=75, stratify=subset_train.Y
    )
    sampler = RandomOverSampler()
    subset_train = sampler.fit_resample(subset_train, subset_train.Y)[0]
    df_trains.append(subset_train)
    df_vals.append(subset_val)
    df_tests.append(df_test[df_test.property == prop])

In [5]:
train_data_total = []
val_data_total = []
test_data_total = []
for i in range(len(df_trains)):
    train_data_total.append(
        [
            data.MoleculeDatapoint.from_smi(smi, [y])
            for smi, y in zip(df_trains[i]["Drug"], df_trains[i]["Y"])
        ]
    )
    val_data_total.append(
        [
            data.MoleculeDatapoint.from_smi(smi, [y])
            for smi, y in zip(df_vals[i]["Drug"], df_vals[i]["Y"])
        ]
    )

    test_data_total.append(
        [data.MoleculeDatapoint.from_smi(smi) for smi in df_tests[i]["Drug"]]
    )



In [6]:
featurizer = featurizers.SimpleMoleculeMolGraphFeaturizer()
train_datasets = [
    data.MoleculeDataset(train_data, featurizer) for train_data in train_data_total
]
val_datasets = [
    data.MoleculeDataset(val_data, featurizer) for val_data in val_data_total
]
test_datasets = [
    data.MoleculeDataset(test_data, featurizer) for test_data in test_data_total
]

train_loaders = [
    data.build_dataloader(train_dataset) for train_dataset in train_datasets
]
val_loaders = [
    data.build_dataloader(val_dataset, shuffle=False) for val_dataset in val_datasets
]
test_loaders = [
    data.build_dataloader(test_dataset, shuffle=False) for test_dataset in test_datasets
]

In [19]:
mpnns = []
for i in range(len(train_datasets)):
    mp = nn.BondMessagePassing()
    agg = nn.MeanAggregation()
    ffn = nn.BinaryClassificationFFN()
    batch_norm = True
    metric_list = [
        nn.metrics.BinaryAUROCMetric(),
        nn.metrics.BinaryAccuracyMetric(),
        nn.metrics.BCEMetric(),
    ]

    mpnn = models.MPNN(mp, agg, ffn, batch_norm, metric_list)
    mpnns.append(mpnn)

AttributeError: module 'chemprop.nn' has no attribute 'ModuleList'

In [17]:
trainers = [
    pl.Trainer(
        logger=False,
        enable_checkpointing=True,
        enable_progress_bar=True,
        accelerator="auto",
        devices=1,
        max_epochs=25,
    )
    for _ in range(len(train_datasets))
]

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [18]:
for i in range(len(trainers)):
    trainers[i].fit(mpnns[i], train_loaders[i], val_loaders[i])

/home/nikita/edu/competitions/admet/.conda/lib/python3.11/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /home/nikita/edu/competitions/admet/checkpoints exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/home/nikita/edu/competitions/admet/.conda/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.

  | Name            | Type                         | Params | Mode 
-------------------------------------------------------------------------
0 | message_passing | MulticomponentMessagePassing | 455 K  | train
1 | agg             | MeanAggregation              | 0      | train
2 | bn              | BatchNorm1d                  | 1.2 K  

Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

/home/nikita/edu/competitions/admet/.conda/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


TypeError: 'BatchMolGraph' object is not iterable

In [12]:
for i in range(len(trainers)):
    trainers[i].test(mpnns[i], val_loaders[i])

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/nikita/edu/competitions/admet/.conda/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Testing DataLoader 0: 100%|██████████| 16/16 [00:00<00:00, 43.71it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        Test metric                 DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
batch_averaged_test/accuracy     0.8164867758750916
  batch_averaged_test/bce        0.5922560691833496
  batch_averaged_test/roc        0.8900040984153748
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Testing DataLoader 0: 100%|██████████| 5/5 [00:00<00:00, 31.93it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        Test metric                 DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
batch_averaged_test/accuracy     0.8315789699554443
  batch_averaged_tes

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing DataLoader 0: 100%|██████████| 5/5 [00:00<00:00, 36.58it/s] 
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        Test metric                 DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
batch_averaged_test/accuracy     0.8736842274665833
  batch_averaged_test/bce       0.46433115005493164
  batch_averaged_test/roc        0.9130865931510925
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


In [75]:
preds = []
for i in range(len(trainers)):
    preds.append(trainers[i].predict(mpnns[i], test_loaders[i]))

preds = [torch.cat(pred, dim=0) for pred in preds]
preds = [np.array(pred) for pred in preds]
preds = np.concatenate(preds, axis=0)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/nikita/edu/competitions/admet/.conda/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████| 7/7 [00:00<00:00, 58.19it/s] 

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



Predicting DataLoader 0: 100%|██████████| 7/7 [00:00<00:00, 34.69it/s] 


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting DataLoader 0: 100%|██████████| 7/7 [00:00<00:00, 39.07it/s]


In [80]:
sample["Y"] = preds
sample.to_csv("submissions/chemprop.csv", index=False)