In [1]:

%load_ext autoreload
%autoreload 2

In [45]:
data = QM9(r"../data/qm9/")

In [53]:
from rdkit import Chem

import pandas as pd
import numpy as np

path = "qm9_smiles.csv"

suppl = Chem.SmilesMolSupplier(path)

df = pd.read_csv(path)

df["target1"] = np.random.randn((len(df)))
df["target2"] = np.random.randn((len(df)))

df.set_index("smiles",inplace=True)

df.to_csv("qm9_target.csv")

## Testing SmilesDataset

In [7]:
from smiles_dataset import SmilesDataset
from smiles_lightning_data_module import SmilesDataModule
from lightning_model import LightningClassicGNN
import pytorch_lightning as pl
from torch_geometric.transforms import distance
from torch_geometric.loader import DataLoader
import os
import torch
# making sure we are as determinstic as possibe
torch.use_deterministic_algorithms(True)
import numpy as np

! rm -rf ../data/test/processed
! rm -rf ../data/test_hydrogen/processed


seed=42
## pytorch lighting takes of seeding everything
pl.seed_everything(seed=seed, workers=True)
# create from csv a pytorch dataset
dataset = SmilesDataset(root=r"../data/test", filename="qm9_target.csv", add_hydrogen=True, transform=distance.Distance())
# from torch dataset, create lightning data module to make sure training splits are always done the same ways
data_module = SmilesDataModule(dataset=dataset, seed=seed)


num_node_features = data_module.num_node_features
num_edge_features= data_module.num_edge_features

gnn_model = LightningClassicGNN(classification=False, output_dim=2, num_node_features=num_node_features, num_edge_features=num_edge_features)

num_epochs=1


# from pytorch_ligthing import loggers
# logger = loggers.WandbLogger()

# default root dir is where the logs and weights are logged
# useful when debugging is limit_train_batches
# by default uses TensorBoardLogger, can be configured 
# Plugins allows us to connect to arbitrary cluster
# can set max_epochs
# can use precision to specify number of bit floating points to reduce memory footprint ()
# can use accumulate_grad_batches to speed-up training too
trainer = pl.Trainer(deterministic=True, auto_lr_find=True, default_root_dir=os.getcwd(), precision="bf16", max_epochs=num_epochs)



Global seed set to 42
Processing...
100%|██████████| 100/100 [00:00<00:00, 105.91it/s]
Done!
Global seed set to 42
Using bfloat16 Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(


In [8]:
# tune to find the learning rate
trainer.tune(gnn_model,datamodule=data_module)

  rank_zero_warn(


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.
LR finder stopped early after 2 steps due to diverging loss.
Failed to compute suggestion for learning rate because there are not enough points. Increase the loop iteration limits or the size of your dataset/dataloader.
Restoring states from the checkpoint path at /home/harold/epfl/chem-ml-repr/code/.lr_find_ef32d5bb-2f32-433c-adc9-58cebf7beeee.ckpt


{'lr_find': <pytorch_lightning.tuner.lr_finder._LRFinder at 0x7fcce733e460>}

In [5]:

# we can resume from a checkpoint using trainer.fit(ckpth_path="some/path/to/my_checkpoint.ckpt")
trainer.fit(gnn_model, datamodule=data_module)

(5, 5)

In [6]:
a = [1,2,3]

end_index = len(a) -1 + 1

a[:end_index]

[1, 2, 3]

In [11]:
from datasets import BaceDataset


%load_ext autoreload
%autoreload 2

root= "../data/bace"
! rm -rf ../data/bace/processed

data = BaceDataset(root=root)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Processing...
  3%|█▏                                       | 42/1513 [00:14<08:42,  2.81it/s]

Unexpected exception formatting exception. Falling back to standard exception



Traceback (most recent call last):
  File "/home/harold/miniconda3/envs/chem/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3398, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_19613/1858521360.py", line 10, in <cell line: 10>
    data = BaceDataset(root=root)
  File "/home/harold/epfl/chem-ml-repr/code/datasets.py", line 138, in BaceDataset
    dataset = SmilesDataset(root=root, filename=filename, add_hydrogen=add_hydrogen, seed=seed, begin_index=begin_index, end_index=end_index,
  File "/home/harold/epfl/chem-ml-repr/code/smiles_dataset.py", line 62, in __init__
    super().__init__(root, transform, pre_transform, pre_filter)
  File "/home/harold/miniconda3/envs/chem/lib/python3.9/site-packages/torch_geometric/data/in_memory_dataset.py", line 50, in __init__
    super().__init__(root, transform, pre_transform, pre_filter)
  File "/home/harold/miniconda3/envs/chem/lib/python3.9/site-packages/torch_geometric/data/dataset.p

In [1]:
import pandas as pd
import torch


%load_ext autoreload
%autoreload 2

df = pd.read_csv("../data/bace/raw/bace.csv")

BACE_REGRESSION_TASKS = "pIC50"
BACE_CLASSIFICATION_TASKS = "Class"

df = df[["mol", BACE_CLASSIFICATION_TASKS, BACE_REGRESSION_TASKS]]
df.set_index("mol", drop=True, inplace=True)

torch.tensor(df.values)

tensor([[1.0000, 9.1549],
        [1.0000, 8.8539],
        [1.0000, 8.6990],
        ...,
        [0.0000, 2.9531],
        [0.0000, 2.7333],
        [0.0000, 2.5445]], dtype=torch.float64)

array([[1.       , 9.1549015],
       [1.       , 8.8538723],
       [1.       , 8.6989698],
       ...,
       [0.       , 2.9531147],
       [0.       , 2.7332981],
       [0.       , 2.5445461]])

In [7]:
df.Class.factorize()

(array([0, 0, 0, ..., 1, 1, 1]), Int64Index([1, 0], dtype='int64'))