creating a synthetic csv dataset

In [8]:
# import torch
# import pandas as pd

# # 'nan' for missing property labels
# data = {
#     'smiles': ['CCO', 'CO', 'C', 'O'] * 32,
#     'logP': torch.randn(127).numpy().tolist() + [float('nan')],
#     'logD': torch.randn(128).numpy().tolist()
# }
# df = pd.DataFrame(data)
# df.to_csv('syn_data.csv', index=False)

In [9]:
from dgllife.utils import atom_type_one_hot, atom_degree_one_hot, \
    atom_hybridization_one_hot, atom_is_aromatic_one_hot, \
    atom_chiral_tag_one_hot, atom_formal_charge_one_hot, atom_mass, \
    atom_implicit_valence_one_hot, BaseAtomFeaturizer, \
    ConcatFeaturizer, CanonicalBondFeaturizer
from functools import partial
from rdkit import Chem

atom_featurizer = BaseAtomFeaturizer(
    featurizer_funcs={
        'hv': ConcatFeaturizer(
            [partial(atom_degree_one_hot, allowable_set=[1, 2, 3, 4, 6]),
             partial(atom_type_one_hot, allowable_set=[
                 'B', 'Br', 'C', 'Cl', 'F', 'H', 'I', 'N', 'O', 'P', 'S', 'Se', 'Si']),
             atom_chiral_tag_one_hot,
             partial(atom_formal_charge_one_hot, allowable_set=[-1, 0, 1]),
             partial(atom_hybridization_one_hot, allowable_set=[
                Chem.rdchem.HybridizationType.S,
                Chem.rdchem.HybridizationType.SP,
                Chem.rdchem.HybridizationType.SP2,
                Chem.rdchem.HybridizationType.SP3,
                Chem.rdchem.HybridizationType.SP3D2
             ]),
             partial(atom_implicit_valence_one_hot, allowable_set=list(range(4))),
             atom_is_aromatic_one_hot, atom_mass,
    ])}
)

if __name__ == '__main__':
    import pandas as pd

    from dgllife.data import MoleculeCSVDataset
    from dgllife.utils import SMILESToBigraph, RandomSplitter

    from configure import configs
    from run import main
    from utils import mkdir_p, setup

    # Provide the arguments directly here
    args = {
        'csv-path': '/Users/kseniiakholina/Desktop/dgl-lifesci-Kseniia/MTL/syn_data.csv',
        'model': 'GCN',
        'mode': 'bypass',
        'num_epochs': 4000,
        'result_path': 'Desktop/dgl-lifesci-Kseniia/MTL',  # Fix the key name here
        'smiles-column': 'smiles',
        'tasks': ['logP', 'logD']  # Fix the task_names here
    }

    args['exp_name'] = '_'.join([args['model'], args['mode']])
    args.update(configs[args['exp_name']])

    # Setup for experiments
    mkdir_p(args['result_path'])  # Fix the key name here

    edge_featurizer = CanonicalBondFeaturizer(bond_data_field='he', self_loop=True)
    df = pd.read_csv(args['csv-path'])  # Fix the column name here

    smiles_to_g = SMILESToBigraph(add_self_loop=True, node_featurizer=atom_featurizer,
                                  edge_featurizer=edge_featurizer)

    dataset = MoleculeCSVDataset(
        df, smiles_to_g,
        smiles_column=args['smiles-column'],  # Fix the column name here
        cache_file_path=args['result_path'] + '/graph.bin',  # Fix the key name here
        task_names=args['tasks']  # Fix the task_names here
    )
    args['tasks'] = dataset.task_names
    args = setup(args)
    train_set, val_set, test_set = RandomSplitter.train_val_test_split(
        dataset, frac_train=0.8, frac_val=0.1,
        frac_test=0.1, random_state=0)

    main(args, atom_featurizer, edge_featurizer, train_set, val_set, test_set)


Directory Desktop/dgl-lifesci-Kseniia/MTL already exists.
Processing dgl graphs from scratch...
Epoch 1/4000 | training | averaged loss 0.5594 | averaged r2 0.0186 | averaged mae 0.9643
Epoch 1/4000 | validation | current r2 0.1384 | best r2 0.1384 | mae 0.8105
Epoch 2/4000 | training | averaged loss 0.4908 | averaged r2 0.0254 | averaged mae 0.8799
EarlyStopping counter: 1 out of 30
Epoch 2/4000 | validation | current r2 0.0803 | best r2 0.1384 | mae 0.8113
Epoch 3/4000 | training | averaged loss 0.4389 | averaged r2 0.0407 | averaged mae 0.8219
EarlyStopping counter: 2 out of 30
Epoch 3/4000 | validation | current r2 0.1165 | best r2 0.1384 | mae 0.8163
Epoch 4/4000 | training | averaged loss 0.4200 | averaged r2 0.0537 | averaged mae 0.8007
Epoch 4/4000 | validation | current r2 0.1762 | best r2 0.1762 | mae 0.8204
Epoch 5/4000 | training | averaged loss 0.4150 | averaged r2 0.0535 | averaged mae 0.7958
Epoch 5/4000 | validation | current r2 0.1925 | best r2 0.1925 | mae 0.8251
Epoc

to run the main.py script:

In [10]:
# !python main.py -c /Users/kseniiakholina/Desktop/dgl-lifesci-Kseniia/MTL/syn_data.csv -m GCN --mode bypass -p Desktop/dgl-lifesci-Kseniia/MTL -s smiles -t logP,logD
