In [21]:
import numpy as np
import pandas as pd
import tensorflow as tf
import deepchem as dc
from deepchem.feat.mol_graphs import WeaveMol
from deepchem.data.datasets import DiskDataset,NumpyDataset
import inspect
import os
from glob import glob
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [22]:
dataset_dir = '/home/matthew/Programming/Kaggle/MolecularProperties/dataset'

In [23]:
def load_data(shard,shardStart):
    X = []
    y = []
    ids = []
    cid = 0
    for mol in tqdm(shard):
        nodes = np.load(os.path.join(dataset_dir,'dc_graphs','%s_nodes.npy' % mol)).astype('int64')
        pairs = np.load(os.path.join(dataset_dir,'dc_graphs','%s_pairs.npy' % mol)).astype('int64')
        if nodes.shape[0] > 50:
            continue
        mol = WeaveMol(nodes, pairs)
        coupling_constant = train_df[train_df.molecule_name == mol].scalar_coupling_constant.values
        if len(coupling_constant) != 1:
            continue
        X.append(mol)
        y.append(coupling_constant[0])
        ids.append(shardStart+cid)
        cid += 1
    X = np.array(X)
    y = np.array(y)
    w = np.ones_like(y)
    ids = np.array(ids)
    y = np.expand_dims(y,-1)
    w = np.expand_dims(w,-1)
    return X,y,w,ids

def generator(pdbs):
    shardID = 0
    shardSize = 2000
    shardStart = shardID * shardSize
    shardEnd = (shardID+1) * shardSize
    while shardEnd < len(pdbs):
        shard = pdbs[shardStart:shardEnd]
        X, y, w, ids = load_data(shard,shardStart)
        shardID += 1
        shardStart = shardID * shardSize
        shardEnd = (shardID+1) * shardSize
        yield (X, y, w, ids)
    
    shard = pdbs[shardStart:]
    X, y, w, ids = load_data(shard,shardStart)
    yield (X, y, w, ids)

In [24]:
train_df = pd.read_csv('/home/matthew/Programming/Kaggle/MolecularProperties/dataset/train.csv')

In [25]:
val_sele = np.random.choice(train_df.id.values,int(len(train_df) * 0.33))
train_split_df = train_df[~train_df.id.isin(val_sele)]
val_split_df = train_df[train_df.id.isin(val_sele)]

In [26]:
train_dataset = DiskDataset.create_dataset(generator(train_split_df.molecule_name.values),data_dir='train',tasks=['scalar_coupling_constant'])
val_dataset = DiskDataset.create_dataset(generator(val_split_df.molecule_name.values),data_dir='val',tasks=['scalar_coupling_constant'])

100%|██████████| 2000/2000 [11:41<00:00,  2.88it/s]
  1%|          | 11/2000 [00:03<11:51,  2.80it/s]


KeyboardInterrupt: 