In [17]:
"""
    IMPORTING LIBS
"""
import dgl

import numpy as np
import os
import socket
import time
import random
import glob
import argparse, json
import pickle

import torch
import torch.nn as nn
import torch.nn.functional as F

import torch.optim as optim
from torch.utils.data import DataLoader

from tensorboardX import SummaryWriter
from tqdm import tqdm

class DotDict(dict):
    def __init__(self, **kwds):
        self.update(kwds)
        self.__dict__ = self
        
# """
#     AUTORELOAD IPYTHON EXTENSION FOR RELOADING IMPORTED MODULES
# """

def in_ipynb():
    try:
        cfg = get_ipython().config 
        return True
    except NameError:
        return False
    
notebook_mode = in_ipynb()
print(notebook_mode)

if notebook_mode == True:
    %load_ext autoreload
    %autoreload 2
    
"""
    IMPORTING CUSTOM MODULES/METHODS
"""
# from original_graphs_specific.nets.molecules_graph_regression.load_net import gnn_model # import all GNNS
# from original_graphs_specific.data.data import LoadData # import dataset

from nets.molecules_graph_regression.load_net import gnn_model # import all GNNS
from data.data import LoadData # import dataset

"""
    GPU Setup
"""
def gpu_setup(use_gpu, gpu_id):
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)  

    if torch.cuda.is_available() and use_gpu:
        print('cuda available with GPU:',torch.cuda.get_device_name(0))
        device = torch.device("cuda")
    else:
        print('cuda not available')
        device = torch.device("cpu")
    return device


# select GPU or CPU
use_gpu = True; gpu_id = 0; device = None # default GPU
# use_gpu = False; gpu_id = -1; device = None # CPU

gpu_setup(use_gpu, gpu_id)

if notebook_mode == True:
    MODEL_NAME = 'GCN'
#     MODEL_NAME = 'GraphSage'
#     MODEL_NAME = 'GIN'

# Original Graphs
    DATASET_NAME = 'toy_reduced_dataset'

# Reduced graphs
#     DATASET_NAME = 'toy_original_dataset'

    out_dir = 'out/molecules_graph_regression/'
    root_log_dir = out_dir + 'logs/' + MODEL_NAME + "_" + DATASET_NAME + "_" + time.strftime('%Hh%Mm%Ss_on_%b_%d_%Y')
    root_ckpt_dir = out_dir + 'checkpoints/' + MODEL_NAME + "_" + DATASET_NAME + "_" + time.strftime('%Hh%Mm%Ss_on_%b_%d_%Y')

    print("[I] Loading data (notebook) ...")
    dataset = LoadData(DATASET_NAME)
    trainset, valset, testset = dataset.train, dataset.val, dataset.test
    print("[I] Finished loading.")
    
if notebook_mode == True:

    n_heads = -1
    edge_feat = True
    pseudo_dim_MoNet = -1
    kernel = -1
    gnn_per_block = -1
    embedding_dim = -1
    pool_ratio = -1
    n_mlp_GIN = -1
    gated = False
    self_loop = False
    #self_loop = True
    max_time = 12
    pos_enc = True
    #pos_enc = False
    pos_enc_dim = 8
    

    if MODEL_NAME == 'GCN':
        seed=44; epochs=1000; batch_size=5; init_lr=5e-5; lr_reduce_factor=0.5; lr_schedule_patience=25; min_lr = 1e-6; weight_decay=0
        L=4; hidden_dim=145; out_dim=hidden_dim; in_dim=dataset.num_atom_type; edim = 1; dropout=0.0; readout='mean'; dgl_builtin = True
    
    if MODEL_NAME == 'GIN':
        seed=40; epochs=1000; batch_size=50; init_lr=5e-4; lr_reduce_factor=0.5; lr_schedule_patience=25; min_lr = 1e-6; weight_decay=0
        L=4; hidden_dim=110; out_dim=hidden_dim; in_dim=dataset.num_atom_type; dropout=0.0; readout='mean'
        n_mlp_GIN = 2; learn_eps_GIN=True; neighbor_aggr_GIN='sum'; dgl_builtin = True
        
    if MODEL_NAME == 'GraphSage':
        seed=44; epochs=1000; batch_size=50; init_lr=5e-5; lr_reduce_factor=0.5; lr_schedule_patience=25; min_lr = 1e-6; weight_decay=0
        L=4; hidden_dim=108; out_dim=hidden_dim; in_dim=dataset.num_atom_type; dropout=0.0; readout='mean'; dgl_builtin = True

        # generic new_params
    net_params = {}
    net_params['device'] = device
    net_params['num_atom_type'] = dataset.num_atom_type
    net_params['num_bond_type'] = dataset.num_bond_type
    net_params['residual'] = True
    net_params['hidden_dim'] = hidden_dim
    net_params['out_dim'] = out_dim
    net_params['in_dim'] = in_dim
#     net_params['edim'] = edim
    net_params['n_heads'] = n_heads
    net_params['L'] = L  # min L should be 2
    net_params['readout'] = "sum"
    net_params['layer_norm'] = True
    net_params['batch_norm'] = True
    net_params['in_feat_dropout'] = 0.0
    net_params['dropout'] = 0.0
    net_params['edge_feat'] = edge_feat
    net_params['self_loop'] = self_loop
    net_params['dgl_builtin'] = dgl_builtin
    
    # specific for GIN
    net_params['n_mlp_GIN'] = n_mlp_GIN
    net_params['learn_eps_GIN'] = True
    net_params['neighbor_aggr_GIN'] = 'sum'
    
    # specific for graphsage
    net_params['sage_aggregator'] = 'mean'    

     # for MLPNet 
    net_params['gated'] = gated  
    
"""
    VIEWING MODEL CONFIG AND PARAMS
"""
def view_model_param(MODEL_NAME, net_params):
    model = gnn_model(MODEL_NAME, net_params)
    total_param = 0
    print("MODEL DETAILS:\n")
    #print(model)
    for param in model.parameters():
        # print(param.data.size())
        total_param += np.prod(list(param.data.size()))
    print('MODEL/Total parameters:', MODEL_NAME, total_param)
    return total_param


if notebook_mode == True:
    view_model_param(MODEL_NAME, net_params)

"""
    TRAINING CODE
"""

def train_val_pipeline(MODEL_NAME, dataset, params, net_params):
    t0 = time.time()
    per_epoch_time = []
        
    DATASET_NAME = dataset.name
    trainset, valset, testset = dataset.train, dataset.val, dataset.test
    if MODEL_NAME in ['GCN']:
        if net_params['self_loop']:
            print("[!] Adding graph self-loops for GCN/GAT models (central node trick).")
            dataset._add_self_loops()

    device = net_params['device']
    
    # setting seeds
    random.seed(params['seed'])
    np.random.seed(params['seed'])
    torch.manual_seed(params['seed'])
    if device.type == 'cuda':
        torch.cuda.manual_seed(params['seed'])

    model = gnn_model(MODEL_NAME, net_params)
    model = model.to(device)

    optimizer = optim.Adam(model.parameters(), lr=params['init_lr'], weight_decay=params['weight_decay'])
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min',
                                                     factor=params['lr_reduce_factor'],
                                                     patience=params['lr_schedule_patience'],
                                                     verbose=True)

    drop_last = False

#     train_loader = DataLoader(trainset, batch_size=1, shuffle=True, drop_last=drop_last, collate_fn=dataset.collate)
    
    from train.train_molecules_graph_regression import train_epoch_sparse as train_epoch, evaluate_network_sparse as evaluate_network
    
#     train_loss, train_mae, optimizer = train_epoch(model, optimizer, device, train_loader, 1, MODEL_NAME)
    batch_x =trainset.ndata['feat'].to(device)  # num x feat
    batch_e = trainset.edata['feat'].to(device)
    batch_scores = model.forward(trainset.to(device), batch_x, batch_e)
    return (model.state_dict())

        
def main(notebook_mode=False,config=None):
    
    """
        USER CONTROLS
    """
    
    # terminal mode
    

            
    # notebook mode
    if notebook_mode:
        
        # parameters
        params = config['params']
        
        # dataset
        DATASET_NAME = config['dataset']
        dataset = LoadData(DATASET_NAME)
        
        # device
        device = gpu_setup(config['gpu']['use'], config['gpu']['id'])
        print("device:", device)
        out_dir = config['out_dir']
            
        # GNN model
        MODEL_NAME = config['model']
        
        # network parameters
        net_params = config['net_params']
        net_params['device'] = device
        net_params['gpu_id'] = config['gpu']['id']
        net_params['batch_size'] = params['batch_size']
        
    
    # ZINC
    net_params['num_atom_type'] = dataset.num_atom_type
    net_params['num_bond_type'] = dataset.num_bond_type

    net_params['total_param'] = view_model_param(MODEL_NAME, net_params)
    return(train_val_pipeline(MODEL_NAME, dataset, params, net_params))
  
config = {}
    # gpu config
gpu = {}
gpu['use'] = use_gpu
gpu['id'] = gpu_id
config['gpu'] = gpu
    
# GNN model, dataset, out_dir
config['model'] = MODEL_NAME
config['dataset'] = DATASET_NAME
config['out_dir'] = out_dir
# parameters
params = {}
params['seed'] = seed
params['epochs'] = epochs
params['batch_size'] = batch_size
params['init_lr'] = init_lr
params['lr_reduce_factor'] = lr_reduce_factor 
params['lr_schedule_patience'] = lr_schedule_patience
params['min_lr'] = min_lr
params['weight_decay'] = weight_decay
params['print_epoch_interval'] = 5
params['max_time'] = max_time
config['params'] = params
    # network parameters
config['net_params'] = net_params

# convert to .py format
from utils.cleaner_main import *
cleaner_main('main_molecules_graph_regression')

state_dict = main(True,config)

    

True
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
cuda available with GPU: GeForce GTX 1080 Ti
[I] Loading data (notebook) ...
[I] Loading dataset toy_reduced_dataset...
train, test, val sizes : 5 5 5
[I] Finished loading.
[I] Data load time: 0.0019s
[I] Finished loading.
MODEL DETAILS:

MODEL/Total parameters: GCN 103222
Convert main_molecules_graph_regression.ipynb to main_molecules_graph_regression.py
Clean main_molecules_graph_regression.py
Done. 
[I] Loading dataset toy_reduced_dataset...
train, test, val sizes : 5 5 5
[I] Finished loading.
[I] Data load time: 0.0018s
cuda available with GPU: GeForce GTX 1080 Ti
device: cuda
MODEL DETAILS:

MODEL/Total parameters: GCN 103222


TypeError: forward() got an unexpected keyword argument 'edge_weight'

In [7]:
with open('state_dict_original.pkl', 'wb') as f:
    pickle.dump(state_dict, f)

In [9]:
def prepare_original_reformatted_graph(graph):
    new_features = torch.zeros((graph.number_of_nodes(),28))
    for j in range(graph.number_of_nodes()): 
        new_feature = torch.zeros(28)
        atom_type = int(graph.ndata['feat'][j].item())
        new_feature[atom_type] = 1  
        new_features[j] = new_feature
    print(new_features)
    graph.ndata['feat'] = new_features     
def prepare_reduced_reformatted_graph(dataset):
    new_features = torch.zeros((graph.number_of_nodes(),28))
    weight_features = torch.zeros(graph.number_of_nodes(),1)
    for j in range(graph.number_of_nodes()): 
        new_feature = torch.zeros(28)
        atom_type = int(graph.ndata['feat'][j][0].item())
        new_feature[atom_type] = 1
        weight_feature =  graph.ndata['feat'][j][1]
        new_features[j] = new_feature
        weight_features[j]= weight_feature
    graph.ndata['feat'] = new_features
    graph.ndata['weight'] = weight_features
    graph.edata['feat'] = graph.edata['feat'].to(torch.float32)

In [12]:
# graph = LoadData(DATASET_NAME).train
# prepare_original_reformatted_graph(graph)

with open('data/molecules/toy_example_reduced.pkl', 'rb') as f:
    graph = pickle.load(f)

In [13]:
prepare_reduced_reformatted_graph(graph)

In [14]:
print(graph.ndata['feat'])

tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])


In [15]:
with open('data/molecules/toy_reduced_dataset.pkl', 'wb') as f:
    pickle.dump([graph, graph, graph, 28,4], f)