In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
from rdkit import Chem
from torch_geometric.data import Data
from sklearn.metrics import mean_squared_error
import torch.nn as nn
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem import AllChem
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset
from rdkit.Chem import Draw
from sklearn.metrics import r2_score
from scipy.stats import spearmanr
import pandas as pd
from random import randrange
import itertools
import random
import os
from pickle import dump, load
from sklearn.metrics import mean_absolute_error
import pickle
import gzip, pickle
from torch_geometric.data import DataLoader
import gnn_utils
import gnn_model
from gnn_model import GNN, BaselineGCN1, BaselineGCN2
import config
import datetime

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import sys,torch;print('py',sys.version);print('torch',torch.__version__,'cuda',torch.version.cuda,'gpu?',torch.cuda.is_available())
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

py 3.7.12 | packaged by conda-forge | (default, Oct 26 2021, 05:35:01) [MSC v.1916 64 bit (AMD64)]
torch 1.5.0+cu101 cuda 10.1 gpu? True
Using device: cuda


In [8]:
def run():
    
    # create data
    # gnn_utils.create_data()
    print("start create data")
    with gzip.open(f"{config.data_dir}train.pkl.gz", "rb") as f:
        train_X = pickle.load(f)
    with gzip.open(f"{config.data_dir}val.pkl.gz", "rb") as f:
        val_X = pickle.load(f)
    with gzip.open(f"{config.data_dir}test.pkl.gz", "rb") as f:
        test_X = pickle.load(f)

    print("start define model")
    # define model
    n_features = config.n_features # number of node features
    bs = config.bs

    train_loader = DataLoader(train_X, batch_size=bs, shuffle=True, drop_last=True)
    val_loader = DataLoader(val_X, batch_size=bs, shuffle=True, drop_last=True)
    test_loader = DataLoader(test_X, batch_size=bs, shuffle=False, drop_last=False)

    train_loader_no_shuffle = DataLoader(train_X, batch_size = bs, shuffle=False, drop_last=False)
    val_loader_no_shuffle = DataLoader(val_X, batch_size = bs, shuffle=False, drop_last=False)


    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = BaselineGCN2(n_features = n_features).to(device)
    adam = torch.optim.Adam(model.parameters(), lr = config.lr )
    optimizer = adam
    early_stopping = gnn_utils.EarlyStopping(patience = config.patience, verbose=True, chkpoint_name = config.best_model2)
    criterion = nn.MSELoss()
    n_epochs = config.max_epochs

    print("start train")
    # train the model
    hist = {"train_rmse":[], "val_rmse":[]}
    for epoch in range(0, n_epochs):
        print("start epoch", epoch)
        model.train()
        loss_all = 0
        for data in train_loader:
            data = data.to(device)
            optimizer.zero_grad()
            output = model(data)
            output = output.reshape(-1,)

            loss = criterion(output, data.y)
            loss.backward()
            optimizer.step()


        train_rmse = gnn_utils.test_fn(train_loader, model, device)
        val_rmse = gnn_utils.test_fn(val_loader, model, device)
        early_stopping(val_rmse, model)

        if early_stopping.early_stop:
            print("Early stopping")
            break

        hist["train_rmse"].append(train_rmse)
        hist["val_rmse"].append(val_rmse)
        print(f'Epoch: {epoch}, Train_rmse: {train_rmse:.3}, Val_rmse: {val_rmse:.3}')

    print(f"training completed at {datetime.datetime.now()}")
    
if __name__ == "__main__":
    run()

start create data
start define model
start train
start epoch 0
Validation loss decreased (inf --> 1.502345).  Saving model ...
Epoch: 0, Train_rmse: 1.48, Val_rmse: 1.5
start epoch 1
Validation loss decreased (1.502345 --> 1.430080).  Saving model ...
Epoch: 1, Train_rmse: 1.38, Val_rmse: 1.43
start epoch 2
Validation loss decreased (1.430080 --> 1.123766).  Saving model ...
Epoch: 2, Train_rmse: 1.21, Val_rmse: 1.12
start epoch 3
EarlyStopping counter: 1 out of 25
Epoch: 3, Train_rmse: 1.44, Val_rmse: 1.49
start epoch 4
Validation loss decreased (1.123766 --> 1.093496).  Saving model ...
Epoch: 4, Train_rmse: 1.13, Val_rmse: 1.09
start epoch 5
EarlyStopping counter: 1 out of 25
Epoch: 5, Train_rmse: 1.21, Val_rmse: 1.18
start epoch 6
EarlyStopping counter: 2 out of 25
Epoch: 6, Train_rmse: 1.24, Val_rmse: 1.11
start epoch 7
EarlyStopping counter: 3 out of 25
Epoch: 7, Train_rmse: 1.2, Val_rmse: 1.13
start epoch 8
Validation loss decreased (1.093496 --> 1.060161).  Saving model ...
Epo

In [None]:
def run_L2_regularization():
    
    # create data
    # gnn_utils.create_data()
    print("start create data")
    with gzip.open(f"{config.data_dir}train.pkl.gz", "rb") as f:
        train_X = pickle.load(f)
    with gzip.open(f"{config.data_dir}val.pkl.gz", "rb") as f:
        val_X = pickle.load(f)
    with gzip.open(f"{config.data_dir}test.pkl.gz", "rb") as f:
        test_X = pickle.load(f)

    print("start define model")
    # define model
    n_features = config.n_features # number of node features
    bs = config.bs

    train_loader = DataLoader(train_X, batch_size=bs, shuffle=True, drop_last=True)
    val_loader = DataLoader(val_X, batch_size=bs, shuffle=True, drop_last=True)
    test_loader = DataLoader(test_X, batch_size=bs, shuffle=False, drop_last=False)

    train_loader_no_shuffle = DataLoader(train_X, batch_size = bs, shuffle=False, drop_last=False)
    val_loader_no_shuffle = DataLoader(val_X, batch_size = bs, shuffle=False, drop_last=False)


    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = BaselineGCN2(n_features = n_features).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=config.lr, weight_decay=config.l2_lambda)
    early_stopping = gnn_utils.EarlyStopping(patience = config.patience, verbose=True, chkpoint_name = config.best_model3)
    criterion = nn.MSELoss()
    n_epochs = config.max_epochs

    print("start train")
    # train the model
    hist = {"train_rmse":[], "val_rmse":[]}
    for epoch in range(0, n_epochs):
        print("start epoch", epoch)
        model.train()
        loss_all = 0
        for data in train_loader:
            data = data.to(device)
            optimizer.zero_grad()
            output = model(data)
            output = output.reshape(-1,)

            loss = criterion(output, data.y)
            # L2 regularization
            l2_reg = sum(p.pow(2.0).sum() for p in model.parameters())
            loss = loss + config.l2_lambda * l2_reg
            loss.backward()
            optimizer.step()


        train_rmse = gnn_utils.test_fn(train_loader, model, device)
        val_rmse = gnn_utils.test_fn(val_loader, model, device)
        early_stopping(val_rmse, model)

        if early_stopping.early_stop:
            print("Early stopping")
            break

        hist["train_rmse"].append(train_rmse)
        hist["val_rmse"].append(val_rmse)
        print(f'Epoch: {epoch}, Train_rmse: {train_rmse:.3}, Val_rmse: {val_rmse:.3}')

    print(f"training completed at {datetime.datetime.now()}")
    
if __name__ == "__main__":
    run_L2_regularization()