In [1]:
import torch.optim as optim
import torch
import argparse
import numpy as np
import random
import os
from torch.nn.functional import cosine_similarity
import matplotlib.pyplot as plt
import argparse
from ogb.graphproppred import PygGraphPropPredDataset
from torch_geometric.loader import DataLoader
import pandas as pd
from probing import *
from utils.general import *

def get_args_parser():
    # Training settings
    # ======= Usually default settings
    parser = argparse.ArgumentParser(description='GNN baselines on ogbgmol* data with Pytorch Geometrics')
    parser.add_argument('--seed', default=0, type=int)
    parser.add_argument('--device', type=int, default=0,
                        help='which gpu to use if any (default: 0)')
    parser.add_argument('--drop_ratio', type=float, default=0.5,
                        help='dropout ratio (default: 0.5)')
    parser.add_argument('--batch_size', type=int, default=64,
                        help='input batch size for training (default: 64)')
    parser.add_argument('--num_workers', type=int, default=2,
                        help='number of workers (default: 0)')
    parser.add_argument('--dataset_name', type=str, default="ogbg-molhiv",
                        help='dataset name (default: ogbg-molhiv/moltox21/molpcba)')
    parser.add_argument('--feature', type=str, default="full",
                        help='full feature or simple feature')
    parser.add_argument('--bottle_type', type=str, default='std',
                        help='bottleneck type, can be std or sem')
    # ==== Model Structure ======
        # ----- Backbone
    parser.add_argument('--backbone_type', type=str, default='gcn',
                        help='backbone type, can be gcn, gin, gcn_virtual, gin_virtual')
    parser.add_argument('--emb_dim', type=int, default=300,
                        help='dimensionality of hidden units in GNNs (default: 300)')  
    parser.add_argument('--num_layer', type=int, default=3,
                        help='number of GNN message passing layers (default: 5)')
        # ---- SEM
    parser.add_argument('--L', type=int, default=15,
                        help='No. word in SEM')
    parser.add_argument('--V', type=int, default=20,
                        help='word size in SEM')
                        
        # ---- Head-type
    parser.add_argument('--head_type', type=str, default='linear',
                        help='Head type in interaction, linear or mlp')    
    return parser


args = get_args_parser()
args = args.parse_args(args=[])
#args = args.parse_args()
args.device = torch.device("cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu")

## Prepare for the probing data

In [2]:
if args.dataset_name == 'ogbg-molhiv':
    smiles_path = 'E:\\P4_Graph\\dataset\\ogbg_molhiv\\mapping\\mol.csv.gz'
    args.batch_size = 4113
elif args.dataset_name == 'ogbg-molpcba':
    smiles_path = 'E:\\P4_Graph\\dataset\\ogbg_molpcba\\mapping\\mol.csv.gz'

selected_prop = ['NumSaturatedRings', 'NumAromaticRings', 'NumAromaticCarbocycles', 'fr_aniline', 'fr_ketone_Topliss', 
                 'fr_ketone', 'fr_bicyclic', 'fr_methoxy', 'fr_para_hydroxylation', 'fr_pyridine', 'fr_benzene']

dataset = PygGraphPropPredDataset(name = args.dataset_name)
args.num_tasks = dataset.num_tasks
split_idx = dataset.get_idx_split()
valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=args.batch_size, shuffle=False, drop_last=False,
                        num_workers=args.num_workers)
test_loader = DataLoader(dataset[split_idx["test"]], batch_size=args.batch_size, shuffle=False, drop_last=False,
                        num_workers=args.num_workers)

valid_smiles = pd.read_csv(smiles_path).iloc[split_idx['valid']].smiles.values
valid_smiles = valid_smiles.tolist()
test_smiles = pd.read_csv(smiles_path).iloc[split_idx['test']].smiles.values
test_smiles = test_smiles.tolist()

valid_desc_names, valid_properties = compute_properties(valid_smiles)
test_desc_names, test_properties = compute_properties(test_smiles)

## Prepare the model

In [3]:
# ====== Generate features
def get_features(args, model,loader):
    with torch.no_grad():
        for step, batch in enumerate(loader):
            batch = batch.to(args.device)
            msg, hid = model.task_forward(batch)
        msg = msg.reshape(msg.shape[0],-1)
        return msg.cpu()

In [4]:
# ====== Init a model, load parameters
model = get_init_net(args)

# ====== Generate representations
embd_valid = get_features(args,model,valid_loader)
embd_test = get_features(args,model,test_loader)

In [5]:
prop = selected_prop[0]
performs, rnd_performs = [], []

for prop in tqdm(selected_prop):
    x_train = embd_valid
    y_train = valid_properties[prop].values.copy()
    #y_train[y_train>1] = 1 # binarize
    x_test = embd_test
    y_test = test_properties[prop].values.copy()
    #y_test[y_test>1] = 1 # binarize
    if y_train.min()==0 and y_train.max()==1:
        performs.append(linear_probing(embedding_train=x_train, y_train=y_train, embeding_test=x_test, 
                                           y_test=y_test, seed=args.seed, scale=True))
        np.random.shuffle(y_train)
        np.random.shuffle(y_test)
        rnd_performs.append(linear_probing(embedding_train=x_train, y_train=y_train, embeding_test=x_test, 
                                           y_test=y_test, seed=args.seed, scale=True))        
    else:
        performs.append(linear_probing_regression(embedding_train=x_train, y_train=y_train, embeding_test=x_test, 
                                           y_test=y_test, seed=args.seed, scale=True))
        np.random.shuffle(y_train)
        np.random.shuffle(y_test)
        rnd_performs.append(linear_probing_regression(embedding_train=x_train, y_train=y_train, embeding_test=x_test, 
                                           y_test=y_test, seed=args.seed, scale=True))


100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:03<00:00,  3.42it/s]


In [6]:
y_train.min()==0 and y_train.max()==1

False

In [7]:
probing_performance = np.array(performs)
random_performance = np.array(rnd_performs)
perf = pd.DataFrame(np.column_stack((probing_performance, random_performance)),
                index=selected_prop,
                columns=['MAE', 'AUC', 'rnd_MAE', 'rnd_AUC'])

In [8]:
perf

Unnamed: 0,MAE,AUC,rnd_MAE,rnd_AUC
NumSaturatedRings,0.642593,0.0,1.053616,0.0
NumAromaticRings,0.852974,0.0,1.367183,0.0
NumAromaticCarbocycles,0.800669,0.0,1.18113,0.0
fr_aniline,0.547053,0.0,0.590593,0.0
fr_ketone_Topliss,0.374211,0.0,0.363322,0.0
fr_ketone,0.474599,0.0,0.485068,0.0
fr_bicyclic,1.41467,0.0,1.588079,0.0
fr_methoxy,0.466182,0.0,0.460634,0.0
fr_para_hydroxylation,0.49797,0.0,0.525304,0.0
fr_pyridine,0.31775,0.0,0.307901,0.0


In [73]:
from sklearn.linear_model import LogisticRegression, LinearRegression
lr = LinearRegression()
lr.fit(x_train,y_train)

pred_test = lr.predict(x_test)

In [76]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test,pred_test)

0.6187111632234998

In [9]:
for prop in tqdm(selected_prop):
    x_train = embd_valid
    y_train = valid_properties[prop].values.copy()
    #y_train[y_train>1] = 1 # binarize
    x_test = embd_test
    y_test = test_properties[prop].values.copy()
    #y_test[y_test>1] = 1 # binarize
    print(y_train.min(),y_train.max())

100%|████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 5429.84it/s]

0 10
0 15
0 12
0 6
0 10
0 10
0 25
0 8
0 6
0 6
0 12





In [78]:
y_train.min()

0

In [None]:
evaluator = Evaluator(args.dataset_name)