# Load local dataset

In [1]:
import importlib
import random
import argparse
import configparser
import numpy as np
import networkx as nx
from collections import defaultdict

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch_sparse
from torch import Tensor
from torch.nn import Linear
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
import torch.optim as optim

from torch_geometric.utils import negative_sampling, to_networkx
from typing import Union, Tuple
from torch_geometric.typing import OptPairTensor, Adj, OptTensor, Size
from torch_sparse import SparseTensor, matmul
from torch_geometric.nn.conv import MessagePassing

from ogb.linkproppred import PygLinkPropPredDataset, Evaluator


import networkx as nx
import seaborn as sns
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt

import scipy
import math


from dataset_utils import node_feature_utils
from dataset_utils.node_feature_utils import *
import my_utils as utils

importlib.reload(utils)

load node feauture: imdb_degree_dist_shuffled.npy



<module 'my_utils' from '/li_zhengdao/github/GenerativeGNN/my_utils.py'>

In [19]:
# Load specific dataset:

import sys,os
sys.path.append(os.getcwd())


from PrepareDatasets import DATASETS
import my_utils
import dataset_utils


print(DATASETS.keys())
"""
    'REDDIT-BINARY': RedditBinary,
    'REDDIT-MULTI-5K': Reddit5K,
    'COLLAB': Collab,
    'IMDB-BINARY': IMDBBinary,
    'IMDB-MULTI': IMDBMulti,
    'NCI1': NCI1,
    'ENZYMES': Enzymes,
    'PROTEINS': Proteins,
    'DD': DD,
    "MUTAG": Mutag,
    'CSL': CSL
"""

data_names = ['MUTAG']
# data_names = ['IMDB-BINARY']
datasets_obj = {}
for k, v in DATASETS.items():
    if k not in data_names:
        continue
    
    print('loaded dataset, name:', k)
    dat = v(use_node_attrs=True)
    datasets_obj[k] = dat
    print(type(dat.dataset.get_data()))


dict_keys(['REDDIT-BINARY', 'REDDIT-MULTI-5K', 'COLLAB', 'IMDB-BINARY', 'IMDB-MULTI', 'NCI1', 'ENZYMES', 'PROTEINS', 'DD', 'MUTAG', 'CSL'])
loaded dataset, name: MUTAG
processed_dir:  DATA/MUTAG/processed
load dataset !
dataset len:  188
load splits: DATA/MUTAG/processed/MUTAG_splits.json
split counts: 10
<class 'list'>


In [20]:
da = datasets_obj['MUTAG'].get_test_fold(1, batch_size=1, shuffle=True).dataset[0]

from torch_geometric import utils as pyg_utils


G = pyg_utils.to_networkx(da)

In [26]:
def get_each_folder(data_name, fold_id, batch_size=1):
    
    fold_test = datasets_obj[data_name].get_test_fold(fold_id, batch_size=batch_size, shuffle=True).dataset
    fold_train, fold_val = datasets_obj[data_name].get_model_selection_fold(fold_id, inner_idx=None,
                                                                          batch_size=batch_size, shuffle=True)
    fold_train = fold_train.dataset
    fold_val = fold_val.dataset
    
    # train_G = [pyg_utils.to_networkx(d, node_attrs=['x']) for d in fold_train.get_subset()]
    # test_G = [pyg_utils.to_networkx(d, node_attrs=['x']) for d in fold_test.get_subset()]
    # print('x: ',train_G[0].nodes[0]['x'])
    
    train_adjs, test_adjs = [], []
    train_y, test_y = [], []
    
    def node_fea_to_dict(node_fea):
        res = {}
        for i in range(node_fea.shape[0]):
            res[i] = node_fea[i]
        return res
        
    for d in fold_train.get_subset():
        train_y.append(d.y.item())
        train_adjs.append([d.to_numpy_array()])

    for d in fold_test.get_subset():
        test_y.append(d.y.item())
        test_adjs.append([d.to_numpy_array()])
        
    return train_adjs, test_adjs, train_y, test_y
    # do not use val for kernel methods.
#     for d in fold.dataset.get_subset():

In [22]:
# Transform from networkx
from grakel.utils import graph_from_networkx

In [23]:
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

from grakel.datasets import fetch_dataset
from grakel.kernels import ShortestPath

# Loads the MUTAG dataset



# MUTAG = fetch_dataset("MUTAG", verbose=False)
# G, y = MUTAG.data, MUTAG.target
# print('G10:', G[0])

def train_with_kernel(gk):
    res=[]
    for i in range(10):
        G_train, G_test, y_train, y_test = get_each_folder(i)
        
        # G_train = [g for g in graph_from_networkx(G_train,node_labels_tag='x')]
        # G_test = [g for g in graph_from_networkx(G_test,node_labels_tag='x')]
        # print('G_train 10:',G_train[:10])
        
        # G_train, G_test, y_train, y_test = train_test_split(G_train, y_train, test_size=0.1)
        # Uses the shortest path kernel to generate the kernel matrices
        
        K_train = gk.fit_transform(G_train)
        K_test = gk.transform(G_test)

        # Uses the SVM classifier to perform classification
        clf = SVC(kernel="precomputed")
        clf.fit(K_train, y_train)
        y_pred = clf.predict(K_test)

        # Computes and prints the classification accuracy
        acc = accuracy_score(y_test, y_pred)
        res.append(acc)
        # print("Accuracy:", str(round(acc*100, 2)) + "%")
        
    res = np.array(res)
    print(f'Acc, mean: {round(np.mean(res)*100, 4)}, std: {round(100*np.std(res),4)}')

In [24]:
# ShortestPathKernel
gk = ShortestPath(normalize=True, with_labels=False)
train_with_kernel(gk)

KeyError: 'IMDB-BINARY'

In [25]:
from grakel.kernels import RandomWalk

# TODO: other kernel
rw_gk = RandomWalk()

train_with_kernel(rw_gk)

KeyError: 'IMDB-BINARY'

# SVM for $|V|+\alpha|E|$

In [27]:
from dataset_utils.node_feature_utils import graph_invariant

def train_simple_svm(kernel_name):
    res = []
    for i in range(10):
        train_adjs, test_adjs, train_y, test_y= get_each_folder(data_names[0], i)
        # NOTE: adj -> graph_features
        
        train_x = [graph_invariant(adj=adj[0]) for adj in train_adjs]
        test_x = [graph_invariant(adj=adj[0]) for adj in test_adjs]
        Classifier = SVC(kernel=kernel_name)
        Classifier.fit(train_x, train_y)
        y_pred = Classifier.predict(test_x)
        # Computes and prints the classification accuracy
        acc = accuracy_score(test_y, y_pred)

        res.append(acc)
        # print("Accuracy:", str(round(acc*100, 2)) + "%")
        
    res = np.array(res)
    print(f'Acc, mean: {round(np.mean(res)*100, 4)}, std: {round(100*np.std(res),4)}')



In [28]:

for kr in ['linear', 'poly', 'rbf', 'sigmoid']:
    train_simple_svm(kr)

Acc, mean: 85.117, std: 8.0719
Acc, mean: 82.9532, std: 7.7978
Acc, mean: 86.2281, std: 8.5031
Acc, mean: 66.4912, std: 2.2807
