In [1]:
import os.path
import os
import torch
import torch.nn.functional as F
import torch.nn as nn
import torch_geometric.transforms as T
from torch_geometric.datasets import Planetoid,TUDataset
from torch_geometric.nn import GATConv, GCNConv,GINConv
import numpy as np
import igraph as ig
from torch_geometric.nn import global_mean_pool,global_add_pool
from functools import reduce
import pickle
from sklearn.model_selection import KFold,StratifiedKFold
import torch.optim as optim
import csv
import pandas as pd
from torch_geometric.loader import DataLoader
from torch_geometric.utils import degree
from torch_geometric.utils.convert import to_networkx

import networkx as nx


In [2]:
import utils

# 1. Load dataset

### A. dataset_name

ENZYMES, PROTEINS_full
### B. Check data information

In [3]:
dataset_name = 'PROTEINS_full'
dataset = TUDataset(root='dataset', name=dataset_name,use_node_attr=True)

print()
print(f'Dataset: {dataset}:')
print('====================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

print('=============================================================')
total_cycle=0
total_magnitude_cycle=0
nonzero_count=0
for i in range(len(dataset)):
    data=dataset[i]
    Xgraph = to_networkx(data,to_undirected= True)
    num_g_cycle=Xgraph.number_of_edges() - Xgraph.number_of_nodes() + nx.number_connected_components(Xgraph)
    total_cycle += num_g_cycle
    node_each_cycle=nx.cycle_basis(Xgraph)  
   
    if len(node_each_cycle)>0:
        magnitude = 0
        for j in range(len(node_each_cycle)):
            magnitude += len(node_each_cycle[j])
        average_magnitude=magnitude/len(node_each_cycle)
        nonzero_count+=1
    else :
        average_magnitude=0
        
    total_magnitude_cycle+=average_magnitude
avg_total_cycle=total_cycle/len(dataset)   
avg_total_magnitude=total_magnitude_cycle/len(dataset)
print(f'AVERAGE # H1 CYCLES: {avg_total_cycle}') 
print(f'AVERAGE MAGNITUDE # CYCLES: {avg_total_magnitude}') 
print(f'# GRAPH WITH CYCLES: {nonzero_count}') 

Downloading https://www.chrsmrrs.com/graphkerneldatasets/PROTEINS_full.zip
Extracting dataset/PROTEINS_full/PROTEINS_full.zip
Processing...
Done!



Dataset: PROTEINS_full(1113):
Number of graphs: 1113
Number of features: 32
Number of classes: 2
AVERAGE # H1 CYCLES: 34.83647798742138
AVERAGE MAGNITUDE # CYCLES: 3.7217758564407077
# GRAPH WITH CYCLES: 1112


# 2. Preprocessing
## A. Normalize 
- ENZYMES,PROTEINS_full (normalize=True)

In [4]:
dataset=utils.data_load(dataset,normalize=True)
max_node=utils.max_node_dataset(dataset)

In [5]:
file = f'./dataset/{dataset_name}/H1_ver2'

if os.path.isfile(file):
    NEWDATA = torch.load(file)     
    print('file')

else:        
    SUB_ADJ=[]
    RAW_SUB_ADJ=[]
    NEWDATA=[]
    for i in range(len(dataset)):                    
        
        data=dataset[i]
        v1=data.edge_index[0,:]
        v2=data.edge_index[1,:]
        #print(torch.max(v1))
        adj = torch.zeros((max_node,max_node))
        adj[v1,v2]=1
        adj=adj.numpy()
        (adj==adj.T).all()
        list_feature=(data.x)
        list_adj=(adj)       
        
        #print(dataset[i])
        _, _, _, _, sum_sub_adj = utils.make_cycle_adj_speed_nosl(list_adj,data)
        
        if i % 100 == 0:
            print(i)
            
        #_sub_adj=np.array(sub_adj)

        if len(sum_sub_adj)>0:    
            new_adj=np.stack((list_adj,sum_sub_adj),0)
        else :
            sum_sub_adj=np.zeros((1, list_adj.shape[0], list_adj.shape[1]))
            new_adj=np.concatenate((list_adj.reshape(1, list_adj.shape[0], list_adj.shape[1]),sum_sub_adj),0)

        #SUB_ADJ.append(new_adj)
        SUB_ADJ=new_adj
        #------합치기
        data=dataset[i]
        check1=torch.sum(data.edge_index[0]-np.where(SUB_ADJ[0]==1)[0])+torch.sum(data.edge_index[1]-np.where(SUB_ADJ[0]==1)[1])
        if check1 != 0 :
            print('error')
        data.xx=data.x[:,:-3]
        data.cycle_index=torch.stack((torch.LongTensor(np.where(SUB_ADJ[1]!=0)[0]), torch.LongTensor(np.where(SUB_ADJ[1]!=0)[1])),1).T.contiguous()
        #data.cycle_attr = torch.FloatTensor(SUB_ADJ[1][np.where(SUB_ADJ[1]!=0)[0],np.where(SUB_ADJ[1]!=0)[1]]) 
        #FloatTensor 형태여야됨 
        NEWDATA.append(data)
        
    torch.save(NEWDATA,file)




0
100
200
300
400
500
600
700
800
900
1000
1100


## C. stratified 10-fold

In [6]:
dataset_class=[]
for i in range(len(dataset)):
    dataset_class.append(dataset[i].y)
dataset_class=np.array(dataset_class)
dataset_class.shape, dataset_class[[0,10,500]]


# In[6]:


folder = f'./dataset/{dataset_name}/kfold_data'
if os.path.isdir(folder):
    print('folder_on')

    for j in range(10):
        print(j)
        test_index = torch.as_tensor(np.loadtxt(f'./dataset/{dataset_name}/kfold_data/test_idx-{j}.txt',dtype=np.int32), dtype=torch.long)
        for k in range(10):
            train_index = torch.as_tensor(np.loadtxt(f'./dataset/{dataset_name}/kfold_data/train_total_{j}/train_idx-{k}.txt',dtype=np.int32), dtype=torch.long)
            valid_index = torch.as_tensor(np.loadtxt(f'./dataset/{dataset_name}/kfold_data/train_total_{j}/valid_idx-{k}.txt',dtype=np.int32), dtype=torch.long)    
            all_index = reduce(np.union1d, (train_index, valid_index, test_index))
            assert len(dataset) == len(all_index)

else :        
    os.makedirs(folder)
    kkf=StratifiedKFold(n_splits=10, shuffle=True)
    #kf = KFold(n_splits=10, shuffle=True)
    kkf2=StratifiedKFold(n_splits=10, shuffle=True)
    #kf2 = KFold(n_splits=10, shuffle=True)
    kkf.get_n_splits(dataset,dataset_class)
    print(kkf)
    j=0
    for train_total_index, test_index in kkf.split(dataset,dataset_class):
        #print(train_index, test_index)
        np.savetxt(f'./dataset/{dataset_name}/kfold_data/train_total_idx-{j}.txt',(train_total_index.astype(np.int64)), fmt='%i', delimiter='\t')
        np.savetxt(f'./dataset/{dataset_name}/kfold_data/test_idx-{j}.txt',(test_index.astype(np.int64)), fmt='%i', delimiter='\t')
        assert len(dataset)==len(reduce(np.union1d, (test_index, train_total_index)))
        k=0
        os.mkdir(f'./dataset/{dataset_name}/kfold_data/train_total_{j}') 
        
        dataset_class_train=[]
        dataset_train=[]
        for i in train_total_index:
            dataset_class_train.append(dataset[i].y)
            dataset_train.append(dataset[i])
        dataset_class_train=np.array(dataset_class_train)
        dataset_train=np.array(dataset_train)
        kkf2.get_n_splits(train_total_index,dataset_class_train)
        
        for ii, jj in kkf2.split(dataset_train,dataset_class_train):        
            valid_index=train_total_index[jj]
            train_index=train_total_index[ii]
            np.savetxt(f'./dataset/{dataset_name}/kfold_data/train_total_{j}/valid_idx-{k}.txt',(valid_index.astype(np.int64)), fmt='%i', delimiter='\t')
            np.savetxt(f'./dataset/{dataset_name}/kfold_data/train_total_{j}/train_idx-{k}.txt',(train_index.astype(np.int64)), fmt='%i', delimiter='\t')
            k+=1
            assert len(train_total_index)==len(reduce(np.union1d, (valid_index, train_index)))
        j+=1




folder_on
0
1
2
3
4
5
6
7
8
9


# 3. Train & Test

### Baseline-GNNs
- from nets import GCN, GAT, GIN
- Option Cy2C=False

### Cy2C-GNNs
- from nets import Cy2C-GCN, Cy2C-GAT, Cy2C-GIN
- Option Cy2C=True(default)

## A. Cy2C-GNNs 

In [7]:
from nets import Cy2C_GCN, Cy2C_GAT, Cy2C_GIN
from Trainer_part import Trainer
device = torch.device('cuda:2' if torch.cuda.is_available() else 'cpu')
model_name=['Cy2C_GCN', 'Cy2C_GAT', 'Cy2C_GIN']
class_name=[Cy2C_GCN, Cy2C_GAT, Cy2C_GIN]



In [8]:
#(BF)Cy2C_GAT_3_drop0.0(0.3)_deacy1e-05_besttest_0.8035714285714286_0.0357142857142857_finaltest_0.7767857142857143_0.008928571428571397
lr=1e-3
batch_size=32

In [None]:
for i,CLASS_NAME in enumerate(class_name):
    for decay in [0.0, 0.0001]:
        for hidden_dim in [64,128]:
            for drop_ini in [0.0,0.2,0.4]:
                for drop_mid in [0.0, 0.2, 0.4]:
                    for n_layer in [1,2,3,4,5]:
                        print(n_layer)
                        name=f'(10FOLD){model_name[i]}_{n_layer}({hidden_dim})_drop{drop_ini}({drop_mid})_deacy{decay}'
                        print('=====================================')
                        print('=====',name,'=====',dataset_name,'=====')
                        print('=====================================')
                        trainer=Trainer(name, dataset_name,NEWDATA,device,CLASS_NAME,dataset.num_node_features-3,dataset.num_classes,lr=lr,hidden_dim=hidden_dim,n_layer=n_layer,num_workers=2,drop_mid=drop_mid,small_fold=1,batch_size=batch_size,decay=decay, main_fold=10,Cy2C=True)
                        trainer.train()

## B. Baseline-GNNs

In [9]:
from nets import GCN, GAT, GIN
from Trainer_part import Trainer
device = torch.device('cuda:2' if torch.cuda.is_available() else 'cpu')
model_name=['GCN', 'GAT', 'GIN']
class_name=[GCN, GAT, GIN]

In [10]:
lr = 1e-4
batch_size=32

In [None]:
for i,CLASS_NAME in enumerate(class_name):
    for decay in [0.0]:
        for hidden_dim in [136]:
            for drop_ini in [0.0]:
                for drop_mid in [0.0]:
                    for n_layer in [1,2,3,4,5]:
                        print(n_layer)
                        name=f'{model_name[i]}_{n_layer}({hidden_dim})_drop{drop_ini}({drop_mid})_deacy{decay}'
                        print('=====================================')
                        print('=====',name,'=====',dataset_name,'=====')
                        print('=====================================')
                        trainer=Trainer(name, dataset_name,NEWDATA,device,CLASS_NAME,dataset.num_node_features-3,dataset.num_classes,lr=lr,hidden_dim=hidden_dim,n_layer=n_layer,num_workers=2,drop_mid=drop_mid,small_fold=1,batch_size=batch_size,decay=decay, main_fold=10,Cy2C=False)
                        trainer.train()

1
===== GCN_1(136)_drop0.0(0.0)_deacy0.0 ===== PROTEINS_full =====
load mainfold, subfold== 0 0
