In [99]:
# Modules

import warnings
import torch
import scipy.sparse as sp
import numpy as np
import os
import torch_geometric.transforms as T
from torch_geometric.datasets import Planetoid, WikipediaNetwork, Actor, WebKB, Amazon, Coauthor, WikiCS
from torch_geometric.utils import remove_self_loops

from collections import defaultdict
import numpy as np 
import itertools
from collections import defaultdict
import math

import dgl
import pandas as pd

warnings.simplefilter("ignore")

In [100]:
# Load dataset using pyg

def load_data_with_pyg(dataset_name):
    path = os.path.join('.', 'data', dataset_name)

    if dataset_name in ['cora', 'citeseer', 'pubmed']:
        dataset = Planetoid(path, dataset_name)
    elif dataset_name in ['chameleon']:
        dataset = WikipediaNetwork(path, dataset_name)
    elif dataset_name in ['squirrel']:
        dataset = WikipediaNetwork(path, dataset_name, transform=T.NormalizeFeatures())
    elif dataset_name in ['actor']:
        dataset = Actor(path)
    elif dataset_name in ['cornell', 'texas', 'wisconsin']:
        dataset = WebKB(path, dataset_name)
    elif dataset_name in ['computers', 'photo']:
        dataset = Amazon(path, dataset_name, transform=T.NormalizeFeatures())
    elif dataset_name in ['cs', 'physics']:
        dataset = Coauthor(path, dataset_name, transform=T.NormalizeFeatures())
    elif dataset_name in ['wikics']:
        dataset = WikiCS(path)

    data = dataset[0]
    return data

name = 'pubmed'
data = load_data_with_pyg(name)

In [101]:
# Initialzie dataset dictiionary which becomes dataset series later

def get_dataset_dict(data):
    edges_without_self_loops = remove_self_loops(data.edge_index)[0]

    features = data.x
    labels = data.y
    num_nodes = len(labels) 
    num_edges = len(edges_without_self_loops[0])
    dataset_dict = {'name': name, 'num_nodes': num_nodes, 'num_edges': num_edges,
                    'features': features, 'labels': labels, 
                    'edges': edges_without_self_loops}

    dataset_dict['src'] = dataset_dict['edges'][0]
    dataset_dict['dst'] = dataset_dict['edges'][1]

    dataset_dict['graph'] = dgl.graph((dataset_dict['src'], dataset_dict['dst']), 
                                      num_nodes=num_nodes, idtype=torch.int)

    return dataset_dict

dataset_dict = get_dataset_dict(data)

In [102]:
# Functions to calculate Label Informations and Adjusted Homophily

def get_p_bar_k(dataset_dict):
    labels = dataset_dict['labels']
    num_edges = dataset_dict['num_edges']
    num_nodes = dataset_dict['num_nodes']
    degree_dict = defaultdict(int)
    norm_degree_dict = defaultdict(int)

    for node in range(num_nodes):
        label = labels[node].item()
        degrees = dataset_dict['graph'].in_degrees(node) + dataset_dict['graph'].out_degrees(node)
        degree_dict[label] += degrees
    for label, degree in degree_dict.items():
        norm_degree_dict[label] = degree / (2 * num_edges)
    
    return norm_degree_dict


def get_p_c1_and_c2(dataset_dict):
    unique_labels = np.unique(dataset_dict['labels'])
    unq_labels_product = list(itertools.product(unique_labels, repeat=2))
    num_edges = dataset_dict['num_edges']

    src = dataset_dict['src'].numpy()
    dst = dataset_dict['dst'].numpy()
    src_labels = []
    dst_labels = []
    

    for v in src:
        src_labels.append(dataset_dict['labels'][v].item())
    for v in dst:
        dst_labels.append(dataset_dict['labels'][v].item())

    edge_label_pairs = list(zip(src_labels, dst_labels))
    label_pairs_dict = defaultdict(int)
    norm_label_pairs_dict = defaultdict(int)

    for product in unq_labels_product:
        c1, c2 = product
        for edge_label_pair in edge_label_pairs:
            y_u, y_v = edge_label_pair
            if y_u == c1 and y_v == c2:
                label_pairs_dict[(c1, c2)] += 1
    for pair, cnt in label_pairs_dict.items():
        norm_label_pairs_dict[pair] = cnt / (num_edges)

    return norm_label_pairs_dict


def get_p_k(dataset_dict):
    labels = list(dataset_dict['labels'].numpy())
    edges_num = dataset_dict['num_edges']
    nodes_num = dataset_dict['num_nodes']
    label_dict = defaultdict(int)
    norm_label_dict = defaultdict(int)

    for label in np.unique(labels):
        label_dict[label] = labels.count(label)
    for label, cnt in label_dict.items():
        norm_label_dict[label] = cnt / nodes_num
    
    return norm_label_dict


def check_summation(norm_degree_dict, norm_label_dict, norm_label_pairs_dict):
    checker_dict = defaultdict(int)
    checker_dict['sum_of_degree'] = {sum(norm_degree_dict.values())}
    checker_dict['sum_of_num_class'] = {sum(norm_label_dict.values())}
    checker_dict['sum_of_class_pairs'] = {sum(norm_label_pairs_dict.values())}

    return checker_dict


def calc_label_information(dataset_dict):
    numerator = 0
    denominator = 0

    norm_degree_dict = get_p_bar_k(dataset_dict)
    norm_label_dict = get_p_k(dataset_dict)
    norm_label_pairs_dict = get_p_c1_and_c2(dataset_dict)

    checker_dict = check_summation(norm_degree_dict, norm_label_dict, norm_label_pairs_dict)

    for c1_c2, p_c1_c2 in norm_label_pairs_dict.items():
        c1, c2 = c1_c2
        p_bar_c1 = norm_degree_dict[c1]
        p_bar_c2 = norm_degree_dict[c2]
        numerator += p_c1_c2 * math.log2(p_c1_c2 / (p_bar_c1 * p_bar_c2))
    for p_bar_c in norm_degree_dict.values():
        denominator += p_bar_c * math.log2(p_bar_c)

    label_information = - (numerator / denominator)

    return label_information, checker_dict


def calc_edge_homophily(dataset_dict):   # yandex_dataloader
    src = dataset_dict['src'].numpy()
    dst = dataset_dict['dst'].numpy()
    pairs = list(zip(src, dst))
    labels = dataset_dict['labels'].numpy()

    homophily_count = 0
    for pair in pairs:
        u, v = pair
        if labels[u] == labels[v]:
            homophily_count += 1

    edge_homophily = homophily_count / len(src)
    return edge_homophily


def calc_adjusted_homophily(dataset_dict):  # yandex_dataloader
    label_degree_cnt = defaultdict(int)
    labels = dataset_dict['labels'].numpy()

    for node in range(len(labels)):
        label = labels[node]
        degree = (dataset_dict['graph'].in_degrees(node) + dataset_dict['graph'].out_degrees(node))
        label_degree_cnt[label] += degree  # D_k

    total = 0
    num_edges = dataset_dict['num_edges']
    num_edges = (2 * num_edges) * (2 * num_edges)

    for degree_cnt in label_degree_cnt.values():
        degree_cnt = degree_cnt * degree_cnt
        total += (degree_cnt / num_edges)

    edge_hm = calc_edge_homophily(dataset_dict)
    adjusted_homophily = (edge_hm - total) / (1.0 - total)
    return adjusted_homophily

In [103]:
label_information, checker_dict = calc_label_information(dataset_dict)

for value in checker_dict.values():
    print(value.pop)
    print(type(value.pop))
    # if value.pop < 0.9:
    #     print('Something is wrong!!!')
    #     assert(False)    

edge_homophily = calc_edge_homophily(dataset_dict)
adjusted_homophily = calc_adjusted_homophily(dataset_dict)

dataset_dict['label_information'] = label_information
dataset_dict['edge_homo'] = edge_homophily
dataset_dict['adjusted_homo'] = adjusted_homophily

<built-in method pop of set object at 0x7f5d388e3060>
<class 'builtin_function_or_method'>
<built-in method pop of set object at 0x7f5d25119a80>
<class 'builtin_function_or_method'>
<built-in method pop of set object at 0x7f5d25119b60>
<class 'builtin_function_or_method'>


In [104]:
# Generate dataset series 

def get_series(dataset_dict):
    keys = dataset_dict.keys()
    values = dataset_dict.values()
    df = pd.Series(values, index=keys)
    return df

ser = get_series(dataset_dict)
ser.drop(labels=['features', 'labels', 'edges', 
                            'src', 'dst', 'graph'], 
                    inplace=True)

In [105]:
ser.head(10)  # last test
csv_path = 'series_' + name + '.csv'
ser.to_csv(csv_path, index=True)

In [106]:
# Generate dataframe with series

ser_cora = pd.read_csv('series_cora.csv')
ser_citeseer = pd.read_csv('series_citeseer.csv')
ser_pubmed = pd.read_csv('series_pubmed.csv')

In [115]:
# It has to be changed function format

index_list = []
value_list = []
column_list = []
for row in ser_cora.values:
    key, value = row
    if key == 'name':
        column_list.append(value)
        continue
    else:
        index_list.append(key)
    value_list.append(value)

df_cora = pd.DataFrame(value_list, index=index_list, columns=column_list)
df_cora.head(10)

Unnamed: 0,cora
num_nodes,2708.0
num_edges,10556.0
label_information,0.5903741288693318
edge_homo,0.8099658961727927
adjusted_homo,0.7710854223002092


In [121]:
df_all = pd.concat([df_cora, df_citeseer, df_pubmed], axis=1)

Unnamed: 0,cora,citeseer,pubmed
num_nodes,2708.0,3327.0,19717.0
num_edges,10556.0,9104.0,88648.0
label_information,0.5903741288693318,0.4507603192321097,0.409284486559879
edge_homo,0.8099658961727927,0.7355008787346221,0.8023869686851367
adjusted_homo,0.7710854223002092,0.6706593983410405,0.6860214783089259


In [122]:
df_all.plot.bar(y='label_information', rot=1)

KeyError: 'label_information'