### Global analysis (no marginalization)
This notebook analyses and plots the global results of mode1 and mode4 with no marginalization on graph properties.


The results are based on

-----------
OBS: Uncomment lines when mode4 is ready to be analysed. Feel free to add what you want

In [1]:
import pandas as pd
import json
from evaluation_utils import read_processed_shards, get_best_configuration_per_model
import seaborn as sns
import matplotlib.pyplot as plt
from collections import defaultdict
import numpy as np
from matplotlib.ticker import PercentFormatter
from scipy.stats import f_oneway
from tqdm import tqdm
import pylab as pl
import matplotlib as mpl
from collections import OrderedDict
import itertools



##### CONSTANTS
These constants are used to specify different views of our results which might be interesting

In [2]:
# --------- DATA ---------

MODE_1_PROCESSED_DIR = f'/home/data_shares/scara/graphworld/results/mode1/processed' # mode1
#MODE_4_PROCESSED_DIR = f'results/mode1/processed' # mode4

# --------- MODELS ---------
GENERATION_FEATURES = ['AttributeMask', 'CorruptedEmbeddingsReconstruction', 'CorruptedFeaturesReconstruction']
GENERATION_STRUCTURE = ['EdgeMask', 'GAE']
GENERATION_ALL = GENERATION_FEATURES + GENERATION_STRUCTURE
AUXILIARY_ALL = ['NodeClusteringWithAlignment', 'S2GRL', 'PairwiseAttrSim', 'GraphPartitioning']
CONTRAST_SAME_SCALE = ['BGRL', 'GBT', 'GCA', 'SelfGNNPPR', 'SelfGNNSplit', 'MERIT']
CONTRAST_CROSS_SCALE = ['DeepGraphInfomax', 'GraphInfoClust', 'SUBGCON']
CONTRAST_ALL = CONTRAST_SAME_SCALE + CONTRAST_CROSS_SCALE
HYBRID_ALL = ['G_Zoom', 'MEtAl', 'MVMI_FT']
ALL_MODELS = GENERATION_ALL + AUXILIARY_ALL + CONTRAST_ALL + HYBRID_ALL
N_MODELS = len(ALL_MODELS)

# --------- ENCODERS ---------
ENCODERS = ['GCN', 'GAT', 'GIN']

# --------- TRAINING SCHEMES ---------
TRAINING_SCHEMES = ['JL', 'PF', 'URL']

# --------- DATA SPLIT ---------
DATA_SPLIT = ['train', 'val', 'test']

# --------- BASELINES ---------
BASELINES = ['GCN', 'GAT', 'GIN']

# --------- TEST METRIC ---------
TEST_METRIC = 'test_rocauc_ovr'

##### INDEXING
General scheme of a column: `Encoder_model_scheme`
Example of a column: `GCN_AttributeMask_JL`

##### READING DATA
We read the data and remove graphs where any model has failed / crashed

In [3]:
# --------- READING DATA ---------
df1 = read_processed_shards(MODE_1_PROCESSED_DIR) # mode1
df1.drop(['marginal_param', 'fixed_params'], axis=1, inplace=True)
df1.dropna(axis=0, inplace=True)

#df4 = read_processed_shards(PROCESSED_DIR_4) # mode4
#df4.drop(['marginal_param', 'fixed_params'], axis=1, inplace=True)
#df4.dropna(axis=0, inplace=True)

1.ndjson
2.ndjson
3.ndjson
4.ndjson
5.ndjson
6.ndjson
7.ndjson
8.ndjson
9.ndjson
10.ndjson
11.ndjson
concatenating


In [9]:
df1

Unnamed: 0,nvertex,avg_degree,feature_center_distance,feature_dim,edge_center_distance,edge_feature_dim,p_to_q_ratio,num_clusters,cluster_size_slope,power_exponent,...,GIN_MVMI_FT_URL_encoder_in_channels,GIN_MVMI_FT_URL_encoder_hidden_channels,GIN_MVMI_FT_URL_encoder_num_layers,GIN_MVMI_FT_URL_encoder_dropout,GIN_MVMI_FT_URL_encoder_out_channels,GIN_MVMI_FT_URL_pretext_k,GIN_MVMI_FT_URL_pretext_disagreement_regularization,GIN_MVMI_FT_URL_pretext_common_representation_regularization,GIN_MVMI_FT_URL_skipped,graph_id
0,261,4.273973,3.941096,16,2.0,2,3.316129,2,0.234167,0.611330,...,16,8,1,0.3,2,2,0.10,0.1,False,0
1,458,9.460917,4.515301,16,2.0,2,7.717303,4,0.159576,0.748894,...,16,8,1,0.5,4,6,0.01,0.5,False,1
4,485,8.039024,4.484327,16,2.0,2,1.102266,2,0.067285,0.974368,...,16,8,1,0.0,2,3,0.10,0.3,False,4
5,462,11.402913,0.605213,16,2.0,2,1.365196,2,0.200492,0.641867,...,16,8,2,0.0,2,6,1.00,0.5,False,5
6,330,13.775578,2.933209,16,2.0,2,6.046855,4,0.210481,0.656033,...,16,8,1,0.8,4,2,1.00,0.3,False,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,470,13.603729,3.361385,16,2.0,2,4.370031,2,0.200980,0.956175,...,16,16,2,0.3,2,3,0.01,0.5,False,99995
99996,246,11.672269,2.388899,16,2.0,2,6.737449,5,0.198418,0.774095,...,16,16,1,0.5,5,6,0.10,0.1,False,99996
99997,183,2.491525,3.235314,16,2.0,2,9.344038,2,0.191659,0.783611,...,16,16,2,0.8,2,3,0.10,0.1,False,99997
99998,335,9.326530,2.034517,16,2.0,2,8.981967,2,0.141679,0.672916,...,16,8,1,0.0,2,3,0.01,0.3,False,99998


In [4]:
def unpivot_ssl_model_experiments(model : str, df : pd.DataFrame):
    '''
    Unpivot SSL model experimments. Each row corresponds to a single SSL method with an encoder and training scheme on a graph.
    '''
    cols = [col for col in df.columns if model in col]
    frames = []

    for (encoder, training_scheme) in itertools.product(*[ENCODERS, TRAINING_SCHEMES]):
        prefix = f'{encoder}_{model}_{training_scheme}'
        single_experiment = [col for col in cols if prefix in col]
        if not (len(single_experiment) == 0):
            experiment_df = df[single_experiment + ['graph_id']].rename(columns=lambda col: col.replace(f'{prefix}_', ''))
            experiment_df['encoder'] = encoder
            experiment_df['training_scheme'] = training_scheme
            frames += [experiment_df]

    ssl_df = pd.concat(frames)

    return ssl_df, cols

In [5]:
def unpivot_baseline_model_experiments(model : str, df : pd.DataFrame):
    '''
    Unpivot baseline model experimments. Each row corresponds to a single baseline model on a graph.
    '''
    model_cols = [col for col in df.columns if model in col]
    frames = []

    
    for training_scheme in TRAINING_SCHEMES:
        prefix = f'{model}__{training_scheme}'
        single_experiment = [col for col in model_cols if prefix in col]
        experiment_df = df[single_experiment + ['graph_id']].rename(columns=lambda col: col.replace(f'{prefix}_', ''))
        frames += [experiment_df]

    baseline_df = pd.concat(frames)

    return baseline_df, model_cols

In [6]:
df1['graph_id'] = df1.index.values
ssl_experiments = {}
baseline_experiments = {}

ssl_cols = []
baseline_cols = []


# Build all ssl models
for model in ALL_MODELS:
    ssl_df, model_cols = unpivot_ssl_model_experiments(model, df1)
    ssl_cols += model_cols
    ssl_experiments[model] = ssl_df



# All columns except for those with the SSL methods.
cols_minus_ssl_cols = list(set(df1.columns.tolist()) - set(ssl_cols))

# Build all baseline models
for model in BASELINES:
    baseline_df, model_cols = unpivot_baseline_model_experiments(model, df1[cols_minus_ssl_cols])
    baseline_cols += model_cols
    baseline_experiments[model] = baseline_df


# Build graphs
cols_minus_ssl_and_baselines = list(set(df1.columns.tolist()) - set(ssl_cols + baseline_cols))

graphs = df1[cols_minus_ssl_and_baselines]

### Sanity checks

In [7]:
for ssl_method in ssl_experiments:
    df = ssl_experiments[ssl_method]
    if (df.encoder == 'GAT').any():
        assert not (df[df.encoder == 'GAT'].encoder_heads.isna().any())
        assert df[df.encoder == 'GCN'].encoder_heads.isna().all()
        assert df[df.encoder == 'GIN'].encoder_heads.isna().all()
        print(ssl_method, '|', 'Yes GAT')
    else:
        print(ssl_method, '|', 'No GAT')

AttributeMask | Yes GAT
CorruptedEmbeddingsReconstruction | Yes GAT
CorruptedFeaturesReconstruction | Yes GAT
EdgeMask | Yes GAT
GAE | Yes GAT
NodeClusteringWithAlignment | Yes GAT
S2GRL | Yes GAT
PairwiseAttrSim | Yes GAT
GraphPartitioning | Yes GAT
BGRL | Yes GAT
GBT | Yes GAT
GCA | Yes GAT
SelfGNNPPR | No GAT
SelfGNNSplit | Yes GAT
MERIT | No GAT
DeepGraphInfomax | Yes GAT
GraphInfoClust | Yes GAT
SUBGCON | Yes GAT
G_Zoom | No GAT
MEtAl | Yes GAT
MVMI_FT | Yes GAT


In [8]:
ssl_experiments['MVMI_FT']

Unnamed: 0,val_accuracy,val_f1_micro,val_f1_macro,val_rocauc_ovr,val_rocauc_ovo,val_logloss,test_accuracy,test_f1_micro,test_f1_macro,test_rocauc_ovr,...,pretext_k,pretext_disagreement_regularization,pretext_common_representation_regularization,skipped,graph_id,encoder,training_scheme,train_pretext_epochs,train_pretext_lr,encoder_heads
0,1.00,1.00,1.000000,1.000000,1.000000,0.693147,0.991701,0.991701,0.991579,0.990654,...,3,0.10,0.5,False,0,GCN,JL,,,
1,0.25,0.25,0.100000,0.500000,0.500000,9.733307,0.198565,0.198565,0.082834,0.500000,...,2,1.00,0.1,False,1,GCN,JL,,,
4,1.00,1.00,1.000000,1.000000,1.000000,0.554518,1.000000,1.000000,1.000000,1.000000,...,3,0.10,0.1,False,4,GCN,JL,,,
5,1.00,1.00,1.000000,1.000000,1.000000,0.693147,0.911765,0.911765,0.911743,0.918554,...,2,1.00,0.5,False,5,GCN,JL,,,
6,0.85,0.85,0.845328,0.900000,0.900000,0.554255,0.675862,0.675862,0.603859,0.762859,...,3,0.01,0.5,False,6,GCN,JL,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,1.00,1.00,1.000000,1.000000,1.000000,0.061525,1.000000,1.000000,1.000000,1.000000,...,3,0.01,0.5,False,99995,GIN,URL,100.0,0.0100,
99996,0.84,0.84,0.834921,0.900000,0.900000,4.239043,0.806122,0.806122,0.771307,0.863135,...,6,0.10,0.1,False,99996,GIN,URL,50.0,0.0001,
99997,0.90,0.90,0.898990,0.900000,0.900000,3.486280,0.901840,0.901840,0.901837,0.908974,...,3,0.10,0.1,False,99997,GIN,URL,100.0,0.0100,
99998,1.00,1.00,1.000000,1.000000,1.000000,0.418593,0.965079,0.965079,0.964715,0.962585,...,3,0.01,0.3,False,99998,GIN,URL,100.0,0.0010,


In [88]:
all_models = []
for model in ssl_experiments:
    df_ssl = ssl_experiments[model][['test_rocauc_ovr', 'graph_id']]
    df_ssl['model'] = model
    all_models += [df_ssl]

df = pd.concat(all_models)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ssl['model'] = model
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ssl['model'] = model
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ssl['model'] = model
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the c

In [91]:
df.groupby(['model', 'graph_id']).test_rocauc_ovr.mean().reset_index()

Unnamed: 0,model,graph_id,test_rocauc_ovr
0,AttributeMask,0,0.898339
1,AttributeMask,1,0.812127
2,AttributeMask,4,0.695355
3,AttributeMask,5,0.723714
4,AttributeMask,6,0.845869
...,...,...,...
2056798,SelfGNNSplit,99995,0.908749
2056799,SelfGNNSplit,99996,0.838981
2056800,SelfGNNSplit,99997,0.951657
2056801,SelfGNNSplit,99998,0.908069


##### Mean and std global results for all models
Here we report the mean and std test metric of all models per main category:
- Generation-based
- Auxiliary-based
- Contrast-based
- Hybrid


In [None]:
# --------- Mode1 ---------
for m in ALL_MODELS:
    
    
df1.groupby()