In [None]:
%load_ext jupyter_spaces

In [None]:
from deeprobust.graph.data import Dataset
import pandas
import numpy as np
import scipy.sparse
import networkx as nx
import seaborn as sns
import matplotlib.pyplot as plt
from collections import defaultdict, OrderedDict, deque
import signac
import pickle
import itertools
from jupyter_spaces import get_spaces
import jupyter_spaces
from scipy.special import softmax
import warnings
import itertools
import plotly
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from IPython.display import HTML
import re
from ipywidgets import interact, fixed, Textarea, Layout
import ipywidgets as widgets
from pathlib import Path
from io import StringIO
import sys
import copy

In [None]:
HRPath = Path("../../").resolve()
assert (HRPath / "HeteroRobust" / "__init__.py").exists()
sys.path.append(str(HRPath))
import HeteroRobust
from HeteroRobust.attacks.modules.sparse_smoothing.cert import binary_certificate

In [None]:
project = signac.get_project("../../")
datasetRoot = "../../datasets/data"
datasetDict = dict()
for datasetName in ["citeseer", "cora"]:
    datasetDict[datasetName] = Dataset(root=datasetRoot, name=datasetName)

In [None]:
%pwd

In [None]:
df_expRun = pandas.read_csv("sparse-smoothing-cert.csv", index_col=0, keep_default_na=False, na_values=[""])

df_expRun = df_expRun.drop('evasionJobID', 1)
df_expRun = df_expRun.drop('poisonJobID', 1)

incomplete_mask = df_expRun.cleanJobID.isnull()
if incomplete_mask.sum() > 0:
    warnings.warn(f"{incomplete_mask.sum()} experiments are incomplete!")
df_expRun_Original = df_expRun.copy()
df_expRun = df_expRun.loc[~incomplete_mask]

In [None]:
df_expRun.loc[df_expRun.model_arg.isna(), "model_arg"] = ""

In [None]:
h2gcn_code_mask = (df_expRun.model == "H2GCN")
re_h2gcn2 = re.compile(r"$^")
h2gcn2_mask = df_expRun.model_arg.apply(lambda x: re_h2gcn2.search(x) is not None) & h2gcn_code_mask
df_expRun.loc[h2gcn2_mask, "model"] = "H2GCN-2"

h2gcn_code_mask = (df_expRun.model == "H2GCN")
re_h2gcn1 = re.compile(r"--network_setup M64-R-T1-G-V-C1-D[\d\.]*-MO")
h2gcn1_mask = df_expRun.model_arg.apply(lambda x: re_h2gcn1.search(x) is not None) & h2gcn_code_mask
df_expRun.loc[h2gcn1_mask, "model"] = "H2GCN-1"

h2gcn_code_mask = (df_expRun.model == "H2GCN")
re_model = re.compile(r"--network_setup I-T1-G-V-C1-M64-R-T2-G-V-C2-MO-R")
model_mask = df_expRun.model_arg.apply(lambda x: re_model.search(x) is not None) & h2gcn_code_mask
df_expRun.loc[model_mask, "model"] = "GraphSAGE"

h2gcn_code_mask = (df_expRun.model == "H2GCN")
re_mlp = re.compile(r"--network_setup M64-R-D0.5-MO")
mlp_mask = df_expRun.model_arg.apply(lambda x: re_mlp.search(x) is not None) & h2gcn_code_mask
df_expRun.loc[mlp_mask, "model"] = "MLP"

df_expRun.loc[df_expRun.model.isin(["MultiLayerGCN"]), "model"] = "GCN"

In [None]:
h2gcn_code_mask = (df_expRun.model == "CPGNN")
re_model = re.compile(r"--network_setup GGM64-VS-R-G-GMO-VS-E-BP2")
model_mask = df_expRun.model_arg.apply(lambda x: re_model.search(x) is not None) & h2gcn_code_mask
df_expRun.loc[model_mask, "model"] = "CPGNN-Cheby"

h2gcn_code_mask = (df_expRun.model == "CPGNN")
re_model = re.compile(r"--network_setup M64-R-MO-E-BP2")
model_mask = df_expRun.model_arg.apply(lambda x: re_model.search(x) is not None) & h2gcn_code_mask
df_expRun.loc[model_mask, "model"] = "CPGNN-MLP"

In [None]:
h2gcn_code_mask = (df_expRun.model == "GPRGNN")
re_model = re.compile(r"--dropout 0.5")
model_mask = df_expRun.model_arg.apply(lambda x: re_model.search(x) is not None) & h2gcn_code_mask
df_expRun.loc[model_mask, "model"] = "GPRGNN-D0.5"

In [None]:
h2gcn_code_mask = (df_expRun.model == "FAGCN")
re_model = re.compile(r"--nhid 64 --eps [0-8\.]* --dropout 0.5")
model_mask = df_expRun.model_arg.apply(lambda x: re_model.search(x) is not None) & h2gcn_code_mask
df_expRun.loc[model_mask, "model"] = "FAGCN-Tune"

In [None]:
display(df_expRun)

In [None]:
df_expRun_pivot = df_expRun.pivot_table(index=["AttackSession", "SessionConfig", "datasetName", "model", "model_arg"], 
                      aggfunc=dict(
                          cleanJobID=lambda x: ",".join(x)
                      ))
df_expRun_pivot_flat = df_expRun_pivot.reset_index()

# Robustness Certification

## Basic Function

In [None]:
# Adapted from https://github.com/sigeisler/reliable_gnn_via_robust_aggregation/blob/main/experiment_smoothing.py

def calc_certification_ratio(smoothing_result: dict, idx_selected: np.ndarray, labels: np.ndarray,
                             mask: np.ndarray = None) -> np.ndarray:
    """Calculation of the certification ratio. `R(r_a, r_d)` in our paper.
    Parameters
    ----------
    smoothing_result : Dict[str, Any]
        Dictionary with smoothing results.
    idx_selected : np.ndarray
        Array containing the indices of e.g. the test nodes.
    labels : np.ndarray, optional
        Ground truth class labels.
    mask : np.ndarray, optional
        To select only a subset of nodes e.g. by degree, by default None.
    Returns
    -------
    np.ndarray
        Bivariate certification ratio R(r_a, r_d).
    """
    grid_lower = smoothing_result['grid_lower'][idx_selected]
    grid_upper = smoothing_result['grid_upper'][idx_selected]
    if mask is not None:
        grid_lower = grid_lower[mask[idx_selected]]
        grid_upper = grid_upper[mask[idx_selected]]

    correctly_classified = (smoothing_result['votes'][idx_selected].argmax(1) == labels[idx_selected])
    if mask is not None:
        correctly_classified = correctly_classified[mask[idx_selected]]
    heatmap_loup = (
        (grid_lower > grid_upper)
        & correctly_classified[:, None, None]
    )

    heatmap_loup = heatmap_loup.mean(0)
    heatmap_loup[0, 0] = correctly_classified.mean()

    return heatmap_loup, correctly_classified

In [None]:
certGridDict = dict()
def getCertGrid(jobID, key="rho_grid", get_binary_cert=True):
    if jobID not in certGridDict:
        job = project.open_job(id=jobID)
        with job.data:
            assert job.data.certGrid.type.decode() == "separate"

            rho_grid = np.array(job.data.certGrid.rho_grid)
            max_ra = job.data.certGrid.max_ra
            max_rd = job.data.certGrid.max_rd
            heatmap = (rho_grid > 0.5).mean(0)
            
            preVotes = np.array(job.data.preVotes)
            votes = np.array(job.data.votes)
        
            if get_binary_cert:
                if job.doc.get("binary_certificate_nb", False):
                    grid_base = np.array(job.data.binary_certificate_nb.grid_base)
                    grid_lower = np.array(job.data.binary_certificate_nb.grid_lower)
                    grid_upper = np.array(job.data.binary_certificate_nb.grid_upper)
                else:
                    conf_alpha = job.sp.conf_alpha
                    assert job.sp.sampleConfig.votes.pf_plus_att == 0 and job.sp.sampleConfig.votes.pf_minus_att == 0
                    pf_plus = job.sp.sampleConfig.votes.pf_plus_adj
                    pf_minus = job.sp.sampleConfig.votes.pf_minus_adj
                    n_samples = job.sp.sampleConfig.votes.n_samples

                    grid_base, grid_lower, grid_upper = binary_certificate(votes, preVotes, n_samples, conf_alpha, pf_plus, pf_minus)
                    job.data.binary_certificate_nb = dict(
                        grid_base=grid_base,
                        grid_lower=grid_lower,
                        grid_upper=grid_upper
                    )
                    job.doc.binary_certificate_nb = True
            else:
                grid_base = None
                grid_lower = None
                grid_upper = None
                
        certGridDict[jobID] = dict(
            rho_grid=rho_grid,
            max_ra=max_ra,
            max_rd=max_rd,
            heatmap=heatmap,
            datasetName=job.sp.datasetName,
            preVotes=preVotes,
            votes=votes,
            job=job,
            grid_base=grid_base,
            grid_lower=grid_lower,
            grid_upper=grid_upper
        )
    return certGridDict[jobID][key]

In [None]:
def pivotACRadii(task_ind_list, df=df_expRun_pivot_flat):
    print(task_ind_list)
    plotDfList = []
    
    for task_ind in task_ind_list:
        jobItem = df.iloc[task_ind]

        jobIDList = jobItem.cleanJobID.split(",")
        rdDataDict = dict()
        maxRa = 0
        maxRd = 0

        for jobID in jobIDList:
            rho_grid = getCertGrid(jobID, key="rho_grid")
            preVotes = getCertGrid(jobID, key="preVotes")
            votes = getCertGrid(jobID, key="votes")
            datasetName = getCertGrid(jobID, key="datasetName")
            job = getCertGrid(jobID, key="job")
            with job:
                if Path("data.pkl").exists():
                    with open("data.pkl", "rb") as dataFile:
                        data = pickle.load(dataFile)
                else:
                    data = datasetDict[datasetName]
            
            grid_base = getCertGrid(jobID, key="grid_base")
            grid_lower = getCertGrid(jobID, key="grid_lower")
            grid_upper = getCertGrid(jobID, key="grid_upper")
            smoothing_result = {
                'grid_base': grid_base,
                'grid_lower': grid_lower,
                'grid_upper': grid_upper,
                'votes': votes,
                'pre_votes': preVotes
            }
            heatmap, preVoteCorrectMask = calc_certification_ratio(smoothing_result, data.idx_test, data.labels)
            certResultNodes = (grid_lower > grid_upper)[data.idx_test]
            
            maxRa = max(getCertGrid(jobID, key="max_ra"), maxRa + 1)
            maxRd = max(getCertGrid(jobID, key="max_rd"), maxRd + 1)
            
            certResultNodesCorrect = certResultNodes[preVoteCorrectMask, :, :]
            radiiMat = np.zeros((certResultNodesCorrect.shape[0], 2))
            for i in range(certResultNodesCorrect.shape[0]):
                _cert_slice = certResultNodesCorrect[i, :, :]
                _cert_slice[0, 0] = True
                wRes = np.where(_cert_slice)
                radiiMat[i, :] = (wRes[0].max(), wRes[1].max())
            avgRadii = radiiMat.mean(0)
            
            rdDataDict[jobID] = (heatmap, avgRadii, preVoteCorrectMask)
    
        acScoreMat = np.zeros(len(rdDataDict))
        avgRadiiMat = np.zeros((len(rdDataDict), 2))
        avgAccMat = np.zeros(len(rdDataDict))
        for i, (key, value) in enumerate(rdDataDict.items()): 
            value_flat = value[0].flatten()
            ac_score = value_flat.sum() - value[0][0, 0]
            acScoreMat[i] = ac_score
            
            avgRadiiMat[i, :] = value[1]
            avgAccMat[i] = value[2].mean()
        avgRadiiMatMean = avgRadiiMat.mean(0)
        avgRadiiMatStd = avgRadiiMat.std(0)
        plotDf = pandas.Series({
            "model": jobItem.model,
            "model_arg": jobItem.model_arg,
            "exp_count": len(rdDataDict),
            "max_ra": maxRa,
            "max_rd": maxRd,
            "AC_mean": acScoreMat.mean(),
            "AC_std": acScoreMat.std(),
            "ra_mean": avgRadiiMatMean[0],
            "ra_std": avgRadiiMatStd[0],
            "rd_mean": avgRadiiMatMean[1],
            "rd_std": avgRadiiMatStd[1],
            "acc_mean": avgAccMat.mean(),
            "acc_std": avgAccMat.std()
        })
        plotDfList.append(plotDf)
    
    resultDf = pandas.DataFrame(data=plotDfList)
    return resultDf

## Results on Multiple Seeds

- Ratio of **all nodes** that are certified to be robust without conditioning on correctness
- Ratio of **all nodes** that are **both** correctly classified and certified to be robust
- Ratio of **correctly classified nodes** that are certified to be robust

### LaTeX Table

In [None]:
modelHeadDict = {
    "H2GCN-2": r"\textbf{H$_2$GCN} & \checkmark & &",
    "GraphSAGE": r"\textbf{GraphSAGE} & \checkmark & &",
    "CPGNN-MLP": r"\textbf{CPGNN} & \checkmark & &", 
    "GPRGNN": r"\textbf{GPR-GNN} & \checkmark & &",
    "FAGCN": r"\textbf{FAGCN} & \checkmark & &", 
    "#sep": r"""\noalign{\vskip 0.25ex}
\cdashline{1-3}[0.8pt/2pt]
\cdashline{5-8}[0.8pt/2pt]
\cdashline{10-13}[0.8pt/2pt]
\noalign{\vskip 0.25ex}""",
    "GAT": r"\textbf{GAT} & & &",
    "GCN": r"\textbf{GCN} & & &"
}
datasetNameStr = r"\multirow{modelCount}{*}{\rotatebox[origin=c]{90}{\textbf{datasetName}}}"
accuracyStdStr = r"acc\tiny{$\pm$std}"
cellColorDict = {
    "1st": r"\cellcolor{blue!20}",
    "incomplete": r"\cellcolor{red!20}"
}


In [None]:
def genTeXTable(modelHeadDict, datasetNameStr, accuracyStdStr, datasetTableDict, cellColorDict=None, showTable=False):
    datasetResultDict = dict()
    modelResultDict = dict()
    for datasetName, table in datasetTableDict.items():
        if showTable:
            print(datasetName)
            display(table)
        table = table.loc[{key for key in modelHeadDict if not key.startswith("#")}]
        tableResultDict = {model: deque() for model in modelHeadDict}
        if datasetName is not None:
            dNameStr = datasetNameStr.replace("datasetName", datasetName)
            dNameStr = "\n" + dNameStr.replace("modelCount", str(len(modelHeadDict))) + "\n"
        else:
            dNameStr = "\n"
        for ind, model in enumerate(modelHeadDict):
            if model.startswith("#"):
                continue
            for col in ["AC", "ra", "rd", "acc"]:
                if col != "acc":
                    dTable = table.round({f"{col}_mean": 2, f"{col}_std": 2})
                else:
                    dTable = table.round({f"{col}_mean": 4, f"{col}_std": 4})
                
                
                # Mean and STD
                if model not in dTable.index:
                    modelResultStr = f'{accuracyStdStr.replace("acc", "nan").replace("std", "nan")}'
                    if cellColorDict:
                        modelResultStr += cellColorDict["incomplete"]
                else:
                    mean_value = dTable.loc[model, f"{col}_mean"]
                    std_value = dTable.loc[model, f"{col}_std"]
                    if col in ["ra", "rd"] and mean_value == 0 and std_value == 0:
                        modelResultStr = "-"
                        mean_value = "-"
                    elif col != "acc":
                        modelResultStr = f'{accuracyStdStr.replace("acc", f"{mean_value:.2f}").replace("std", f"{std_value:.2f}")}'
                    else:
                        modelResultStr = f'{accuracyStdStr.replace("acc", f"{mean_value*100:.2f}").replace("std", f"{std_value*100:.2f}")}'
                    if dTable.loc[model, "exp_count"] != 3 and cellColorDict:
                        modelResultStr += cellColorDict["incomplete"]
                    elif mean_value == dTable[f"{col}_mean"].max() and cellColorDict:
                        modelResultStr += cellColorDict["1st"]
                    
                tableResultDict[model].append(modelResultStr)
        
            if ind == 0:
                tableResultDict[model].appendleft(dNameStr)
            else:
                tableResultDict[model].appendleft("\n")
        
        datasetResultDict[datasetName] = tableResultDict
    
    output = ""
    for model, value in modelHeadDict.items():
        modelStr = value
        if not model.startswith("#"):
            for ind, tableResultDict in enumerate(datasetResultDict.values()):
                modelStr += " & ".join(tableResultDict[model])
                if ind != len(datasetResultDict) - 1:
                    modelStr += " & "
            modelStr += r" \\"
        output += modelStr + "\n\n"
    print("===")
    
    display(Textarea(
        value=output,
        layout=Layout(width="auto"),
        rows=output.count("\n") + 5
    ))

#### Main Paper - Top

In [None]:
df_expRun_set = df_expRun_pivot.loc[pandas.IndexSlice["SparseSmoothingSession", "--pf_plus_adj 0.001 --pf_minus_adj 0.4 --pf_plus_att 0.0 --pf_minus_att 0.0 --conf_alpha 0.01", "cora"]].reset_index()
cora_df = pivotACRadii(np.where(~df_expRun_set.model.isin(["H2GCN-1", "CPGNN-Cheby"]))[0], df=df_expRun_set).set_index("model")

df_expRun_set = df_expRun_pivot.loc[pandas.IndexSlice["SparseSmoothingSession", "--pf_plus_adj 0.001 --pf_minus_adj 0.4 --pf_plus_att 0.0 --pf_minus_att 0.0 --conf_alpha 0.01", "citeseer"]].reset_index()
citeseer_df = pivotACRadii(np.where(~df_expRun_set.model.isin(["H2GCN-1", "CPGNN-Cheby"]))[0], df=df_expRun_set).set_index("model")

genTeXTable(modelHeadDict, datasetNameStr, accuracyStdStr, {
    "Cora": cora_df,
    "Citeseer": citeseer_df
}, cellColorDict=cellColorDict, showTable=True)

#### Main Paper - Bottom

In [None]:
df_expRun_set = df_expRun_pivot.loc[pandas.IndexSlice["SparseSmoothingSession", "--pf_plus_adj 0.001 --pf_minus_adj 0.4 --pf_plus_att 0.0 --pf_minus_att 0.0 --conf_alpha 0.01", "fb100"]].reset_index()
fb100_df = pivotACRadii(np.where(~df_expRun_set.model.isin(["H2GCN-1", "CPGNN-Cheby"]))[0], df=df_expRun_set).set_index("model")

df_expRun_set = df_expRun_pivot.loc[pandas.IndexSlice["SparseSmoothingSession", "--pf_plus_adj 0.001 --pf_minus_adj 0.4 --pf_plus_att 0.0 --pf_minus_att 0.0 --conf_alpha 0.01", "snap-patents-downsampled"]].reset_index()
snap_df = pivotACRadii(np.where(~df_expRun_set.model.isin(["H2GCN-1", "CPGNN-Cheby"]))[0], df=df_expRun_set).set_index("model")

genTeXTable(modelHeadDict, datasetNameStr, accuracyStdStr, {
    "FB100": fb100_df,
    "Snap": snap_df
}, cellColorDict=cellColorDict, showTable=True)

#### Appendix

In [None]:
df_expRun_set = df_expRun_pivot.loc[pandas.IndexSlice["SparseSmoothingSession", "--pf_plus_adj 0.001 --pf_minus_adj 0.0 --pf_plus_att 0.0 --pf_minus_att 0.0 --conf_alpha 0.01", "cora"]].reset_index()
cora_df_plus = pivotACRadii(np.where(df_expRun_set.model != "H2GCN-1")[0], df=df_expRun_set).set_index("model")

df_expRun_set = df_expRun_pivot.loc[pandas.IndexSlice["SparseSmoothingSession", "--pf_plus_adj 0.0 --pf_minus_adj 0.4 --pf_plus_att 0.0 --pf_minus_att 0.0 --conf_alpha 0.01", "cora"]].reset_index()
cora_df_minus = pivotACRadii(np.where(df_expRun_set.model != "H2GCN-1")[0], df=df_expRun_set).set_index("model")

genTeXTable(modelHeadDict, datasetNameStr, accuracyStdStr, {
    "Cora": cora_df_plus,
    None: cora_df_minus
}, cellColorDict=cellColorDict, showTable=True)

In [None]:
df_expRun_set = df_expRun_pivot.loc[pandas.IndexSlice["SparseSmoothingSession", "--pf_plus_adj 0.001 --pf_minus_adj 0.0 --pf_plus_att 0.0 --pf_minus_att 0.0 --conf_alpha 0.01", "citeseer"]].reset_index()
df_plus = pivotACRadii(np.where(df_expRun_set.model != "H2GCN-1")[0], df=df_expRun_set).set_index("model")

df_expRun_set = df_expRun_pivot.loc[pandas.IndexSlice["SparseSmoothingSession", "--pf_plus_adj 0.0 --pf_minus_adj 0.4 --pf_plus_att 0.0 --pf_minus_att 0.0 --conf_alpha 0.01", "citeseer"]].reset_index()
df_minus = pivotACRadii(np.where(df_expRun_set.model != "H2GCN-1")[0], df=df_expRun_set).set_index("model")

genTeXTable(modelHeadDict, datasetNameStr, accuracyStdStr, {
    "Citeseer": df_plus,
    None: df_minus
}, cellColorDict=cellColorDict, showTable=True)

In [None]:
datasetName = "fb100"

df_expRun_set = df_expRun_pivot.loc[pandas.IndexSlice["SparseSmoothingSession", "--pf_plus_adj 0.001 --pf_minus_adj 0.0 --pf_plus_att 0.0 --pf_minus_att 0.0 --conf_alpha 0.01", datasetName]].reset_index()
df_plus = pivotACRadii(np.where(df_expRun_set.model != "H2GCN-1")[0], df=df_expRun_set).set_index("model")

df_expRun_set = df_expRun_pivot.loc[pandas.IndexSlice["SparseSmoothingSession", "--pf_plus_adj 0.0 --pf_minus_adj 0.4 --pf_plus_att 0.0 --pf_minus_att 0.0 --conf_alpha 0.01", datasetName]].reset_index()
df_minus = pivotACRadii(np.where(df_expRun_set.model != "H2GCN-1")[0], df=df_expRun_set).set_index("model")

genTeXTable(modelHeadDict, datasetNameStr, accuracyStdStr, {
    "FB100": df_plus,
    None: df_minus
}, cellColorDict=cellColorDict, showTable=True)

In [None]:
datasetName = "snap-patents-downsampled"

df_expRun_set = df_expRun_pivot.loc[pandas.IndexSlice["SparseSmoothingSession", "--pf_plus_adj 0.001 --pf_minus_adj 0.0 --pf_plus_att 0.0 --pf_minus_att 0.0 --conf_alpha 0.01", datasetName]].reset_index()
df_plus = pivotACRadii(np.where(df_expRun_set.model != "H2GCN-1")[0], df=df_expRun_set).set_index("model")

df_expRun_set = df_expRun_pivot.loc[pandas.IndexSlice["SparseSmoothingSession", "--pf_plus_adj 0.0 --pf_minus_adj 0.4 --pf_plus_att 0.0 --pf_minus_att 0.0 --conf_alpha 0.01", datasetName]].reset_index()
df_minus = pivotACRadii(np.where(df_expRun_set.model != "H2GCN-1")[0], df=df_expRun_set).set_index("model")

genTeXTable(modelHeadDict, datasetNameStr, accuracyStdStr, {
    "Snap": df_plus,
    None: df_minus
}, cellColorDict=cellColorDict, showTable=True)