In [87]:
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import dgl.data
import pandas as pd
import stellargraph as sg
from stellargraph.mapper import GraphSAGENodeGenerator
from stellargraph.layer import GraphSAGE
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from collections import Counter
from tensorflow.keras import layers, optimizers, losses, metrics, Model
from sklearn import preprocessing, feature_extraction, model_selection
from stellargraph import datasets
from IPython.display import display, HTML
import matplotlib.pyplot as plt
%matplotlib inline
import random
import numpy 

In this notebook, we train a victim model using NodeDrop. 

We use 10% of the CORA dataset for training the model using NodeDrop. The validation and test datasets are generated using 20% of the dataset each.

In [88]:
dataset = datasets.Cora()
display(HTML(dataset.description))
G, node_subjects = dataset.load()

  known = data[existing]
  known = data[existing]


In [90]:
train_labels, test_labels = model_selection.train_test_split(
    node_subjects, train_size=0.1, random_state=12
)

val_labels, test_labels = model_selection.train_test_split(
    test_labels, train_size=0.2, test_size=0.2, random_state=15,
)

Counter({'Neural_Networks': 79, 'Probabilistic_Methods': 49, 'Genetic_Algorithms': 48, 'Theory': 29, 'Reinforcement_Learning': 28, 'Case_Based': 26, 'Rule_Learning': 11})
270
Counter({'Neural_Networks': 149, 'Probabilistic_Methods': 72, 'Theory': 66, 'Case_Based': 66, 'Genetic_Algorithms': 65, 'Rule_Learning': 35, 'Reinforcement_Learning': 34})
487
Counter({'Neural_Networks': 155, 'Probabilistic_Methods': 82, 'Genetic_Algorithms': 68, 'Theory': 55, 'Case_Based': 49, 'Reinforcement_Learning': 46, 'Rule_Learning': 33})
488


In [91]:
target_encoding = preprocessing.LabelBinarizer()

train_targets = target_encoding.fit_transform(train_labels)
val_targets = target_encoding.transform(val_labels)
test_targets = target_encoding.transform(test_labels)

In [92]:
# number of nodes per batch
batch_size = 50

# number of neighbours per layer
num_samples = [10, 5]

# generator
generator = GraphSAGENodeGenerator(G, batch_size, num_samples)

In [94]:
# GraphSage stellargraph model
graphsage_model = GraphSAGE(
    layer_sizes=[32, 32], 
    generator=generator,
    bias=True, 
    dropout=0.2,
)

# get input and output tensors
x_inp, x_out = graphsage_model.in_out_tensors()

prediction = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out)

# build and compile
model = Model(inputs=x_inp, outputs=prediction)
model.compile(
    optimizer=optimizers.Adam(lr=0.005),
    loss=losses.binary_crossentropy,
    metrics=[metrics.AUC(num_thresholds=200, curve='ROC'), 'acc'],
)
# model.summary()

We choose n=4 (as the average degree of the dataset is 4.006) and c=50 (randomly chosen value which produced good results).

In order to randomly drop 'c' nodes with degree below 'n', we must first label each node in the dataset with it's degree. The package I used here (StellarGraph) does not give us the degree of each node individually. Therefore, I wrote a separate program to determine the degree of each node and generated an edgelist (cora/deg_labelled.csv) that contains the degrees of each node.

We train for 20 epochs. Between each epoch, 'c' random nodes are dropped from the dataset using the algorithm below.

In [102]:
df_degs = pd.read_csv("cora/deg_labelled.csv", header=None)
X_degs = df_degs.iloc[:, 0:-1].values.tolist()
y_degs = df_degs.iloc[:, -1].values.tolist()

deg_inds_train = []

# The below for loop prints out the edgelist after it is imported in
# the code above.
# for i in range(0, sum(Counter(train_labels).values())):
#     print("{}, {}".format(train_labels.index[i],X_degs.index(train_labels.index[i])))
#     deg_inds_train.append(X_degs.index(train_labels.index[i]))

y_degs_train = []
for i in range(len(deg_inds_train)):
    y_degs_train.append(y_degs[deg_inds_train[i]])

listcomp = [idx for idx, element in enumerate(y_degs_train) if element <= 4]

for i in range(20):
    y_degs_chosen = random.sample(listcomp, 50)
    y_degs_chosen.sort()

    train_labels_new = train_labels.drop(train_labels.index[[y_degs_chosen]])
    train_targets_new = numpy.delete(train_targets, y_degs_chosen, 0)
    train_gen = generator.flow(train_labels_new.index, train_targets_new, shuffle=True)
    val_gen = generator.flow(val_labels.index, val_targets)
    test_gen = generator.flow(test_labels.index, test_targets)

    history = model.fit(
        train_gen, 
        epochs=1, 
        validation_data=val_gen, 
        verbose=1, 
        shuffle=False)

132806, 1010
28456, 431
31483, 466
156977, 1096
1129111, 2384
643221, 1712
1131464, 2474
54132, 644
562067, 1543
427606, 1469
628668, 1671
18832, 323
1118120, 2207
1120084, 2241
5086, 100
523394, 1521
96847, 876
102406, 899
1137466, 2572
853118, 1874
1152663, 2614
1129518, 2393
948299, 1889
36167, 514
733167, 1831
1120880, 2259
358866, 1410
28254, 417
348305, 1403
1071981, 1913
56709, 656
1107455, 2032
1120197, 2245
1109199, 2059
12195, 224
248425, 1287
28641, 442
45189, 577
70444, 745
1115471, 2164
753070, 1846
1026, 20
273949, 1330
66594, 715
3222, 69
285675, 1339
1125092, 2299
628764, 1673
62274, 685
1108728, 2054
1140231, 2585
190698, 1174
417017, 1464
561581, 1533
1153896, 2674
1102550, 1920
645016, 1731
64484, 705
1153866, 2669
242663, 1279
1121659, 2271
44121, 570
59244, 671
330148, 1389
56167, 654
593201, 1614
1113551, 2126
1134865, 2539
1104055, 1946
345340, 1400
561568, 1532
189577, 1165
66564, 714
385572, 1433
20593, 339
22566, 357
85452, 824
289885, 1351
12182, 222
642930, 

  result = getitem(key)




  result = getitem(key)




  result = getitem(key)




  result = getitem(key)




  result = getitem(key)




  result = getitem(key)




  result = getitem(key)




  result = getitem(key)




  result = getitem(key)




  result = getitem(key)




  result = getitem(key)




  result = getitem(key)




  result = getitem(key)




  result = getitem(key)




  result = getitem(key)




  result = getitem(key)




  result = getitem(key)




  result = getitem(key)




  result = getitem(key)




  result = getitem(key)




In [103]:
# Checking the test accuracy.
test_metrics = model.evaluate(test_gen)
print("\nTest Set Metrics:")
for name, val in zip(model.metrics_names, test_metrics):
    print("\t{}: {:0.4f}".format(name, val))


Test Set Metrics:
	loss: 0.1666
	auc_3: 0.9600
	acc: 0.7971


In [104]:
# We save the predictions of the model for the training data and prediction data.
# These outcomes are saved to separate edgelists and are used to simulate shadow
# model attacks.

predictions = model.predict(train_gen)
prediction_train = pd.DataFrame(predictions).to_csv('prediction_target_final.csv')
print("done")
predictions_test = model.predict(test_gen)
pd.DataFrame(predictions_test).to_csv('prediction_target_test_final.csv')
print("done")

done
done
