In [44]:
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import dgl.data
import pandas as pd
import stellargraph as sg
from stellargraph.mapper import GraphSAGENodeGenerator
from stellargraph.layer import GraphSAGE
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from collections import Counter
from tensorflow.keras import layers, optimizers, losses, metrics, Model
from sklearn import preprocessing, feature_extraction, model_selection
from stellargraph import datasets
from IPython.display import display, HTML
import matplotlib.pyplot as plt
%matplotlib inline

Using backend: pytorch


In [45]:
dataset = datasets.Cora()
display(HTML(dataset.description))
G, node_subjects = dataset.load()

  known = data[existing]
  known = data[existing]


This notebook builds the shadow model and attack model. The shadow model is trained with the exact same architecture as the target model, but only because the models are rather basic. If the target model had had an advanced architecture, it would be necessary to explain how the adversary knew it. 

The attack model is a simple MLP that learns from the confidence outcomes and predictions of the shadow model, and predicts whether or not a piece of data was used to train the model or not based on the confidence outcomes.

In [46]:
train_labels, test_labels = model_selection.train_test_split(
    node_subjects, train_size=0.1, random_state=37
)

val_labels, test_labels = model_selection.train_test_split(
    test_labels, train_size=0.2, test_size=0.2, random_state=48,
)

print(Counter(train_labels))
print(sum(Counter(train_labels).values()))
print(Counter(val_labels))
print(sum(Counter(val_labels).values()))
print(Counter(test_labels))
print(sum(Counter(test_labels).values()))

Counter({'Neural_Networks': 79, 'Genetic_Algorithms': 50, 'Probabilistic_Methods': 38, 'Case_Based': 36, 'Theory': 28, 'Reinforcement_Learning': 22, 'Rule_Learning': 17})
270
Counter({'Neural_Networks': 148, 'Probabilistic_Methods': 74, 'Genetic_Algorithms': 72, 'Theory': 58, 'Case_Based': 54, 'Reinforcement_Learning': 41, 'Rule_Learning': 40})
487
Counter({'Neural_Networks': 139, 'Probabilistic_Methods': 82, 'Genetic_Algorithms': 80, 'Theory': 72, 'Case_Based': 44, 'Reinforcement_Learning': 41, 'Rule_Learning': 30})
488


In [47]:
target_encoding = preprocessing.LabelBinarizer()

train_targets = target_encoding.fit_transform(train_labels)
val_targets = target_encoding.transform(val_labels)
test_targets = target_encoding.transform(test_labels)

In [48]:
# number of nodes per batch
batch_size = 50

# number of neighbours per layer
num_samples = [10, 5]

# generator
generator = GraphSAGENodeGenerator(G, batch_size, num_samples)

# Generators for all the data sets
train_gen = generator.flow(train_labels.index, train_targets, shuffle=True)
val_gen = generator.flow(val_labels.index, val_targets)
test_gen = generator.flow(test_labels.index, test_targets)


In [49]:
# GraphSage stellargraph model
graphsage_model = GraphSAGE(
    layer_sizes=[32, 32], 
    generator=generator,
    bias=True, 
    dropout=0.2,
)

# get input and output tensors
x_inp, x_out = graphsage_model.in_out_tensors()
# pass the output tensor through the classification layer
# prediction = layers.Dense(1, activation="sigmoid")(x_out)
prediction = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out)
# build and compile
model = Model(inputs=x_inp, outputs=prediction)
model.compile(
    optimizer=optimizers.Adam(lr=0.005),
    loss=losses.binary_crossentropy,
    metrics=[metrics.AUC(num_thresholds=200, curve='ROC'), 'acc'],
)
# model.summary()

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'


In [50]:

history = model.fit(
    train_gen, 
    epochs=20, 
    validation_data=val_gen, 
    verbose=1, 
    shuffle=False)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [51]:
test_metrics = model.evaluate(test_gen)
print("\nTest Set Metrics:")
for name, val in zip(model.metrics_names, test_metrics):
    print("\t{}: {:0.4f}".format(name, val))


Test Set Metrics:
	loss: 0.1763
	auc: 0.9468
	acc: 0.7705


In [52]:
predictions = model.predict(train_gen)
pd.DataFrame(predictions).to_csv('predictions_shad_final.csv')

predictions_test = model.predict(test_gen)
pd.DataFrame(predictions_test).to_csv('prediction_test_shad_final.csv')
print("done")

done


In [53]:
# Now, we prepare the attack model using the combined csvs from above
# as the training data.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import StandardScaler    
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [54]:
df = pd.read_csv("predictions_shad_final.csv")
df.head()
X = df.iloc[:, 0:-1]
y = df.iloc[:, -1]

X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=0.33, random_state=69)

In [55]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [61]:
EPOCHS = 200
BATCH_SIZE = 128
LEARNING_RATE = 0.001

In [57]:
## train dataloader
class TrainData(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)

train_data = TrainData(torch.FloatTensor(X_train), torch.FloatTensor(y_train.values))

## test dataloader
class TestData(Dataset):
    
    def __init__(self, X_data):
        self.X_data = X_data
        
    def __getitem__(self, index):
        return self.X_data[index]
        
    def __len__ (self):
        return len(self.X_data)

test_data = TestData(torch.FloatTensor(X_test))


In [58]:
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=1)

In [59]:
class BinaryClassification(nn.Module):
    def __init__(self):
        super(BinaryClassification, self).__init__()
        # Number of input features is 7.
        self.layer_1 = nn.Linear(7, 128) 
        self.layer_2 = nn.Linear(128, 128)
        self.layer_out = nn.Linear(128, 1) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(128)
        self.batchnorm2 = nn.BatchNorm1d(128)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.layer_out(x)
        
        return x

In [60]:
model = BinaryClassification()
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
print(model)

def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc

BinaryClassification(
  (layer_1): Linear(in_features=7, out_features=128, bias=True)
  (layer_2): Linear(in_features=128, out_features=128, bias=True)
  (layer_out): Linear(in_features=128, out_features=1, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.1, inplace=False)
  (batchnorm1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


In [62]:
model.train()
for e in range(1, EPOCHS+1):
    epoch_loss = 0
    epoch_acc = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        
        y_pred = model(X_batch)
        
        loss = criterion(y_pred, y_batch.unsqueeze(1))
        acc = binary_acc(y_pred, y_batch.unsqueeze(1))
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        

    print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')

Epoch 001: | Loss: 0.67408 | Acc: 58.000
Epoch 002: | Loss: 0.59585 | Acc: 67.750
Epoch 003: | Loss: 0.56126 | Acc: 69.250
Epoch 004: | Loss: 0.53457 | Acc: 70.750
Epoch 005: | Loss: 0.52317 | Acc: 71.250
Epoch 006: | Loss: 0.51299 | Acc: 71.250
Epoch 007: | Loss: 0.50117 | Acc: 72.500
Epoch 008: | Loss: 0.47805 | Acc: 74.750
Epoch 009: | Loss: 0.46152 | Acc: 74.000
Epoch 010: | Loss: 0.45979 | Acc: 76.250
Epoch 011: | Loss: 0.44285 | Acc: 77.750
Epoch 012: | Loss: 0.45414 | Acc: 75.750
Epoch 013: | Loss: 0.44710 | Acc: 78.250
Epoch 014: | Loss: 0.43631 | Acc: 78.000
Epoch 015: | Loss: 0.43352 | Acc: 80.000
Epoch 016: | Loss: 0.42003 | Acc: 79.750
Epoch 017: | Loss: 0.40765 | Acc: 80.000
Epoch 018: | Loss: 0.39476 | Acc: 83.250
Epoch 019: | Loss: 0.40425 | Acc: 80.000
Epoch 020: | Loss: 0.37814 | Acc: 81.750
Epoch 021: | Loss: 0.40227 | Acc: 80.500
Epoch 022: | Loss: 0.41077 | Acc: 81.000
Epoch 023: | Loss: 0.39294 | Acc: 79.500
Epoch 024: | Loss: 0.38835 | Acc: 81.000
Epoch 025: | Los

In [63]:
y_pred_list = []
model.eval()
with torch.no_grad():
    for X_batch in test_loader:
        y_test_pred = model(X_batch)
        y_test_pred = torch.sigmoid(y_test_pred)
        y_pred_tag = torch.round(y_test_pred)
        y_pred_list.append(y_pred_tag.cpu().numpy())

y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

In [64]:
confusion_matrix(y_test, y_pred_list)

array([[ 80,   7],
       [ 28, 135]], dtype=int64)

In [65]:
df = pd.read_csv("prediction_target_final.csv")
df.head()
X_tar = df.iloc[:, 0:-1]
y_tar = df.iloc[:, -1]
X_tar = scaler.transform(X_tar)

test_tar_data = TestData(torch.FloatTensor(X_tar))
test_tar_loader = DataLoader(dataset=test_tar_data, batch_size=1)

y_pred_tar_list = []
model.eval()
with torch.no_grad():
    for X_batch in test_tar_loader:
        y_test_pred = model(X_batch)
        y_test_pred = torch.sigmoid(y_test_pred)
        y_pred_tag = torch.round(y_test_pred)
        y_pred_tar_list.append(y_pred_tag.cpu().numpy())
        
y_pred_tar_list = [a.squeeze().tolist() for a in y_pred_tar_list]
confusion_matrix(y_tar, y_pred_tar_list)
# print(classification_report(y_test, y_pred_list))


array([[  8, 211],
       [  1, 487]], dtype=int64)

In [66]:
print(classification_report(y_tar, y_pred_tar_list))

              precision    recall  f1-score   support

           0       0.89      0.04      0.07       219
           1       0.70      1.00      0.82       488

    accuracy                           0.70       707
   macro avg       0.79      0.52      0.45       707
weighted avg       0.76      0.70      0.59       707

