In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.data import DataLoader, Dataset
from torch_geometric.nn import GCNConv, GraphSAGE, GATConv, GINConv
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import networkx as nx
import numpy as np
from torch_geometric.datasets import Planetoid


In [2]:
import torch
from torch_geometric.datasets import Planetoid
from sklearn.model_selection import train_test_split
import numpy as np
import random

# Set random seed for reproducibility
seed = 42  # You can change this number, but it should be the same across all runs
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

# Load the PubMed dataset
dataset = Planetoid(root='/tmp/PubMed', name='PubMed')
data = dataset[0]

# Split nodes into target and shadow sets
nodes = np.arange(data.num_nodes)
target_nodes, shadow_nodes = train_test_split(nodes, test_size=0.5, random_state=seed)
target_train_nodes, target_test_nodes = train_test_split(target_nodes, test_size=0.2, random_state=seed)

# # Verify disjoint sets
# overlap = np.intersect1d(target_nodes, shadow_nodes)
# if len(overlap) == 0:
#     print("The target and shadow datasets are disjoint.")
# else:
#     print(f"The target and shadow datasets are not disjoint. Overlapping nodes: {len(overlap)}")


# Create training masks
data.train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
data.train_mask[target_nodes] = True

shadow_data = data.clone()
shadow_data.train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
shadow_data.train_mask[shadow_nodes] = True

# Inspect dataset
print(f"Dataset: {dataset}")
print(f"Number of nodes: {data.num_nodes}")
print(f"Number of edges: {data.num_edges}")
print(f"Number of features: {data.num_features}")
print(f"Number of classes: {dataset.num_classes}")
print(f"Train Mask: {data.train_mask.sum()} nodes")
print(f"Test Mask: {data.test_mask.sum()} nodes")
print(f"Target Dataset Nodes: {len(target_nodes)}")
print(f"Shadow Dataset Nodes: {len(shadow_nodes)}")

Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.test.index


Dataset: PubMed()
Number of nodes: 19717
Number of edges: 88648
Number of features: 500
Number of classes: 3
Train Mask: 9858 nodes
Test Mask: 1000 nodes
Target Dataset Nodes: 9858
Shadow Dataset Nodes: 9859


Processing...
Done!


In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GCN(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.5, training=self.training)  # 0.5 dropout rate
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)  # Log-softmax for classification


In [4]:
# Hyperparameters
hidden_channels = 128
learning_rate = 0.001
epochs = 200

# Initialize models
target_model = GCN(dataset.num_node_features, hidden_channels, dataset.num_classes)
shadow_model = GCN(dataset.num_node_features, hidden_channels, dataset.num_classes)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
target_optimizer = torch.optim.Adam(target_model.parameters(), lr=learning_rate)
shadow_optimizer = torch.optim.Adam(shadow_model.parameters(), lr=learning_rate)

# Function to train a model
def train_model(model, optimizer, data, epochs):
    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        out = model(data)
        loss = criterion(out[data.train_mask], data.y[data.train_mask])  # Masked training loss
        loss.backward()
        optimizer.step()
        if (epoch + 1) % 50 == 0:  # Log progress every 50 epochs
            print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item():.4f}")

# Train the target and shadow models
print("Training Target Model...")
train_model(target_model, target_optimizer, data, epochs)

print("Training Shadow Model...")
train_model(shadow_model, shadow_optimizer, shadow_data, epochs)

print("Models trained successfully.")


Training Target Model...
Epoch 50/200, Loss: 0.8289
Epoch 100/200, Loss: 0.5806
Epoch 150/200, Loss: 0.4636
Epoch 200/200, Loss: 0.4121
Training Shadow Model...
Epoch 50/200, Loss: 0.8350
Epoch 100/200, Loss: 0.5727
Epoch 150/200, Loss: 0.4567
Epoch 200/200, Loss: 0.4047
Models trained successfully.


In [5]:
def evaluate_attack(model, data):
    model.eval()
    out = model(data)
    pred = out.argmax(dim=1)
    correct = (pred[data.test_mask] == data.y[data.test_mask]).sum().item()
    accuracy = correct / data.test_mask.sum().item()
    print(f"Accuracy: {accuracy:.4f}")
    # AUC calculation would require a link prediction setup, which is not included here

evaluate_attack(target_model, data)
evaluate_attack(shadow_model, shadow_data)

Accuracy: 0.8640
Accuracy: 0.8650


In [6]:
from sklearn.metrics import roc_auc_score
import torch

# Function to predict links (links that are predicted to be connected)
def predict_links(model, data, positive_pairs, negative_pairs):
    model.eval()
    pred_scores = []
    
    # Get predictions for positive and negative pairs
    for pair in positive_pairs + negative_pairs:
        node1, node2 = pair
        edge_index = torch.tensor([[node1, node2], [node2, node1]], dtype=torch.long).t()
        out = model(data)
        score = out[node1].dot(out[node2])  # Cosine similarity or dot product
        pred_scores.append(score.item())
    
    return pred_scores

# Generate positive and negative pairs
def generate_link_pairs(data, num_pairs=1000):
    # Positive pairs (edges)
    positive_pairs = [(u, v) for u, v in zip(*data.edge_index)]
    
    # Negative pairs (non-edges)
    negative_pairs = []
    while len(negative_pairs) < num_pairs:
        node1, node2 = torch.randint(0, data.num_nodes, (2,))
        if not torch.any((data.edge_index[0] == node1) & (data.edge_index[1] == node2)):
            negative_pairs.append((node1.item(), node2.item()))
    
    return positive_pairs[:num_pairs], negative_pairs

# AUC Calculation
def calculate_auc(pred_scores, positive_labels):
    return roc_auc_score(positive_labels, pred_scores)

# Example usage:
positive_pairs, negative_pairs = generate_link_pairs(data)
positive_labels = [1] * len(positive_pairs) + [0] * len(negative_pairs)

# Get prediction scores for target model
target_pred_scores = predict_links(target_model, data, positive_pairs, negative_pairs)

# Calculate AUC for the target model
target_auc = calculate_auc(target_pred_scores, positive_labels)
print(f"Target Model AUC: {target_auc:.4f}")

# Get prediction scores for shadow model
shadow_pred_scores = predict_links(shadow_model, shadow_data, positive_pairs, negative_pairs)

# Calculate AUC for the shadow model
shadow_auc = calculate_auc(shadow_pred_scores, positive_labels)
print(f"Shadow Model AUC: {shadow_auc:.4f}")


Target Model AUC: 0.8594
Shadow Model AUC: 0.8744


In [7]:
# Link Stealing Attack: Use shadow model to predict links for the target graph
def link_stealing_attack(shadow_model, target_data):
    shadow_model.eval()
    stolen_links = []
    
    # Use shadow model to predict links on the target data
    for pair in generate_link_pairs(target_data)[0]:  # Get positive pairs from target graph
        node1, node2 = pair
        score = predict_links(shadow_model, target_data, [pair], [])[0]
        stolen_links.append((node1, node2, score))
        
        # # Sort stolen links by score in descending order
        # stolen_links = sorted(stolen_links, key=lambda x: x[2], reverse=True)
        # return stolen_links
    
    # Sort stolen links by score
    stolen_links.sort(key=lambda x: x[2], reverse=True)
    return stolen_links

# Evaluate Link Stealing Attack
stolen_links = link_stealing_attack(shadow_model, data)
print("Stolen Links (sorted by prediction score):", stolen_links[:5])

Stolen Links (sorted by prediction score): [(tensor(11894), tensor(62), 270.33013916015625), (tensor(11450), tensor(62), 248.64093017578125), (tensor(1205), tensor(60), 234.48582458496094), (tensor(11894), tensor(88), 204.99546813964844), (tensor(13940), tensor(117), 188.9935302734375)]


In [8]:
print("Top 10 Stolen Links:") 
for i, link in enumerate(stolen_links[:10]): print(f"Link {i+1}: Node {link[0]} ↔ Node {link[1]}, Score: {link[2]:.4f}")

Top 10 Stolen Links:
Link 1: Node 11894 ↔ Node 62, Score: 270.3301
Link 2: Node 11450 ↔ Node 62, Score: 248.6409
Link 3: Node 1205 ↔ Node 60, Score: 234.4858
Link 4: Node 11894 ↔ Node 88, Score: 204.9955
Link 5: Node 13940 ↔ Node 117, Score: 188.9935
Link 6: Node 4058 ↔ Node 88, Score: 164.6518
Link 7: Node 12019 ↔ Node 105, Score: 161.6323
Link 8: Node 11894 ↔ Node 147, Score: 152.0048
Link 9: Node 8106 ↔ Node 117, Score: 141.9617
Link 10: Node 9046 ↔ Node 117, Score: 135.9112


In [9]:
def evaluate_attack_success(stolen_links, actual_positive_pairs): 
    matched_links = [(u, v) for u, v, _ in stolen_links if (u, v) in actual_positive_pairs] 
    attack_success_rate = len(matched_links) / len(actual_positive_pairs) 
    print(f"Attack Success Rate: {attack_success_rate:.4f}")

In [10]:
actual_positive_pairs = [(u, v) for u, v in zip(*data.edge_index)]

In [11]:
evaluate_attack_success(stolen_links, actual_positive_pairs)

Attack Success Rate: 0.0113


In [12]:

class Baseline0(nn.Module):
    def __init__(self, in_features):
        super(Baseline0, self).__init__()
        self.fc1 = nn.Linear(in_features, 128)
        self.fc2 = nn.Linear(128, 32)
        self.fc3 = nn.Linear(32, 2)  # Binary classification

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.dropout(x, p=0.5, training=self.training)  # Dropout rate = 0.5
        x = F.relu(self.fc2(x))
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.fc3(x)
        return F.log_softmax(x, dim=1)  # Log-softmax for binary classification


In [13]:
class Baseline1(nn.Module):
    def __init__(self, in_features):
        super(Baseline1, self).__init__()
        self.fc1 = nn.Linear(in_features, 16)
        self.fc2 = nn.Linear(16, 2)  # Binary classification

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)


In [14]:
class Baseline2(nn.Module):
    def __init__(self, node_features, graph_features):
        super(Baseline2, self).__init__()
        # Sub-network for node attributes
        self.node_fc1 = nn.Linear(node_features, 256)
        self.node_fc2 = nn.Linear(256, 64)
        self.node_fc3 = nn.Linear(64, 8)

        # Sub-network for graph features
        self.graph_fc = nn.Linear(graph_features, 1)

        # Final layer
        self.final_fc = nn.Linear(8 + 1, 2)  # Concatenated inputs, binary classification

    def forward(self, node_x, graph_x):
        # Node attributes sub-network
        x1 = F.relu(self.node_fc1(node_x))
        x1 = F.dropout(x1, p=0.5, training=self.training)
        x1 = F.relu(self.node_fc2(x1))
        x1 = F.dropout(x1, p=0.5, training=self.training)
        x1 = F.relu(self.node_fc3(x1))

        # Graph features sub-network
        x2 = F.relu(self.graph_fc(graph_x))

        # Concatenate and pass through final layer
        x = torch.cat([x1, x2], dim=1)
        x = self.final_fc(x)
        return F.log_softmax(x, dim=1)


In [15]:
# Define Baseline Models
baseline0 = Baseline0(in_features=data.x.size(1))
baseline1 = Baseline1(in_features=data.x.size(1))
baseline2 = Baseline2(node_features=data.x.size(1), graph_features=1)


# Define loss function
criterion = nn.CrossEntropyLoss()

# Define optimizers for each baseline model
learning_rate = 0.001
optimizer_baseline0 = torch.optim.Adam(baseline0.parameters(), lr=learning_rate)
optimizer_baseline1 = torch.optim.Adam(baseline1.parameters(), lr=learning_rate)
optimizer_baseline2 = torch.optim.Adam(baseline2.parameters(), lr=learning_rate)

# Define cosine annealing schedulers
epochs = 200
scheduler_baseline0 = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer_baseline0, T_max=epochs)
scheduler_baseline1 = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer_baseline1, T_max=epochs)
scheduler_baseline2 = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer_baseline2, T_max=epochs)

print("Baseline models, optimizers, and schedulers initialized.")
print(scheduler_baseline0)


Baseline models, optimizers, and schedulers initialized.
<torch.optim.lr_scheduler.CosineAnnealingLR object at 0x153ee5220>
