In [40]:
import torch
import torch.nn as nn
import torchvision
import torch.optim as optim
import torchvision.transforms as transforms
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader 
from art.attacks.inference.membership_inference import MembershipInferenceBlackBoxRuleBased
from art.estimators.classification import PyTorchClassifier
from art.attacks.evasion import HopSkipJump
from torchvision.datasets.mnist import MNIST
import lenet
import torch.nn.functional as F
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
device = 'cuda' if torch.cuda.is_available() else 'cpu'
criterion = nn.CrossEntropyLoss()


In [221]:
def accuracy(output, target, topk=(1,)):
    '''
    topk = 1이라면 가장 높은 예측 확률을 가진 레이블과 실제 레이블이 동일한지 계산 
    topk = (1, 5)라면, 가장 높은 예측 확률을 가진 레이블과 실제 레이블이 동일한 경우를 계산하여
    top1 정확도 구하고, 그 다음으로 높은 5개의 예측 확률을 가진 레이블 중 실제 레이블이 포함되는지 확인하여 top5 정확도 구함
    
    더욱 모델의 성능을 상세하게 평가하기 위한 방법으로, 모델의 성능을 다각도로 이해하고 평가하는 데 도움됨
    '''
    with torch.no_grad():
        maxk = max(topk)
        batch_size = target.size(0)
        _, pred = output.topk(maxk, 1, True, True)
        pred = pred.t()
        correct = pred.eq(target.reshape(1, -1).expand_as(pred))
        res = []
        for k in topk:
            correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
            res.append(correct_k.mul_(100.0 / batch_size))
        return res

In [222]:
data_train = MNIST('/cache/data/',
                       download=True,
                       transform=transforms.Compose([
                           transforms.Resize((32, 32)),
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                           ]))
data_test = MNIST('/cache/data/',
                download=True,
                train=False,
                transform=transforms.Compose([
                    transforms.Resize((32, 32)),
                    transforms.ToTensor(),
                    transforms.Normalize((0.1307,), (0.3081,))
                    ]))

In [223]:
test_loader = DataLoader(data_train, batch_size=1024, num_workers=8)

In [224]:
distilled = torch.load("./DFKD_pths/LeNet5Half_MNIST_TestSet_distilled")
synthetic = torch.load("./DFKD_pths/LeNet5Half_Synthetic_MNIST")
adv_synthetic = torch.load("./DFKD_pths/LeNet5Half_Synthetic_Adv_MNIST")

  distilled = torch.load("./DFKD_pths/LeNet5Half_MNIST_TestSet_distilled")
  synthetic = torch.load("./DFKD_pths/LeNet5Half_Synthetic_MNIST")
  adv_synthetic = torch.load("./DFKD_pths/LeNet5Half_Synthetic_Adv_MNIST")


In [225]:
def evaluate_model(model, test_loader, criterion, device):
    model.eval()
    total_loss = 0.0
    total_acc1 = 0.0
    total_samples = 0

    with torch.no_grad():
        for images, labels in test_loader:
            images = images.to(device)
            labels = labels.to(device)
            batch_size = images.size(0)
            
            outputs = model(images)
            loss = criterion(outputs, labels)
            acc1, _ = accuracy(outputs, labels, topk=(1, 5))
            total_loss += loss.item() * batch_size
            total_acc1 += acc1.item() * batch_size
            total_samples += batch_size
            
    val_loss = total_loss / total_samples
    val_acc1 = total_acc1 / total_samples
    return val_loss, val_acc1

In [206]:
val_loss, val_acc1 = evaluate_model(distilled, test_loader, criterion, device)
print(f"Test Loss: {val_loss:.4f}, Test Accuracy: {val_acc1 / 100:.4%}")

Test Loss: 0.1537, Test Accuracy: 95.6867%


In [207]:
val_loss, val_acc1 = evaluate_model(synthetic, test_loader, criterion, device)
print(f"Test Loss: {val_loss:.4f}, Test Accuracy: {val_acc1 / 100:.2%}")

Test Loss: 0.2492, Test Accuracy: 92.68%


In [226]:
val_loss, val_acc1 = evaluate_model(adv_synthetic, test_loader, criterion, device)
print(f"Test Loss: {val_loss:.4f}, Test Accuracy: {val_acc1 / 100:.2%}")

Test Loss: 0.3131, Test Accuracy: 92.02%


In [230]:
train_data_loader = DataLoader(data_train, batch_size=1000, shuffle=True)
x_train, y_train = next(iter(train_data_loader))

test_data_loader = DataLoader(data_test, batch_size=500, shuffle=True)
x_test, y_test = next(iter(test_data_loader))

In [231]:
def Rule_based_Attack(model, batch_size=1000):
    
    model.eval()
    
    x_member = x_train[:batch_size]
    y_member = y_train[:batch_size]
    x_nonmember = x_test[:(batch_size // 2)]
    y_nonmember = y_test[:(batch_size // 2)]

    x_target = np.concatenate([x_member, x_nonmember], axis=0)
    y_target = np.concatenate([y_member, y_nonmember], axis=0)
    
    x_tensor = torch.from_numpy(x_target).to(device)
    with torch.no_grad():
        logits = model(x_tensor)
        pred_labels = logits.argmax(dim=1).cpu().numpy()
    
    predictions = (pred_labels == y_target).astype(np.int32)

    true_membership = np.array([1] * batch_size + [0] * (batch_size // 2))
    
    MIA_ACC = (predictions == true_membership).mean()
    return MIA_ACC

In [232]:
print(Rule_based_Attack(distilled))
print(Rule_based_Attack(synthetic))
print(Rule_based_Attack(adv_synthetic))

0.6446666666666667
0.6333333333333333
0.63


In [545]:
def Conf_Rule_based_Attack(model, batch_size=1000):
    
    model.eval()
    threshold = 0.98
    
    x_member = x_train[:batch_size]
    y_member = y_train[:batch_size]
    x_nonmember = x_test[:400]
    y_nonmember = y_test[:400]

    x_target = np.concatenate([x_member, x_nonmember], axis=0)
    y_target = np.concatenate([y_member, y_nonmember], axis=0)
    
    x_tensor = torch.from_numpy(x_target).to(device)
    with torch.no_grad():
        logits = model(x_tensor)
        prob = F.softmax(logits, dim=1)
        max_confidence = prob.max(dim=1).values.cpu().numpy()
        
    mia_pred = (max_confidence > threshold).astype(np.int32)
    true_membership = np.array([1] * batch_size + [0] * 400)
    
    MIA_ACC = (mia_pred == true_membership).mean()
        
    return MIA_ACC

In [528]:
MIA_ACC = Conf_Rule_based_Attack(distilled)
print(MIA_ACC)

0.7383333333333333


In [529]:
MIA_ACC = Conf_Rule_based_Attack(synthetic)
print(MIA_ACC)

0.6725


In [547]:
MIA_ACC = Conf_Rule_based_Attack(adv_synthetic)
print(MIA_ACC)

0.6092857142857143


In [584]:
def collect_logits_non_loader(model, data):
    '''
    x_tensor = torch.from_numpy(x_target).to(device)
    with torch.no_grad():
        logits = model(x_tensor)
        pred_labels = logits.argmax(dim=1).cpu().numpy()
    '''
    data = data.to(device)
    with torch.no_grad():
        return model(data)


def train_attack_model(logits_in, logits_out):
    probs_in = F.softmax(logits_in, dim=1).cpu().numpy()
    probs_out = F.softmax(logits_out, dim=1).cpu().numpy()
    
    x = np.concatenate([probs_in, probs_out], axis=0)
    y = np.concatenate([np.ones(len(probs_in)), np.zeros(len(probs_out))], axis=0)
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.8)
    
    attack_model = LogisticRegression(max_iter=5000)
    # attack_model = RandomForestClassifier(n_estimators=150)
    attack_model.fit(x_train, y_train)
    
    y_pred = attack_model.predict(x_test)
    y_score = attack_model.predict_proba(x_test)[:, 1]
    
    MIA_ACC = accuracy_score(y_test, y_pred)
    
    return MIA_ACC
    

> ### Logistic Classifier Shadow Model

In [608]:
dist_logits_in = collect_logits_non_loader(distilled, x_train)
dist_logits_out = collect_logits_non_loader(distilled, x_test[:300])
print(train_attack_model(dist_logits_in, dist_logits_out))

0.7692307692307693


In [617]:
syn_logits_in = collect_logits_non_loader(synthetic, x_train)
syn_logits_out = collect_logits_non_loader(synthetic, x_test[:400])
print(train_attack_model(syn_logits_in, syn_logits_out))

0.7223214285714286


In [587]:
adv_syn_logits_in = collect_logits_non_loader(adv_synthetic, x_train)
adv_syn_logits_out = collect_logits_non_loader(adv_synthetic, x_test[:400])
print(train_attack_model(adv_syn_logits_in, adv_syn_logits_out))

0.7035714285714286


> ### RandomForest Shadow Model

In [578]:
dist_logits_in = collect_logits_non_loader(distilled, x_train)
dist_logits_out = collect_logits_non_loader(distilled, x_test[:300])
print(train_attack_model(dist_logits_in, dist_logits_out))

syn_logits_in = collect_logits_non_loader(synthetic, x_train)
syn_logits_out = collect_logits_non_loader(synthetic, x_test[:300])
print(train_attack_model(syn_logits_in, syn_logits_out))

adv_syn_logits_in = collect_logits_non_loader(adv_synthetic, x_train)
adv_syn_logits_out = collect_logits_non_loader(adv_synthetic, x_test[:400])
print(train_attack_model(adv_syn_logits_in, adv_syn_logits_out))

0.7567307692307692
0.7211538461538461
0.6732142857142858


In [516]:
def label_only_decision_boundary(model, x_target, n_perturb=20, epsilon=0.1, threshold_ratio=0.2):
    '''
    n_perturb : perturbation 반복 횟수
    epsilon : perturbation 강도
    threshold_ratio : label 변화 허용 비율 (ex: 0.25 -> 20회 중 5회 이상 바뀌면 non-member)
    '''
    model.eval()
    N = x_target.size(0)
    x_target = x_target.float().to(device)
    with torch.no_grad():
        pred_clean = model(x_target).argmax(dim=1).cpu().numpy()
        
    boundary_score = np.zeros(N)
    for t in range(n_perturb):
        noise = (torch.rand_like(x_target) - 0.5) * 2 * epsilon
        x_perturbed = torch.clamp(x_target + noise, 0, 1)
        with torch.no_grad():
            pred_noisy = model(x_perturbed).argmax(dim=1).cpu().numpy()
        boundary_score += (pred_noisy != pred_clean)
    
    threshold = int(n_perturb * threshold_ratio)
    inferred_membership = (boundary_score <= threshold).astype(np.int32)
    return inferred_membership, boundary_score

In [517]:
x_all = torch.cat([x_train, x_test], dim=0)
inferred_membership, boundary_score = label_only_decision_boundary(distilled, x_all)
true_membership = np.array([1] * 1000 + [0] * 500)
MIA_ACC = (inferred_membership == true_membership).mean()
print(f"Label-Only Decision Boundary MIA accuracy: {MIA_ACC:.4f}")

Label-Only Decision Boundary MIA accuracy: 0.6433


In [518]:
x_all = torch.cat([x_train, x_test], dim=0)
inferred_membership, boundary_score = label_only_decision_boundary(synthetic, x_all)
true_membership = np.array([1] * 1000 + [0] * 500)
MIA_ACC = (inferred_membership == true_membership).mean()
print(f"Label-Only Decision Boundary MIA accuracy: {MIA_ACC:.4f}")

Label-Only Decision Boundary MIA accuracy: 0.5847


In [527]:
x_all = torch.cat([x_train, x_test], dim=0)
inferred_membership, boundary_score = label_only_decision_boundary(adv_synthetic, x_all)
true_membership = np.array([1] * 1000 + [0] * 500)
MIA_ACC = (inferred_membership == true_membership).mean()
print(f"Label-Only Decision Boundary MIA accuracy: {MIA_ACC:.4f}")

Label-Only Decision Boundary MIA accuracy: 0.5973
