Цель моей работы исследовать Membership Inference Attacks из статьи [Membership Inference Attacks against Machine Learning Models](https://arxiv.org/abs/1610.05820) и попробовать провести атаку против леса решений из sklearn рассматривая модель как черный ящик, возвращающий вероятность.

В качестве целевой модели на которую будет происходить атака взята модель для распознавания рака из работы [Breast Cancer Prediction](https://www.kaggle.com/code/kanuriviveknag/breast-cancer-prediction)

Атака подразумевает создание атакующей и теневых моделей, атакующуя модель учится распознавать тренировочные данные теневых моделей и благодаря обобщающей способности сможет определять тренировочные данные целевой модели. Так как теневые модели должны копировать оригинальную, все модели являются лесами.

# Импорт

In [None]:
!pip install adversarial-robustness-toolbox



In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import random
from art.attacks.evasion import BoundaryAttack, HopSkipJump, ZooAttack
from art.estimators.classification import SklearnClassifier, PyTorchClassifier
import matplotlib.pyplot as plt
import os
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from google.colab import drive
drive.mount('/content/drive')
import logging
logging.getLogger("art").setLevel(logging.ERROR)
path = 'drive/MyDrive/Colab Notebooks/ID_leak/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Сведения о данных из датасета

In [None]:
df=pd.read_csv(path+"breast-cancer.csv")
df.columns

Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst'],
      dtype='object')

In [None]:
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [None]:
df.shape

(569, 32)

In [None]:
df.nunique()

Unnamed: 0,0
id,569
diagnosis,2
radius_mean,456
texture_mean,479
perimeter_mean,522
area_mean,539
smoothness_mean,474
compactness_mean,537
concavity_mean,537
concave points_mean,542


In [None]:
df.describe()

Unnamed: 0,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,30371830.0,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,125020600.0,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,8670.0,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,869218.0,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,906024.0,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,8813129.0,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,911320500.0,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


In [None]:
from sklearn.preprocessing import LabelEncoder
l=LabelEncoder()
df['diagnosis']=l.fit_transform(df.diagnosis)
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


# Обучение атакуемой оригинальной модели

In [None]:
def print_score(yt,yp,text ='',only_acc = True):  # Функция вывода сводки
  yp = np.round(yp).astype('int')
  yt = np.round(yt).astype('int')
  if text!='':
    print(text)
  print("accuracy is ",round(accuracy_score(yp,yt),2)*100,'%')
  if not only_acc:
    print ("precision is ",round(precision_score(yp,yt),2)*100,'%')
    print("recall is ",round(recall_score(yp,yt),2)*100,'%')
    print("f1 is ",round(f1_score(yp,yt),2)*100,'%')


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
original_model=RandomForestClassifier(n_estimators=100)
x=df.drop('diagnosis',axis=1)
y=df['diagnosis']
xtrain,xtest,ytrain,ytest=train_test_split(np.array(x),np.array(y),test_size=0.2,random_state=42)
original_model.fit(xtrain,ytrain)
y_pred=original_model.predict(xtest)
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
print_score(y_pred,ytest,only_acc=False,text = "model name is "+str(original_model))
print("************************************************************************************************************")
y_pred=original_model.predict_proba(xtest)
ytrain_pred = original_model.predict_proba(xtrain)

model name is RandomForestClassifier()
accuracy is  96.0 %
precision is  98.0 %
recall is  93.0 %
f1 is  95.0 %
************************************************************************************************************


## model name is  RandomForestClassifier()
## accuracy is  96.0 %
## precision is  93.0 %
## recall score is  97.0 %
## f1 score is  95.0 %

# Прямая атака на оригинальную модель с состязательными примерами

Для начала я попробовал обучить атакующую модель напрямую на ответах оригинальной модели, чтобы определить подход атаки. Я создаю несколько состязательных примеров для одного экземпляра данных, и атакующая модель делает вывод на основе предсказанных вероятностей целевой модели.

Состязательные примеры сгенерированные путем поочередного обнуления каждого признака, позволяют достичь точности 90%;

Другая попытка заключалась в том, чтобы генерировать состязательные примеры с использованием атак из библиотеки ART, но предсказания для тестовых и тренировочных наборов для таких примеров были неразличимы атакующей моделью.

Отличие в поведении модели на тренировочных и тестовых данных может заключаться в том, что на тренировочных данных модель сильнее зависит от конкретных признаков

In [None]:
# Функция для создания маски из 1 с n измененными элементами
def get_noise(shape = (114, 31),
              e = 0,  # Значение изменного элемента: 0 или случайное значение 1+-e
              n = 4,  # Количество измененных фич
              ri = 0  # Если 0, то заменяется случайный элемент маски, иначе эдемент mask[:,ri]
             ):
  mask = np.ones(shape, dtype=int)
  for i in range(mask.shape[0]):
    for _ in range(n):
        if ri:
          random_index = ri-1
        else:
          random_index = np.random.randint(0, mask.shape[1])

        if e !=0:
           e = 1+random.random()*e
        mask[i, random_index] = e

  return mask[0]

Генерация состязательных примеров

In [None]:
def make_advisorial(model,xtrain,xtest,ytrain,ytest,verbose = True):
  xattack_train = []
  xattack_test = []

  for s in range(31):
    noise = get_noise(ri = s+1, n =1)
    adv = xtrain[:len(xtest)]*noise
    noise = get_noise()
    adv_test = xtest*noise

    xattack_train.append(model.predict_proba(adv)[:,1])
    xattack_test.append(model.predict_proba(adv_test)[:,1])

    if verbose:
      print_score(xattack_train[-2],ytrain[:len(ytest)],'Тренировочные '+str(i))
      #print_score(xattack_train[-1],ytrain[:len(ytest)])
      print_score(xattack_test[-2],ytest,'Тестовые '+str(i))
      #print_score(xattack_test[-1],ytest)
  return   xattack_train, xattack_test

In [None]:
# Генерация
xattack_train, xattack_test = make_advisorial(original_model,
                                              xtrain=xtrain,
                                              xtest=xtest,
                                              ytrain = ytrain,
                                              ytest = ytest,
                                              verbose=False
                              )
xattack = np.hstack([xattack_train,xattack_test]).T
yattack = np.hstack([np.ones(len(xattack)//2),np.zeros(len(xattack)//2)])
xattack, xattack_val, yattack, yattack_val = train_test_split(xattack,yattack,test_size=0.3)
xattack.shape, yattack.shape

((159, 31), (159,))

Генерация атак с разной инициализацией RandomForestClassifier

In [None]:
for i in range(5):
  print(i+1,'========================')

  attack_model = RandomForestClassifier(n_estimators=10,max_depth=3)
  attack_model.fit(xattack,yattack)
  p = attack_model.predict(xattack)
  print_score(p,yattack,"Attacking RandomForestClassifier on it's Train")
  p = attack_model.predict(xattack_val)
  print_score(p,yattack_val,"Attacking RandomForestClassifier on it's Test")

Attacking RandomForestClassifier on it's Train
accuracy is  94.0 %
Attacking RandomForestClassifier on it's Test
accuracy is  97.0 %
Attacking RandomForestClassifier on it's Train
accuracy is  95.0 %
Attacking RandomForestClassifier on it's Test
accuracy is  97.0 %
Attacking RandomForestClassifier on it's Train
accuracy is  94.0 %
Attacking RandomForestClassifier on it's Test
accuracy is  96.0 %
Attacking RandomForestClassifier on it's Train
accuracy is  95.0 %
Attacking RandomForestClassifier on it's Test
accuracy is  97.0 %
Attacking RandomForestClassifier on it's Train
accuracy is  94.0 %
Attacking RandomForestClassifier on it's Test
accuracy is  97.0 %


# Атака с теневыми моделями

## Генерация тренировочных данных для теневых моделей

Реализую сценарий Noisy real data из статьи [Membership Inference Attacks Against
Machine Learning Models](https://arxiv.org/abs/1610.05820). Предполагается имитация зашумленных данных, которые могут быть доступны злоумышленнику в качестве реальных, но из другого датасета. Добавление шума к 10-20% фичам примеров из реального датасета атакуемой модели.

> This scenario models the case where the training data for the
target and shadow models are not sampled from exactly the
same population, or else sampled in a non-uniform way.



Я решил взять только часть примеров из реального датасета, чтобы проверить, что впоследствии атакующая модель сможет распознать тренировочные данные оригинальной модели, которые для теневых моделей будут являться тестовыми. Это будет свидетельствовать, что тренировочные данные теневых моделей достаточно отличаются от настоящих, и их можно рассматривать как реальные данные, доступные злоумышленнику.

In [None]:
x.shape,y.shape

((569, 31), (569,))

In [None]:
x,y = np.array(x),np.array(y)
noise = get_noise(shape = x[:len(y)].shape,
                  e=0.2,   # Степень шума
                  n=6      # Количество фич
                 )
shadowx = x[:100]*noise
shadowy = original_model.predict(shadowx)
print('Резмерность данных для теневых моделей',shadowx.shape,shadowy.shape)
print('Размерность реальных данных',x.shape,y.shape)
strainx = []
stestx = []
stesty = []
strainy = []
shadow_models = []

for _ in range(25):
  #shadow_models.append(ShadowClassifier())
  shadow_models.append(RandomForestClassifier(n_estimators=100) )
  # Сплит данных на которые будет идти атака
  strx, sttx, stry,stty = train_test_split(shadowx,shadowy,test_size=0.3,shuffle = True)
  strainx.append(strx)
  strainy.append(stry)
  stestx.append(sttx)
  stesty.append(stty)

Резмерность данных для теневых моделей (100, 31) (100,)
Размерность реальных данных (569, 31) (569,)


## Тренировка теневых моделей

In [None]:
for i in range(len(shadow_models)):

   #train_shadow_model(shadow_models[i],strainx[i],strainy[i],verbose=True)
   shadow_models[i].fit(strainx[i],strainy[i])
   # После тренировки тренировочные данные сокращаются до размера тестовой выборки
   # - Для последующего обучения атакующей модели
   strainx[i],strainy[i] = strainx[i][:stestx[i].shape[0]],strainy[i][:stestx[i].shape[0]]
   #test_shadow_model(shadow_models[i],stestx[i],stesty[i],'Зашумленные данные')
   #test_shadow_model(shadow_models[i],xtest,ytest,'Реальные данные')
   print_score(shadow_models[i].predict(stestx[i]),stesty[i])

accuracy is  97.0 %
accuracy is  100.0 %
accuracy is  87.0 %
accuracy is  93.0 %
accuracy is  93.0 %
accuracy is  87.0 %
accuracy is  90.0 %
accuracy is  93.0 %
accuracy is  93.0 %
accuracy is  93.0 %
accuracy is  97.0 %
accuracy is  93.0 %
accuracy is  93.0 %
accuracy is  87.0 %
accuracy is  100.0 %
accuracy is  93.0 %
accuracy is  90.0 %
accuracy is  93.0 %
accuracy is  93.0 %
accuracy is  87.0 %
accuracy is  93.0 %
accuracy is  90.0 %
accuracy is  93.0 %
accuracy is  90.0 %
accuracy is  97.0 %


In [None]:
strainx[0].shape

(30, 31)

## Создание advesarial examples для теневых моделей

In [None]:
xattack_train = []
xattack_test = []
for si in range(len(shadow_models)): # Проход по всем теневым моделям
  sh_model = shadow_models[si]
  xattack_train_si, xattack_test_si = make_advisorial(sh_model,
                                                      strainx[si],
                                                      stestx[si],
                                                      strainy[si],
                                                      stesty[si],
                                                      verbose = False
                                                     )
  xattack_train.append(xattack_train_si)
  xattack_test.append(xattack_test_si)

In [None]:
np.array(xattack_train).shape, np.array(xattack_test).shape

((25, 31, 30), (25, 31, 30))

In [None]:
xattack_train = np.hstack(xattack_train)
xattack_test = np.hstack(xattack_test)
xattack_train.shape, xattack_test.shape

((31, 750), (31, 750))

In [None]:
xattack = np.hstack([xattack_train,xattack_test]).T
yattack = np.hstack([np.ones(len(xattack)//2),np.zeros(len(xattack)//2)])
xattack, xattack_val, yattack, yattack_val = train_test_split(xattack,yattack,test_size=0.3)

In [None]:
xattack.shape, yattack.shape

((1050, 31), (1050,))

## Тренировка атакующей модели с разными инициализациями

In [None]:
for i in range(5):
  print(i+1,'===============')
  axtrain, axtest, aytrain, aytest = train_test_split(xattack,yattack,test_size=0.2,shuffle = True)
  attack_model = RandomForestClassifier(n_estimators=150,max_depth =5)
  attack_model.fit(axtrain,aytrain)
  p = attack_model.predict(axtrain)
  print_score(p,aytrain,'TRAIN')
  p = attack_model.predict(axtest)
  print_score(p,aytest,'TEST')

TRAIN
accuracy is  92.0 %
TEST
accuracy is  94.0 %
TRAIN
accuracy is  93.0 %
TEST
accuracy is  91.0 %
TRAIN
accuracy is  93.0 %
TEST
accuracy is  93.0 %
TRAIN
accuracy is  92.0 %
TEST
accuracy is  94.0 %
TRAIN
accuracy is  91.0 %
TEST
accuracy is  88.0 %


# Атака на целевую модель

## Атака на весь набор данных

In [None]:
xattack_train, xattack_test = make_advisorial(original_model,
                                              xtrain=xtrain,
                                              xtest=xtest,
                                              ytrain = ytrain,
                                              ytest = ytest,
                                              verbose=False
                                             )
xattack = np.hstack([xattack_train,xattack_test]).T
yattack = np.hstack([np.ones(len(xattack)//2),np.zeros(len(xattack)//2)])
len(np.array(xattack_train).T), len(np.array(xattack_test).T), xattack.shape # Количество примеров из тестовой и тренировочный выборок одинаково

(114, 114, (228, 31))

In [None]:
p = attack_model.predict(xattack)
print_score(p,yattack)

accuracy is  90.0 %


## Атака на данные целевой модели, которые не попали в зашумленный тренировочные набор теневых моделей

In [None]:
xattack_train, xattack_test = make_advisorial(original_model,
                                              xtrain=xtrain[:100],
                                              xtest=xtest[:100],
                                              ytrain = ytrain[:100],
                                              ytest = ytest[:100],
                                              verbose=False
                                             )
xattack = np.hstack([xattack_train,xattack_test]).T
yattack = np.hstack([np.ones(len(xattack)//2),np.zeros(len(xattack)//2)])
xattack.shape, yattack.shape # Количество примеров из тестовой и тренировочный выборок одинаково

((200, 31), (200,))

In [None]:
p = attack_model.predict(xattack)
print_score(p,yattack)

accuracy is  89.0 %


* С 10 теневыми моделями получилась точность 73%;

 25 моделей - 89%;

 50 моделей - 89%

# Вывод
Атакующая модель достигает точноти 89% на данных которые не были тренировочными для теневых моделей, следовательно обобщающая способность атаки хорошая и будет работать с любыми другими тренировочными данными целевой модели

