# Privacy attack simulation with Iris dataset
Authors : Johan Jublanc / Vincent Heng

### Imports

In [None]:
! pip install xgboost

In [None]:
import numpy as np
import pandas as pd
from sklearn import datasets

from os import listdir
from os.path import isfile, join

import urllib.request

import tarfile

from sklearn import metrics

import xgboost as xgb

In [None]:
if not ("data" in listdir()):
    ! mkdir data

### Get the data

In [None]:
data_file_name = "./data/cifar10.tar.gz"

In [None]:
url = 'https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'
urllib.request.urlretrieve(url, data_file_name)

In [None]:
tar = tarfile.open(data_file_name, "r:gz")
tar.extractall("./data")
tar.close()

In [None]:
data_batches_names = [f for f in listdir("./data/cifar-10-batches-py") if f.split("_")[0:2]==["data","batch"]]

In [None]:
def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

In [None]:
data_batch = unpickle("./data/cifar-10-batches-py/" + data_batches_names[0])

In [None]:
data = data_batch

## Build up a model to predict the category

We firstly build a model that is trained on the dataset $data_b$

In [None]:
from sklearn.model_selection import train_test_split
x=data[b"data"]
y=data[b"labels"]
x_a, x_b, y_a, y_b = train_test_split(x,y,test_size=.5)
x_b1, x_b2, y_b1, y_b2 = train_test_split(x_b,y_b,test_size=.2)

A first model is trained on 80% of the $data_b$ and test on the 20% left

In [None]:
clf=RandomForestClassifier(n_estimators=400)
clf.fit(x_b1,y_b1)
y_pred=clf.predict(x_b2)
print("Accuracy:", metrics.accuracy_score(y_b2, y_pred))

Then the model is train over the complete dataset $data_b$

In [None]:
clf=RandomForestClassifier(n_estimators=400)
clf.fit(x_b, y_b)

# Get a shadow dataset

### Get information about the dataset D

Here the attacker knows another dataset that is similar to D.

In [None]:
data_prim = unpickle("./data/cifar-10-batches-py/" + data_batches_names[1])
x_prim = data_prim[b"data"]
y_prim = data_prim[b"labels"]

In [None]:
i=2805
fig, axes = plt.subplots(1,1,figsize=(1.5,1.5))
plt.imshow(np.reshape(data[b'data'][i],(3,32,32)).transpose(1,2,0))

# Generation of an attack model training set

In [None]:
x_prim_in, x_prim_out, y_prim_in, y_prim_out = train_test_split(x_prim, y_prim, test_size=.5)
x_prim_in_train, x_prim_in_test, y_prim_in_train, y_prim_in_test = train_test_split(x_prim_in, y_prim_in, test_size=.2)

In [None]:
clf_prim = RandomForestClassifier(n_estimators=400)
clf_prim.fit(x_prim_in_train, y_prim_in_train)
print("Accuracy:",metrics.accuracy_score(y_prim_in_test, clf_prim.predict(x_prim_in_test)))

In [None]:
clf_prim = RandomForestClassifier(n_estimators=400)
clf_prim.fit(x_prim_in, y_prim_in)

In [None]:
x_star_in = clf_prim.predict_proba(x_prim_in)
y_star_in = [1 for i in range(len(x_star_in))]

x_star_out = clf_prim.predict_proba(x_prim_out)
y_star_out = [0 for i in range(len(x_star_out))]

In [None]:
x_star = np.concatenate([x_star_in, x_star_out], axis=0)
y_star = np.concatenate([y_star_in, y_star_out], axis=0)

# Build the attack model

ref : https://www.datacamp.com/community/tutorials/xgboost-in-python#apply

In [None]:
x_star_train, x_star_test, y_star_train, y_star_test = train_test_split(x_star, y_star, test_size =.2)

In [None]:
clf_attack  = xgb.XGBClassifier(objective ='reg:squarederror',
                                colsample_bytree = 0.3,
                                learning_rate = 0.1,
                                max_depth = 5,
                                alpha = 10,
                                n_estimators = 10)

clf_attack.fit(x_star_train,y_star_train)
print("Accuracy:", metrics.accuracy_score(y_star_test, clf_attack.predict(x_star_test)))

In [None]:
clf_attack  = xgb.XGBClassifier(objective ='reg:squarederror',
                                colsample_bytree = 0.3,
                                learning_rate = 0.1,
                                max_depth = 5,
                                alpha = 10,
                                n_estimators = 10)
clf_attack.fit(x_star,y_star)

# Test the attack against the true data set D

In [None]:
proba_a   = clf.predict_proba(x_a) # information we have thank to the API
predict_a = clf_attack.predict(proba_a) # model we have trained with few information
result_a  = pd.DataFrame(zip(predict_a, [0 for i in range(len(proba_a))]), columns = ("y_pred", "y"))

In [None]:
proba_b   = clf.predict_proba(x_b) # information we have thank to the API
predict_b = clf_attack.predict(proba_b) # model we have trained with few information
result_b  = pd.DataFrame(zip(predict_b, [1 for i in range(len(proba_b))]), columns = ("y_pred", "y"))

In [None]:
attack_results = pd.concat([result_a, result_b]).reset_index().drop("index", axis=1)

In [None]:
print("Accuracy:", metrics.accuracy_score(attack_results["y"], attack_results["y_pred"]))

## Is this picture in the training dataset ?

In [None]:
i = np.random.randint(0,len(x_b), 1)[0]
fig, axs = plt.subplots(1,1,figsize=(1.5,1.5))
plt.imshow(np.reshape(x_a[i],(3,32,32)).transpose(1,2,0))

proba   = clf_prim.predict_proba([x_b[i]])
predict = clf_attack.predict(proba)

if predict_a == 1:
    title = "We predict YES and we are right"
else :
    title = "We predict no but we are wrong (loooser;)"


plt.title(title, size=20)
plt.show()