# Privacy attack simulation with Iris dataset
Authors : Johan Jublanc / Vincent Heng

## Installing dependencies

In [None]:
import numpy as np
import pandas as pd
from sklearn import datasets 

In [None]:
iris = datasets.load_iris()
x_columns = iris['feature_names']

## Testing dataset with Pandas

In [None]:
data1 = pd.DataFrame(data = np.c_[iris['data'], iris['target']], columns= iris['feature_names'] + ['target'])

## Splitting data

In [None]:
from sklearn.model_selection import train_test_split
x=iris.data
y=iris.target
x_a, x_b, y_a, y_b = train_test_split(x,y,test_size=.5)
x_b1, x_b2, y_b1, y_b2 = train_test_split(x_b,y_b,test_size=.2)

In [None]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(x_b1,y_b1)

y_pred=clf.predict(x_b2)

In [None]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_b2, y_pred))

In [None]:
#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(x_b,y_b)

# Generating a shadow dataset

### Get macro information about the dataset D

The attacker have only few public information about the original train dataset. The information detained by the attacker is denoted $I$. Here the information is just the mean and standard deviation for each variable.

In [None]:
I = data1.iloc[:,0:4]
I = I.describe()
I = I.loc[[(x in ['mean', 'std']) for x in I.index],:]

Make a simulation thanks to the model API (here clf) and use it to build up the shadow dataset.

In [None]:
def get_sample_from_I(I):
    d_s = []
    for col in I.columns:
        mean = I.loc["mean",col]
        std  = I.loc["std",col]
        d_s.append(np.random.normal(mean, std, 1)[0])
    return d_s

In [None]:
def choose_a_point(clf, I, threshold=0.7) :
    is_point_chosen = False
    while not is_point_chosen :
        d_s = get_sample_from_I(I)
        is_point_chosen = (np.max(clf.predict_proba([d_s])) > threshold)
    predicted_class = np.argmax(clf.predict_proba([d_s]))
    return d_s, predicted_class

In [None]:
def sample_in_an_hypercube(d_f, I):
    d_f_j = []
    for i in range(len(d_f)):
        mean = d_f[i]
        std  = I.loc["std", I.columns[i]]
        d_f_j.append(np.random.uniform(mean - std/2, mean + std/2, 1)[0])
    return d_f_j

In [None]:
def generate_D_prim(clf, I, threshold=0.7, total_size = 300, sample_by_hypercube = 10):
    D_prim = []
    
    while len(D_prim) < total_size :
        
        # sample a point with a high for which the prediction is good
        d_f, class_ = choose_a_point(clf, I, threshold)
        D_prim.append(d_f + [class_])
        
        # sample in the hypercube
        for i in range(sample_by_hypercube):
            d_f_j = sample_in_an_hypercube(d_f, I)
            predicted_class = np.argmax(clf.predict_proba([d_f_j]))
            D_prim.append(d_f_j + [predicted_class])     
    
    D_prim = pd.DataFrame(data=D_prim,
                          columns=(list(I.columns)+["label"]))
    return D_prim

In [None]:
D_prim = generate_D_prim(clf, I, total_size=300)

# Generation of an attack model training set

In [None]:
D_prim_in, D_prim_out  = train_test_split(D_prim, test_size=.5)

In [None]:
x_prim_out = D_prim_out.iloc[:,0:4]
y_prim_out = D_prim_out.iloc[:,4]

x_prim_in = D_prim_in.iloc[:,0:4]
y_prim_in = D_prim_in.iloc[:,4]

In [None]:
clf_prim = RandomForestClassifier(n_estimators=100)
clf_prim.fit(x_prim_in,y_prim_in)

In [None]:
D_star_in = pd.DataFrame(clf_prim.predict_proba(x_prim_in), columns = ("p_0", "p_1", "p_2"))
D_star_in["is_in"] = 1

D_star_out = pd.DataFrame(clf_prim.predict_proba(x_prim_out), columns = ("p_0", "p_1", "p_2"))
D_star_out["is_in"] = 0

In [None]:
D_star = pd.concat([D_star_in, D_star_out]).reset_index().drop("index", axis=1)

# Build the attack model

In [None]:
nb_col_star = len(D_star.columns)
x_star = D_star.iloc[:, 0:(nb_col_star-1)]
y_star = D_star.iloc[:,-1]

In [None]:
clf_attack = RandomForestClassifier(n_estimators=100)
clf_attack.fit(x_star,y_star)

# Test the attack against the true data set D

In [None]:
x_a = pd.DataFrame(x_a, columns=x_columns)
x_b = pd.DataFrame(x_b, columns=x_columns)

In [None]:
proba_a   = pd.DataFrame(clf_prim.predict_proba(x_a), columns = ("p_0", "p_1", "p_2"))
predict_a = clf_attack.predict(proba_a)
result_a  = pd.DataFrame(zip(predict_a, [0 for i in range(len(proba_a))]), columns = ("y_pred", "y"))

proba_b   = pd.DataFrame(clf_prim.predict_proba(x_b), columns = ("p_0", "p_1", "p_2"))
predict_b = clf_attack.predict(proba_b)
result_b  = pd.DataFrame(zip(predict_b, [1 for i in range(len(proba_b))]), columns = ("y_pred", "y"))

In [None]:
attack_results = pd.concat([result_a, result_b]).reset_index().drop("index", axis=1)

In [None]:
print("Accuracy:",metrics.accuracy_score(attack_results["y"], attack_results["y_pred"]))