# Counterfactuals Training Data Extraction Demonstration

In [27]:
import pandas as pd
import sklearn.ensemble as es
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
import numpy as np
import random
import logging
import sys
import time
import multiprocessing
import dice_ml

This notebook will test whether training data extraction is possible with counterfactuals (CF) that are drawn from the training data. Training data extraction means an attacker can find out the feature values of samples from the training data without prior knowledge of them. The attacker only has access to the model's prediction function and the explanation.

This attack should be trivial because any counterfactual that is shown as an explanation was picked directly from the training data.

First we define the function that will run the experiment for the different variations. The attacker makes repeated queries to the model with random input values. In order to do this, the attacker knows the maximum and minimum value of each feature in the training data. Each counterfactual is recorded and in the end, it is checked what percentage of them is contained in the training data.

In [28]:
# data: pandas dataframe of the training data
# repetitions: number of experiment repetitions = number of queries to the model
# continuous_features: names of the continuous features in the training data
# outcome_name: name of the label in the training data
# classifier: machine learning classifier to train on the training data
# random_state: seed for random decisions
# returns: accuracy of training data extraction with counterfactuals
def experiment(data, repetitions, continuous_features, outcome_name, classifier, random_state=0):
    rng = np.random.default_rng(random_state)
    
    data_np = data.to_numpy().astype(float)
    
    features = data.drop(outcome_name, axis=1)
    labels = data[outcome_name]
    
    features_np = features.to_numpy().astype(float)
    
    num_features =  features.shape[1]
    
    # names of the categorical features
    categorical_features = features.columns.difference(continuous_features)
    
    logging.debug("Categorical features: %s" % categorical_features)
    
    # DiCE needs categorical features to be strings:
    for col in categorical_features:
        data[col]= data[col].astype(str)
        features[col] = features[col].astype(str)
    
    if len(categorical_features) > 0:
        # DiCE did not work without this pipeline for categorical features
        # Define transformer to transform categorical features into one-hot encoding
        categorical_transformer = Pipeline(steps=[
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])

        transformations = ColumnTransformer(transformers=[
            ('cat', categorical_transformer, categorical_features)
        ])

        # Train classifier on given data
        clf = Pipeline(steps=[('preprocessor', transformations),
                              ('classifier', classifier)])
        clf = clf.fit(features, labels)
    else:
        # If there are no categorical features, then nothing needs to be transformed
        clf = classifier.fit(features, labels)
    
    # Train explainer on given data and use method "kd-tree" to get counterfactuals drawn from the training data
    d = dice_ml.Data(dataframe=data, continuous_features=continuous_features, outcome_name=outcome_name)
    m = dice_ml.Model(model=clf, backend="sklearn", model_type='classifier')
    exp = dice_ml.Dice(d, m, method="kdtree")
    
    # Get the minimum and maximum value for all features in the training data. The attacker needs this information to 
    # create random samples for queries.
    feature_mins = np.amin(features_np, axis=0)
    feature_maxs = np.amax(features_np, axis=0)
    
    logging.debug("Minimum values of features: %s " % feature_mins)
    logging.debug("Maximum values of features: %s " % feature_maxs)
    
    # Generate random samples:
    samples_float = rng.uniform(feature_mins, feature_maxs, (repetitions, num_features))
    samples = np.rint(samples_float)
    samples = samples.astype(int)
    
    logging.debug("Getting counterfactuals for following random samples: \n %s" % samples)

    # Create dataframe from numpy array of random samples
    samples_df = pd.DataFrame(data=samples, columns=list(features.columns.values))
    
    # Cast categorical features to string again because of DiCE peculiarities
    for col in categorical_features:
        samples_df[col]= samples_df[col].astype(str)

    # Get five counterfactuals for each random sample
    e1 = exp.generate_counterfactuals(samples_df, total_CFs=5, desired_class="opposite")
    
    list_of_counterfactuals = []
    
    for cf_examples in e1.cf_examples_list:
        # Get counterfactuals as dataframe. See https://github.com/interpretml/DiCE/issues/174
        # Use sparse final cfs instead of just final_cfs_df because those are the counterfactuals that are also shown by default
        # with the function visualize_as_dataframe()
        list_of_counterfactuals.append(cf_examples.final_cfs_df_sparse.to_numpy())

    counterfactuals = np.concatenate(list_of_counterfactuals)
    counterfactuals = counterfactuals.astype(float)
        
    number_of_cfs_in_training_data = 0
    number_of_cfs = counterfactuals.shape[0]

    # Check how many counterfactuals can be found in the training data
    for counterfactual in counterfactuals:

        # Get all indices of the counterfactual in the given training data. data_np == row creates a boolean array with
        # True if the cells match and False otherwise. all(axis=1) returns for each row if all elements in the row are True.
        # np.where returns an array of indices where the boolean array contains the value True.
        indices_of_counterfactual = np.where(np.isclose(data_np, counterfactual).all(axis=1))[0]

        if indices_of_counterfactual.shape[0] > 0:
            logging.debug("Counterfactual %s appears in training data at indices %s" % (counterfactual,\
                                                                                        indices_of_counterfactual))
            number_of_cfs_in_training_data += 1
        else:
            logging.debug("Counterfactual %s does not appear in training data" % counterfactual)

    # Calculate percentage of counterfactuals that appear in the training data
    accuracy = number_of_cfs_in_training_data / number_of_cfs
            
    print("Percentage of counterfactuals that appear in training data: %s" % accuracy)
    
    return accuracy

# Dataset 1: Heart Disease

Load dataset one: heart disease

In [29]:
filename = '../data/framingham.csv'

names = ['sex', 'age', 'education', 'smoker', 'cigs_per_day', 'bp_meds', 'prevalent_stroke', 'prevelant_hyp', 'diabetes', \
         'total_chol', 'sys_bp', 'dia_bp', 'bmi', 'heart_rate', 'glucose', 'heart_disease_label']

data = pd.read_csv(filename, names=names)

For this dataset we only look at numerical data so we drop the categorical columns. We also drop the column "education" for which there is no feature description on kaggle: https://www.kaggle.com/dileep070/heart-disease-prediction-using-logistic-regression

In [30]:
data_num = data.drop('sex', axis=1).drop('smoker', axis=1).drop('bp_meds', axis=1).drop('prevalent_stroke', axis=1)\
    .drop('prevelant_hyp', axis=1).drop('diabetes', axis=1).drop('education', axis=1)

data_num.head(5)

Unnamed: 0,age,cigs_per_day,total_chol,sys_bp,dia_bp,bmi,heart_rate,glucose,heart_disease_label
0,39,0.0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,46,0.0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,48,20.0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,61,30.0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,46,23.0,285.0,130.0,84.0,23.1,85.0,85.0,0


Remove any rows that are missing data. Afterwards there should be no more entries with NaN values. We also drop any duplicate rows.

In [31]:
data_num = data_num.dropna()
data_num = data_num.drop_duplicates()

data_num_100 = data_num.sample(n = 100, random_state=13)

continuous_features_num = ['age', 'cigs_per_day', 'total_chol', 'sys_bp', 'dia_bp', 'bmi', 'heart_rate', 'glucose']
outcome_name_num = 'heart_disease_label'

data_num.isnull().sum()

age                    0
cigs_per_day           0
total_chol             0
sys_bp                 0
dia_bp                 0
bmi                    0
heart_rate             0
glucose                0
heart_disease_label    0
dtype: int64

We now generate five counterfactuals for the first sample from the training data to demonstrate counterfactual explanations in general.

In [32]:
features = data_num.drop('heart_disease_label', axis=1)
labels = data_num['heart_disease_label']

# Train a random forest classifier on training data.
clf = es.RandomForestClassifier(random_state=0)
clf = clf.fit(features, labels)

# Train explainer
d = dice_ml.Data(dataframe=data_num, continuous_features=continuous_features_num, outcome_name=outcome_name_num)


m = dice_ml.Model(model=clf, backend="sklearn", model_type='classifier')
# Generating counterfactuals from training data (kd-tree)
exp = dice_ml.Dice(d, m, method="kdtree")

In [33]:
e1 = exp.generate_counterfactuals(features[0:1], total_CFs=5, desired_class="opposite")
e1.visualize_as_dataframe(show_only_changes=True)

100%|██████████| 1/1 [00:03<00:00,  3.23s/it]

Query instance (original outcome : 0)





Unnamed: 0,age,cigs_per_day,total_chol,sys_bp,dia_bp,bmi,heart_rate,glucose,heart_disease_label
0,39,0.0,195.0,106.0,70.0,26.97,80.0,77.0,0



Diverse Counterfactual set (new outcome: 1.0)


Unnamed: 0,age,cigs_per_day,total_chol,sys_bp,dia_bp,bmi,heart_rate,glucose,heart_disease_label
0,44.0,-,180.0,-,-,23.98,92.0,67.0,1.0
1,64.0,-,210.0,120.0,-,24.77,-,-,1.0
2,43.0,-,-,121.5,86.5,20.82,92.0,-,-
3,49.0,-,211.0,104.2,66.7,24.17,75.0,87.0,1.0
4,50.0,-,-,126.0,88.0,-,-,-,1.0


We can see that the counterfactuals are similar to the query sample and that most of them have a flipped prediction. These are the two general properties of counterfactual explanations.

We will now do a small proof of concept of the experiment with logging enabled to demonstrate how it works.

In [34]:
logging.root.setLevel(logging.DEBUG)

experiment(data_num, repetitions=2, continuous_features=continuous_features_num, outcome_name=outcome_name_num,\
           random_state=0, classifier=es.RandomForestClassifier(random_state=0))

logging.root.setLevel(logging.ERROR)

DEBUG:root:Categorical features: Index([], dtype='object')
DEBUG:root:Minimum values of features: [ 32.           0.         113.          83.5         48.
  15.53999996  44.          40.        ] 
DEBUG:root:Maximum values of features: [ 70.          70.         696.         295.         142.5
  56.79999924 143.         394.        ] 
DEBUG:root:Getting counterfactuals for following random samples: 
 [[ 56  19 137  87 125  53 104 298]
 [ 53  65 589  84 129  17 116 102]]
100%|██████████| 2/2 [00:02<00:00,  1.38s/it]
DEBUG:root:Counterfactual [ 56.     0.   214.   115.    80.    25.09  70.   298.     1.  ] does not appear in training data
DEBUG:root:Counterfactual [ 56.     0.   273.   136.    80.    27.73  90.   210.     1.  ] appears in training data at indices [3667]
DEBUG:root:Counterfactual [ 61.    0.  265.  200.  125.   29.5  68.  256.    1. ] appears in training data at indices [3648]
DEBUG:root:Counterfactual [ 56.    10.   241.   174.    97.    29.22  90.   135.     1.  ] appe

Percentage of counterfactuals that appear in training data: 0.7


We can see that some counterfactuals can be found in the training data, while others cannot. This is due to the sparsity induced by the counterfactual explainer. Some counterfactuals do not retain all their original feature values from the training data. Instead, feature values that lie close to the feature values of the query instance are replaced by those original values. Therefore, some counterfactuals may not appear in the training data.

We can now begin with the actual experiments.

In [35]:
results_ = {'dataset': [], 'training_samples': [], 'model': [], 'accuracy': []}

results = pd.DataFrame(data = results_)

In [36]:
print("features: continuous, training samples: all, model: decision tree.")

start_time = time.time()

accuracy = experiment(data_num, repetitions=100, continuous_features=continuous_features_num, outcome_name=outcome_name_num,\
                      classifier=DecisionTreeClassifier(random_state=0), random_state=0)

results.loc[len(results.index)] = ['continuous', 'all', 'decision tree', accuracy]

print("--- %s seconds ---" % (time.time() - start_time))

features: continuous, training samples: all, model: decision tree.


100%|██████████| 100/100 [00:26<00:00,  3.81it/s]


Percentage of counterfactuals that appear in training data: 0.6
--- 26.528949975967407 seconds ---


In [37]:
print("features: continuous, training samples: all, model: random forest.")

start_time = time.time()

accuracy = experiment(data_num, repetitions=100, continuous_features=continuous_features_num, outcome_name=outcome_name_num,\
                      classifier=es.RandomForestClassifier(random_state=0), random_state=0)

results.loc[len(results.index)] = ['continuous', 'all', 'random forest', accuracy]

print("--- %s seconds ---" % (time.time() - start_time))

features: continuous, training samples: all, model: random forest.


100%|██████████| 100/100 [02:12<00:00,  1.33s/it]


Percentage of counterfactuals that appear in training data: 0.598
--- 133.6598916053772 seconds ---


In [38]:
print("features: continuous, training samples: 100, model: decision tree.")

start_time = time.time()

accuracy = experiment(data_num_100, repetitions=100, continuous_features=continuous_features_num, outcome_name=outcome_name_num,\
                      classifier=DecisionTreeClassifier(random_state=0), random_state=0)

results.loc[len(results.index)] = ['continuous', '100', 'decision tree', accuracy]

print("--- %s seconds ---" % (time.time() - start_time))

features: continuous, training samples: 100, model: decision tree.


100%|██████████| 100/100 [00:14<00:00,  6.77it/s]


Percentage of counterfactuals that appear in training data: 0.648
--- 14.971148252487183 seconds ---


In [39]:
print("features: continuous, training samples: 100, model: random forest.")

start_time = time.time()

accuracy = experiment(data_num_100, repetitions=100, continuous_features=continuous_features_num, outcome_name=outcome_name_num,\
                      classifier=es.RandomForestClassifier(random_state=0), random_state=0)

results.loc[len(results.index)] = ['continuous', '100', 'random forest', accuracy]

print("--- %s seconds ---" % (time.time() - start_time))

features: continuous, training samples: 100, model: random forest.


100%|██████████| 100/100 [01:00<00:00,  1.64it/s]

Percentage of counterfactuals that appear in training data: 0.646
--- 61.261871099472046 seconds ---





# Dataset 2: Census Income (categorical)

Load dataset two: census income

In [40]:
filename = '../data/adult.data.csv'

names = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', \
         'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'label']

data_cat = pd.read_csv(filename, names=names)

In [41]:
data_cat.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


There is missing data in the columns workclass and native_country that needs to be removed.

In [42]:
print("Unique values of columns before removal: ")
print(data_cat.workclass.unique())
print(data_cat.native_country.unique())

data_cat = data_cat[data_cat.workclass != ' ?']
data_cat = data_cat[data_cat.native_country != ' ?']

print("Unique values of columns after removal: ")
print(data_cat.workclass.unique())
print(data_cat.native_country.unique())

Unique values of columns before removal: 
[' State-gov' ' Self-emp-not-inc' ' Private' ' Federal-gov' ' Local-gov'
 ' ?' ' Self-emp-inc' ' Without-pay' ' Never-worked']
[' United-States' ' Cuba' ' Jamaica' ' India' ' ?' ' Mexico' ' South'
 ' Puerto-Rico' ' Honduras' ' England' ' Canada' ' Germany' ' Iran'
 ' Philippines' ' Italy' ' Poland' ' Columbia' ' Cambodia' ' Thailand'
 ' Ecuador' ' Laos' ' Taiwan' ' Haiti' ' Portugal' ' Dominican-Republic'
 ' El-Salvador' ' France' ' Guatemala' ' China' ' Japan' ' Yugoslavia'
 ' Peru' ' Outlying-US(Guam-USVI-etc)' ' Scotland' ' Trinadad&Tobago'
 ' Greece' ' Nicaragua' ' Vietnam' ' Hong' ' Ireland' ' Hungary'
 ' Holand-Netherlands']
Unique values of columns after removal: 
[' State-gov' ' Self-emp-not-inc' ' Private' ' Federal-gov' ' Local-gov'
 ' Self-emp-inc' ' Without-pay' ' Never-worked']
[' United-States' ' Cuba' ' Jamaica' ' India' ' Mexico' ' Puerto-Rico'
 ' Honduras' ' England' ' Canada' ' Germany' ' Iran' ' Philippines'
 ' Poland' ' Colu

We will only use the categorical features of this dataset. Remove continuous columns:

In [43]:
data_cat = data_cat.drop('age', axis=1).drop('fnlwgt', axis=1).drop('education_num', axis=1).drop('capital_gain', axis=1)\
    .drop('capital_loss', axis=1).drop('hours_per_week', axis=1)

data_cat.head(3)

Unnamed: 0,workclass,education,marital_status,occupation,relationship,race,sex,native_country,label
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States,<=50K
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,<=50K
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States,<=50K


Drop duplicates and create version with only 100 samples.

In [44]:
data_cat.drop_duplicates()

# This needs to be done before the transformations to label encoding. This smaller dataset will contain fewer categories
# Otherwise, DiCE will later throw an error if random samples with categories are created, that do not exist in this dataset
data_cat_100 = data_cat.sample(n = 100, random_state=0)

Transform workclass, education, marital_status, occupation, relationship, race, sex and native_country into label encoded features:

In [45]:
def transform_dataset(dataset):

    dataset['workclass_encoded'] = LabelEncoder().fit_transform(dataset['workclass'])
    dataset['education_encoded'] = LabelEncoder().fit_transform(dataset['education'])
    dataset['marital_status_encoded'] = LabelEncoder().fit_transform(dataset['marital_status'])
    dataset['occupation_encoded'] = LabelEncoder().fit_transform(dataset['occupation'])
    dataset['relationship_encoded'] = LabelEncoder().fit_transform(dataset['relationship'])
    dataset['race_encoded'] = LabelEncoder().fit_transform(dataset['race'])
    dataset['native_country_encoded'] = LabelEncoder().fit_transform(dataset['native_country'])

    dataset = dataset.drop('workclass', axis=1).drop('education', axis=1).drop('marital_status', axis=1)\
        .drop('occupation', axis=1).drop('relationship', axis=1).drop('race', axis=1).drop('native_country', axis=1)
    
    return dataset

data_cat = transform_dataset(data_cat)
data_cat_100 = transform_dataset(data_cat_100)
    
data_cat.head(3)

Unnamed: 0,sex,label,workclass_encoded,education_encoded,marital_status_encoded,occupation_encoded,relationship_encoded,race_encoded,native_country_encoded
0,Male,<=50K,6,9,4,1,1,4,38
1,Male,<=50K,5,9,2,4,0,4,38
2,Male,<=50K,3,11,0,6,1,4,38


Transform label and sex into binary encoding:

In [46]:
data_cat['female'] = data_cat['sex'].map( {' Male': 0, ' Female': 1} )
data_cat['income'] = data_cat['label'].map( {' <=50K': 0, ' >50K': 1} )

data_cat = data_cat.drop('sex', axis=1).drop('label', axis=1)

data_cat_100['female'] = data_cat_100['sex'].map( {' Male': 0, ' Female': 1} )
data_cat_100['income'] = data_cat_100['label'].map( {' <=50K': 0, ' >50K': 1} )

data_cat_100 = data_cat_100.drop('sex', axis=1).drop('label', axis=1)

data_cat.head(3)

Unnamed: 0,workclass_encoded,education_encoded,marital_status_encoded,occupation_encoded,relationship_encoded,race_encoded,native_country_encoded,female,income
0,6,9,4,1,1,4,38,0,0
1,5,9,2,4,0,4,38,0,0
2,3,11,0,6,1,4,38,0,0


Begin with the experiments:

In [47]:
continuous_features_cat = []

outcome_name_cat = 'income'

In [48]:
print("features: categorical, training samples: all, model: decision tree.")

start_time = time.time()

accuracy = experiment(data_cat, repetitions=100, continuous_features=continuous_features_cat, outcome_name=outcome_name_cat,\
                      classifier=DecisionTreeClassifier(random_state=0), random_state=0)

results.loc[len(results.index)] = ['categorical', 'all', 'decision tree', accuracy]

print("--- %s seconds ---" % (time.time() - start_time))

features: categorical, training samples: all, model: decision tree.


100%|██████████| 100/100 [00:37<00:00,  2.66it/s]


Percentage of counterfactuals that appear in training data: 1.0
--- 39.624513149261475 seconds ---


In [49]:
print("features: categorical, training samples: all, model: random forest.")

start_time = time.time()

accuracy = experiment(data_cat, repetitions=100, continuous_features=continuous_features_cat, outcome_name=outcome_name_cat,\
                      classifier=es.RandomForestClassifier(random_state=0), random_state=0)

results.loc[len(results.index)] = ['categorical', 'all', 'random forest', accuracy]

print("--- %s seconds ---" % (time.time() - start_time))

features: categorical, training samples: all, model: random forest.


100%|██████████| 100/100 [01:39<00:00,  1.01it/s]


Percentage of counterfactuals that appear in training data: 1.0
--- 113.70071315765381 seconds ---


In [50]:
print("features: categorical, training samples: 100, model: decision tree.")

start_time = time.time()

accuracy = experiment(data_cat_100, repetitions=100, continuous_features=continuous_features_cat, outcome_name=outcome_name_cat,\
                      classifier=DecisionTreeClassifier(random_state=0), random_state=0)

results.loc[len(results.index)] = ['categorical', '100', 'decision tree', accuracy]

print("--- %s seconds ---" % (time.time() - start_time))

features: categorical, training samples: 100, model: decision tree.


100%|██████████| 100/100 [00:13<00:00,  7.36it/s]


Percentage of counterfactuals that appear in training data: 1.0
--- 13.892636060714722 seconds ---


In [51]:
print("features: categorical, training samples: all, model: random forest.")

start_time = time.time()

accuracy = experiment(data_cat_100, repetitions=100, continuous_features=continuous_features_cat, outcome_name=outcome_name_cat,\
                      classifier=es.RandomForestClassifier(random_state=0), random_state=0)

results.loc[len(results.index)] = ['categorical', '100', 'random forest', accuracy]

print("--- %s seconds ---" % (time.time() - start_time))

features: categorical, training samples: all, model: random forest.


100%|██████████| 100/100 [00:25<00:00,  3.94it/s]

Percentage of counterfactuals that appear in training data: 1.0
--- 25.81084895133972 seconds ---





# Results

"Accuracy" describes the percentage of counterfactuals that matched a sample in the training data exactly.

In [52]:
results

Unnamed: 0,dataset,training_samples,model,accuracy
0,continuous,all,decision tree,0.6
1,continuous,all,random forest,0.598
2,continuous,100,decision tree,0.648
3,continuous,100,random forest,0.646
4,categorical,all,decision tree,1.0
5,categorical,all,random forest,1.0
6,categorical,100,decision tree,1.0
7,categorical,100,random forest,1.0
