In [90]:
# Sklearn imports
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

# DiCE imports
import dice_ml
from dice_ml.utils import helpers  # helper functions

In [91]:
dataset = helpers.load_adult_income_dataset()
dataset.head()

Unnamed: 0,age,workclass,education,marital_status,occupation,race,gender,hours_per_week,income
0,28,Private,Bachelors,Single,White-Collar,White,Female,60,0
1,30,Self-Employed,Assoc,Married,Professional,White,Male,65,1
2,32,Private,Some-college,Married,White-Collar,White,Male,50,0
3,20,Private,Some-college,Single,Service,White,Female,35,0
4,41,Self-Employed,Some-college,Married,White-Collar,White,Male,50,0


In [92]:
target = dataset["income"]
train_dataset, test_dataset, y_train, y_test = train_test_split(dataset,
                                                                target,
                                                                test_size=0.2,
                                                                random_state=0,
                                                                stratify=target)
x_train = train_dataset.drop('income', axis=1)
x_test = test_dataset.drop('income', axis=1)

In [93]:
# Step 1: dice_ml.Data
d = dice_ml.Data(dataframe=train_dataset, continuous_features=['age', 'hours_per_week'], outcome_name='income')

In [94]:
numerical = ["age", "hours_per_week"]
categorical = x_train.columns.difference(numerical)

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

transformations = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', transformations),
                      ('classifier', RandomForestClassifier())])
model = clf.fit(x_train, y_train)

In [95]:
# Using sklearn backend
m = dice_ml.Model(model=model, backend="sklearn")
# Using method=random for generating CFs
exp = dice_ml.Dice(d, m, method="random")

In [117]:
# Restricting age to be between [20,30] and Education to be either {'Doctorate', 'Prof-school'}.
e3 = exp.generate_counterfactuals(x_test[0:1],
                                  total_CFs=1,
                                  desired_class="opposite",
                                  permitted_range={'age': [20, 30], 'education': ['Doctorate', 'Prof-school']})
e3.visualize_as_dataframe(show_only_changes=True)

100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 10.52it/s]

Query instance (original outcome : 0)





Unnamed: 0,age,workclass,education,marital_status,occupation,race,gender,hours_per_week,income
0,29,Private,HS-grad,Married,Blue-Collar,White,Female,38,0



Diverse Counterfactual set (new outcome: 1.0)


Unnamed: 0,age,workclass,education,marital_status,occupation,race,gender,hours_per_week,income
0,-,-,Doctorate,-,-,-,Male,-,1


In [113]:
query_instance = x_test[0:1]
imp = exp.local_feature_importance(query_instance, total_CFs=10)
print(imp.local_importance)

100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  6.57it/s]

[{'occupation': 0.9, 'education': 0.7, 'workclass': 0.3, 'gender': 0.1, 'marital_status': 0.0, 'race': 0.0, 'age': 0.0, 'hours_per_week': 0.0}]





In [84]:
query_instances = x_test[0:20]
imp = exp.global_feature_importance(query_instances)
print(imp.summary_importance)

100%|███████████████████████████████████████████| 20/20 [00:02<00:00,  8.53it/s]

{'education': 0.645, 'occupation': 0.295, 'marital_status': 0.27, 'age': 0.27, 'hours_per_week': 0.205, 'workclass': 0.195, 'race': 0.095, 'gender': 0.09}





In [85]:
backend = 'PYT'  # needs pytorch installed
ML_modelpath = helpers.get_adult_income_modelpath(backend=backend)
m = dice_ml.Model(model_path=ML_modelpath, backend=backend,  func="ohe-min-max")

In [86]:
exp = dice_ml.Dice(d, m, method="gradient")

In [87]:
x_test[4:9]

Unnamed: 0,age,workclass,education,marital_status,occupation,race,gender,hours_per_week
25954,28,Other/Unknown,Assoc,Separated,Other/Unknown,White,Female,40
20513,33,Private,School,Divorced,White-Collar,White,Male,40
12521,33,Private,HS-grad,Married,Blue-Collar,White,Male,40
14299,18,Private,Some-college,Single,Service,White,Male,12
19005,26,Government,Some-college,Single,Service,Other,Female,10


In [88]:
# generate counterfactuals
dice_exp = exp.generate_counterfactuals(x_test[6:7], total_CFs=1, desired_class="opposite",posthoc_sparsity_algorithm='binary',)
# highlight only the changes
dice_exp.visualize_as_dataframe(show_only_changes=True)

100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  1.64it/s]

Diverse Counterfactuals found! total time taken: 00 min 00 sec
Query instance (original outcome : 0)





Unnamed: 0,age,workclass,education,marital_status,occupation,race,gender,hours_per_week,income
0,33.0,Private,HS-grad,Married,Blue-Collar,White,Male,40.0,0.117



Diverse Counterfactual set (new outcome: 1.0)


Unnamed: 0,age,workclass,education,marital_status,occupation,race,gender,hours_per_week,income
0,50.0,-,Prof-school,-,-,-,-,-,1
