In [1]:
from numpy.random import RandomState
seed = RandomState(1994)

In [2]:
from sklearn import datasets, model_selection, ensemble, metrics

In [3]:
import pprint

## Classification
Contrastive explanation for an instance of the [Iris](https://archive.ics.uci.edu/ml/datasets/iris) data set

---

**1. Train a (black-box) model on the Iris data**

In [21]:
data = datasets.load_iris()
x_train, x_test, y_train, y_test = model_selection.train_test_split(data.data, 
                                                                data.target, 
                                                                train_size=0.80, 
                                                                random_state=seed)
model = ensemble.RandomForestClassifier(random_state=seed)
model.fit(x_train, y_train)

print('Classifier performance (F1):', metrics.f1_score(y_test, model.predict(x_test), average='weighted'))

Classifier performance (F1): 0.9333333333333333


In [22]:
print(data.feature_names)

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


**2. Perform contrastive explanation**

In [29]:
# Import
import contrastive_explanation as ce

# Select a sample to explain ('questioned data point') why it predicted the fact instead of the foil 
sample = x_test[4] 

# Create a domain mapper (map the explanation to meaningful labels for explanation)
dm = ce.domain_mappers.DomainMapperTabular(x_train,
                                           feature_names=data.feature_names,
                                           contrast_names=data.target_names)

# Create the contrastive explanation object (default is a Foil Tree explanator)
exp = ce.ContrastiveExplanation(dm)

# Explain the instance (sample) for the given model
exp.explain_instance_domain(model.predict_proba, sample)

"The model predicted 'setosa' instead of 'versicolor' because 'petal width (cm) <= 0.701 and petal length (cm) <= 2.159 and petal width (cm) <= 0.798 and sepal length (cm) <= 5.993'"

## Regression
Explain an instance of the [Diabetes](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_diabetes.html#sklearn.datasets.load_diabetes) data set

**1. Train a (black-box) model on the Diabetes data**

In [25]:
data_reg = datasets.load_diabetes()

rx_train, rx_test, ry_train, ry_test = model_selection.train_test_split(data_reg.data, 
                                                                        data_reg.target, 
                                                                        train_size=0.80, 
                                                                        random_state=seed)
m_cv = ensemble.RandomForestRegressor(random_state=seed)
r_model = model_selection.GridSearchCV(m_cv, param_grid={'n_estimators': [50, 100, 500]})

r_model.fit(rx_train, ry_train)

print('Regressor performance (R-squared):', metrics.r2_score(ry_test, model_reg.predict(rx_test)))

Regressor performance (R-squared): 0.873933146710173


In [26]:
print(data_reg['DESCR'])

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - Age
      - Sex
      - Body mass index
      - Average blood pressure
      - S1
      - S2
      - S3
      - S4
      - S5
      - S6

Note: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times `n_samples` (i.e. the sum of squares of each column totals 1).

Source URL:
https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html

For more information see:
Bra

**2. Perform contrastive explanation**

In [27]:
import contrastive_explanation as ce

# Select a sample to explain
r_sample = test[1]

# Create a domain mapper (still tabular data, but for regression we do not have named labels for the outcome),
# ensure that 'sex' is a categorical feature
r_dm = ce.domain_mappers.DomainMapperTabular(rx_train, 
                                             feature_names=data_reg.feature_names,
                                             categorical_features=[1])

# Create the CE objects, ensure that 'regression' is set to True
# again, we use the Foil Tree explanator, but now we print out intermediary outcomes and steps (verbose)
r_exp = ce.ContrastiveExplanation(r_dm,
                                  regression=True,
                                  explanator=ce.explanators.TreeExplanator(verbose=True),
                                  verbose=True)

# Explain using the model, also include a 'factual' (non-contrastive 'why fact?') explanation
r_exp.explain_instance_domain(r_model.predict, r_sample, include_factual=True)

[F] Picked foil "more than 134.898" using foil selection strategy "greater"
[D] Obtaining neighborhood data
[E] Explaining with a decision tree...
[E] Fidelity of tree on neighborhood data = 1.0
[E] Found 5 contrastive decision regions, starting from node 2
[E] Found shortest path [13, 12, 14, 16] using strategy "informativeness"
[C] Decision obtained: [(3, 0.08280231058597565, 0.0666296740135272, True, False), (0, 0.04994678683578968, 0.0380759064334241, True, False)]
[C] Combining full rules [Literal(feature=3, operator=<Operator.SEQ: '<='>, value=0.08280231058597565, categorical=False), Literal(feature=0, operator=<Operator.SEQ: '<='>, value=0.04994678683578968, categorical=False)]...
[C] Decision obtained: [(2, 0.005121644586324692, -0.0180618869484982, False, False), (8, -0.004276006133295596, -0.0119006848015081, False, False), (8, -0.030646467581391335, -0.0119006848015081, False, True), (6, -0.028987007215619087, -0.0765355858888105, True, False), (8, -0.01580944936722517, -0.0

("The model predicted '134.898' instead of 'more than 134.898' because 'bp <= 0.083 and age <= 0.05'",
 "The model predicted '134.898' because 'bmi <= 0.005 and s5 <= -0.031 and s3 > -0.029 and s5 <= -0.016 and age > 0.043 and bmi <= -0.022'")