In [26]:
from numpy.random import RandomState
seed = RandomState(1994)

In [27]:
from sklearn import datasets, model_selection, ensemble, metrics

## Classification
Contrastive explanation for an instance of the [Iris](https://archive.ics.uci.edu/ml/datasets/iris) data set

---

**1. Train a (black-box) model on the Iris data**

In [28]:
data = datasets.load_iris()
train, test, y_train, y_test = model_selection.train_test_split(data.data, 
                                                                data.target, 
                                                                train_size=0.80, 
                                                                random_state=seed)
model = ensemble.RandomForestClassifier(random_state=seed)
model.fit(train, y_train)

print('Classifier performance (F1):', metrics.f1_score(y_test, model.predict(test), average='weighted'))

Classifier performance (F1): 0.9333333333333333


**2. Perform contrastive explanation**

In [29]:
# Import
import contrastive_explanation as ce

# Select a sample to explain ('questioned data point') why it predicted the fact instead of the foil 
sample = test[0] 

# Create a domain mapper (map the explanation to meaningful labels for explanation)
dm = ce.domain_mappers.DomainMapperTabular(train, 
                                           feature_names=data.feature_names,
                                           contrast_names=data.target_names)

# Create the contrastive explanation object (default is a Foil Tree explanator)
exp = ce.ContrastiveExplanation(dm)

# Explain the instance (sample) for the given model
exp.explain_instance_domain(model.predict_proba, sample)

"The model predicted 'setosa' instead of 'versicolor' because 'petal length (cm) <= 2.529 and sepal width (cm) <= 3.561'"

## Regression
Explain an instance of the [Diabetes](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_diabetes.html#sklearn.datasets.load_diabetes) data set

**1. Train a (black-box) model on the Diabetes data**

In [30]:
data_reg = datasets.load_diabetes()

train, test, y_train, y_test = model_selection.train_test_split(data_reg.data, 
                                                                data_reg.target, 
                                                                train_size=0.80, 
                                                                random_state=seed)
m_cv = ensemble.RandomForestRegressor(random_state=seed)
model_reg = model_selection.GridSearchCV(m_cv, param_grid={'n_estimators': [50, 100, 500]})

model_reg.fit(train, y_train)

print('Regressor performance (R-squared):', metrics.r2_score(y_test, model_reg.predict(test)))

Regressor performance (R-squared): 0.5086602809470543


**2. Perform contrastive explanation**

In [31]:
import contrastive_explanation as ce

# Select a sample to explain
sample = test[1]

# Create a domain mapper (still tabular data, but for regression we do not have named labels for the outcome),
# ensure that 'sex' is a categorical feature
dm = ce.domain_mappers.DomainMapperTabular(train, 
                                           feature_names=data_reg.feature_names,
                                           categorical_features=[1])

# Create the CE objects, ensure that 'regression' is set to True
# again, we use the Foil Tree explanator, but now we print out intermediary outcomes and steps (verbose)
exp = ce.ContrastiveExplanation(dm,
                                regression=True,
                                explanator=ce.explanators.TreeExplanator(verbose=True),
                                verbose=True)

# Explain using the model, also include a 'factual' (non-contrastive 'why fact?') explanation
exp.explain_instance_domain(model_reg.predict, sample, include_factual=True)

[F] Picked foil "more than 92.414" using foil selection strategy "greater"
[D] Obtaining neighborhood data
[E] Explaining with a decision tree...
[E] Fidelity of tree on neighborhood data = 1.0
[E] Found 9 contrastive decision regions, starting from node 2
[E] Found shortest path [19, 18, 20, 21] using strategy "informativeness"
[C] Decision obtained: [(6, 0.10606995224952698, 0.100183028707369, True, False), (0, -0.014294859021902084, -0.0164121703318693, False, False)]
[C] Combining full rules [Literal(feature=6, operator=<Operator.SEQ: '<='>, value=0.10606995224952698, categorical=False)]...
[C] Decision obtained: [(2, 0.006363794207572937, -0.0353068801305926, False, False), (8, -0.002834510989487171, -0.0702093127286876, False, False), (6, 0.020282596349716187, 0.100183028707369, False, True), (3, 0.05408908426761627, -0.0263278347173518, True, False), (7, 0.024366647005081177, -0.0394933828740919, False, False), (4, -0.057564325630664825, 0.0328298616348169, False, True), (4, 0.0

("The model predicted '92.414' instead of 'more than 92.414' because 's3 <= 0.106'",
 "The model predicted '92.414' because 'bmi <= 0.006 and s5 <= -0.003 and s3 <= 0.02 and bp > 0.054 and s4 <= 0.024 and s1 <= -0.058 and s1 > 0.023 and age > -0.03 and s4 > 0.016 and s2 <= -0.002 and s1 > 0.113 and bp <= -0.018'")