In [1]:
import sys
sys.path.append('../src/')
import decode_mcd
import decode_mcd.multi_objective_problem as MOP
from decode_mcd import data_package


from decode_mcd import design_targets
from decode_mcd import counterfactuals_generator

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In the notebook we will cover mixed datatypes and generating counterfactuals with multiple mixed objectives types. 

### Creating Dataset

First, lets create a dataset inspired by some basic arithmetic. This time, we will create a dataset with four variables of different types. `A` will be a random integer from 0 to 10 inclusive. `B` will be a random float from -1 to 1. `C` will be a random choice among "Add" "Subtract", "Multiply", and "Divide". Finally, `D` will be a boolean variable determining the sign of the expression. True will correspond to positive while False will mean negative. To effectively handle mixed datatypes will will use pandas.

In [2]:
num_data = 1000
A = np.random.randint(0, 10, num_data)
B = np.random.rand(num_data) * 2 - 1 #Randomized values which originally range from 0 to 1. Scales to -1 to 1.
C = np.random.choice(["Add", "Subtract", "Multiply", "Divide"], num_data)
D = np.random.choice([True, False], num_data)
features_dataset = pd.DataFrame({"A": A, "B": B, "C": C, "D": D})
display(features_dataset)


Unnamed: 0,A,B,C,D
0,4,0.903350,Add,False
1,7,0.163521,Multiply,True
2,3,0.960120,Divide,False
3,7,0.683961,Add,False
4,8,0.338584,Subtract,False
...,...,...,...,...
995,9,-0.199254,Divide,False
996,4,0.080076,Divide,False
997,5,-0.036126,Multiply,False
998,7,0.110110,Add,True


We will create two functions. In the first one we will perform operation C(D(A), B). The second will perform D(C(B, A))>=0. For example, if C is add and D is False, function 1 will calculate -A+B while function 2 will check if -(B+A) is greater than 0 or not. We code up the `evaluate` function which takes in a dataframe with a set of `A`, `B`, `C`, and `D` values and returns a set of `O1` and `O2` values, which is what we call our function

In [3]:
def apply_operation(C, x, y):
    #Vectorized function to calculate C(x, y), for example if C is "Subtract", calculates x-y.
    add_mask = (C == "Add")
    subtract_mask = (C == "Subtract")
    multiply_mask = (C == "Multiply")
    divide_mask = (C == "Divide")
    result = np.zeros(len(C))
    result[add_mask] = x[add_mask] + y[add_mask]
    result[subtract_mask] = x[subtract_mask] - y[subtract_mask]
    result[multiply_mask] = x[multiply_mask] * y[multiply_mask]
    result[divide_mask] = x[divide_mask] / y[divide_mask]
    return result

def apply_inverse(D, x):
    #Vectorized function to calculate D(x), for example if D is False, returns -x. 
    return x*D-x*~D

def evaluate(x):
    #Evaluation function to calculate both objectives. x is an nx4 dataframe. 
    A = x["A"] #First isolate the individual variables from the provided dataframe
    B = x["B"]
    C = x["C"]
    D = x["D"].astype(bool)
    objective_1 = apply_operation(C, apply_inverse(D, A), B) #Calculate objective 1
    objective_2 = np.greater(apply_inverse(D, apply_operation(C, B, A)), 0) #Calculate objective 2
    return pd.DataFrame({"O1": objective_1, "O2": objective_2}) #Create a nx2 dataframe with the objective values
 

Let's evaluate our dataset.

In [4]:
predictions_dataset = evaluate(features_dataset)
display(predictions_dataset)

Unnamed: 0,O1,O2
0,-3.096650,False
1,1.144644,True
2,-3.124610,False
3,-6.316039,False
4,-8.338584,True
...,...,...
995,45.168536,True
996,-49.952499,False
997,0.180631,True
998,7.110110,True


Finally, we create our query. 

In [5]:
query_x = pd.DataFrame({"A": [0], "B": [0.0], "C": ["Add"], "D": [True]}, index = ["Query"])
display(query_x)

Unnamed: 0,A,B,C,D
Query,0,0.0,Add,True


### Setting up MCD

Now we are ready to set up the DataPackage. We create two design targets. We have one continuous target and one categorical target. Continuous targets should be used for any objective with ordinal significance (such as floats or ints), while categorical targets should be used for those without (such as classes or bools)

In [6]:
#We first set up a ContinuousTarget for O1, setting a minimum of 10 and setting no upper bound (i.e. infinity))
target_1 = design_targets.ContinuousTarget(label = "O1", lower_bound=10, upper_bound=np.inf)

#We then set up a CategoricalTarget for O2 specifying only True as the desired class. 
#Desired_classes is a list. In problems with multiple classes, this list specifies the acceptable classes.
target_2 = design_targets.CategoricalTarget(label = "O2", desired_classes=[True])

#We then create a DesignTargets object with the two targets.
targets = design_targets.DesignTargets(continuous_targets=[target_1], categorical_targets=[target_2])

We also specify the datatypes of our dataset. This time we have one of each type of variable. We specify the boundaries and options as specified earlier in the notebook.

In [7]:
from pymoo.core.variable import Real, Integer, Choice, Binary 
datatypes=[Integer(bounds=(0, 10)), 
           Real(bounds=(-1, 1)), 
           Choice(options=["Add", "Subtract", "Multiply", "Divide"]), 
           Binary()]

Finally, we create the `DataPackage`.

In [8]:
data = data_package.DataPackage(features_dataset=features_dataset,
                           predictions_dataset=predictions_dataset,
                           query_x=query_x,
                           design_targets=targets,
                           datatypes=datatypes,)

In [9]:
problem = MOP.MultiObjectiveProblem(data_package=data, prediction_function=evaluate)

In [10]:
generator = counterfactuals_generator.CounterfactualsGenerator(problem=problem, pop_size=100, initialize_from_dataset=True)

In [11]:
generator.generate(n_generations=10)

1000 dataset entries found matching problem parameters
Initial population initialized from dataset of 1000 samples!

Compiled modules for significant speedup can not be used!
https://pymoo.org/installation.html#installation

from pymoo.config import Config

Training GA from 0 to 10 generations!
n_gen  |  n_eval  | n_nds  |     cv_min    |     cv_avg    |      eps      |   indicator  
     1 |        0 |      9 |  0.000000E+00 |  1.519441E+01 |             - |             -
     2 |      100 |     10 |  0.000000E+00 |  0.4832558283 |  0.0180904765 |             f
     3 |      200 |     10 |  0.000000E+00 |  0.1276585004 |  0.000000E+00 |             f
     4 |      300 |     10 |  0.000000E+00 |  0.0147787263 |  0.0023960604 |             f


  satisfaction = np.maximum(actual - query_ub, query_lb - actual)
  satisfaction = np.maximum(actual - query_ub, query_lb - actual)


     5 |      400 |     10 |  0.000000E+00 |  0.000000E+00 |  0.0023960604 |             f
     6 |      500 |     12 |  0.000000E+00 |  0.000000E+00 |  0.5000000000 |         ideal
     7 |      600 |     15 |  0.000000E+00 |  0.000000E+00 |  0.0134101524 |             f


  satisfaction = np.maximum(actual - query_ub, query_lb - actual)
  satisfaction = np.maximum(actual - query_ub, query_lb - actual)


     8 |      700 |     16 |  0.000000E+00 |  0.000000E+00 |  0.0003908560 |             f
     9 |      800 |     18 |  0.000000E+00 |  0.000000E+00 |  0.3333333333 |         ideal
    10 |      900 |     20 |  0.000000E+00 |  0.000000E+00 |  0.0003095052 |             f


In [12]:
num_samples = 10 
counterfactuals = generator.sample_with_weights(num_samples, 1, 1, 1, 1, include_dataset=False)
display(counterfactuals)

Collecting all counterfactual candidates!
Scoring all counterfactual candidates!
Calculating diversity matrix!
Sampling diverse set of counterfactual candidates!
samples_index=[51, 181, 73, 138, 14, 1, 179, 182, 114, 69]
Done! Returning CFs


Unnamed: 0,A,B,C,D
0,1,-0.022828,Divide,False
1,10,0.0,Add,True
2,10,0.773336,Divide,True
3,10,0.751657,Add,True
4,3,0.158954,Divide,True
5,9,-0.897949,Divide,False
6,7,0.094165,Divide,True
7,5,0.424776,Divide,True
8,10,0.388122,Add,True
9,6,-0.105938,Divide,False


In [13]:
evaluate(counterfactuals)

Unnamed: 0,O1,O2
0,43.806543,True
1,10.0,True
2,12.930986,True
3,10.751657,True
4,18.873437,True
5,10.022841,True
6,74.337842,True
7,11.770923,True
8,10.388122,True
9,56.636814,True
