In [1]:
import sys
sys.path.append('../src/')
import decode_mcd
import decode_mcd.multi_objective_problem as MOP
from decode_mcd import data_package


from decode_mcd import design_targets
from decode_mcd import counterfactuals_generator

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In the notebook we will cover mixed datatypes and generating counterfactuals with multiple mixed objectives types. 

### Creating Dataset

First, lets create a dataset inspired by some basic arithmetic. This time, we will create a dataset with four variables of different types. `A` will be a random integer from 0 to 10 inclusive. `B` will be a random float from -1 to 1. `C` will be a random choice among "Add" "Subtract", "Multiply", and "Divide". Finally, `D` will be a boolean variable determining the sign of the expression. True will correspond to positive while False will mean negative. To effectively handle mixed datatypes will will use pandas.

In [2]:
num_data = 1000
A = np.random.randint(0, 10, num_data)
B = np.random.rand(num_data) * 2 - 1 #Randomized values which originally range from 0 to 1. Scales to -1 to 1.
C = np.random.choice(["Add", "Subtract", "Multiply", "Divide"], num_data)
D = np.random.choice([True, False], num_data)
x = pd.DataFrame({"A": A, "B": B, "C": C, "D": D})
display(x)


Unnamed: 0,A,B,C,D
0,3,-0.126518,Multiply,False
1,7,-0.582334,Divide,False
2,6,0.324557,Add,False
3,7,0.968625,Multiply,True
4,8,-0.281043,Divide,False
...,...,...,...,...
995,2,-0.419863,Subtract,False
996,7,-0.540450,Divide,False
997,2,-0.135318,Subtract,True
998,4,-0.157028,Subtract,True


We will create two functions. In the first one we will perform operation C(D(A), B). The second will perform D(C(B, A))>=0. For example, if C is add and D is False, function 1 will calculate -A+B while function 2 will check if -(B+A) is greater than 0 or not. We code up the `evaluate` function which takes in a dataframe with a set of `A`, `B`, `C`, and `D` values and returns a set of `O1` and `O2` values, which is what we call our function

In [3]:
def apply_operation(C, x, y):
    #Vectorized function to calculate C(x, y), for example if C is "Subtract", calculates x-y.
    add_mask = (C == "Add")
    subtract_mask = (C == "Subtract")
    multiply_mask = (C == "Multiply")
    divide_mask = (C == "Divide")
    result = np.zeros(len(C))
    result[add_mask] = x[add_mask] + y[add_mask]
    result[subtract_mask] = x[subtract_mask] - y[subtract_mask]
    result[multiply_mask] = x[multiply_mask] * y[multiply_mask]
    result[divide_mask] = x[divide_mask] / y[divide_mask]
    return result

def apply_inverse(D, x):
    #Vectorized function to calculate D(x), for example if D is False, returns -x. 
    return x*D-x*~D

def evaluate(x):
    #Evaluation function to calculate both objectives. x is an nx4 dataframe. 
    A = x["A"] #First isolate the individual variables from the provided dataframe
    B = x["B"]
    C = x["C"]
    D = x["D"].astype(bool)
    objective_1 = apply_operation(C, apply_inverse(D, A), B) #Calculate objective 1
    objective_2 = np.greater(apply_inverse(D, apply_operation(C, B, A)), 0) #Calculate objective 2
    return pd.DataFrame({"O1": objective_1, "O2": objective_2}) #Create a nx2 dataframe with the objective values
 

Let's evaluate our dataset.

In [4]:
y = evaluate(x)
display(y)

Unnamed: 0,O1,O2
0,0.379553,True
1,12.020590,True
2,-5.675443,False
3,6.780378,True
4,28.465378,True
...,...,...
995,-1.580137,True
996,12.952159,True
997,2.135318,False
998,4.157028,False


Finally, we create our query. 

In [5]:
x_query = pd.DataFrame({"A": [0], "B": [0.0], "C": ["Add"], "D": [True]}, index = ["Query"])
display(x_query)

Unnamed: 0,A,B,C,D
Query,0,0.0,Add,True


### Setting up MCD

Now we are ready to set up the DataPackage. We create two design targets. We have one continuous target and one categorical target. Continuous targets should be used for any objective with ordinal significance (such as floats or ints), while categorical targets should be used for those without (such as classes or bools)

We also specify the datatypes of our dataset. This time we have one of each type of variable. We specify the boundaries and options as specified earlier in the notebook.

In [7]:
from pymoo.core.variable import Real, Integer, Choice, Binary 
datatypes=[Integer(bounds=(0, 10)), 
           Real(bounds=(-1, 1)), 
           Choice(options=["Add", "Subtract", "Multiply", "Divide"]), 
           Binary()]

Finally, we create the `DataPackage`.

In [9]:
data = data_package.DataPackage(x=x, y=y, x_datatypes=datatypes)

Next, we create the design targets and the `MultiObjectiveProblem`. We want set a hard lower bound of 10 for O1 with no upper bound, meaning C(D(A), B)>=10. We also set a requirement that O2 must be true, meaning that D(C(B, A))>=0

In [10]:
#We first set up a ContinuousTarget for O1, setting a minimum of 10 and setting no upper bound (i.e. infinity))
target_1 = design_targets.ContinuousTarget(label = "O1", lower_bound=10, upper_bound=np.inf)

#We then set up a CategoricalTarget for O2 specifying only True as the desired class. 
#Desired_classes is a list. In problems with multiple classes, this list specifies the acceptable classes.
target_2 = design_targets.CategoricalTarget(label = "O2", desired_classes=[True])

#We then create a DesignTargets object with the two targets.
y_targets = design_targets.DesignTargets(continuous_targets=[target_1], categorical_targets=[target_2])

In [11]:
problem = MOP.MultiObjectiveProblem(data_package=data, x_query = x_query, y_targets = y_targets, prediction_function=evaluate)

Finally, we create the `CounterfactualsGenerator`.

In [13]:
generator = counterfactuals_generator.CounterfactualsGenerator(problem=problem, pop_size=100, initialize_from_dataset=True)

In [14]:
generator.generate(n_generations=10)

1000 dataset entries found matching problem parameters
Initial population initialized from dataset of 1000 samples!
Training GA from 0 to 10 generations!
n_gen  |  n_eval  | n_nds  |     cv_min    |     cv_avg    |      eps      |   indicator  
     1 |        0 |      4 |  0.000000E+00 |  1.570420E+01 |             - |             -


  satisfaction = np.maximum(actual - query_ub, query_lb - actual)
  satisfaction = np.maximum(actual - query_ub, query_lb - actual)


     2 |      100 |      4 |  0.000000E+00 |  0.5740628280 |  0.000000E+00 |             f
     3 |      200 |      4 |  0.000000E+00 |  0.2661620773 |  0.000000E+00 |             f
     4 |      300 |      4 |  0.000000E+00 |  0.0323266568 |  0.000000E+00 |             f
     5 |      400 |      4 |  0.000000E+00 |  0.000000E+00 |  0.000000E+00 |             f
     6 |      500 |      4 |  0.000000E+00 |  0.000000E+00 |  0.000000E+00 |             f
     7 |      600 |      4 |  0.000000E+00 |  0.000000E+00 |  0.000000E+00 |             f
     8 |      700 |      4 |  0.000000E+00 |  0.000000E+00 |  0.000000E+00 |             f


  satisfaction = np.maximum(actual - query_ub, query_lb - actual)
  satisfaction = np.maximum(actual - query_ub, query_lb - actual)
  satisfaction = np.maximum(actual - query_ub, query_lb - actual)


     9 |      800 |      5 |  0.000000E+00 |  0.000000E+00 |  0.0234051693 |             f
    10 |      900 |      5 |  0.000000E+00 |  0.000000E+00 |  0.000000E+00 |             f


  satisfaction = np.maximum(actual - query_ub, query_lb - actual)


In [18]:
num_samples = 10 
counterfactuals = generator.sample(num_samples, include_dataset=False)
display(counterfactuals)

Collecting all counterfactual candidates!
Scoring all counterfactual candidates!
Calculating diversity matrix!
Sampling diverse set of counterfactual candidates!
samples_index=[38, 90, 42, 16, 145, 182, 154, 148, 17, 191]
Done! Returning CFs


Unnamed: 0,A,B,C,D
0,2,0.182102,Divide,True
1,7,0.017875,Divide,True
2,4,0.296135,Divide,True
3,2,-0.12219,Divide,False
4,5,0.110378,Divide,True
5,4,0.014061,Divide,True
6,6,0.59604,Divide,True
7,9,0.31584,Divide,True
8,2,0.112151,Divide,True
9,6,0.184919,Divide,True


Let's evaluate the counterfactuals we generated, we should see that every O1 value is greater or equal to 10, while every O2 value is true. 

In [19]:
evaluate(counterfactuals)

Unnamed: 0,O1,O2
0,10.982857,True
1,391.602901,True
2,13.507345,True
3,16.367968,True
4,45.298855,True
5,284.472265,True
6,10.066445,True
7,28.495471,True
8,17.833059,True
9,32.446701,True
