In [1]:
from autogluon.tabular import TabularDataset, TabularPredictor
import sklearn
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris, fetch_california_housing
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.neural_network import MLPRegressor
import matplotlib
from matplotlib import pyplot as plt

import pandas as pd
import random
import numpy as np
import sys

sys.path.append('../')
import decode_mcd.multi_objective_problem as MOP
from decode_mcd.counterfactuals_generator import CounterfactualsGenerator
from decode_mcd import data_package
import load_data
import importlib

from IPython.display import display, HTML

In [2]:
x, y, _, _ = load_data.load_framed_dataset("r", onehot = True, scaled = False, augmented = True)
y = y.loc[:,["Model Mass Magnitude", "Sim 1 Safety Factor (Inverted)"]]

In [3]:
from autogluon.tabular import TabularDataset, TabularPredictor
from autogluon.common.utils.utils import setup_outputdir
from autogluon.core.utils.loaders import load_pkl
from autogluon.core.utils.savers import save_pkl
import os.path

class MultilabelPredictor():
    """ Tabular Predictor for predicting multiple columns in table.
        Creates multiple TabularPredictor objects which you can also use individually.
        You can access the TabularPredictor for a particular label via: `multilabel_predictor.get_predictor(label_i)`

        Parameters
        ----------
        labels : List[str]
            The ith element of this list is the column (i.e. `label`) predicted by the ith TabularPredictor stored in this object.
        path : str, default = None
            Path to directory where models and intermediate outputs should be saved.
            If unspecified, a time-stamped folder called "AutogluonModels/ag-[TIMESTAMP]" will be created in the working directory to store all models.
            Note: To call `fit()` twice and save all results of each fit, you must specify different `path` locations or don't specify `path` at all.
            Otherwise files from first `fit()` will be overwritten by second `fit()`.
            Caution: when predicting many labels, this directory may grow large as it needs to store many TabularPredictors.
        problem_types : List[str], default = None
            The ith element is the `problem_type` for the ith TabularPredictor stored in this object.
        eval_metrics : List[str], default = None
            The ith element is the `eval_metric` for the ith TabularPredictor stored in this object.
        consider_labels_correlation : bool, default = True
            Whether the predictions of multiple labels should account for label correlations or predict each label independently of the others.
            If True, the ordering of `labels` may affect resulting accuracy as each label is predicted conditional on the previous labels appearing earlier in this list (i.e. in an auto-regressive fashion).
            Set to False if during inference you may want to individually use just the ith TabularPredictor without predicting all the other labels.
        kwargs :
            Arguments passed into the initialization of each TabularPredictor.

    """

    multi_predictor_file = 'multilabel_predictor.pkl'

    def __init__(self, labels, path=None, problem_types=None, eval_metrics=None, consider_labels_correlation=True, **kwargs):
        if len(labels) < 2:
            raise ValueError("MultilabelPredictor is only intended for predicting MULTIPLE labels (columns), use TabularPredictor for predicting one label (column).")
        if (problem_types is not None) and (len(problem_types) != len(labels)):
            raise ValueError("If provided, `problem_types` must have same length as `labels`")
        if (eval_metrics is not None) and (len(eval_metrics) != len(labels)):
            raise ValueError("If provided, `eval_metrics` must have same length as `labels`")
        self.path = setup_outputdir(path, warn_if_exist=False)
        self.labels = labels
        self.consider_labels_correlation = consider_labels_correlation
        self.predictors = {}  # key = label, value = TabularPredictor or str path to the TabularPredictor for this label
        if eval_metrics is None:
            self.eval_metrics = {}
        else:
            self.eval_metrics = {labels[i] : eval_metrics[i] for i in range(len(labels))}
        problem_type = None
        eval_metric = None
        for i in range(len(labels)):
            label = labels[i]
            path_i = self.path + "Predictor_" + label
            if problem_types is not None:
                problem_type = problem_types[i]
            if eval_metrics is not None:
                eval_metric = eval_metrics[i]
            self.predictors[label] = TabularPredictor(label=label, problem_type=problem_type, eval_metric=eval_metric, path=path_i, **kwargs)

    def fit(self, train_data, tuning_data=None, **kwargs):
        """ Fits a separate TabularPredictor to predict each of the labels.

            Parameters
            ----------
            train_data, tuning_data : str or autogluon.tabular.TabularDataset or pd.DataFrame
                See documentation for `TabularPredictor.fit()`.
            kwargs :
                Arguments passed into the `fit()` call for each TabularPredictor.
        """
        if isinstance(train_data, str):
            train_data = TabularDataset(train_data)
        if tuning_data is not None and isinstance(tuning_data, str):
            tuning_data = TabularDataset(tuning_data)
        train_data_og = train_data.copy()
        if tuning_data is not None:
            tuning_data_og = tuning_data.copy()
        else:
            tuning_data_og = None
        save_metrics = len(self.eval_metrics) == 0
        for i in range(len(self.labels)):
            label = self.labels[i]
            predictor = self.get_predictor(label)
            if not self.consider_labels_correlation:
                labels_to_drop = [l for l in self.labels if l != label]
            else:
                labels_to_drop = [self.labels[j] for j in range(i+1, len(self.labels))]
            train_data = train_data_og.drop(labels_to_drop, axis=1)
            if tuning_data is not None:
                tuning_data = tuning_data_og.drop(labels_to_drop, axis=1)
            print(f"Fitting TabularPredictor for label: {label} ...")
            predictor.fit(train_data=train_data, tuning_data=tuning_data, **kwargs)
            self.predictors[label] = predictor.path
            if save_metrics:
                self.eval_metrics[label] = predictor.eval_metric
        self.save()

    def predict(self, data, **kwargs):
        """ Returns DataFrame with label columns containing predictions for each label.

            Parameters
            ----------
            data : str or autogluon.tabular.TabularDataset or pd.DataFrame
                Data to make predictions for. If label columns are present in this data, they will be ignored. See documentation for `TabularPredictor.predict()`.
            kwargs :
                Arguments passed into the predict() call for each TabularPredictor.
        """
        return self._predict(data, as_proba=False, **kwargs)

    def predict_proba(self, data, **kwargs):
        """ Returns dict where each key is a label and the corresponding value is the `predict_proba()` output for just that label.

            Parameters
            ----------
            data : str or autogluon.tabular.TabularDataset or pd.DataFrame
                Data to make predictions for. See documentation for `TabularPredictor.predict()` and `TabularPredictor.predict_proba()`.
            kwargs :
                Arguments passed into the `predict_proba()` call for each TabularPredictor (also passed into a `predict()` call).
        """
        return self._predict(data, as_proba=True, **kwargs)

    def evaluate(self, data, **kwargs):
        """ Returns dict where each key is a label and the corresponding value is the `evaluate()` output for just that label.

            Parameters
            ----------
            data : str or autogluon.tabular.TabularDataset or pd.DataFrame
                Data to evalate predictions of all labels for, must contain all labels as columns. See documentation for `TabularPredictor.evaluate()`.
            kwargs :
                Arguments passed into the `evaluate()` call for each TabularPredictor (also passed into the `predict()` call).
        """
        data = self._get_data(data)
        eval_dict = {}
        for label in self.labels:
            print(f"Evaluating TabularPredictor for label: {label} ...")
            predictor = self.get_predictor(label)
            eval_dict[label] = predictor.evaluate(data, **kwargs)
            if self.consider_labels_correlation:
                data[label] = predictor.predict(data, **kwargs)
        return eval_dict

    def save(self):
        """ Save MultilabelPredictor to disk. """
        for label in self.labels:
            if not isinstance(self.predictors[label], str):
                self.predictors[label] = self.predictors[label].path
        save_pkl.save(path=self.path+self.multi_predictor_file, object=self)
        print(f"MultilabelPredictor saved to disk. Load with: MultilabelPredictor.load('{self.path}')")

    @classmethod
    def load(cls, path):
        """ Load MultilabelPredictor from disk `path` previously specified when creating this MultilabelPredictor. """
        path = os.path.expanduser(path)
        if path[-1] != os.path.sep:
            path = path + os.path.sep
        return load_pkl.load(path=path+cls.multi_predictor_file)

    def get_predictor(self, label):
        """ Returns TabularPredictor which is used to predict this label. """
        predictor = self.predictors[label]
        if isinstance(predictor, str):
            return TabularPredictor.load(path=predictor)
        return predictor

    def _get_data(self, data):
        if isinstance(data, str):
            return TabularDataset(data)
        return data.copy()

    def _predict(self, data, as_proba=False, **kwargs):
        data = self._get_data(data)
        if as_proba:
            predproba_dict = {}
        for label in self.labels:
#             print(f"Predicting with TabularPredictor for label: {label} ...")
            predictor = self.get_predictor(label)
            if as_proba:
                predproba_dict[label] = predictor.predict_proba(data, as_multiclass=True, **kwargs)
            data[label] = predictor.predict(data, **kwargs)
        if not as_proba:
            return data[self.labels]
        else:
            return predproba_dict

In [4]:
data=pd.concat([x, y], axis=1)
data_train, data_test, x_train, x_test, y_train, y_test=train_test_split(data, x, y, test_size=0.2)

In [5]:
labels=y.columns
predictor = MultilabelPredictor(labels=labels)
predictor.fit(train_data=data_train)

Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20230521_174408\Predictor_Model Mass Magnitude\"
AutoGluon Version:  0.7.0
Python Version:     3.10.9
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.19045
Train Data Rows:    11880
Train Data Columns: 39
Label Column: Model Mass Magnitude
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (22.83, 0.73, 5.23366, 2.67333)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    26038.76 MB
	Train Data (Original)  Memory Usage: 3.46 MB (0.0% of available me

Fitting TabularPredictor for label: Model Mass Magnitude ...


	-2.466	 = Validation score   (-root_mean_squared_error)
	4.08s	 = Training   runtime
	0.18s	 = Validation runtime
Fitting model: KNeighborsDist ...
	-2.2882	 = Validation score   (-root_mean_squared_error)
	0.02s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: LightGBMXT ...


[1000]	valid_set's rmse: 0.449114


	-0.4453	 = Validation score   (-root_mean_squared_error)
	1.55s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: LightGBM ...


[1000]	valid_set's rmse: 0.464235


	-0.463	 = Validation score   (-root_mean_squared_error)
	1.44s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: RandomForestMSE ...
	-0.6415	 = Validation score   (-root_mean_squared_error)
	7.32s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: CatBoost ...
	-0.4247	 = Validation score   (-root_mean_squared_error)
	80.38s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	-0.6189	 = Validation score   (-root_mean_squared_error)
	1.51s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	-0.4164	 = Validation score   (-root_mean_squared_error)
	9.99s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: XGBoost ...
	-0.4687	 = Validation score   (-root_mean_squared_error)
	6.83s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: NeuralNetTorch ...
	-0.4707	 = Validation score   (-root_mean_squared_error)
	34.13s	 = Training   runtime
	0.02s	 = Validation run

[1000]	valid_set's rmse: 0.477384
[2000]	valid_set's rmse: 0.476957


	-0.4769	 = Validation score   (-root_mean_squared_error)
	7.63s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	-0.3866	 = Validation score   (-root_mean_squared_error)
	0.19s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 156.5s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels\ag-20230521_174408\Predictor_Model Mass Magnitude\")
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20230521_174408\Predictor_Sim 1 Safety Factor (Inverted)\"
AutoGluon Version:  0.7.0
Python Version:     3.10.9
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.19045
Train Data Rows:    11880
Train Data Columns: 40
Label Column: Sim 1 Safety Factor (Inverted)
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and

Fitting TabularPredictor for label: Sim 1 Safety Factor (Inverted) ...


	0.01s	 = Validation runtime
Fitting model: LightGBMXT ...
	-2.7822	 = Validation score   (-root_mean_squared_error)
	0.51s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: LightGBM ...
	-2.7279	 = Validation score   (-root_mean_squared_error)
	0.51s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: RandomForestMSE ...
	-2.8252	 = Validation score   (-root_mean_squared_error)
	10.94s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: CatBoost ...
	-2.7787	 = Validation score   (-root_mean_squared_error)
	2.97s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	-2.8886	 = Validation score   (-root_mean_squared_error)
	1.63s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	-2.955	 = Validation score   (-root_mean_squared_error)
	9.9s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: XGBoost ...
	-3.1682	 = Validation score   (-root_mean_squared_error)
	0.98

MultilabelPredictor saved to disk. Load with: MultilabelPredictor.load('AutogluonModels\ag-20230521_174408\')


In [6]:
data_cols = x.columns
def fn(x_f):
    x_f = pd.DataFrame(x_f, columns = data_cols)
    p = predictor.predict(x_f)
    return p

In [19]:
from pymoo.core.variable import Real, Integer, Binary, Choice
from decode_mcd.design_targets import *
"Model Mass Magnitude", "Sim 1 Safety Factor (Inverted)"

design_targets = DesignTargets([ContinuousTarget("Model Mass Magnitude", 0, 5), 
                              ContinuousTarget("Sim 1 Safety Factor (Inverted)", 0, 1)])
bonus_objs = ["Model Mass Magnitude", "Sim 1 Safety Factor (Inverted)"]


lbs = np.quantile(x.values, 0.01, axis=0)
ubs = np.quantile(x.values, 0.99, axis=0)
datatypes=[]
for i in range(len(x.columns)):
    datatypes.append(Real(bounds=(lbs[i], ubs[i])))

query_x = x.iloc[0:1] 
dp = data_package.DataPackage(x, y, query_x, design_targets, datatypes, x.columns, bonus_objs)
problem = MOP.MultiObjectiveProblem(dp, fn, [])
generator = CounterfactualsGenerator(problem, 500, initialize_from_dataset=False)
generator.generate(20)

Initial population randomly initialized!
Training GA from 0 to 20 generations!
n_gen  |  n_eval  | n_nds  |     cv_min    |     cv_avg    |      eps      |   indicator  
     1 |      499 |      2 |  0.000000E+00 |  1.6040000000 |             - |             -
     2 |      999 |      3 |  0.000000E+00 |  1.1820000000 |  0.4816577589 |         ideal
     3 |     1499 |      9 |  0.000000E+00 |  0.9700000000 |  1.0000000000 |         ideal
     4 |     1999 |     10 |  0.000000E+00 |  0.9500000000 |  0.3166675259 |         ideal
     5 |     2499 |     11 |  0.000000E+00 |  0.9260000000 |  0.3109350892 |         ideal
     6 |     2999 |     14 |  0.000000E+00 |  0.8940000000 |  0.2141366762 |         ideal
     7 |     3499 |     11 |  0.000000E+00 |  0.8400000000 |  0.0414574911 |         ideal
     8 |     3999 |     13 |  0.000000E+00 |  0.7660000000 |  0.2447870140 |         ideal
     9 |     4499 |     20 |  0.000000E+00 |  0.6400000000 |  0.2250360849 |         ideal
    10 |   

In [22]:
num_samples = 10
cfs = generator.sample_with_dtai(num_samples, 0.5, 0.2, 0.5, 0.2, np.array([1,0.5]), include_dataset=False, num_dpp=10000)
print(cfs)

Collecting all counterfactual candidates!
Scoring all counterfactual candidates!
Calculating diversity matrix!
Sampling diverse set of counterfactual candidates!
Done! Returning CFs
   Material=Steel  Material=Aluminum  Material=Titanium  SSB_Include  \
0        1.000000           0.027727           0.021178     0.002650   
1        0.380201           0.877051           0.048397     0.000000   
2        0.066706           0.309734           0.013792     0.754630   
3        0.891239           0.038914           0.010385     0.495558   
4        0.976761           0.030608           0.017376     0.369496   
5        0.651423           0.124726           0.434200     0.015142   
6        0.946584           0.000000           0.004792     0.000000   
7        0.500876           0.048408           0.000415     0.067897   
8        0.009159           0.000000           0.000118     0.000000   
9        0.340405           0.044520           0.000544     0.124289   

   CSB_Include  CS Length