In [1]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

In [2]:
df =  pd.read_csv("ANTT.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 695 entries, 0 to 694
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Unnamed: 0            695 non-null    int64 
 1   Make                  695 non-null    object
 2   Model                 695 non-null    object
 3   Year of manufacture   695 non-null    int64 
 4   Kilometer             695 non-null    int64 
 5   Previous owners       695 non-null    int64 
 6   Body style            695 non-null    object
 7   Horsepower            695 non-null    int64 
 8   Cubic capacity (ccm)  695 non-null    int64 
 9   Cylinders             695 non-null    int64 
 10  Doors                 695 non-null    int64 
 11  Steering              695 non-null    object
 12  Gearbox               695 non-null    object
 13  Gears                 695 non-null    int64 
 14  Transmission          695 non-null    object
 15  Front breaks          695 non-null    ob

In [3]:
sys.path.append('./')

In [4]:
def prepare_ANTT_dataset(filename):
    class_name = 'class'
    df = pd.read_csv(filename, skipinitialspace = True, keep_default_na = True)
    df = df.drop(df.columns[0],axis = 1)
    new_col = []
    for item in df['Price (£)']:
        if 0 < item <= 30000:
            new_col.append('Low Price')
        else:
            new_col.append('High Price')
    df[class_name] = new_col
    return df, class_name

In [5]:
from collections import defaultdict


def prepare_dataset(df, class_name):

    df = remove_missing_values(df)

    numeric_columns = get_numeric_columns(df)

    rdf = df

    df, feature_names, class_values = one_hot_encoding(df, class_name)

    real_feature_names = get_real_feature_names(rdf, numeric_columns, class_name)

    rdf = rdf[real_feature_names + (class_values if isinstance(class_name, list) else [class_name])]

    features_map = get_features_map(feature_names, real_feature_names)

    return df, feature_names, class_values, numeric_columns, rdf, real_feature_names, features_map


def get_features_map(feature_names, real_feature_names):
    features_map = defaultdict(dict)
    i = 0
    j = 0
    while i < len(feature_names) and j < len(real_feature_names):
        #Nếu tên 2 cột thuộc tính giống nhau: 
        if feature_names[i] == real_feature_names[j]:
            # Loại bỏ phần tiền tố được thêm vào khi thực hiện one-hot encoding sau đó lưu vào bản đồ
            # keys = j, values = {'Thuộc tính': i }
            features_map[j][feature_names[i].replace('%s=' % real_feature_names[j], '')] = i
            i += 1
            j += 1
        #Nếu tên cột thuộc tính trong biến feature_names (sau khi OHE) bắt đầu bằng từ của cột thuộc tính ban đầu
        elif feature_names[i].startswith(real_feature_names[j]):
            # Loại bỏ phần tiền tố được thêm vào khi thực hiện one-hot encoding sau đó lưu vào bản đồ
            # keys = j, values = {'Thuộc tính': i }
            features_map[j][feature_names[i].replace('%s=' % real_feature_names[j], '')] = i
            i += 1
        else:
            j += 1
    return features_map


def get_real_feature_names(rdf, numeric_columns, class_name):
    if isinstance(class_name, list):
        real_feature_names = [c for c in rdf.columns if c in numeric_columns and c not in class_name]
        real_feature_names += [c for c in rdf.columns if c not in numeric_columns and c not in class_name]
    else:
        #Thêm toàn bộ tên cột kiểu số và kiểu phân loại vào list rồi lưu vào real_features_names.
        real_feature_names = [c for c in rdf.columns if c in numeric_columns and c != class_name]
        real_feature_names += [c for c in rdf.columns if c not in numeric_columns and c != class_name]
    return real_feature_names


def one_hot_encoding(df, class_name):
    if not isinstance(class_name, list):
        # dfX thực hiện One_hot_encoding (ứng với những sample có giá trị giống với giá trị của cột thì vị trí đó được đánh số 1)
        dfX = pd.get_dummies(df[[c for c in df.columns if c != class_name]], prefix_sep='=')
        # Lấy ra các giá trị khác nhau của biến target và gán số cho từng giá trị đó: (High Price = 0, Low Price = 1)
        class_name_map = {v: k for k, v in enumerate(sorted(df[class_name].unique()))}
        # Thay thế các giá trị đó bằng giá trị số tương ứng
        dfY = df[class_name].map(class_name_map)
        # Nhập dfX và dfY lại tạo thành dataframe hoàn chỉnh sau khi One_hot_encoding.
        df = pd.concat([dfX, dfY], axis=1).reindex(dfX.index)
        feature_names = list(dfX.columns) # Lưu toàn bộ biến độc lập vào kiểu danh sách sau đó lưu vào biến feature_name
        class_values = sorted(class_name_map) # Sắp xếp lại giá trị số trong biến class_name_map
    else: # isinstance(class_name, list)
        dfX = pd.get_dummies(df[[c for c in df.columns if c not in class_name]], prefix_sep='=')
        # class_name_map = {v: k for k, v in enumerate(sorted(class_name))}
        class_values = sorted(class_name)
        dfY = df[class_values]
        df = pd.concat([dfX, dfY], axis=1).reindex(dfX.index)
        feature_names = list(dfX.columns)
    return df, feature_names, class_values


def remove_missing_values(df):
    for column_name, nbr_missing in df.isna().sum().to_dict().items():
        if nbr_missing > 0:
            if column_name in df._get_numeric_data().columns:
                mean = df[column_name].mean()
                df[column_name].fillna(mean, inplace=True)
            else:
                mode = df[column_name].mode().values[0]
                df[column_name].fillna(mode, inplace=True)
    return df


def get_numeric_columns(df):
    numeric_columns = list(df._get_numeric_data().columns)
    return numeric_columns



In [6]:
def neuclidean(x, y):
    return 0.5 * np.var(x - y) / (np.var(x) + np.var(y))


def nmeandev(x, y):  # normalized mean deviation
    return np.mean(np.abs(x-y)/np.max([np.abs(x), np.abs(y)], axis=0))

def record2str(x, feature_names, numeric_columns):
    xd = vector2dict(x, feature_names)
    s = '{ '
    for att, val in xd.items():
        if att not in numeric_columns and val == 0.0:
            continue
        if att in numeric_columns:
            s += '%s = %s, ' % (att, val)
        else:
            att_split = att.split('=')
            s += '%s = %s, ' % (att_split[0], att_split[1])

    s = s[:-2] + ' }'
    return s
def vector2dict(x, feature_names):
    return {k: v for k, v in zip(feature_names, x)}

## Read and clean dataset

In [7]:
class_name = 'class'
df = pd.read_csv('ANTT_1.csv', skipinitialspace = True, keep_default_na = True)
df = df.drop(df.columns[0],axis = 1)
new_col = []
for item in df['Price (£)']:
    if 0 < item <= 30000:
        new_col.append('Low Price')
    else:
        new_col.append('High Price')
df[class_name] = new_col
df = df.drop('Price (£)', axis = 1)

In [8]:
df

Unnamed: 0,Year of manufacture,Kilometer,Previous owners,Horsepower,Cubic capacity (ccm),Cylinders,Doors,Steering,Gearbox,Gears,Transmission,Front breaks,Rear breaks,Fuel type,class
0,2000,99800,3,251,1781,4,2,Left (LHD),Manual,6,4WD,Disc,Disc,Petrol,Low Price
1,1979,82000,6,127,1962,4,2,Left (LHD),Manual,5,Rear,Disc,Disc,Petrol,Low Price
2,1990,201000,3,147,2849,6,2,Left (LHD),Automatic,4,Rear,Disc,Disc,Petrol,Low Price
3,1981,60000,3,205,2906,8,2,Left (LHD),Manual,5,Rear,Disc,Disc,Petrol,High Price
4,1966,161000,5,150,2496,6,2,Left (LHD),Manual,4,Rear,Disc,Disc,Petrol,High Price
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
690,2000,11,1,485,5935,12,2,Left (LHD),Manual,6,Rear,Disc,Disc,Petrol,High Price
691,1959,115902,3,100,2000,4,2,Left (LHD),Manual,4,Rear,Disc,Drum,Petrol,Low Price
692,1983,119895,6,231,4973,8,2,Left (LHD),Automatic,4,Rear,Disc,Disc,Petrol,Low Price
693,1971,73971,2,130,2341,6,2,Left (LHD),Manual,5,Rear,Disc,Disc,Petrol,High Price


In [9]:
df, feature_names, class_values, numeric_columns, rdf, real_feature_names, features_map = prepare_dataset(df,class_name)
# df: lưu dataframe sau khi thực hiện One_hot_encoding.
# feature_names: Danh sách các cột tạo ra sau khi thực hiện One_hot_encoding với các attribute
# class_values: Lấy ra các giá trị khác nhau của biến target và gán số cho từng giá trị đó: 
# rồi sắp xếp theo thứ tự tăng dần: (High Price = 0, Low Price = 1) class_values = [0,1]
# numeric_columns: Lưu tất cả các tên cột có giá trị là số trong df ban đầu khi chưa encode.
# real_feature_names: lưu trữ toàn bộ cột dữ liệu của df ban đầu trừ cột class
# rdf: lưu trữ lại dataframe ban đầu trước khi thực hiện encode và giữ lại cột class
# features_map: Bản đồ này sẽ chứa thông tin về cách các biến độc lập trong dữ liệu đã được one-hot encoded 
# tương ứng với các biến độc lập trong dữ liệu gốc.

In [10]:
rdf.shape

(695, 15)

In [11]:
df

Unnamed: 0,Year of manufacture,Kilometer,Previous owners,Horsepower,Cubic capacity (ccm),Cylinders,Doors,Gears,Steering=Left (LHD),Steering=Right (RHD),...,Transmission=4WD,Transmission=Front,Transmission=Rear,Front breaks=Disc,Front breaks=Drum,Rear breaks=Disc,Rear breaks=Drum,Fuel type=Diesel,Fuel type=Petrol,class
0,2000,99800,3,251,1781,4,2,6,True,False,...,True,False,False,True,False,True,False,False,True,1
1,1979,82000,6,127,1962,4,2,5,True,False,...,False,False,True,True,False,True,False,False,True,1
2,1990,201000,3,147,2849,6,2,4,True,False,...,False,False,True,True,False,True,False,False,True,1
3,1981,60000,3,205,2906,8,2,5,True,False,...,False,False,True,True,False,True,False,False,True,0
4,1966,161000,5,150,2496,6,2,4,True,False,...,False,False,True,True,False,True,False,False,True,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
690,2000,11,1,485,5935,12,2,6,True,False,...,False,False,True,True,False,True,False,False,True,0
691,1959,115902,3,100,2000,4,2,4,True,False,...,False,False,True,True,False,False,True,False,True,1
692,1983,119895,6,231,4973,8,2,4,True,False,...,False,False,True,True,False,True,False,False,True,1
693,1971,73971,2,130,2341,6,2,5,True,False,...,False,False,True,True,False,True,False,False,True,0


## Split to train & test

In [12]:
#Lưu trữ toàn bộ giá trị của toàn bộ hàng dữ liệu sau khi thực hiện encode. (Mỗi hàng chứa 504 giá trị nên
# mỗi phần tử trong list sẽ là một list gồm có 504 giá trị kiểu số nguyên)
df[feature_names].values 

array([[2000, 99800, 3, ..., False, False, True],
       [1979, 82000, 6, ..., False, False, True],
       [1990, 201000, 3, ..., False, False, True],
       ...,
       [1983, 119895, 6, ..., False, False, True],
       [1971, 73971, 2, ..., False, False, True],
       [2013, 73971, 2, ..., False, True, False]], dtype=object)

In [13]:
#Lưu trữ toàn bộ giá trị target của các hàng dữ liệu (đã được encode)
df[class_name].values

array([1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0,

In [14]:
#Lưu trữ toàn bộ giá trị của toàn bộ hàng dữ liệu trước khi thực hiện encode. (Mỗi hàng chứa 18 giá trị nên
# mỗi phần tử trong list sẽ là một list gồm có 18 giá trị kiểu khác nhau)
rdf[real_feature_names].values

array([[2000, 99800, 3, ..., 'Disc', 'Disc', 'Petrol'],
       [1979, 82000, 6, ..., 'Disc', 'Disc', 'Petrol'],
       [1990, 201000, 3, ..., 'Disc', 'Disc', 'Petrol'],
       ...,
       [1983, 119895, 6, ..., 'Disc', 'Disc', 'Petrol'],
       [1971, 73971, 2, ..., 'Disc', 'Disc', 'Petrol'],
       [2013, 73971, 2, ..., 'Disc', 'Disc', 'Diesel']], dtype=object)

In [15]:
#Lưu trữ toàn bộ giá trị target của các hàng dữ liệu (trước khi được encode)
rdf[class_name].values

array(['Low Price', 'Low Price', 'Low Price', 'High Price', 'High Price',
       'Low Price', 'High Price', 'Low Price', 'Low Price', 'High Price',
       'High Price', 'High Price', 'High Price', 'High Price',
       'High Price', 'High Price', 'Low Price', 'Low Price', 'High Price',
       'High Price', 'High Price', 'High Price', 'High Price',
       'High Price', 'Low Price', 'High Price', 'High Price',
       'High Price', 'High Price', 'High Price', 'Low Price',
       'High Price', 'High Price', 'Low Price', 'Low Price', 'High Price',
       'High Price', 'High Price', 'High Price', 'Low Price',
       'High Price', 'High Price', 'High Price', 'High Price',
       'High Price', 'High Price', 'Low Price', 'High Price',
       'High Price', 'High Price', 'High Price', 'High Price',
       'Low Price', 'Low Price', 'Low Price', 'Low Price', 'Low Price',
       'Low Price', 'Low Price', 'High Price', 'High Price', 'High Price',
       'High Price', 'Low Price', 'High Price', 'High P

In [16]:
test_size = 0.20
random_state = 0
X_train, X_test, Y_train, Y_test = train_test_split(df[feature_names].values, df[class_name].values, 
                                                    test_size=test_size,
                                                    random_state=random_state, 
                                                    stratify=df[class_name].values)

x_train, x_test, y_train, y_test = train_test_split(rdf[real_feature_names].values, rdf[class_name].values, 
                              test_size=test_size,
                              random_state=random_state, 
                              stratify=df[class_name].values)

In [17]:
K = x_test
K

array([[1971, 73971, 2, ..., 'Disc', 'Disc', 'Petrol'],
       [1999, 196051, 6, ..., 'Disc', 'Disc', 'Petrol'],
       [1974, 42000, 3, ..., 'Disc', 'Disc', 'Petrol'],
       ...,
       [1987, 76000, 1, ..., 'Disc', 'Disc', 'Petrol'],
       [1988, 95040, 6, ..., 'Disc', 'Disc', 'Petrol'],
       [1962, 10046, 2, ..., 'Drum', 'Drum', 'Petrol']], dtype=object)

## Train a black box classifier (Random Forest)

In [18]:
bb = RandomForestClassifier(n_estimators=100, random_state=random_state)
bb.fit(X_train, Y_train)

In [19]:
def bb_predict(X):
    return bb.predict(X)

def bb_predict_proba(X):
    return bb.predict_proba(X)

In [20]:
Y_pred = bb_predict(X_test)

print('Accuracy %.3f' % accuracy_score(Y_test, Y_pred))
print('F1-measure %.3f' % f1_score(Y_test, Y_pred))

Accuracy 0.820
F1-measure 0.675


In [21]:
print(y_test[82], Y_test[82], Y_pred[82])

Low Price 1 1


In [22]:
print(y_test[0],Y_test[0], Y_pred[0])

High Price 0 0


## Select an record to explain

In [23]:
import random

In [24]:
i2e = np.random.randint(0,len(X_test))
x = X_test[i2e]

print('x = %s' % record2str(x, feature_names, numeric_columns))
print('')

x = { Year of manufacture = 1969, Kilometer = 55303, Previous owners = 1, Horsepower = 18, Cubic capacity (ccm) = 499, Cylinders = 2, Doors = 2, Gears = 4, Steering = Left (LHD), Gearbox = Manual, Transmission = Rear, Front breaks = Drum, Rear breaks = Drum, Fuel type = Petrol }



In [25]:
i2e

98

In [26]:
X_test[i2e]

array([1969, 55303, 1, 18, 499, 2, 2, 4, True, False, False, True, False,
       False, False, True, False, True, False, True, False, True],
      dtype=object)

In [27]:
Y_pred[i2e]

1

In [28]:
y_test[i2e]

'Low Price'

# LORE

In [29]:
sys.path.append('./lore/')

In [30]:
import numpy as np

import itertools
from functools import partial

from scipy.spatial.distance import cdist

from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score

from rule import Rule, compact_premises

from explanation import Explanation, MultilabelExplanation
from decision_tree import learn_local_decision_tree
from neighgen import RandomGenerator, GeneticGenerator, RandomGeneticGenerator, ClosestInstancesGenerator
from neighgen import GeneticProbaGenerator, RandomGeneticProbaGenerator
from rule import get_rule, get_counterfactual_rules
from util import calculate_feature_values, neuclidean, multilabel2str, multi_dt_predict


def default_kernel(d, kernel_width):
    return np.sqrt(np.exp(-(d ** 2) / kernel_width ** 2))


# LOcal Rule-based Explanation Method
class LOREM(object):

    def __init__(self, K, bb_predict, feature_names, class_name, class_values, numeric_columns, features_map,
                 neigh_type='genetic', categorical_use_prob=True, continuous_fun_estimation=False,
                 size=1000, ocr=0.1, multi_label=False, one_vs_rest=False, filter_crules=True, init_ngb_fn=True,
                 kernel_width=None, kernel=None, random_state=None, verbose=False, **kwargs):

        self.random_state = random_state
        self.bb_predict = bb_predict
        self.K = K
        self.class_name = class_name
        self.feature_names = feature_names
        self.class_values = class_values
        self.numeric_columns = numeric_columns
        self.features_map = features_map
        self.neigh_type = neigh_type
        self.multi_label = multi_label
        self.one_vs_rest = one_vs_rest
        self.filter_crules = self.bb_predict if filter_crules else None
        self.verbose = verbose

        self.features_map_inv = None
        if self.features_map:
            self.features_map_inv = dict()
            for idx, idx_dict in self.features_map.items():
                for k, v in idx_dict.items():
                    self.features_map_inv[v] = idx
        #Lấy ra cặp chỉ số của sample trong features_map và lưu đảo ngược lại.
        #Khởi tạo Kernal_width
        kernel_width = np.sqrt(len(self.feature_names)) * .75 if kernel_width is None else kernel_width
        self.kernel_width = float(kernel_width)

        kernel = default_kernel if kernel is None else kernel
        self.kernel = partial(kernel, kernel_width=kernel_width)

        np.random.seed(self.random_state)

        if init_ngb_fn:
            self.__init_neighbor_fn(ocr, categorical_use_prob, continuous_fun_estimation, 
                                    size, kwargs)

    def explain_instance(self, x, samples=1000, use_weights=True, metric=neuclidean):

        if isinstance(samples, int):
            if self.verbose:
                print('generating neighborhood - %s' % self.neigh_type)
            Z = self.neighgen_fn(x, samples)
        else:
            Z = samples

        Yb = self.bb_predict(Z)
        if self.multi_label:
            Z = np.array([z for z, y in zip(Z, Yb) if np.sum(y) > 0])
            Yb = self.bb_predict(Z)

        if self.verbose:
            if not self.multi_label:
                neigh_class, neigh_counts = np.unique(Yb, return_counts=True)
                neigh_class_counts = {self.class_values[k]: v for k, v in zip(neigh_class, neigh_counts)}
            else:
                neigh_counts = np.sum(Yb, axis=0)
                neigh_class_counts = {self.class_values[k]: v for k, v in enumerate(neigh_counts)}

            print('synthetic neighborhood class counts %s' % neigh_class_counts)

        weights = None if not use_weights else self.__calculate_weights__(Z, metric)

        if self.one_vs_rest and self.multi_label:
            exp = self.__explain_tabular_instance_multiple_tree(x, Z, Yb, weights)
        else:  # binary, multiclass, multilabel all together
            exp = self.__explain_tabular_instance_single_tree(x, Z, Yb, weights)

        return exp

    def __calculate_weights__(self, Z, metric):
        if np.max(Z) != 1 and np.min(Z) != 0:
            Zn = (Z - np.min(Z)) / (np.max(Z) - np.min(Z))
            distances = cdist(Zn, Zn[0].reshape(1, -1), metric=metric).ravel()
        else:
            distances = cdist(Z, Z[0].reshape(1, -1), metric=metric).ravel()
        weights = self.kernel(distances)
        return weights

    def __explain_tabular_instance_single_tree(self, x, Z, Yb, weights):

        if self.verbose:
            print('learning local decision tree')

        idx_train = len(Z) - int(len(Z) * 0.05)
        dt = learn_local_decision_tree(Z[:idx_train], Yb[:idx_train], weights[:idx_train], self.class_values, self.multi_label, self.one_vs_rest,
                                       prune_tree=False)
        Yc = dt.predict(Z)

        fidelity = dt.score(Z, Yb, sample_weight=weights)

        if self.verbose:
            print('retrieving explanation')

        rule = get_rule(x, dt, self.feature_names, self.class_name, self.class_values, self.numeric_columns,
                        self.multi_label)
        crules, deltas = get_counterfactual_rules(x, Yc[0], dt, Z, Yc, self.feature_names, self.class_name,
                                                  self.class_values, self.numeric_columns, self.features_map,
                                                  self.features_map_inv, self.filter_crules, self.multi_label)

        exp = Explanation()
        exp.bb_pred = Yb[0]
        exp.dt_pred = Yc[0]
        exp.rule = rule
        exp.crules = crules
        exp.deltas = deltas
        exp.dt = dt
        exp.fidelity = fidelity

        return exp

    def __explain_tabular_instance_multiple_tree(self, x, Z, Yb, weights):

        dt_list = list()
        premises = list()
        rule_list = list()
        crules_list = list()
        deltas_list = list()
        nbr_labels = len(self.class_name)

        if self.verbose:
            print('learning %s local decision trees' % nbr_labels)

        for l in range(nbr_labels):
            if np.sum(Yb[:, l]) == 0 or np.sum(Yb[:, l]) == len(Yb):
                outcome = 0 if np.sum(Yb[:, l]) == 0 else 1
                rule = Rule([], outcome, [0, 1])
                crules, deltas = list(), list()
                dt = DummyClassifier()
                dt.fit(np.zeros(Z.shape[1]).reshape(1, -1), np.array([outcome]))
            else:
                idx_train = len(Z) - int(len(Z) * 0.05)
                dt = learn_local_decision_tree(Z[:idx_train], Yb[:idx_train, l], weights[:idx_train], self.class_values, self.multi_label,
                                               self.one_vs_rest, prune_tree=False)
                Yc = dt.predict(Z)
                class_values = [0, 1]
                rule = get_rule(x, dt, self.feature_names, self.class_name[l], class_values, self.numeric_columns,
                                multi_label=False)
                crules, deltas = get_counterfactual_rules(x, Yc[0], dt, Z, Yc, self.feature_names,
                                                          self.class_name[l], class_values, self.numeric_columns,
                                                          self.features_map, self.features_map_inv,
                                                          self.filter_crules, multi_label=False)

            dt_list.append(dt)
            rule_list.append(rule)
            premises.extend(rule.premises)
            crules_list.append(crules)
            deltas_list.append(deltas)

        if self.verbose:
            print('retrieving explanation')

        Yc = multi_dt_predict(Z, dt_list)
        fidelity = accuracy_score(Yb, Yc, sample_weight=weights)

        premises = compact_premises(premises)
        dt_outcome = multi_dt_predict(x.reshape(1, -1), dt_list)[0]
        cons = multilabel2str(dt_outcome, self.class_values)
        rule = Rule(premises, cons, self.class_name)

        exp = MultilabelExplanation()
        exp.bb_pred = Yb[0]
        exp.dt_pred = Yc[0]
        exp.rule = rule
        exp.crules = list(itertools.chain.from_iterable(crules_list))
        exp.deltas = list(itertools.chain.from_iterable(deltas_list))
        exp.dt = dt_list
        exp.fidelity = fidelity

        exp.rule_list = rule_list
        exp.crules_list = crules_list
        exp.deltas_list = deltas_list

        return exp

    def __init_neighbor_fn(self, ocr, categorical_use_prob, continuous_fun_estimation, size, kwargs):

        neighgen = None
        #Lấy index của những cột có giá trị là numeric trong feature_names
        numeric_columns_index = [i for i, c in enumerate(self.feature_names) if c in self.numeric_columns]

        self.feature_values = None
        if self.neigh_type in ['random', 'genetic', 'rndgen', 'geneticp', 'rndgenp']:
            if self.verbose:
                print('calculating feature values')

            self.feature_values = calculate_feature_values(self.K, numeric_columns_index,
                                                           categorical_use_prob=categorical_use_prob,
                                                           continuous_fun_estimation=continuous_fun_estimation,
                                                           size=size)

        nbr_features = len(self.feature_names)
        nbr_real_features = self.K.shape[1]

        if self.neigh_type in ['genetic', 'rndgen', 'geneticp', 'rndgenp']:
            alpha1 = kwargs.get('alpha1', 0.5)
            alpha2 = kwargs.get('alpha2', 0.5)
            metric = kwargs.get('metric', neuclidean)
            ngen = kwargs.get('ngen', 10)
            mutpb = kwargs.get('mutpb', 0.5)
            cxpb = kwargs.get('cxpb', 0.7)
            tournsize = kwargs.get('tournsize', 3)
            halloffame_ratio = kwargs.get('halloffame_ratio', 0.1)
            random_seed = self.random_state

            if self.neigh_type == 'genetic':
                neighgen = GeneticGenerator(self.bb_predict, self.feature_values, self.features_map, nbr_features,
                                            nbr_real_features, numeric_columns_index, ocr=ocr, alpha1=alpha1,
                                            alpha2=alpha2, metric=metric, ngen=ngen,
                                            mutpb=mutpb, cxpb=cxpb, tournsize=tournsize,
                                            halloffame_ratio=halloffame_ratio, random_seed=random_seed,
                                            verbose=self.verbose)
            elif self.neigh_type == 'rndgen':
                neighgen = RandomGeneticGenerator(self.bb_predict, self.feature_values, self.features_map,
                                                  nbr_features, nbr_real_features, numeric_columns_index,
                                                  ocr=ocr, alpha1=alpha1, alpha2=alpha2,
                                                  metric=metric, ngen=ngen, mutpb=mutpb, cxpb=cxpb,
                                                  tournsize=tournsize, halloffame_ratio=halloffame_ratio,
                                                  random_seed=random_seed, verbose=self.verbose)
            elif self.neigh_type == 'geneticp':
                bb_predict_proba = kwargs.get('bb_predict_proba', None)
                neighgen = GeneticProbaGenerator(self.bb_predict, self.feature_values, self.features_map, nbr_features,
                                                 nbr_real_features, numeric_columns_index, ocr=ocr, alpha1=alpha1,
                                                 alpha2=alpha2, metric=metric, ngen=ngen,
                                                 mutpb=mutpb, cxpb=cxpb, tournsize=tournsize,
                                                 halloffame_ratio=halloffame_ratio,
                                                 bb_predict_proba=bb_predict_proba,
                                                 random_seed=random_seed,
                                                 verbose=self.verbose)

            elif self.neigh_type == 'rndgenp':
                bb_predict_proba = kwargs.get('bb_predict_proba', None)
                neighgen = RandomGeneticProbaGenerator(self.bb_predict, self.feature_values, self.features_map,
                                                       nbr_features, nbr_real_features, numeric_columns_index,
                                                       ocr=ocr, alpha1=alpha1, alpha2=alpha2,
                                                       metric=metric, ngen=ngen, mutpb=mutpb, cxpb=cxpb,
                                                       tournsize=tournsize, halloffame_ratio=halloffame_ratio,
                                                       bb_predict_proba=bb_predict_proba,
                                                       random_seed=random_seed, verbose=self.verbose)

        elif self.neigh_type == 'random':
            neighgen = RandomGenerator(self.bb_predict, self.feature_values, self.features_map, nbr_features,
                                       nbr_real_features, numeric_columns_index, ocr=ocr)
        elif self.neigh_type == 'closest':
            Kc = kwargs.get('Kc', None)
            k = kwargs.get('k', None)
            type = kwargs.get('core_neigh_type', 'simple')
            alphaf = kwargs.get('alphaf', 0.5)
            alphal = kwargs.get('alphal', 0.5)
            metric_features = kwargs.get('metric_features', neuclidean)
            metric_labels = kwargs.get('metric_labels', neuclidean)
            neighgen = ClosestInstancesGenerator(self.bb_predict, self.feature_values, self.features_map, nbr_features,
                                                 nbr_real_features, numeric_columns_index, ocr=ocr,
                                                 K=Kc, rK=self.K, k=k, core_neigh_type=type, alphaf=alphaf,
                                                 alphal=alphal, metric_features=metric_features,
                                                 metric_labels=metric_labels, categorical_use_prob=categorical_use_prob,
                                                 continuous_fun_estimation=continuous_fun_estimation, size=size,
                                                 verbose=self.verbose)
        else:
            print('unknown neighborhood generator')
            raise Exception

        self.neighgen_fn = neighgen.generate

In [31]:
lore_explainer = LOREM(K, bb_predict, feature_names, class_name, class_values, numeric_columns, features_map,
                       neigh_type='geneticp', categorical_use_prob=True, continuous_fun_estimation=False, 
                       size=1000, ocr=0.1, random_state=random_state, ngen=10, bb_predict_proba=bb_predict_proba, 
                       verbose=True)

calculating feature values


In [32]:
exp = lore_explainer.explain_instance(x, samples=500, use_weights=True, metric=nmeandev)

generating neighborhood - geneticp
gen	nevals	avg     	min     	max     
0  	250   	0.496654	0.496654	0.496654
1  	211   	0.66697 	0.0294829	0.993276
2  	215   	0.822577	0.09215  	0.993307
3  	205   	0.87719 	0.0461333	0.993307
4  	208   	0.891243	0.0527394	0.993307
5  	219   	0.900892	0.411892 	0.993307
6  	214   	0.894238	0.247616 	0.993307
7  	210   	0.901462	0.0331902	0.993307
8  	217   	0.882259	0.0536189	0.993307
9  	220   	0.896061	0.120369 	0.993307
10 	220   	0.898801	0.0417277	0.993307
gen	nevals	avg	min	max
0  	250   	0.5	0.5	0.5
1  	230   	0.571183	0.0917111	0.987006
2  	196   	0.706548	0.149846 	0.987006
3  	215   	0.825921	0.27878  	0.989733
4  	203   	0.905684	0.360642 	0.990619
5  	214   	0.928407	0.490313 	0.990911
6  	206   	0.927701	0.480039 	0.990911
7  	225   	0.931023	0.425994 	0.990911
8  	200   	0.934895	0.399679 	0.990911
9  	223   	0.934496	0.479345 	0.990911
10 	207   	0.939436	0.48251  	0.991037
synthetic neighborhood class counts {'High Price': 397, 'Low Pr

In [33]:
print(exp)

r = { Horsepower <= 50.50, Cubic capacity (ccm) <= 1689.00, Cylinders <= 5.50 } --> { class: Low Price }
c = { { Cubic capacity (ccm) > 1689.00, Cylinders > 5.50 } }


In [34]:
Y_pred[i2e]

1

In [35]:
x_test[i2e][11]

'Drum'

In [36]:
x1 = x.copy()


In [37]:
x1 = x1.reshape(1,-1)

In [38]:
x1

array([[1969, 55303, 1, 18, 499, 2, 2, 4, True, False, False, True,
        False, False, False, True, False, True, False, True, False, True]],
      dtype=object)

In [39]:
Y_pred = bb_predict(x1)


In [40]:
Y_pred

array([1], dtype=int64)

## Train a black box classifier (Multi-layer Perceptron)

In [55]:
bb1 = MLPClassifier(random_state=100)
bb1.fit(X_train, Y_train)

In [56]:
def bb1_predict(X):
    return bb1.predict(X)

def bb1_predict_proba(X):
    return bb1.predict_proba(X)

In [57]:
Y_pred_1 = bb1_predict(X_test)

print('Accuracy %.3f' % accuracy_score(Y_test, Y_pred_1))
print('F1-measure %.3f' % f1_score(Y_test, Y_pred_1))

Accuracy 0.683
F1-measure 0.000


In [58]:
print(y_test[82], Y_test[82], Y_pred_1[82])

Low Price 1 0


In [59]:
print(y_test[0],Y_test[0], Y_pred_1[0])

High Price 0 0


## Select an record to explain

In [60]:
lore_explainer_1 = LOREM(K, bb1_predict, feature_names, class_name, class_values, numeric_columns, features_map,
                       neigh_type='geneticp', categorical_use_prob=True, continuous_fun_estimation=False, 
                       size=1000, ocr=0.1, random_state=random_state, ngen=10, bb_predict_proba=bb1_predict_proba, 
                       verbose=True)

calculating feature values


In [47]:
exp_1 = lore_explainer_1.explain_instance(x, samples=500, use_weights=True, metric=nmeandev)

generating neighborhood - geneticp
gen	nevals	avg     	min     	max     
0  	250   	0.496654	0.496654	0.496654
1  	211   	0.742202	0.496654	0.993307
2  	215   	0.958951	0.496654	0.993307
3  	205   	0.979638	0.503506	0.993307
4  	208   	0.982805	0.500363	0.993307
5  	219   	0.983226	0.496654	0.993307
6  	214   	0.978055	0.500049	0.993307
7  	210   	0.986649	0.508514	0.993307
8  	217   	0.970974	0.496654	0.993307
9  	220   	0.97911 	0.500046	0.993307
10 	220   	0.976246	0.496654	0.993307
gen	nevals	avg	min	max
0  	250   	0.5	0.5	0.5
1  	230   	0.492266	0.0100723	0.5
2  	196   	0.486575	0.0102186	0.5
3  	215   	0.48417 	0.00693286	0.5
4  	203   	0.483678	0.00802455	0.5
5  	214   	0.496961	0.248971  	0.5
6  	206   	0.490458	0.008787  	0.5
7  	225   	0.48491 	0.0102119 	0.5
8  	200   	0.491929	0.00674751	0.5
9  	223   	0.491843	0.00995049	0.5
10 	207   	0.487704	0.00779836	0.5
synthetic neighborhood class counts {'High Price': 550, 'Low Price': 1}
learning local decision tree
retrieving exp

In [48]:
print(exp_1)

r = {  } --> { class: High Price }
c =  }


In [49]:
Y_pred_1[i2e]

0

In [50]:
x

array([1969, 55303, 1, 18, 499, 2, 2, 4, True, False, False, True, False,
       False, False, True, False, True, False, True, False, True],
      dtype=object)

In [51]:
x2 = x.copy()


In [52]:
x2 = x2.reshape(1,-1)

In [53]:
x2

array([[1969, 55303, 1, 18, 499, 2, 2, 4, True, False, False, True,
        False, False, False, True, False, True, False, True, False, True]],
      dtype=object)

In [54]:
Y_pred_1 = bb_predict(x2)
Y_pred_1

array([1], dtype=int64)