# AIX360

In [20]:
# import numpy as np
# import pandas as pd
# import os
# import tensorflow as tf
# from pandas.api.types import is_string_dtype
# from sklearn.metrics import confusion_matrix, precision_score
# from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
# from sklearn.model_selection import train_test_split
# from aix360.algorithms.contrastive import CEMExplainer, KerasClassifier
# from keras.models import Sequential, Model, model_from_json
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score
# from keras.layers import Dense, Dropout
# from keras.utils.np_utils import to_categorical
# from keras import regularizers
# from keras import optimizers

In [1]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from pandas.api.types import is_string_dtype
from sklearn.metrics import confusion_matrix, precision_score
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from aix360.algorithms.nncontrastive import NearestNeighborContrastiveExplainer
from keras.models import Sequential, Model, model_from_json
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from keras.layers import Dense, Dropout
from keras.utils.np_utils import to_categorical
from keras import regularizers
from keras import optimizers

In [2]:
data = pd.read_csv('./archive/german_credit_data.csv')
data_copy = data.copy()
data_copy.fillna(value="unknown", inplace=True)
labels = data_copy.columns

dict_min_max_by_col = {}

def normalize(df, col_name):
    result = df.copy()
    max_value = df.max()
    min_value = df.min()
    dict_min_max_by_col[col] = (min_value, max_value)
    result = ((df - min_value) / (max_value - min_value))
    return result

for col in labels:
    # En caso que la variable sea categorica
    if is_string_dtype(data_copy[col]) or col == "Job":
        if col == 'Risk':
            # Queremos que risk sea binario
            values, uniques = pd.factorize(data_copy[col]) #PD Factorize da un valor numerico para cada posible valor de la columna
            # por lo que al encontrar primero un good, good = 0 y bad = 1  
            data_copy[col] = data_copy[col].apply(lambda x: 1 if x == 'good' else 0)
            continue

        # se creara una columna para cada valor posible de la columna col y se añade al dataframe.
        # Posteriormente se elimina la columna col original
        data_copy = pd.concat([data_copy, pd.get_dummies(data_copy[col], prefix=col)], axis=1)
        data_copy.drop(col, axis=1, inplace=True)
    else:
        # Normalizamos el valor con la formula: normalize = (value - min_value) / (max_value - min_value) 
        data_copy[col] = normalize(data_copy[col], col)


# Añadimos de nuevo la columna de 'Risk' al final 
data_copy = data_copy[[c for c in data_copy if c not in ["Risk"]] + ["Risk"]]
display(data_copy)

Unnamed: 0,Age,Credit amount,Duration,Sex_female,Sex_male,Job_0,Job_1,Job_2,Job_3,Housing_free,...,Checking account_unknown,Purpose_business,Purpose_car,Purpose_domestic appliances,Purpose_education,Purpose_furniture/equipment,Purpose_radio/TV,Purpose_repairs,Purpose_vacation/others,Risk
0,0.857143,0.050567,0.029412,0,1,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,1
1,0.053571,0.313690,0.647059,1,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0.535714,0.101574,0.117647,0,1,0,1,0,0,0,...,1,0,0,0,1,0,0,0,0,1
3,0.464286,0.419941,0.558824,0,1,0,0,1,0,1,...,0,0,0,0,0,1,0,0,0,1
4,0.607143,0.254209,0.294118,0,1,0,0,1,0,1,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.214286,0.081765,0.117647,1,0,0,1,0,0,0,...,1,0,0,0,0,1,0,0,0,1
996,0.375000,0.198470,0.382353,0,1,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,1
997,0.339286,0.030483,0.117647,0,1,0,0,1,0,0,...,1,0,0,0,0,0,1,0,0,1
998,0.071429,0.087763,0.602941,0,1,0,0,1,0,1,...,0,0,0,0,0,0,1,0,0,0


In [36]:
class_names = ['Bad', 'Good']

# Preparation of data for training
X = data_copy.drop("Risk", axis=1) # Datos de entrada
Y = data_copy['Risk'] # Datos de salida 

# Separate into x_train, x_test, y_train y y_set
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25)

values = np.vstack((x_train, x_test))
values_max = np.max(values, axis=0)
values_min = np.min(values, axis=0)

In [None]:
#     model = Sequential()
#     model.add(Dense(30, input_dim=26, activation='relu')) # Add a 
#     model.add(Dense(1, activation='relu')) #""", activation='relu'"""
#     model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy'])
#     model.summary()

In [4]:
# def create_model():
#     model = Sequential()
#     model.add(Dense(units=18, 
#                     input_dim=x_train.shape[1]))
#     # model.add(Dropout(0.35)) # Util para el overfitting
#     model.add(Dense(units=9, activation="sigmoid"))
#     model.add(Dense(units=1, 
#                     activation="sigmoid")) #Sigmoid porque al tener solo dos clases de salidas, devolderá
#                                            #un valor entre 0 i 1 para clasificar en que clase deben estar
#     model.compile(loss="binary_crossentropy", 
#                   optimizer="sgd", # Combinación de dos tecnicas: descenso del gradiente con momentum 
#                                     # y estimación adaptativa 
#                   metrics=['accuracy'])
#     return model

# model = create_model()

# params = {"epochs": 50, 
#           "batch_size": 128, 
#           "verbose": 3, 
#           "shuffle": False}

# model.fit(x_train, y_train, 
#           epochs=params["epochs"],
#           batch_size=params["batch_size"],
#           shuffle=params["shuffle"],
#           verbose=params["verbose"], 
#           validation_data=(x_test, y_test))

# score_train = model.evaluate(x_train, y_train, verbose=0)
# print("Train accuracy:", score_train[1])

# score_test = model.evaluate(x_test, y_test, verbose=0)
# print("Test accuracy:", score_test[1])

In [37]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(x_train, y_train)

y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)

print("Precision:", accuracy)

Precision: 0.708


In [38]:
index_to_prove = 6
value_to_predict = x_test.iloc[index_to_prove:index_to_prove + 1]
index = value_to_predict.index[0]
# display(value_to_predict)
# display(data_copy.iloc[index:index + 1])

print("Predicion made by the model:", model.predict(value_to_predict))
print("Real value:", data_copy[index: index + 1]["Risk"])

# print("Prediction made by the model:", model.predict(value_to_predict.values))
# print("Prediction classes:", model.predict_classes(value_to_predict.values))
# print("Real classification of the client with id ", data_copy[index: index + 1]['Risk'])

Predicion made by the model: [1]
Real value: 723    1
Name: Risk, dtype: int64


In [39]:
# Creation of the Classifier
# k_classifier = KerasClassifier(model)
# cem_explainer = CEMExplainer(k_classifier)
explainer = NearestNeighborContrastiveExplainer(model=model.predict, 
                                                    embedding_dim=4, 
                                                    layers_config=[], 
                                                    neighbors=3)

explainer.fit(x_train, epochs=50, 
              numeric_scaling=None, 
              random_seed = 1)

# my_ae_model = None



<keras.callbacks.History at 0x29f8a8c9300>

In [54]:
index_test = 67
X = x_test.iloc[index_test: index_test + 1]
new_index = X.index[0]
#X = X.values
# display(value_to_explain)
#X = X.values.reshape((1,) + X.values.shape)
display(X)

Unnamed: 0,Age,Credit amount,Duration,Sex_female,Sex_male,Job_0,Job_1,Job_2,Job_3,Housing_free,...,Checking account_rich,Checking account_unknown,Purpose_business,Purpose_car,Purpose_domestic appliances,Purpose_education,Purpose_furniture/equipment,Purpose_radio/TV,Purpose_repairs,Purpose_vacation/others
527,0.410714,0.068945,0.0,0,1,0,1,0,0,0,...,0,1,0,0,0,0,0,1,0,0


In [55]:
# arg_mode = "PN" # Find pertinent negative
# arg_max_iter = 1000 # Maximum number of iterations to search for the optimal PN for given parameter settings
# arg_init_const = 1 # Initial coefficient value for main loss term that encourages class change
# arg_b = 9 # No. of updates to the coefficient of the main loss term
# arg_kappa = 0.2 # Minimum confidence gap between the PNs (changed) class probability and original class' probability
# arg_beta = 1e-1 # Controls sparsity of the solution (L1 loss)
# arg_gamma = 100 # Controls how much to adhere to a (optionally trained) autoencoder
# arg_alpha = 0.01 # Penalizes L2 norm of the solution
# arg_threshold = 0 # Automatically turn off features <= arg_threshold if arg_threshold < 1
# arg_offset = 1 # the model assumes classifier trained on data normalized
#                 # in [-arg_offset, arg_offset] range, where arg_offset is 0 or 0.5
    
# adv_pn, delta_pn, info_pn = cem_explainer.explain_instance(X, arg_mode=arg_mode, 
#                                                            AE_model=my_ae_model, 
#                                                            arg_kappa=arg_kappa, 
#                                                            arg_b = arg_b,
#                                                            arg_max_iter=arg_max_iter, 
#                                                            arg_init_const=arg_init_const, 
#                                                            arg_beta=arg_beta,
#                                                            arg_gamma=arg_gamma, 
#                                                            arg_alpha=arg_alpha, 
#                                                            arg_threshold=arg_threshold,
#                                                            arg_offset=arg_offset)

nearest_benign_constrastive = explainer.explain_instance(X)
# display(nearest_benign_constrastive[0]["neighbors"][0])
# display(X)





In [56]:
original_value = X
predicted_value = nearest_benign_constrastive[0]["neighbors"][0]

delta = original_value - predicted_value
X3 = np.vstack((original_value, predicted_value, delta))

dfre = pd.DataFrame.from_records(X3)

dfre.columns = x_test.columns
dfre.rename(index={0:'Original value', 1:'Predicted value', 2: "Difference"}, inplace=True)

dfret = dfre.transpose()

def highlight_ce(s, col, ncols):
    if (type(s[col]) != str):
        if (s[col] != 0):
            return(["background-color: yellow"] * ncols)
    return (["background-color:white"] * ncols)

dfret.style.apply(highlight_ce, col="Difference", ncols=3, axis=1)

Unnamed: 0,Original value,Predicted value,Difference
Age,0.410714,0.267857,0.142857
Credit amount,0.068945,0.646418,-0.577473
Duration,0.0,0.382353,-0.382353
Sex_female,0.0,0.0,0.0
Sex_male,1.0,1.0,0.0
Job_0,0.0,0.0,0.0
Job_1,1.0,1.0,0.0
Job_2,0.0,0.0,0.0
Job_3,0.0,0.0,0.0
Housing_free,0.0,0.0,0.0


In [39]:
# print("Muestra número", index_test, "del subconjunto x_test")
# print("Predicción: ", model.predict(X))
# print("Predicción de clase:", model.predict_classes(X))

# classes = class_names
# muestra_original = X
# muestra_pn = np.around(adv_pn.astype(np.double), 2)

# delta = muestra_pn - muestra_original
# delta = np.around(delta.astype(np.double), 2)
# delta[np.absolute(delta) < 1e-4] = 0

# X3 = np.vstack((muestra_original, muestra_pn, delta))

# dfre = pd.DataFrame.from_records(X3)
# dfre.columns = x_test.columns

# dfre.rename(index={0: "Muestra original", 1:"Muestra PN", 2:"Muestra PN - muestra original"},
#             inplace=True)

# dfret = dfre.transpose()

# def highlight_ce(s, col, ncols):
#     if (type(s[col]) != str):
#         if (s[col] > 0):
#             return (["background-color: yellow"] * ncols)
#     return(["background-color: blakc"] * ncols)

# dfret.style.apply(highlight_ce, col="Muestra PN - muestra original", ncols=3, axis=1)