# AIX360

In [1]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from pandas.api.types import is_string_dtype
from sklearn.metrics import confusion_matrix, precision_score
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from aix360.algorithms.nncontrastive import NearestNeighborContrastiveExplainer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from keras.layers import Dense, Dropout
from keras.utils.np_utils import to_categorical
from keras import regularizers
from keras import optimizers
from joblib import dump, load

In [2]:
# Variables section
saved_model_file = "decision_tree_model.joblib"
saved_explainer_model = "explainer_model.joblib"

# Secció de desenvolupament pràctic

In [3]:
data = pd.read_csv('./archive/german_credit_data.csv')
data_copy = data.copy()
data_copy = data_copy.drop("Id", axis=1)
data_copy.fillna(value="unknown", inplace=True)
labels = data_copy.columns

dict_min_max_by_col = {}

def normalize(df, col_name):
    result = df.copy()

    # Calculem valors minims i maxims de cada columa i els emmagatzem a un diccionari
    max_value = df.max()
    min_value = df.min()
    dict_min_max_by_col[col] = (min_value, max_value)

    # Normalitzem amb la formula: normalized = ((value - min_value) / (max_value - min_value))
    result = ((df - min_value) / (max_value - min_value))
    return result

for col in labels:
    # En cas que la columna sigui categòrica
    if is_string_dtype(data_copy[col]) or col == "Job":
        if col == 'Risk':
            # En cas que sigui la columna Risk, donarem valor 1 si es good i 0 si és false
            values, uniques = pd.factorize(data_copy[col]) #PD Factorize da un valor numerico para cada posible valor de la columna
            # por lo que al encontrar primero un good, good = 0 y bad = 1  
            data_copy[col] = data_copy[col].apply(lambda x: 1 if x == 'good' else 0)
            continue

        # Per a la resta de columnes, es crearà una columna per cada valor de cada atribut i s'omplirà de forma binaria
        data_copy = pd.concat([data_copy, pd.get_dummies(data_copy[col], prefix=col)], axis=1)
        data_copy.drop(col, axis=1, inplace=True)
    else:
        # Normalitzem els valors númerics de la columna 
        data_copy[col] = normalize(data_copy[col], col)


# Afegim al final del dataset la columna risk
data_copy = data_copy[[c for c in data_copy if c not in ["Risk"]] + ["Risk"]]
display(data_copy)

Unnamed: 0,Age,Credit amount,Duration,Sex_female,Sex_male,Job_0,Job_1,Job_2,Job_3,Housing_free,...,Checking account_unknown,Purpose_business,Purpose_car,Purpose_domestic appliances,Purpose_education,Purpose_furniture/equipment,Purpose_radio/TV,Purpose_repairs,Purpose_vacation/others,Risk
0,0.857143,0.050567,0.029412,0,1,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,1
1,0.053571,0.313690,0.647059,1,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0.535714,0.101574,0.117647,0,1,0,1,0,0,0,...,1,0,0,0,1,0,0,0,0,1
3,0.464286,0.419941,0.558824,0,1,0,0,1,0,1,...,0,0,0,0,0,1,0,0,0,1
4,0.607143,0.254209,0.294118,0,1,0,0,1,0,1,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.214286,0.081765,0.117647,1,0,0,1,0,0,0,...,1,0,0,0,0,1,0,0,0,1
996,0.375000,0.198470,0.382353,0,1,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,1
997,0.339286,0.030483,0.117647,0,1,0,0,1,0,0,...,1,0,0,0,0,0,1,0,0,1
998,0.071429,0.087763,0.602941,0,1,0,0,1,0,1,...,0,0,0,0,0,0,1,0,0,0


In [4]:
# Separem les dades per entrada (X) i sortida (y)
X = data_copy.drop("Risk", axis=1) # Datos de entrada
Y = data_copy['Risk'] # Datos de salida 

# Separem el dataset en conjunts de test i entrenament
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25)

In [4]:
if not os.path.exists(saved_model_file):
    model = DecisionTreeClassifier(max_depth=10, max_leaf_nodes=10) #RandomForestClassifier(max_depth=10, max_leaf_nodes=10, bootstrap=True)
    model.fit(x_train.values, y_train.values)
    dump(model, saved_model_file)
    print("Model guardat en el fitxer", saved_model_file)

else:
    print("Cargant model ja existent...")
    model = load(saved_model_file)

y_train_pred = model.predict_proba(x_train.values)
y_test_pred = model.predict_proba(x_test.values)

print("Train accuracy:", np.mean(np.argmax(y_train_pred, axis=1) == y_train))
print("Test accuracy:", np.mean(np.argmax(y_test_pred, axis=1) == y_test))

Cargant model ja existent...


NameError: name 'x_train' is not defined

In [6]:
index_to_prove = 9
value_to_predict = x_test.iloc[index_to_prove:index_to_prove + 1]
index = value_to_predict.index[0]

print("Predicion made by the model:", model.predict(value_to_predict.values))
print("Real value:", data_copy[index: index + 1]["Risk"])

Predicion made by the model: [0]
Real value: 714    0
Name: Risk, dtype: int64


In [7]:

# Creem el explicador i l'entrenem per a que pugui generar l'explicacions (si no s'entrena, retorna error la funció de explain_instance)
explainer = NearestNeighborContrastiveExplainer(model=model.predict, 
                                                        embedding_dim=4, 
                                                        neighbors=3)
explainer.fit(x_train, 
                  epochs=50,
                  batch_size=150)

<keras.callbacks.History at 0x1a88b879870>

In [8]:
negative_cases = pd.DataFrame(data_copy.loc[data_copy["Risk"] == 0])

index_test = 45
original = negative_cases.iloc[index_test: index_test + 1]
X = original.drop("Risk", axis=1)
new_index = X.index[0]
display(original)

# index_test = 0
# original = data_copy.iloc[index_test: index_test + 1]
# X = original.drop("Risk", axis=1)
# new_index = X.index[0]
# display(original)

Unnamed: 0,Age,Credit amount,Duration,Sex_female,Sex_male,Job_0,Job_1,Job_2,Job_3,Housing_free,...,Checking account_unknown,Purpose_business,Purpose_car,Purpose_domestic appliances,Purpose_education,Purpose_furniture/equipment,Purpose_radio/TV,Purpose_repairs,Purpose_vacation/others,Risk
181,0.196429,0.231374,0.470588,0,1,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0


In [20]:
nearest_benign_constrastive = explainer.explain_instance(X, neighbors=3)
display(nearest_benign_constrastive)



[{'features': ['Age',
   'Credit amount',
   'Duration',
   'Sex_female',
   'Sex_male',
   'Job_0',
   'Job_1',
   'Job_2',
   'Job_3',
   'Housing_free',
   'Housing_own',
   'Housing_rent',
   'Saving accounts_little',
   'Saving accounts_moderate',
   'Saving accounts_quite rich',
   'Saving accounts_rich',
   'Saving accounts_unknown',
   'Checking account_little',
   'Checking account_moderate',
   'Checking account_rich',
   'Checking account_unknown',
   'Purpose_business',
   'Purpose_car',
   'Purpose_domestic appliances',
   'Purpose_education',
   'Purpose_furniture/equipment',
   'Purpose_radio/TV',
   'Purpose_repairs',
   'Purpose_vacation/others'],
  'categorical_features': [],
  'query': [0.19642857142857142,
   0.23137449103114338,
   0.47058823529411764,
   0.0,
   1.0,
   0.0,
   0.0,
   0.0,
   1.0,
   0.0,
   1.0,
   0.0,
   1.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   1.0,
   0.0,
   0.0,
   1.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0],
  'neig

In [9]:
def rescale(row, col):
    row_copy = row.copy()
    if col in dict_min_max_by_col:
        min_value, max_value = dict_min_max_by_col[col]
        row_copy = (np.multiply(row, (max_value - min_value)) + min_value)
    return row_copy

def rescale_prediction(row, columns, top=3):
    for i in range(top):
        column = columns[i]
        min_value, max_value = dict_min_max_by_col[column]
        row[i] = (np.multiply(row[i], (max_value - min_value)) + min_value)
    
    return row

In [22]:
original_value = original.copy() # X

for i in original_value.columns:
    original_value[i] = rescale(original_value[i].values, i)
    
predicted_value = np.array(nearest_benign_constrastive[0]["neighbors"][0])
predicted_value_with_risk = np.append(predicted_value, 
                                      model.predict(predicted_value.reshape(1, predicted_value.shape[0])))

predicted_value_with_risk = rescale_prediction(predicted_value_with_risk, original_value.columns)
predicted_value_with_risk = predicted_value_with_risk.reshape(1, predicted_value_with_risk.shape[0])

delta = predicted_value_with_risk - original_value
X3 = np.vstack((original_value, predicted_value_with_risk, delta))
dfre = pd.DataFrame.from_records(X3)
dfre.columns = original_value.columns

dfre.rename(index={0: 'Original value', 1: 'Predicted value', 2: "Difference between instances"}, inplace=True)

# for i in range(len(dfre['Risk']) - 1):
#     if dfre["Risk"][i] == 0:
#         dfre["Risk"][i] = "Bad"
#     else:
#         dfre["Risk"][i] = "Good" 
dfre["Risk"][0] = "Bad"
dfre["Risk"][1] = "Good"
dfre["Risk"][2] = "NIL" 
    

dfret = dfre.transpose()

def highlight_ce(s, col, ncols):
    if (type(s[col]) != str):
        if (s[col] != 0):
            return(["background-color: yellow"] * ncols)
    return (["background-color: white"] * ncols)

dfret.style.apply(highlight_ce, col="Difference between instances", ncols=3, axis=1)

Unnamed: 0,Original value,Predicted value,Difference between instances
Age,30.000000,33.000000,3.000000
Credit amount,4455.000000,4439.000000,-16.000000
Duration,36.000000,18.000000,-18.000000
Sex_female,0.000000,0.000000,0.000000
Sex_male,1.000000,1.000000,0.000000
Job_0,0.000000,0.000000,0.000000
Job_1,0.000000,0.000000,0.000000
Job_2,0.000000,0.000000,0.000000
Job_3,1.000000,1.000000,0.000000
Housing_free,0.000000,0.000000,0.000000


# Secció anàlisi de dades i situacions

En aquesta secció es farà un analisis de les dades per poder generar certes situacions depenents de les dades.

In [5]:
low_age = 28
mid_age = 38

data_age_low = data.loc[data["Age"] <= low_age]
data_age_mid = data.loc[(low_age < data["Age"]) & ( data["Age"] <= mid_age)]
data_age_high = data.loc[data["Age"] > mid_age]


print("Mida set petit:", data_age_low["Age"].size, 
"\nMida set mitjà:", data_age_mid["Age"].size,
"\nMida set gran:", data_age_high["Age"].size)

Mida set petit: 334 
Mida set mitjà: 346 
Mida set gran: 320


Despres de separar els usuaris per les tres franjes d'edat, es el que es farà serà separar els tres datasets segons la quantiat de diners que es sol·licita

In [6]:
# Usuaris de edad baixa
mean_value_low = data_age_low["Credit amount"].mean()
max_value_low = data_age_low["Credit amount"].max()
min_value_low = data_age_low["Credit amount"].min()
median_value_low = data_age_low["Credit amount"].median()

# print("Valor mitjà:", mean_value_low, "\nValor màxim:", max_value_low, "\nValor mínim:", min_value_low, "\nValor de la mitjana:", median_value_low)

low_age_ca_down = data_age_low.loc[data_age_low["Credit amount"] < median_value_low]
low_age_ca_up = data_age_low.loc[data_age_low["Credit amount"] >= median_value_low]

print("Numero de clients amb edat entre els", data["Age"].min(), "i els", low_age, "amb un credit menor a la mitja:", low_age_ca_down["Age"].size,
"\nNúmero de persones amb un credit superior a la mitja:", low_age_ca_up["Age"].size)

Numero de clients amb edat entre els 19 i els 28 amb un credit menor a la mitja: 167 
Número de persones amb un credit superior a la mitja: 167


In [8]:
# Usuaris de edad mitjana
mean_value_mid = data_age_mid["Credit amount"].mean()
max_value_mid = data_age_mid["Credit amount"].max()
min_value_mid = data_age_mid["Credit amount"].min()
median_value_mid = data_age_mid["Credit amount"].median()

#print("Valor mitjà:", mean_value_mid, "\nValor màxim:", max_value_mid, "\nValor mínim:", min_value_mid, "\nValor de la mitjana:", median_value_mid)

mid_age_ca_down = data_age_mid.loc[data_age_mid["Credit amount"] < median_value_mid]
mid_age_ca_up = data_age_mid.loc[data_age_mid["Credit amount"] >= median_value_mid]

print("Numero de clients amb edat entre els", low_age, "i els", mid_age, "amb un credit menor a la mitja:", mid_age_ca_down["Age"].size,
"\nNúmero de persones amb un credit superior a la mitja:", mid_age_ca_up["Age"].size)

Numero de clients amb edat entre els 28 i els 38 amb un credit menor a la mitja: 173 
Número de persones amb un credit superior a la mitja: 173


In [7]:
# Usuaris de edad alta
mean_value_high = data_age_high["Credit amount"].mean()
max_value_high = data_age_high["Credit amount"].max()
min_value_high = data_age_high["Credit amount"].min()
median_value_high = data_age_high["Credit amount"].median()

high_age_ca_down = data_age_high.loc[data_age_high["Credit amount"] < median_value_high]
high_age_ca_up = data_age_high.loc[data_age_high["Credit amount"] >= median_value_high]

print("Numero de clients amb edat entre els", mid_age, "i els", data["Age"].max(), "amb un credit menor a la mitja:", high_age_ca_down["Age"].size,
"\nNúmero de persones amb un credit superior a la mitja:", high_age_ca_up["Age"].size)

Numero de clients amb edat entre els 38 i els 75 amb un credit menor a la mitja: 160 
Número de persones amb un credit superior a la mitja: 160


Ara, dels datasets que hem obtingut anteriorment, crearem una situació més en la que mirarem especialment el temps que es triga en retornar el credit

In [9]:
low_age_ca_down
low_age_ca_up

mid_age_ca_down
mid_age_ca_up

high_age_ca_down
high_age_ca_up

Unnamed: 0,Id,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,4,53,male,2,free,little,little,4870,24,car,bad
6,6,53,male,2,own,quite rich,,2835,24,furniture/equipment,good
8,8,61,male,1,own,rich,,3059,12,radio/TV,good
16,16,53,male,2,own,,,2424,24,radio/TV,good
...,...,...,...,...,...,...,...,...,...,...,...
977,977,42,male,2,own,,moderate,2427,18,business,good
978,978,47,male,1,own,little,,2538,24,car,bad
980,980,49,male,2,own,little,moderate,8386,30,furniture/equipment,bad
994,994,50,male,2,own,,,2390,12,car,good


# Secció brossa

In [11]:
# import numpy as np
# import pandas as pd
# import os
# import tensorflow as tf
# from pandas.api.types import is_string_dtype
# from sklearn.metrics import confusion_matrix, precision_score
# from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
# from sklearn.model_selection import train_test_split
# from aix360.algorithms.contrastive import CEMExplainer, KerasClassifier
# from keras.models import Sequential, Model, model_from_json
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score
# from keras.layers import Dense, Dropout
# from keras.utils.np_utils import to_categorical
# from keras import regularizers
# from keras import optimizers

# Creation of the Classifier
# k_classifier = KerasClassifier(model)
# cem_explainer = CEMExplainer(k_classifier)
# my_ae_model = None

In [None]:
#     model = Sequential()
#     model.add(Dense(30, input_dim=26, activation='relu')) # Add a 
#     model.add(Dense(1, activation='relu')) #""", activation='relu'"""
#     model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy'])
#     model.summary()

# def create_model():
#     model = Sequential()
#     model.add(Dense(units=18, 
#                     input_dim=x_train.shape[1]))
#     # model.add(Dropout(0.35)) # Util para el overfitting
#     model.add(Dense(units=9, activation="sigmoid"))
#     model.add(Dense(units=1, 
#                     activation="sigmoid")) #Sigmoid porque al tener solo dos clases de salidas, devolderá
#                                            #un valor entre 0 i 1 para clasificar en que clase deben estar
#     model.compile(loss="binary_crossentropy", 
#                   optimizer="sgd", # Combinación de dos tecnicas: descenso del gradiente con momentum 
#                                     # y estimación adaptativa 
#                   metrics=['accuracy'])
#     return model

# model = create_model()

# params = {"epochs": 50, 
#           "batch_size": 128, 
#           "verbose": 3, 
#           "shuffle": False}

# model.fit(x_train, y_train, 
#           epochs=params["epochs"],
#           batch_size=params["batch_size"],
#           shuffle=params["shuffle"],
#           verbose=params["verbose"], 
#           validation_data=(x_test, y_test))

# score_train = model.evaluate(x_train, y_train, verbose=0)
# print("Train accuracy:", score_train[1])

# score_test = model.evaluate(x_test, y_test, verbose=0)
# print("Test accuracy:", score_test[1])

In [None]:
# arg_mode = "PN" # Find pertinent negative
# arg_max_iter = 1000 # Maximum number of iterations to search for the optimal PN for given parameter settings
# arg_init_const = 1 # Initial coefficient value for main loss term that encourages class change
# arg_b = 9 # No. of updates to the coefficient of the main loss term
# arg_kappa = 0.2 # Minimum confidence gap between the PNs (changed) class probability and original class' probability
# arg_beta = 1e-1 # Controls sparsity of the solution (L1 loss)
# arg_gamma = 100 # Controls how much to adhere to a (optionally trained) autoencoder
# arg_alpha = 0.01 # Penalizes L2 norm of the solution
# arg_threshold = 0 # Automatically turn off features <= arg_threshold if arg_threshold < 1
# arg_offset = 1 # the model assumes classifier trained on data normalized
#                 # in [-arg_offset, arg_offset] range, where arg_offset is 0 or 0.5
    
# adv_pn, delta_pn, info_pn = cem_explainer.explain_instance(X, arg_mode=arg_mode, 
#                                                            AE_model=my_ae_model, 
#                                                            arg_kappa=arg_kappa, 
#                                                            arg_b = arg_b,
#                                                            arg_max_iter=arg_max_iter, 
#                                                            arg_init_const=arg_init_const, 
#                                                            arg_beta=arg_beta,
#                                                            arg_gamma=arg_gamma, 
#                                                            arg_alpha=arg_alpha, 
#                                                            arg_threshold=arg_threshold,
#                                                            arg_offset=arg_offset)

In [None]:
# print("Muestra número", index_test, "del subconjunto x_test")
# print("Predicción: ", model.predict(X))
# print("Predicción de clase:", model.predict_classes(X))

# classes = class_names
# muestra_original = X
# muestra_pn = np.around(adv_pn.astype(np.double), 2)

# delta = muestra_pn - muestra_original
# delta = np.around(delta.astype(np.double), 2)
# delta[np.absolute(delta) < 1e-4] = 0

# X3 = np.vstack((muestra_original, muestra_pn, delta))

# dfre = pd.DataFrame.from_records(X3)
# dfre.columns = x_test.columns

# dfre.rename(index={0: "Muestra original", 1:"Muestra PN", 2:"Muestra PN - muestra original"},
#             inplace=True)

# dfret = dfre.transpose()

# def highlight_ce(s, col, ncols):
#     if (type(s[col]) != str):
#         if (s[col] > 0):
#             return (["background-color: yellow"] * ncols)
#     return(["background-color: blakc"] * ncols)

# dfret.style.apply(highlight_ce, col="Muestra PN - muestra original", ncols=3, axis=1)