# Preparació de classificador binari

En aquest document es mostra la preparació realitzada del classificador binari, també encarregat de generar les explicacions contrafactuals.

In [1]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from pandas.api.types import is_string_dtype
from sklearn.metrics import confusion_matrix, precision_score
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from aix360.algorithms.nncontrastive import NearestNeighborContrastiveExplainer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from keras.layers import Dense, Dropout
from keras.utils.np_utils import to_categorical
from keras import regularizers
from keras import optimizers
from joblib import dump, load

In [2]:
# Secció de variables per emmagatzemar el model
saved_model_file = "decision_tree_model.joblib"
saved_explainer_model = "explainer_model.joblib"

Per a aquesta primera part, prepararem el conjunt de dades. Primer haurem de carregar les dades del fitxer csv. després realitzarem un tractament de les dades. Primer de tot, donat que la llibreria no accepta valors Nan, substituirem els valors Nan per un nou valor "unknown". 

Finalment, codificarem les dades categoriques i normalitzarem aquelles dades númeriques, donat que el model no accepta les variables catègoriques en el seu format original (string). Per fer-ho, es fara servir una codificació one-hot encode, en el que es crea una nova columna per a cada possible valor de cada variable categòrica i en cas de tenir aquesta variable, s'indica com 1 i en cas contrari com un 0, generant un vector binari.

Per als valors númerics, es calcularà de cada columna el valor màxim i minim, i per calcular el nou valor es farà servir la següent formula:

$$
Valor = \frac{valor - valorMínim}{valorMàxim - valorMínim}
$$

In [4]:
data = pd.read_csv('./archive/german_credit_data.csv')
data_copy = data.copy()
# data_copy = data_copy.drop("Id", axis=1)
data_copy.fillna(value="unknown", inplace=True)
labels = data_copy.columns

dict_min_max_by_col = {}

def normalize(df, col_name):
    result = df.copy()

    # Calculem valors minims i maxims de cada columa i els emmagatzem a un diccionari
    max_value = df.max()
    min_value = df.min()
    dict_min_max_by_col[col] = (min_value, max_value)

    # Normalitzem amb la formula: normalized = ((value - min_value) / (max_value - min_value))
    result = ((df - min_value) / (max_value - min_value))
    return result

for col in labels:
    # En cas que la columna sigui categòrica
    if is_string_dtype(data_copy[col]) or col == "Job":
        if col == 'Risk':
            # En cas que sigui la columna Risk, donarem valor 1 si es good i 0 si és false
            values, uniques = pd.factorize(data_copy[col]) #PD Factorize da un valor numerico para cada posible valor de la columna
            # por lo que al encontrar primero un good, good = 0 y bad = 1  
            data_copy[col] = data_copy[col].apply(lambda x: 1 if x == 'good' else 0)
            continue

        # Per a la resta de columnes, es crearà una columna per cada valor de cada atribut i s'omplirà de forma binaria
        data_copy = pd.concat([data_copy, pd.get_dummies(data_copy[col], prefix=col)], axis=1)
        data_copy.drop(col, axis=1, inplace=True)
    else:
        # Normalitzem els valors númerics de la columna 
        data_copy[col] = normalize(data_copy[col], col)


# Afegim al final del dataset la columna risk
data_copy = data_copy[[c for c in data_copy if c not in ["Risk"]] + ["Risk"]]
display(data_copy)

# Variables necesarias:
columns_data = data_copy.columns
print(data_copy.columns.size)

Unnamed: 0,Age,Credit amount,Duration,Sex_female,Sex_male,Job_0,Job_1,Job_2,Job_3,Housing_free,...,Checking account_unknown,Purpose_business,Purpose_car,Purpose_domestic appliances,Purpose_education,Purpose_furniture/equipment,Purpose_radio/TV,Purpose_repairs,Purpose_vacation/others,Risk
0,0.857143,0.050567,0.029412,0,1,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,1
1,0.053571,0.313690,0.647059,1,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0.535714,0.101574,0.117647,0,1,0,1,0,0,0,...,1,0,0,0,1,0,0,0,0,1
3,0.464286,0.419941,0.558824,0,1,0,0,1,0,1,...,0,0,0,0,0,1,0,0,0,1
4,0.607143,0.254209,0.294118,0,1,0,0,1,0,1,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.214286,0.081765,0.117647,1,0,0,1,0,0,0,...,1,0,0,0,0,1,0,0,0,1
996,0.375000,0.198470,0.382353,0,1,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,1
997,0.339286,0.030483,0.117647,0,1,0,0,1,0,0,...,1,0,0,0,0,0,1,0,0,1
998,0.071429,0.087763,0.602941,0,1,0,0,1,0,1,...,0,0,0,0,0,0,1,0,0,0


30


Un cop finalitzat el tractament, haurem de crear els conjunts de dades per a entrenament i test. Per fer-ho, haurem de separar les dades d'entrada i dades de sortida (X i Y respectivament).

- Les dades d'entrada són aquells atributs els quals no volem predir, sinó que són aquells que ens ajuden a donar la predicció.
- Les dades de sortida són aquelles dades qeu retornarem en la predicció del model.

Per fer-ho, usarem la funció train_test_split, la qual ens permetra realitzar la separació dels sets d'entrada d'entrenament i test i els sets de sortida d'entrenament i test. Usarem un tamany de 25% de les dades totals per el test, i el restant per l'entrenament.

In [5]:
# Separem les dades per entrada (X) i sortida (y)
X = data_copy.drop("Risk", axis=1) # Datos de entrada
Y = data_copy['Risk'] # Datos de salida 

# Separem el dataset en conjunts de test i entrenament
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25)

Un cop preparades les dades, comencarem a entrenar el model. En aquest cas, per evitar entrenar els models més d'un cop sense necessitat, salvarem l'entrenament en un fitxer, i en cas que aquest fitxer existeixi, no es tornarà a entrenar el model, sinó que es carregarà el contingut del fitxer en el model. En cas que el fitxer no existeixi, es crearà el model de 0 i s'entrenarà el model amb les dades d'entrenament. Posteriorment, es guardarà en el fitxer que es cercarà en el condicional, per evitar que el model es torni a entrenar.

Després, calcularem un accuracy tant per al train com per al test.

In [7]:
# Comprovem si ja hi ha un model entrenat
if not os.path.exists(saved_model_file):
    model = DecisionTreeClassifier(
        max_depth=10,
        max_leaf_nodes=10
    )
   
    model.fit(x_train.values, y_train.values)

    # Guardem el model entrenat en el fitxer saved_model_file
    dump(model, saved_model_file)
    print("Model guardat en el fitxer", saved_model_file)

else:
    # Carreguem aquell model que ja es trobés preparat.
    print("Cargant model ja existent...")
    model = load(saved_model_file)

scores = cross_val_score(model, x_train, y_train, cv=5)
print("Puntuació de l'entrenament:", scores.mean())

scores = cross_val_score(model, x_test, y_test, cv=5)
print("Puntuació del test:", scores.mean())

Cargant model ja existent...
Puntuació de l'entrenament: 0.7213333333333333
Puntuació del test: 0.6960000000000001


En aquest moment, ja tenim preparada la primera part del classificador, el model que s'encarregarà de generar les prediccions dels crèdits. El següent pas serà preparar l'algorisme encarregat de generar les explicacions contrafactuals.

L'algorisme que s'ha seleccionat per a aquest projecte és **Nearest Neighbor Contrastive Explainer**, un algorisme que genera veins el més proper possible de la instància pasada com a paràmetre, però amb una classificació diferent. En aquest cas, donat que el classificador és un classificador binari, en cas de passar-li una instància de sol·licitud de crèdit classificada com a denegat(Bad), els veins que generi es classificaran com a concedit (Good).

In [8]:
# Creem el explicador i l'entrenem per a que pugui generar l'explicacions (si no s'entrena, retorna error la funció de explain_instance)
explainer = NearestNeighborContrastiveExplainer(model=model.predict, n_classes=2, neighbors=10)
explainer.fit(X, epochs=50, batch_size=100)

<keras.callbacks.History at 0x2344b361090>

Un cop tenim ja preparat el mètode explicatiu, podem començar a generar explicacions contrafactuals de instàncies negatives. Per fer un breu test, a la següent cel·la es prepara un subconjunt amb instàncies de credits classsificats com denegats (Bad) i una variable index, la qual ens permet escollir d'aquest conjunt de dades una instància.

In [20]:
negatives_cases = data_copy.loc[data_copy.Risk == 0] # Cerquem aquells casos negatius

# Index de la instància a provar
index = 0

# Preparem la instància negativa
ins = negatives_cases.iloc[index: index + 1]
ins = ins.drop("Risk", axis=1)

nearest_benign_constrastive = explainer.explain_instance(ins)



In [22]:
# Funció que permet reescalar el valor per a poder mostrar a la taula els valors de la sol·licitud sense processar
def rescale(row, col):
    row_copy = row.copy()
    if col in dict_min_max_by_col:
        min_value, max_value = dict_min_max_by_col[col]
        row_copy = (np.multiply(row, (max_value - min_value)) + min_value)
    return row_copy

def rescale_prediction(row, columns, top=3):
    for i in range(top):
        column = columns[i]
        min_value, max_value = dict_min_max_by_col[column]
        row[i] = (np.multiply(row[i], (max_value - min_value)) + min_value)
    
    return row

En aquest moment, l'explicador ja ha generat un veí semblant a la sol·licitud de crèdit original amb una classificació diferent, o sigui, classificat com concedit.

In [23]:
original_value = ins.copy() # X
original_value['Risk'] = 0.0

for i in original_value.columns:
    original_value[i] = rescale(original_value[i].values, i)

predicted_value = np.array(nearest_benign_constrastive[0]["neighbors"][3])
predicted_value_with_risk = np.append(predicted_value, 
                                      model.predict(predicted_value.reshape(1, predicted_value.shape[0])))

predicted_value_with_risk = rescale_prediction(predicted_value_with_risk, original_value.columns)
predicted_value_with_risk = predicted_value_with_risk.reshape(1, predicted_value_with_risk.shape[0])

delta = predicted_value_with_risk - original_value
X3 = np.vstack((original_value, predicted_value_with_risk, delta))
dfre = pd.DataFrame.from_records(X3)
dfre.columns = original_value.columns

dfre.rename(index={0: 'Original value', 1: 'Predicted value', 2: "Difference between instances"}, inplace=True)

for i in range(len(dfre['Risk']) - 1):
    if dfre["Risk"][i] == 0:
        dfre["Risk"][i] = "Bad"
    else:
        dfre["Risk"][i] = "Good" 

dfret = dfre.transpose()

def highlight_ce(s, col, ncols):
    if (type(s[col]) != str):
        if (s[col] != 0):
            return(["background-color: yellow"] * ncols)
    return (["background-color: white"] * ncols)

dfret.style.apply(highlight_ce, col="Difference between instances", ncols=3, axis=1)

Unnamed: 0,Original value,Predicted value,Difference between instances
Age,22.000000,22.000000,0.0
Credit amount,5951.000000,1567.000000,-4384.0
Duration,48.000000,12.000000,-36.0
Sex_female,1.000000,1.000000,0.0
Sex_male,0.000000,0.000000,0.0
Job_0,0.000000,0.000000,0.0
Job_1,0.000000,0.000000,0.0
Job_2,1.000000,1.000000,0.0
Job_3,0.000000,0.000000,0.0
Housing_free,0.000000,0.000000,0.0


La taula que es mostra a la cel·la superior mostra la següent informació:

- Primera columna: les característiques de una sol·lciitud de crèdit
- Segona columna: sol·licitud de crèdit original
- Tercera columna: Veí generat amb classificació diferent
- Quarta columna: diferencies entre el la sol·licitud inicial i la generada

A més, aquells atributs que han canviat, apareixen marcats en color groc

A les cel·les posteriors, mostrem una implementació que ens permet generar una taula que mostra més de 1 veí.

In [32]:
# Aquesta funció s'encarrega de generar la taula rebent com a paràmetre la sol·licitud original i els veins generats per l'explicador
def show_counterfactual_table(original_data, nb_contrastive):
    original_value = original_data.copy()
    num_neighbors = len(nb_contrastive[0]["neighbors"])
    index = {0:"Original input"}
    predictions = []
    final_result = ["Bad"]
    columns = original_value.columns

    # Invertim els valors de la normalització dels valors
    for col in columns:
        original_value[col] = rescale(original_value[col].values, col)

    ind_val = 0
    for i in range(num_neighbors):
        #Calculem la predicció per a cada vei generat
        predicted_value = np.array(nb_contrastive[0]["neighbors"][i])
        predicted_value_with_risk = np.append(predicted_value, model.predict(predicted_value.reshape(1, predicted_value.shape[0])))
        predicted_value_with_risk = rescale_prediction(predicted_value_with_risk, original_value.columns)
        predicted_value_with_risk = predicted_value_with_risk.reshape(1, predicted_value_with_risk.shape[0])

        # Si la predicció del vei retorna 1, es guarda un good a la llista de resultat, en cas contrari un Bad
        if int(predicted_value_with_risk[0][-1]) == 1:
            final_result.append("Good")
        else:
            final_result.append("Bad")

        # Afegim un NIL per a les diferencies
        final_result.append("NIL")

        # Ajustem les capçaleres de la taula
        index[ind_val + 1] = "Counterfactual " + str(i + 1)
        index[ind_val + 2] = "Difference"
        ind_val = ind_val + 2
        
        predictions.append(predicted_value_with_risk)

    # Ajuntem les columnes per mostrar-ho en un dataframe
    v_stack = create_vstacks(original_value, predictions)
    dfre = pd.DataFrame.from_records(v_stack)
    dfre.columns = original_value.columns

    dfre.rename(index=index, inplace=True)
    len_ = len(dfre["Risk"])
    dfre["Risk"][0] = "Bad"

    # La ultima cel·la del dataframe afegirem la classificació que s'ha calculat anteriorment
    for i in range(1, len_):
        dfre["Risk"][i] = final_result[i]

    return dfre.transpose()

# Aquesta funció s'encarrega d'ajuntar els columnes de la sol·licitud original, el vei generat i les diferencies
def create_vstacks(original_data, predictions):
    values = (original_data, )
    for prediction in predictions:
        delta = prediction - original_data
        values = values + (prediction, )
        values = values + (delta, )

    return np.vstack(values)


# Aquesta funció s'encarrega de preparar la sol·licitud rebuda com a parametre amb el format per al model
def prepare_instance(df: pd.DataFrame):
    list_clients = []

    for col, row in df.iterrows():
        prepared_client = {
            "Age": 0,
            "Credit amount": 0,
            "Duration": 0,
            "Sex_female": 0,
            "Sex_male": 0,
            "Job_0": 0,
            "Job_1": 0,
            "Job_2": 0,
            "Job_3": 0,
            "Housing_free": 0,
            "Housing_rent": 0,
            "Housing_own": 0,
            "Saving accounts_little": 0,
            'Saving accounts_moderate': 0,
            'Saving accounts_quite rich': 0,
            'Saving accounts_rich': 0,
            'Saving accounts_unknown': 0,
            'Checking account_little': 0,
            'Checking account_moderate': 0,
            'Checking account_rich': 0,
            'Checking account_unknown': 0,
            'Purpose_business': 0,
            'Purpose_car': 0,
            'Purpose_domestic appliances': 0,
            'Purpose_education': 0,
            'Purpose_furniture/equipment': 0,
            'Purpose_radio/TV': 0,
            'Purpose_repairs': 0,
            'Purpose_vacation/others': 0,
        }

        # extraiem de la row totes les característiques del crèdit
        age, sex, job, housing, savingAccount, checkingAccount, creditAmount, duration, purpose = row
        
        # Normalitzem els valors numerics
        new_age = normalizeValue(age, dict_min_max_by_col['Age'])
        new_amount = normalizeValue(creditAmount, dict_min_max_by_col['Credit amount'])
        new_duration = normalizeValue(duration, dict_min_max_by_col['Duration'])

        prepared_client["Age"] = new_age
        prepared_client["Credit amount"] = new_amount
        prepared_client["Duration"] = new_duration

        # Assignem el valor 1 per als valors categorics presents a la sol·licitud
        prepared_client["Sex_" + sex] = 1
        prepared_client["Job_" + job] = 1
        prepared_client["Housing_" + housing] = 1
        prepared_client["Saving accounts_" + savingAccount] = 1
        prepared_client["Checking account_" + checkingAccount] = 1
        prepared_client["Purpose_" + purpose] = 1

        list_clients.append(prepared_client)

    # Creem un dataframe amb les dades de les sol·lciituds 
    result = pd.DataFrame(list_clients, columns=columns_data)
    result = result.drop('Risk', axis=1)
    return result

# Funció de normalització dels valors númerics introduits a la variable
def normalizeValue(value: int, min_max: tuple):
    return ((value - min_max[0]) / (min_max[1] - min_max[0]))
        

In [33]:
columns = ["Age", "Sex", "Job", "Housing", "Saving accounts", "Checking account", "Credit amount", "Duration", "Purpose"]
values = [[46, "female", "1", "own", "little", "moderate", 900, 20, "car"]]

instance = pd.DataFrame(values, columns=columns)
instance_prepared = prepare_instance(instance)
_class = model.predict(instance_prepared.values)

new_explain = explainer.explain_instance(instance_prepared, neighbors=10)
instance_prepared['Risk'] = _class

pd.set_option("display.max_columns", None)
dataframe = show_counterfactual_table(instance_prepared, new_explain)
display(dataframe)



Unnamed: 0,Original input,Counterfactual 1,Difference,Counterfactual 2,Difference.1,Counterfactual 3,Difference.2,Counterfactual 4,Difference.3,Counterfactual 5,Difference.4,Counterfactual 6,Difference.5,Counterfactual 7,Difference.6,Counterfactual 8,Difference.7,Counterfactual 9,Difference.8
Age,46.0,23.0,-23.0,74.0,28.0,35.0,-11.0,34.0,-12.0,67.0,21.0,34.0,-12.0,29.0,-17.0,46.0,0.0,47.0,1.0
Credit amount,900.0,1961.0,1061.0,5129.0,4229.0,1393.0,493.0,1501.0,601.0,1199.0,299.0,1860.0,960.0,1412.0,512.0,3594.0,2694.0,1209.0,309.0
Duration,20.0,18.0,-2.0,9.0,-11.0,11.0,-9.0,9.0,-11.0,9.0,-11.0,12.0,-8.0,12.0,-8.0,15.0,-5.0,6.0,-14.0
Sex_female,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,-1.0,1.0,0.0,1.0,0.0,0.0,-1.0
Sex_male,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
Job_0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Job_1,1.0,0.0,-1.0,0.0,-1.0,0.0,-1.0,0.0,-1.0,0.0,-1.0,0.0,-1.0,0.0,-1.0,1.0,0.0,0.0,-1.0
Job_2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Job_3,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0
Housing_free,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Secció anàlisi de dades i situacions

En aquesta secció es farà un analisis de les dades per poder generar certes situacions depenents de les dades.

In [19]:
low_age = 28
mid_age = 38

data_age_low = data.loc[data["Age"] <= low_age]
data_age_mid = data.loc[(low_age < data["Age"]) & ( data["Age"] <= mid_age)]
data_age_high = data.loc[data["Age"] > mid_age]


print("Mida set petit:", data_age_low["Age"].size, 
"\nMida set mitjà:", data_age_mid["Age"].size,
"\nMida set gran:", data_age_high["Age"].size)

display(data)

Mida set petit: 334 
Mida set mitjà: 346 
Mida set gran: 320


Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,,little,1169,6,radio/TV,good
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,49,male,1,own,little,,2096,12,education,good
3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,53,male,2,free,little,little,4870,24,car,bad
...,...,...,...,...,...,...,...,...,...,...
995,31,female,1,own,little,,1736,12,furniture/equipment,good
996,40,male,3,own,little,little,3857,30,car,good
997,38,male,2,own,little,,804,12,radio/TV,good
998,23,male,2,free,little,little,1845,45,radio/TV,bad


Despres de separar els usuaris per les tres franjes d'edat, es el que es farà serà separar els tres datasets segons la quantiat de diners que es sol·licita. Primer, farem un càlcul de la mitja de credit que es sol·licita per posteriorment classificar aquells que es troben per sota o per a sobre

In [17]:
mean_value_ca = data["Credit amount"].mean()

low_age_ca_down = data_age_low.loc[data_age_low["Credit amount"] < mean_value_ca]
low_age_ca_up = data_age_low.loc[data_age_low["Credit amount"] >= mean_value_ca]

mid_age_ca_down = data_age_mid.loc[data_age_mid["Credit amount"] < mean_value_ca]
mid_age_ca_up = data_age_mid.loc[data_age_mid["Credit amount"] >= mean_value_ca]

high_age_ca_down = data_age_high.loc[data_age_high["Credit amount"] < mean_value_ca]
high_age_ca_up = data_age_high.loc[data_age_high["Credit amount"] >= mean_value_ca]

print(mean_value_ca)
print(high_age_ca_down["Age"].size, high_age_ca_up["Age"].size)

3271.258
208 112


Ara, dels datasets que hem obtingut anteriorment, crearem una situació més en la que mirarem especialment el temps que es triga en retornar el credit

In [20]:
mean_duration = data["Duration"].mean()
print(mean_duration)

low_age_ca_down_du_down = low_age_ca_down.loc[low_age_ca_down["Duration"] < mean_duration]
low_age_ca_down_du_up = low_age_ca_down.loc[low_age_ca_down["Duration"] >= mean_duration]

low_age_ca_up_du_down = low_age_ca_up.loc[low_age_ca_up["Duration"] < mean_duration]
low_age_ca_up_du_up = low_age_ca_up.loc[low_age_ca_up["Duration"] >= mean_duration]

mid_age_ca_down_du_down = mid_age_ca_down.loc[mid_age_ca_down["Duration"] < mean_duration]
mid_age_ca_down_du_up = mid_age_ca_down.loc[mid_age_ca_down["Duration"] >= mean_duration]

mid_age_ca_up_du_down = mid_age_ca_up.loc[mid_age_ca_up["Duration"] < mean_duration]
mid_age_ca_up_du_up = mid_age_ca_up.loc[mid_age_ca_up["Duration"] >= mean_duration]

high_age_ca_down_du_down = high_age_ca_down.loc[high_age_ca_down["Duration"] < mean_duration]
high_age_ca_down_du_up = high_age_ca_down.loc[high_age_ca_down["Duration"] >= mean_duration]

high_age_ca_up_du_down = high_age_ca_up.loc[high_age_ca_up["Duration"] < mean_duration]
high_age_ca_up_du_up = high_age_ca_up.loc[high_age_ca_up["Duration"] >= mean_duration]


def print_size(df):
    print(df["Age"].size)

print_size(low_age_ca_down_du_down)
print_size(low_age_ca_down_du_up)
print_size(low_age_ca_up_du_down)
print_size(low_age_ca_up_du_up)
print_size(mid_age_ca_down_du_down)
print_size(mid_age_ca_down_du_up)
print_size(mid_age_ca_up_du_down)
print_size(mid_age_ca_up_du_up)
print_size(high_age_ca_down_du_down)
print_size(high_age_ca_down_du_up)
print_size(high_age_ca_up_du_down)
print_size(high_age_ca_up_du_up)

20.903
169
59
19
87
149
73
32
92
149
59
36
76


# Secció brossa

In [21]:
# # Usuaris de edad mitjana
# mean_value_mid = data_age_mid["Credit amount"].mean()
# max_value_mid = data_age_mid["Credit amount"].max()
# min_value_mid = data_age_mid["Credit amount"].min()
# median_value_mid = data_age_mid["Credit amount"].median()

# #print("Valor mitjà:", mean_value_mid, "\nValor màxim:", max_value_mid, "\nValor mínim:", min_value_mid, "\nValor de la mitjana:", median_value_mid)

# mid_age_ca_down = data_age_mid.loc[data_age_mid["Credit amount"] < median_value_mid]
# mid_age_ca_up = data_age_mid.loc[data_age_mid["Credit amount"] >= median_value_mid]

# print("Numero de clients amb edat entre els", low_age, "i els", mid_age, "amb un credit menor a la mitja:", mid_age_ca_down["Age"].size,
# "\nNúmero de persones amb un credit superior a la mitja:", mid_age_ca_up["Age"].size)

Numero de clients amb edat entre els 28 i els 38 amb un credit menor a la mitja: 173 
Número de persones amb un credit superior a la mitja: 173


In [22]:
# # Usuaris de edad alta
# mean_value_high = data_age_high["Credit amount"].mean()
# max_value_high = data_age_high["Credit amount"].max()
# min_value_high = data_age_high["Credit amount"].min()
# median_value_high = data_age_high["Credit amount"].median()

# high_age_ca_down = data_age_high.loc[data_age_high["Credit amount"] < median_value_high]
# high_age_ca_up = data_age_high.loc[data_age_high["Credit amount"] >= median_value_high]

# print("Numero de clients amb edat entre els", mid_age, "i els", data["Age"].max(), "amb un credit menor a la mitja:", high_age_ca_down["Age"].size,
# "\nNúmero de persones amb un credit superior a la mitja:", high_age_ca_up["Age"].size)

Numero de clients amb edat entre els 38 i els 75 amb un credit menor a la mitja: 160 
Número de persones amb un credit superior a la mitja: 160


In [20]:
# # Usuaris de edad baixa
# mean_value_low = data_age_low["Credit amount"].mean()
# max_value_low = data_age_low["Credit amount"].max()
# min_value_low = data_age_low["Credit amount"].min()
# median_value_low = data_age_low["Credit amount"].median()

# # print("Valor mitjà:", mean_value_low, "\nValor màxim:", max_value_low, "\nValor mínim:", min_value_low, "\nValor de la mitjana:", median_value_low)

# low_age_ca_down = data_age_low.loc[data_age_low["Credit amount"] < median_value_low]
# low_age_ca_up = data_age_low.loc[data_age_low["Credit amount"] >= median_value_low]

# print("Numero de clients amb edat entre els", data["Age"].min(), "i els", low_age, "amb un credit menor a la mitja:", low_age_ca_down["Age"].size,
# "\nNúmero de persones amb un credit superior a la mitja:", low_age_ca_up["Age"].size)

Numero de clients amb edat entre els 19 i els 28 amb un credit menor a la mitja: 167 
Número de persones amb un credit superior a la mitja: 167


In [11]:
# import numpy as np
# import pandas as pd
# import os
# import tensorflow as tf
# from pandas.api.types import is_string_dtype
# from sklearn.metrics import confusion_matrix, precision_score
# from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
# from sklearn.model_selection import train_test_split
# from aix360.algorithms.contrastive import CEMExplainer, KerasClassifier
# from keras.models import Sequential, Model, model_from_json
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score
# from keras.layers import Dense, Dropout
# from keras.utils.np_utils import to_categorical
# from keras import regularizers
# from keras import optimizers

# Creation of the Classifier
# k_classifier = KerasClassifier(model)
# cem_explainer = CEMExplainer(k_classifier)
# my_ae_model = None

In [None]:
#     model = Sequential()
#     model.add(Dense(30, input_dim=26, activation='relu')) # Add a 
#     model.add(Dense(1, activation='relu')) #""", activation='relu'"""
#     model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy'])
#     model.summary()

# def create_model():
#     model = Sequential()
#     model.add(Dense(units=18, 
#                     input_dim=x_train.shape[1]))
#     # model.add(Dropout(0.35)) # Util para el overfitting
#     model.add(Dense(units=9, activation="sigmoid"))
#     model.add(Dense(units=1, 
#                     activation="sigmoid")) #Sigmoid porque al tener solo dos clases de salidas, devolderá
#                                            #un valor entre 0 i 1 para clasificar en que clase deben estar
#     model.compile(loss="binary_crossentropy", 
#                   optimizer="sgd", # Combinación de dos tecnicas: descenso del gradiente con momentum 
#                                     # y estimación adaptativa 
#                   metrics=['accuracy'])
#     return model

# model = create_model()

# params = {"epochs": 50, 
#           "batch_size": 128, 
#           "verbose": 3, 
#           "shuffle": False}

# model.fit(x_train, y_train, 
#           epochs=params["epochs"],
#           batch_size=params["batch_size"],
#           shuffle=params["shuffle"],
#           verbose=params["verbose"], 
#           validation_data=(x_test, y_test))

# score_train = model.evaluate(x_train, y_train, verbose=0)
# print("Train accuracy:", score_train[1])

# score_test = model.evaluate(x_test, y_test, verbose=0)
# print("Test accuracy:", score_test[1])

In [None]:
# arg_mode = "PN" # Find pertinent negative
# arg_max_iter = 1000 # Maximum number of iterations to search for the optimal PN for given parameter settings
# arg_init_const = 1 # Initial coefficient value for main loss term that encourages class change
# arg_b = 9 # No. of updates to the coefficient of the main loss term
# arg_kappa = 0.2 # Minimum confidence gap between the PNs (changed) class probability and original class' probability
# arg_beta = 1e-1 # Controls sparsity of the solution (L1 loss)
# arg_gamma = 100 # Controls how much to adhere to a (optionally trained) autoencoder
# arg_alpha = 0.01 # Penalizes L2 norm of the solution
# arg_threshold = 0 # Automatically turn off features <= arg_threshold if arg_threshold < 1
# arg_offset = 1 # the model assumes classifier trained on data normalized
#                 # in [-arg_offset, arg_offset] range, where arg_offset is 0 or 0.5
    
# adv_pn, delta_pn, info_pn = cem_explainer.explain_instance(X, arg_mode=arg_mode, 
#                                                            AE_model=my_ae_model, 
#                                                            arg_kappa=arg_kappa, 
#                                                            arg_b = arg_b,
#                                                            arg_max_iter=arg_max_iter, 
#                                                            arg_init_const=arg_init_const, 
#                                                            arg_beta=arg_beta,
#                                                            arg_gamma=arg_gamma, 
#                                                            arg_alpha=arg_alpha, 
#                                                            arg_threshold=arg_threshold,
#                                                            arg_offset=arg_offset)

In [None]:
# print("Muestra número", index_test, "del subconjunto x_test")
# print("Predicción: ", model.predict(X))
# print("Predicción de clase:", model.predict_classes(X))

# classes = class_names
# muestra_original = X
# muestra_pn = np.around(adv_pn.astype(np.double), 2)

# delta = muestra_pn - muestra_original
# delta = np.around(delta.astype(np.double), 2)
# delta[np.absolute(delta) < 1e-4] = 0

# X3 = np.vstack((muestra_original, muestra_pn, delta))

# dfre = pd.DataFrame.from_records(X3)
# dfre.columns = x_test.columns

# dfre.rename(index={0: "Muestra original", 1:"Muestra PN", 2:"Muestra PN - muestra original"},
#             inplace=True)

# dfret = dfre.transpose()

# def highlight_ce(s, col, ncols):
#     if (type(s[col]) != str):
#         if (s[col] > 0):
#             return (["background-color: yellow"] * ncols)
#     return(["background-color: blakc"] * ncols)

# dfret.style.apply(highlight_ce, col="Muestra PN - muestra original", ncols=3, axis=1)