# HITO 0

En este cuaderno se realizará la optimización del modelo Deep Learning que será usado durante el TFM.
Para la optimización se empleará 5-fold cv y hyperparameter search para el dataset compuesto por los datos de los 21 sujetos (Centralizado).

Se explorará la mejor combinación de hiperparámetros mediante optimización bayesiana (https://www.analyticsvidhya.com/blog/2021/05/bayesian-optimization-bayes_opt-or-hyperopt/)

Una vez encontrado el mejor modelo, se entrenarán de los modelos 'baseline' sin federated learning para cada sujeto (individuales), por empresa y centralizado. Se medirá el accuracy y el f1-score de cada modelo.

# Lectura y preparación de los datos

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json
import re

from numpy.random import seed
from tensorflow.keras.utils import set_random_seed

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.utils import to_categorical

from keras.models import Sequential
import keras.backend as K
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout
from tensorflow.keras.optimizers import Adam, SGD, RMSprop, Adadelta, Adagrad, Adamax, Nadam, Ftrl
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.wrappers.scikit_learn import KerasClassifier

from math import floor
from sklearn.metrics import make_scorer, accuracy_score
from bayes_opt import BayesianOptimization
from sklearn.model_selection import StratifiedKFold
import pickle
from keras.layers import LeakyReLU
LeakyReLU = LeakyReLU(alpha=0.1)

import warnings
warnings.filterwarnings('ignore')
pd.set_option("display.max_columns", None)

In [17]:
# Make scorer accuracy
score_acc = make_scorer(accuracy_score)

In [2]:
# Load dataset
def prepare_model_data(client_file):
    df = pd.read_csv(client_file)
    
    train, test = train_test_split(df, test_size=0.30, random_state=42)
    
    X_train = train[['psd_delta', 'psd_theta', 'psd_alpha', 'psd_beta', 'psd_gamma','eog_blinks', 'eog_var']]
    X_test = test[['psd_delta', 'psd_theta', 'psd_alpha', 'psd_beta', 'psd_gamma','eog_blinks', 'eog_var']]
    y_train = train['y_class']
    y_test = test['y_class']
    
    scaler = StandardScaler()

    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    return X_train, X_test, y_train, y_test

Se crea el dataset de los 21 sujetos.

In [21]:
e1 = os.listdir("./data/horizontal/empresa_1/")
e2 = os.listdir("./data/horizontal/empresa_2/")

X_train, X_val, y_train, y_val = prepare_model_data(f'./data/horizontal/empresa_2/{e2[0]}')


for file in e1:
    path = f'./data/horizontal/empresa_1/{file}'
    X_train_act, X_val_act, y_train_act, y_val_act = prepare_model_data(path)
    
    X_train = np.vstack((X_train, X_train_act))
    X_val = np.vstack((X_val, X_val_act))
    y_train = np.concatenate((y_train, y_train_act))
    y_val = np.concatenate((y_val, y_val_act))
    
for file in e2[1:]:
    path = f'./data/horizontal/empresa_2/{file}'
    X_train_act, X_val_act, y_train_act, y_val_act = prepare_model_data(path)
    
    X_train = np.vstack((X_train, X_train_act))
    X_val = np.vstack((X_val, X_val_act))
    y_train = np.concatenate((y_train, y_train_act))
    y_val = np.concatenate((y_val, y_val_act))
    
# y_train = y_train.astype(int)
# y_val = y_val.astype(int)

# Búsqueda del mejor modelo

In [2]:
# Set seed
from numpy.random import seed
seed(123)

import os
os.environ['PYTHONHASHSEED']=str(123) 

import random
random.seed(123)

import tensorflow as tf
tf.random.set_seed(123)

In [45]:
# Create function
def nn_cl_bo2(neurons, activation, optimizer, learning_rate, batch_size, epochs,
              layers1, layers2, normalization, dropout, dropout_rate):
    optimizerL = ['SGD', 'Adam', 'RMSprop', 'Adadelta', 'Adagrad', 'Adamax', 'Nadam', 'Ftrl','SGD']
    optimizerD= {'Adam':Adam(lr=learning_rate), 'SGD':SGD(lr=learning_rate),
                 'RMSprop':RMSprop(lr=learning_rate), 'Adadelta':Adadelta(lr=learning_rate),
                 'Adagrad':Adagrad(lr=learning_rate), 'Adamax':Adamax(lr=learning_rate),
                 'Nadam':Nadam(lr=learning_rate), 'Ftrl':Ftrl(lr=learning_rate)}
        
    activationL = ['relu', 'sigmoid', 'softplus', 'softsign', 'tanh', 'selu',
                   'elu', 'exponential', LeakyReLU,'relu']
        
    neurons = round(neurons)
    activation = activationL[round(activation)]
    optimizer = optimizerD[optimizerL[round(optimizer)]]
    batch_size = round(batch_size)
    epochs = round(epochs)
    layers1 = round(layers1)
    layers2 = round(layers2)
        
    def nn_cl_fun():
        input_shape = (7, )
        
        nn = Sequential()
        nn.add(Dense(neurons, input_shape=input_shape, activation=activation))
        if normalization > 0.5:
            nn.add(BatchNormalization())
        for i in range(layers1):
            nn.add(Dense(neurons, activation=activation))
        if dropout > 0.5:
            nn.add(Dropout(dropout_rate, seed=123))
        for i in range(layers2):
            nn.add(Dense(neurons, activation=activation))
        nn.add(Dense(1, activation='sigmoid'))
        nn.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
        return nn
        
    es = EarlyStopping(monitor='accuracy', mode='max', verbose=0, patience=20)
    nn = KerasClassifier(build_fn=nn_cl_fun, epochs=epochs, batch_size=batch_size, verbose=0)
    
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)
    score = cross_val_score(nn, X_train, y_train, scoring=score_acc, cv=kfold, fit_params={'callbacks':[es]}).mean()
    
    return score

In [46]:
params_nn2 ={
    'neurons': (25, 100),
    'activation':(0, 9),
    'optimizer':(0,7),
    'learning_rate':(0.001, 1),
    'batch_size':(16, 256),
    'epochs':(30, 150),
    'layers1':(1,3),
    'layers2':(1,3),
    'normalization':(0,1),
    'dropout':(0,1),
    'dropout_rate':(0,0.3)
}

# Run Bayesian Optimization
nn_bo = BayesianOptimization(nn_cl_bo2, params_nn2, random_state=111)
nn_bo.maximize(init_points=25, n_iter=4)

|   iter    |  target   | activa... | batch_... |  dropout  | dropou... |  epochs   |  layers1  |  layers2  | learni... |  neurons  | normal... | optimizer |
-------------------------------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.5877  [0m | [0m 5.51    [0m | [0m 56.58   [0m | [0m 0.4361  [0m | [0m 0.2308  [0m | [0m 65.44   [0m | [0m 1.298   [0m | [0m 1.045   [0m | [0m 0.4208  [0m | [0m 42.9    [0m | [0m 0.3377  [0m | [0m 6.935   [0m |
| [0m 2       [0m | [0m 0.5877  [0m | [0m 2.14    [0m | [0m 35.49   [0m | [0m 0.6696  [0m | [0m 0.1864  [0m | [0m 62.91   [0m | [0m 1.932   [0m | [0m 1.237   [0m | [0m 0.07488 [0m | [0m 92.56   [0m | [0m 0.794   [0m | [0m 5.884   [0m |
| [0m 3       [0m | [0m 0.4123  [0m | [0m 7.337   [0m | [0m 253.8   [0m | [0m 0.5773  [0m | [0m 0.2441  [0m | [0m 80.56   [0m | [0m 1.055   [0m 

In [47]:
params_nn_ = nn_bo.max['params']

learning_rate = params_nn_['learning_rate']
activationL = ['relu', 'sigmoid', 'softplus', 'softsign', 'tanh', 'selu',
               'elu', 'exponential', LeakyReLU,'relu']
params_nn_['activation'] = activationL[round(params_nn_['activation'])]

params_nn_['batch_size'] = round(params_nn_['batch_size'])
params_nn_['epochs'] = round(params_nn_['epochs'])
params_nn_['layers1'] = round(params_nn_['layers1'])
params_nn_['layers2'] = round(params_nn_['layers2'])
params_nn_['neurons'] = round(params_nn_['neurons'])

optimizerL = ['Adam', 'SGD', 'RMSprop', 'Adadelta', 'Adagrad', 'Adamax', 'Nadam', 'Ftrl','Adam']
optimizerD= {'Adam':Adam(lr=learning_rate), 'SGD':SGD(lr=learning_rate),
             'RMSprop':RMSprop(lr=learning_rate), 'Adadelta':Adadelta(lr=learning_rate),
             'Adagrad':Adagrad(lr=learning_rate), 'Adamax':Adamax(lr=learning_rate),
             'Nadam':Nadam(lr=learning_rate), 'Ftrl':Ftrl(lr=learning_rate)}
params_nn_['optimizer'] = optimizerD[optimizerL[round(params_nn_['optimizer'])]]

params_nn_

{'activation': 'softplus',
 'batch_size': 232,
 'dropout': 0.8182638736220813,
 'dropout_rate': 0.11976995858293202,
 'epochs': 128,
 'layers1': 1,
 'layers2': 2,
 'learning_rate': 0.41306235723522167,
 'neurons': 94,
 'normalization': 0.8253804525121795,
 'optimizer': <keras.optimizer_v2.adagrad.Adagrad at 0x1cd266526a0>}

## Arquitectura
Input -> 1 Dense -> IF normalization > 0.5 BatchNorm -> layers1 Dense -> IF dropout > 0.5 Dropout -> layers2 Dense -> Output

Se almacenan los resultados para poder consultarlos posteriormente

In [52]:
nn_bo.res

[{'target': 0.5877264325323475,
  'params': {'activation': 5.509531580558568,
   'batch_size': 56.57674104295274,
   'dropout': 0.4360590193711702,
   'dropout_rate': 0.23077874175693686,
   'epochs': 65.43903652834514,
   'layers1': 1.2983259142789796,
   'layers2': 1.0449566490883235,
   'learning_rate': 0.4208042677722932,
   'neurons': 42.90116059348345,
   'normalization': 0.33765619188879237,
   'optimizer': 6.934987252416151}},
 {'target': 0.5877264325323475,
  'params': {'activation': 2.139538085100205,
   'batch_size': 35.486238268290776,
   'dropout': 0.6696002382466298,
   'dropout_rate': 0.1863728758202091,
   'epochs': 62.91042362478221,
   'layers1': 1.9324428197899461,
   'layers2': 1.2367355022465671,
   'learning_rate': 0.074883606579061,
   'neurons': 92.55806343705127,
   'normalization': 0.7939625604796284,
   'optimizer': 5.883987541069969}},
 {'target': 0.4122735674676525,
  'params': {'activation': 7.336867117076466,
   'batch_size': 253.82916431929843,
   'dropo

In [62]:
with open("./results/optimization-python.txt", "w") as f:
    for el in nn_bo.res:
        f.write(json.dumps(el))
        f.write("\n")

## Exploracion de las mejores combinaciones

In [3]:
with open("./results/optimization-python.txt", "r") as f:
    results = []
    for line in f.readlines():
        results.append(json.loads(line))
        
results[0]

{'target': 0.5877264325323475,
 'params': {'activation': 5.509531580558568,
  'batch_size': 56.57674104295274,
  'dropout': 0.4360590193711702,
  'dropout_rate': 0.23077874175693686,
  'epochs': 65.43903652834514,
  'layers1': 1.2983259142789796,
  'layers2': 1.0449566490883235,
  'learning_rate': 0.4208042677722932,
  'neurons': 42.90116059348345,
  'normalization': 0.33765619188879237,
  'optimizer': 6.934987252416151}}

In [4]:
import flatdict

df = pd.DataFrame()

for r in results:
    d = dict(flatdict.FlatDict(r, delimiter='.'))
    df = df.append(d, ignore_index=True)

df = df.sort_values(by=['target'], ascending=False)
df = df[df['target']>0.74]

df

Unnamed: 0,target,params.activation,params.batch_size,params.dropout,params.dropout_rate,params.epochs,params.layers1,params.layers2,params.learning_rate,params.neurons,params.normalization,params.optimizer
14,0.754677,2.387756,232.441207,0.818264,0.11977,128.434251,1.39563,2.04505,0.413062,94.43754,0.82538,3.507208
18,0.753641,4.827667,189.641691,0.66155,0.251634,76.583442,1.852362,2.65626,0.46954,86.502446,0.014177,2.777348
21,0.752902,4.289381,41.089102,0.152525,0.082059,123.786173,1.78622,2.597758,0.433635,31.116512,0.010643,3.016192
10,0.74536,8.126105,97.551347,0.652804,0.277548,74.87802,2.542728,2.792329,0.620558,36.329506,0.37492,4.450884
5,0.744547,0.343637,28.741516,0.127961,0.010015,57.170531,2.087818,1.357252,0.180165,36.220991,0.683013,3.28306


Para mayor interpretabilidad, se parsean los resultados a sus valores reales.

(Aunque para la posterior definición del modelo se usarán los datos originales.)

In [5]:
activationL = ['relu', 'sigmoid', 'softplus', 'softsign', 'tanh', 'selu', 'elu', 'exponential', 'LeakyReLU','relu']
optimizerL = ['Adam', 'SGD', 'RMSprop', 'Adadelta', 'Adagrad', 'Adamax', 'Nadam', 'Ftrl','Adam']

df_res = pd.DataFrame()

for r in df.iterrows():
    d = {}
    
    r = r[1]
    d['accuracy'] = r['target']
    
    d['params.activation'] = activationL[round(r['params.activation'])]
    
    d['params.optimizer'] = optimizerL[round(r['params.optimizer'])]
    d['params.learning_rate'] = r['params.learning_rate']
    
    d['params.epochs'] = round(r['params.epochs'])
    d['params.batch_size'] = round(r['params.batch_size'])
    
    d['params.neurons'] = round(r['params.neurons'])
    
    d['params.normalization'] = "Si" if r['params.normalization'] > 0.5 else "No"
    d['params.layers1'] = round(r['params.layers1'])
    d['params.dropout'] = "Si" if r['params.dropout'] > 0.5 else "No"
    d['params.dropout_rate'] = r['params.dropout_rate']
    d['params.layers2'] = round(r['params.layers2'])
    
    df_res = df_res.append(d, ignore_index=True)
    
df_res

Unnamed: 0,accuracy,params.activation,params.optimizer,params.learning_rate,params.epochs,params.batch_size,params.neurons,params.normalization,params.layers1,params.dropout,params.dropout_rate,params.layers2
0,0.754677,softplus,Adagrad,0.413062,128.0,232.0,94.0,Si,1.0,Si,0.11977,2.0
1,0.753641,selu,Adadelta,0.46954,77.0,190.0,87.0,No,2.0,Si,0.251634,3.0
2,0.752902,tanh,Adadelta,0.433635,124.0,41.0,31.0,No,2.0,No,0.082059,3.0
3,0.74536,LeakyReLU,Adagrad,0.620558,75.0,98.0,36.0,No,3.0,Si,0.277548,3.0
4,0.744547,relu,Adadelta,0.180165,57.0,29.0,36.0,Si,2.0,No,0.010015,1.0


# Entrenamiento de modelos

In [6]:
# Set seed
from numpy.random import seed
seed(123)

import os
os.environ['PYTHONHASHSEED']=str(123) 

import random
random.seed(123)

import tensorflow as tf
tf.random.set_seed(123)

### Métricas

Para la medicion del rendimiento de los modelos baseline se incorpora Precision y Recall. También se calculará f1-score a partir de estas dos.

In [7]:
from tensorflow.keras.metrics import Precision, Recall, TrueNegatives, TruePositives, FalsePositives, FalseNegatives

# f1-score: 2*(precision*recall)/(precision+recall)

### Funciones para automatizar el entrenamiento de los modelos

- Se crean las funciones para obtener el modelo compilado en base a la configuración de parametros indicada
- Se crea la función para entrenar un modelo dado

In [8]:
def get_model(neurons, activation, optimizer, learning_rate, layers1, layers2, normalization, dropout, dropout_rate):
    
    ''' LECTURA DE PARAMETROS '''
    optimizerL = ['SGD', 'Adam', 'RMSprop', 'Adadelta', 'Adagrad', 'Adamax', 'Nadam', 'Ftrl','SGD']
    optimizerD= {'Adam':Adam(lr=learning_rate), 'SGD':SGD(lr=learning_rate),
                 'RMSprop':RMSprop(lr=learning_rate), 'Adadelta':Adadelta(lr=learning_rate),
                 'Adagrad':Adagrad(lr=learning_rate), 'Adamax':Adamax(lr=learning_rate),
                 'Nadam':Nadam(lr=learning_rate), 'Ftrl':Ftrl(lr=learning_rate)}
        
    activationL = ['relu', 'sigmoid', 'softplus', 'softsign', 'tanh', 'selu', 'elu', 'exponential', LeakyReLU,'relu']
    
    neurons = round(neurons)
    activation = activationL[round(activation)]
    optimizer = optimizerD[optimizerL[round(optimizer)]]
    layers1 = round(layers1)
    layers2 = round(layers2)
    
    ''' DEFINICION DE LA ARQUITECTURA '''
    input_shape = (7, )

    model = Sequential()
    
    model.add(Dense(neurons, input_shape=input_shape, activation=activation))
    
    if normalization > 0.5:
        model.add(BatchNormalization())
        
    for i in range(layers1):
        model.add(Dense(neurons, activation=activation))
        
    if dropout > 0.5:
        model.add(Dropout(dropout_rate, seed=123))
        
    for i in range(layers2):
        model.add(Dense(neurons, activation=activation))
    
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy', TruePositives(), TrueNegatives(), FalsePositives(), FalseNegatives()])
    
    return model

In [9]:
df

Unnamed: 0,target,params.activation,params.batch_size,params.dropout,params.dropout_rate,params.epochs,params.layers1,params.layers2,params.learning_rate,params.neurons,params.normalization,params.optimizer
14,0.754677,2.387756,232.441207,0.818264,0.11977,128.434251,1.39563,2.04505,0.413062,94.43754,0.82538,3.507208
18,0.753641,4.827667,189.641691,0.66155,0.251634,76.583442,1.852362,2.65626,0.46954,86.502446,0.014177,2.777348
21,0.752902,4.289381,41.089102,0.152525,0.082059,123.786173,1.78622,2.597758,0.433635,31.116512,0.010643,3.016192
10,0.74536,8.126105,97.551347,0.652804,0.277548,74.87802,2.542728,2.792329,0.620558,36.329506,0.37492,4.450884
5,0.744547,0.343637,28.741516,0.127961,0.010015,57.170531,2.087818,1.357252,0.180165,36.220991,0.683013,3.28306


In [10]:
df_res

Unnamed: 0,accuracy,params.activation,params.optimizer,params.learning_rate,params.epochs,params.batch_size,params.neurons,params.normalization,params.layers1,params.dropout,params.dropout_rate,params.layers2
0,0.754677,softplus,Adagrad,0.413062,128.0,232.0,94.0,Si,1.0,Si,0.11977,2.0
1,0.753641,selu,Adadelta,0.46954,77.0,190.0,87.0,No,2.0,Si,0.251634,3.0
2,0.752902,tanh,Adadelta,0.433635,124.0,41.0,31.0,No,2.0,No,0.082059,3.0
3,0.74536,LeakyReLU,Adagrad,0.620558,75.0,98.0,36.0,No,3.0,Si,0.277548,3.0
4,0.744547,relu,Adadelta,0.180165,57.0,29.0,36.0,Si,2.0,No,0.010015,1.0


**Parametros entrenables por modelo**
- 0: 27,825
- 1: 39,064
- 2: 5,240
- 3: 8,317
- 4: 4,393

In [11]:
def generar_modelo(df, combinacion):
    params = dict(df.iloc[combinacion])

    model = get_model(neurons = params['params.neurons'],
                      activation = params['params.activation'],
                      optimizer = params['params.optimizer'],
                      learning_rate = params['params.learning_rate'],
                      layers1 = params['params.layers1'],
                      layers2 = params['params.layers2'],
                      normalization = params['params.normalization'],
                      dropout = params['params.dropout'],
                      dropout_rate = params['params.dropout_rate'])
    return model

combinacion = 4
model = generar_modelo(df, combinacion)

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 36)                288       
                                                                 
 batch_normalization (BatchN  (None, 36)               144       
 ormalization)                                                   
                                                                 
 dense_1 (Dense)             (None, 36)                1332      
                                                                 
 dense_2 (Dense)             (None, 36)                1332      
                                                                 
 dense_3 (Dense)             (None, 36)                1332      
                                                                 
 dense_4 (Dense)             (None, 1)                 37        
                                                        

In [12]:
def train_model(model, X_train, y_train, X_val, y_val, batch_size = 32, epochs = 150, es = True):
    
    callbacks = []
    if es:
        callbacks.append(EarlyStopping(monitor='accuracy', mode='max', verbose=1, patience=20, restore_best_weights=True))

    hist = model.fit(
        X_train, y_train,
        batch_size=batch_size,
        epochs=epochs,
        verbose=0,
        validation_data=(X_val, y_val),
        callbacks = callbacks)
    
    return hist

## Individuales

In [18]:
seed(1)
set_random_seed(2)

results = pd.DataFrame(columns=['cliente', 'tp', 'tn', 'fp', 'fn'])

base_path = "./data/horizontal/"

clientes = list(range(1,22))
for c in clientes:
    print(f"Entrenando cliente {c}...")
    path = f"{base_path}empresa_{(c%2)+1}/cliente_{c}.csv"
    X_train_act, X_val_act, y_train_act, y_val_act = prepare_model_data(path)
    
    combinacion = 4
    model = generar_modelo(df, combinacion)
    hist = train_model(model, X_train_act, y_train_act, X_val_act, y_val_act)
    
    res = hist.model.evaluate(X_val_act, y_val_act, verbose=0)
    
    tp = res[2]
    tn = res[3]
    fp = res[4]
    fn = res[5]

    client_res = {
        "cliente": int(c),
        "tp": tp,
        "tn": tn,
        "fp": fp,
        "fn": fn
    }
    
    print(client_res)
    
    results = results.append(client_res, ignore_index=True)
    
results.to_csv("./results/resultados-modelos-individuales.csv", index=False, header=True)

Entrenando cliente 1...
Restoring model weights from the end of the best epoch: 93.
Epoch 00113: early stopping
{'cliente': 1, 'tp': 219.0, 'tn': 16.0, 'fp': 5.0, 'fn': 13.0}
Entrenando cliente 2...
Restoring model weights from the end of the best epoch: 118.
Epoch 00138: early stopping
{'cliente': 2, 'tp': 23.0, 'tn': 209.0, 'fp': 4.0, 'fn': 19.0}
Entrenando cliente 3...
Restoring model weights from the end of the best epoch: 49.
Epoch 00069: early stopping
{'cliente': 3, 'tp': 144.0, 'tn': 71.0, 'fp': 26.0, 'fn': 20.0}
Entrenando cliente 4...
Restoring model weights from the end of the best epoch: 100.
Epoch 00120: early stopping
{'cliente': 4, 'tp': 273.0, 'tn': 136.0, 'fp': 57.0, 'fn': 35.0}
Entrenando cliente 5...
{'cliente': 5, 'tp': 245.0, 'tn': 172.0, 'fp': 27.0, 'fn': 57.0}
Entrenando cliente 6...
Restoring model weights from the end of the best epoch: 47.
Epoch 00067: early stopping
{'cliente': 6, 'tp': 214.0, 'tn': 6.0, 'fp': 33.0, 'fn': 5.0}
Entrenando cliente 7...
Restorin

## Empresa

In [22]:
seed(1)
set_random_seed(2)

results = pd.DataFrame(columns=['empresa', 'cliente', 'tp', 'tn', 'fp', 'fn'])

empresas = ["empresa_1", "empresa_2"]

for empresa in empresas:
    print(f"Entrenando {empresa}...")
    clientes = os.listdir(f"./data/horizontal/{empresa}/")
    
    # Dataset de todos los clientes de la empresa
    X_train, X_val, y_train, y_val = prepare_model_data(f'./data/horizontal/{empresa}/{clientes[0]}')
    
    for file in clientes[1:]:
        path = f'./data/horizontal/{empresa}/{file}'
        X_train_act, X_val_act, y_train_act, y_val_act = prepare_model_data(path)

        X_train = np.vstack((X_train, X_train_act))
        X_val = np.vstack((X_val, X_val_act))
        y_train = np.concatenate((y_train, y_train_act))
        y_val = np.concatenate((y_val, y_val_act))
    
    # Entrenamiento del modelo de la empresa
    combinacion = 4
    model = generar_modelo(df, combinacion)
    hist = train_model(model, X_train, y_train, X_val, y_val)
    
    # Evaluacion del modelo por empresa para cada cliente de la empresa
    for file in clientes:
        path = f'./data/horizontal/{empresa}/{file}'
        X_train_act, X_val_act, y_train_act, y_val_act = prepare_model_data(path)
        res = hist.model.evaluate(X_val_act, y_val_act, verbose=0)
        
        tp = res[2]
        tn = res[3]
        fp = res[4]
        fn = res[5]
        
        cliente_res = {
            "empresa": empresa,
            "cliente": re.search("cliente_(\d{1,2}).csv", file).group(1),
            "tp": tp,
            "tn": tn,
            "fp": fp,
            "fn": fn
        }
    
        print(cliente_res)
        results = results.append(cliente_res, ignore_index=True)
    
results.to_csv("./results/resultados-modelos-empresa.csv", index=False, header=True)

Entrenando empresa_1...
{'empresa': 'empresa_1', 'cliente': '10', 'tp': 170.0, 'tn': 28.0, 'fp': 10.0, 'fn': 54.0}
{'empresa': 'empresa_1', 'cliente': '12', 'tp': 143.0, 'tn': 87.0, 'fp': 1.0, 'fn': 13.0}
{'empresa': 'empresa_1', 'cliente': '14', 'tp': 126.0, 'tn': 61.0, 'fp': 19.0, 'fn': 34.0}
{'empresa': 'empresa_1', 'cliente': '16', 'tp': 197.0, 'tn': 42.0, 'fp': 12.0, 'fn': 13.0}
{'empresa': 'empresa_1', 'cliente': '18', 'tp': 112.0, 'tn': 98.0, 'fp': 14.0, 'fn': 25.0}
{'empresa': 'empresa_1', 'cliente': '2', 'tp': 38.0, 'tn': 125.0, 'fp': 88.0, 'fn': 4.0}
{'empresa': 'empresa_1', 'cliente': '20', 'tp': 55.0, 'tn': 149.0, 'fp': 39.0, 'fn': 12.0}
{'empresa': 'empresa_1', 'cliente': '4', 'tp': 249.0, 'tn': 149.0, 'fp': 44.0, 'fn': 59.0}
{'empresa': 'empresa_1', 'cliente': '6', 'tp': 206.0, 'tn': 9.0, 'fp': 30.0, 'fn': 13.0}
{'empresa': 'empresa_1', 'cliente': '8', 'tp': 44.0, 'tn': 166.0, 'fp': 22.0, 'fn': 29.0}
Entrenando empresa_2...
{'empresa': 'empresa_2', 'cliente': '1', 'tp': 2

## Empresa v2

In [13]:
seed(1)
set_random_seed(2)

results = pd.DataFrame(columns=['empresa', 'cliente', 'tp', 'tn', 'fp', 'fn'])

empresas = ["empresa_1", "empresa_2", "empresa_3"]
base_path = "./data/horizontal_v2"

for empresa in empresas:
    print(f"Entrenando {empresa}...")
    clientes = os.listdir(f"{base_path}/{empresa}/")
    
    # Dataset de todos los clientes de la empresa
    X_train, X_val, y_train, y_val = prepare_model_data(f'{base_path}/{empresa}/{clientes[0]}')
    
    for file in clientes[1:]:
        path = f'{base_path}/{empresa}/{file}'
        X_train_act, X_val_act, y_train_act, y_val_act = prepare_model_data(path)

        X_train = np.vstack((X_train, X_train_act))
        X_val = np.vstack((X_val, X_val_act))
        y_train = np.concatenate((y_train, y_train_act))
        y_val = np.concatenate((y_val, y_val_act))
    
    # Entrenamiento del modelo de la empresa
    combinacion = 4
    model = generar_modelo(df, combinacion)
    hist = train_model(model, X_train, y_train, X_val, y_val)
    
    # Evaluacion del modelo por empresa para cada cliente de la empresa
    for file in clientes:
        path = f'{base_path}/{empresa}/{file}'
        X_train_act, X_val_act, y_train_act, y_val_act = prepare_model_data(path)
        res = hist.model.evaluate(X_val_act, y_val_act, verbose=0)
        
        tp = res[2]
        tn = res[3]
        fp = res[4]
        fn = res[5]
    
        cliente_res = {
            "empresa": empresa,
            "cliente": re.search("cliente_(\d{1,2}).csv", file).group(1),
            "tp": tp,
            "tn": tn,
            "fp": fp,
            "fn": fn
        }
    
        print(cliente_res)
        results = results.append(cliente_res, ignore_index=True)
    
results.to_csv("./results/resultados-modelos-empresa_v2.csv", index=False, header=True)

Entrenando empresa_1...
{'empresa': 'empresa_1', 'cliente': '12', 'tp': 142.0, 'tn': 87.0, 'fp': 1.0, 'fn': 14.0}
{'empresa': 'empresa_1', 'cliente': '13', 'tp': 141.0, 'tn': 75.0, 'fp': 17.0, 'fn': 9.0}
{'empresa': 'empresa_1', 'cliente': '14', 'tp': 111.0, 'tn': 64.0, 'fp': 16.0, 'fn': 49.0}
{'empresa': 'empresa_1', 'cliente': '15', 'tp': 100.0, 'tn': 110.0, 'fp': 30.0, 'fn': 19.0}
{'empresa': 'empresa_1', 'cliente': '18', 'tp': 101.0, 'tn': 94.0, 'fp': 18.0, 'fn': 36.0}
{'empresa': 'empresa_1', 'cliente': '3', 'tp': 115.0, 'tn': 79.0, 'fp': 18.0, 'fn': 49.0}
{'empresa': 'empresa_1', 'cliente': '4', 'tp': 263.0, 'tn': 143.0, 'fp': 50.0, 'fn': 45.0}
{'empresa': 'empresa_1', 'cliente': '5', 'tp': 232.0, 'tn': 151.0, 'fp': 48.0, 'fn': 70.0}
Entrenando empresa_2...
Restoring model weights from the end of the best epoch: 125.
Epoch 00145: early stopping
{'empresa': 'empresa_2', 'cliente': '17', 'tp': 43.0, 'tn': 190.0, 'fp': 3.0, 'fn': 16.0}
{'empresa': 'empresa_2', 'cliente': '2', 'tp': 

## Empresa v3 con clientes desconocidos

Se entrena el modelo con los clientes de la empresa:

- Se mide el rendimiento para el conjunto test de los clientes

- Se mide el rendimiento para los clientes ajenos a la empresa

In [28]:
clientes_distr = {
    "empresa_1": {3,4,5,12,  2,8,  1},
    "empresa_2": {13,14,    6,7,10,11,16},
    "empresa_3": {15,18,  9,17,20,21,  19}
}

In [31]:
seed(1)
set_random_seed(2)

results = pd.DataFrame(columns=['empresa', 'cliente', 'UC?', 'tp', 'tn', 'fp', 'fn'])

empresas = ["empresa_1", "empresa_2", "empresa_3"]

clientes_distr = {
    "empresa_1": {3,4,5,12,  2,8,  1},
    "empresa_2": {13,14,    6,7,10,11,16},
    "empresa_3": {15,18,  9,17,20,21,  19}
}

base_path = "./data/horizontal_v3"

for empresa in empresas:
    print(f"Entrenando {empresa}...")
    clientes = os.listdir(f"{base_path}/{empresa}/")
    
    # Dataset de todos los clientes de la empresa
    X_train, X_val, y_train, y_val = prepare_model_data(f'{base_path}/{empresa}/{clientes[0]}')
    
    for file in clientes[1:]:
        path = f'{base_path}/{empresa}/{file}'
        X_train_act, X_val_act, y_train_act, y_val_act = prepare_model_data(path)

        X_train = np.vstack((X_train, X_train_act))
        X_val = np.vstack((X_val, X_val_act))
        y_train = np.concatenate((y_train, y_train_act))
        y_val = np.concatenate((y_val, y_val_act))
    
    # Entrenamiento del modelo de la empresa
    combinacion = 4
    model = generar_modelo(df, combinacion)
    hist = train_model(model, X_train, y_train, X_val, y_val)
    
    # Evaluacion del modelo por empresa para cada cliente de la empresa
    for cid in range(1,22):
        path = f'./data/centralizado/cliente_{cid}.csv'
        X_train_act, X_val_act, y_train_act, y_val_act = prepare_model_data(path)
        res = hist.model.evaluate(X_val_act, y_val_act, verbose=0)
        
        tp = res[2]
        tn = res[3]
        fp = res[4]
        fn = res[5]
    
        cliente_res = {
            "empresa": empresa,
            "cliente": cid,
            "UC?": cid in clientes_distr[empresa],
            "tp": tp,
            "tn": tn,
            "fp": fp,
            "fn": fn
        }
    
        print(cliente_res)
        results = results.append(cliente_res, ignore_index=True)
    
results.to_csv("./results/resultados-modelos-empresa_v3.csv", index=False, header=True)

Entrenando empresa_2...
Restoring model weights from the end of the best epoch: 128.
Epoch 00148: early stopping
{'empresa': 'empresa_2', 'cliente': 1, 'UC?': False, 'tp': 179.0, 'tn': 17.0, 'fp': 4.0, 'fn': 53.0}
{'empresa': 'empresa_2', 'cliente': 2, 'UC?': False, 'tp': 41.0, 'tn': 30.0, 'fp': 183.0, 'fn': 1.0}
{'empresa': 'empresa_2', 'cliente': 3, 'UC?': False, 'tp': 141.0, 'tn': 22.0, 'fp': 75.0, 'fn': 23.0}
{'empresa': 'empresa_2', 'cliente': 4, 'UC?': False, 'tp': 290.0, 'tn': 58.0, 'fp': 135.0, 'fn': 18.0}
{'empresa': 'empresa_2', 'cliente': 5, 'UC?': False, 'tp': 278.0, 'tn': 8.0, 'fp': 191.0, 'fn': 24.0}
{'empresa': 'empresa_2', 'cliente': 6, 'UC?': True, 'tp': 218.0, 'tn': 4.0, 'fp': 35.0, 'fn': 1.0}
{'empresa': 'empresa_2', 'cliente': 7, 'UC?': True, 'tp': 175.0, 'tn': 30.0, 'fp': 40.0, 'fn': 8.0}
{'empresa': 'empresa_2', 'cliente': 8, 'UC?': False, 'tp': 73.0, 'tn': 3.0, 'fp': 185.0, 'fn': 0.0}
{'empresa': 'empresa_2', 'cliente': 9, 'UC?': False, 'tp': 70.0, 'tn': 10.0, 'f

## Global/Centralizado

In [14]:
e1 = os.listdir("./data/horizontal/empresa_1/")
e2 = os.listdir("./data/horizontal/empresa_2/")

# Dataset de todos los clientes
X_train, X_val, y_train, y_val = prepare_model_data(f'./data/horizontal/empresa_2/{e2[0]}')

for file in e1:
    path = f'./data/horizontal/empresa_1/{file}'
    X_train_act, X_val_act, y_train_act, y_val_act = prepare_model_data(path)
    
    X_train = np.vstack((X_train, X_train_act))
    X_val = np.vstack((X_val, X_val_act))
    y_train = np.concatenate((y_train, y_train_act))
    y_val = np.concatenate((y_val, y_val_act))
    
for file in e2[1:]:
    path = f'./data/horizontal/empresa_2/{file}'
    X_train_act, X_val_act, y_train_act, y_val_act = prepare_model_data(path)
    
    X_train = np.vstack((X_train, X_train_act))
    X_val = np.vstack((X_val, X_val_act))
    y_train = np.concatenate((y_train, y_train_act))
    y_val = np.concatenate((y_val, y_val_act))

# Entrenamiento de modelo centralizado
combinacion = 4
model = generar_modelo(df, combinacion)
hist = train_model(model, X_train, y_train, X_val, y_val)

# Evaluacion del modelo por empresa para cada cliente de la empresa
results = pd.DataFrame(columns=['empresa', 'cliente', 'tp', 'tn', 'fp', 'fn'])
for cid in range(1,22):
    base_path = "./data/horizontal/"
    path = f"{base_path}empresa_{(int(cid)%2)+1}/cliente_{cid}.csv"
    
    X_train_act, X_val_act, y_train_act, y_val_act = prepare_model_data(path)
    
    res = hist.model.evaluate(X_val_act, y_val_act, verbose=0)
    
    tp = res[2]
    tn = res[3]
    fp = res[4]
    fn = res[5]

    centralizado_res = {
        "empresa": (int(cid)%2)+1,
        "cliente": cid,
        "tp": tp,
        "tn": tn,
        "fp": fp,
        "fn": fn
    }

    print(centralizado_res)
    results = results.append(centralizado_res, ignore_index=True)
    
results.to_csv("./results/resultados-modelo-centralizado.csv", index=False, header=True)

{'empresa': 2, 'cliente': 1, 'tp': 176.0, 'tn': 20.0, 'fp': 1.0, 'fn': 56.0}
{'empresa': 1, 'cliente': 2, 'tp': 37.0, 'tn': 108.0, 'fp': 105.0, 'fn': 5.0}
{'empresa': 2, 'cliente': 3, 'tp': 118.0, 'tn': 70.0, 'fp': 27.0, 'fn': 46.0}
{'empresa': 1, 'cliente': 4, 'tp': 258.0, 'tn': 127.0, 'fp': 66.0, 'fn': 50.0}
{'empresa': 2, 'cliente': 5, 'tp': 237.0, 'tn': 136.0, 'fp': 63.0, 'fn': 65.0}
{'empresa': 1, 'cliente': 6, 'tp': 192.0, 'tn': 12.0, 'fp': 27.0, 'fn': 27.0}
{'empresa': 2, 'cliente': 7, 'tp': 113.0, 'tn': 46.0, 'fp': 24.0, 'fn': 70.0}
{'empresa': 1, 'cliente': 8, 'tp': 42.0, 'tn': 154.0, 'fp': 34.0, 'fn': 31.0}
{'empresa': 2, 'cliente': 9, 'tp': 52.0, 'tn': 115.0, 'fp': 59.0, 'fn': 21.0}
{'empresa': 1, 'cliente': 10, 'tp': 175.0, 'tn': 13.0, 'fp': 25.0, 'fn': 49.0}
{'empresa': 2, 'cliente': 11, 'tp': 159.0, 'tn': 26.0, 'fp': 12.0, 'fn': 57.0}
{'empresa': 1, 'cliente': 12, 'tp': 142.0, 'tn': 84.0, 'fp': 4.0, 'fn': 14.0}
{'empresa': 2, 'cliente': 13, 'tp': 144.0, 'tn': 61.0, 'fp': 

## Centralizado con cliente desconocido

In [18]:
def cargar_dataset_varios_clientes(clientes):
    base_path = "./data/centralizado"
    
    X_train, X_val, y_train, y_val = prepare_model_data(f'{base_path}/cliente_{clientes[0]}.csv')
    
    for cid in clientes[1:]:
        path = f'{base_path}/cliente_{cid}.csv'
        X_train_act, X_val_act, y_train_act, y_val_act = prepare_model_data(path)
    
        X_train = np.vstack((X_train, X_train_act))
        X_val = np.vstack((X_val, X_val_act))
        y_train = np.concatenate((y_train, y_train_act))
        y_val = np.concatenate((y_val, y_val_act))
        
    return X_train, X_val, y_train, y_val

In [20]:
results = pd.DataFrame(columns=['cliente', 'tp', 'tn', 'fp', 'fn', 'tp_c', 'tn_c', 'fp_c', 'fn_c'])

CLIENTS_IDS = list(range(1,22))

for cid in CLIENTS_IDS:
    # Se carga el dataset de todos los clientes menos el actual
    actual_clients = list(range(1,22))
    actual_clients.remove(cid)
    X_train, X_val, y_train, y_val = cargar_dataset_varios_clientes(actual_clients)
    
    seed(1)
    set_random_seed(2)
    
    # Entrenamiento de modelo centralizado
    combinacion = 4
    model = generar_modelo(df, combinacion)
    hist = train_model(model, X_train, y_train, X_val, y_val)
    
    # Evaluar para los clientes conocidos
    res = hist.model.evaluate(X_val, y_val, verbose=0)
    
    tp = res[2]
    tn = res[3]
    fp = res[4]
    fn = res[5]
    
    # Evaluar para el cliente nuevo
    X_train_c, X_val_c, y_train_c, y_val_c = prepare_model_data(f'./data/centralizado/cliente_{cid}.csv')
    res_c = hist.model.evaluate(X_val_c, y_val_c, verbose=0)
    
    tp_c = res_c[2]
    tn_c = res_c[3]
    fp_c = res_c[4]
    fn_c = res_c[5]
    
    centralizado_res = {
        "cliente": cid,
        "tp": tp,
        "tn": tn,
        "fp": fp,
        "fn": fn,
        "tp_c": tp_c,
        "tn_c": tn_c,
        "fp_c": fp_c,
        "fn_c": fn_c
    }

    print(f'cid: {cid} - {centralizado_res}')
    results = results.append(centralizado_res, ignore_index=True)
    
results.to_csv("./results/resultados-modelo-centralizado-new-client.csv", index=False, header=True)

cid: 1 - {'cliente': 1, 'tp': 2410.0, 'tn': 1871.0, 'fp': 517.0, 'fn': 761.0, 'tp_c': 137.0, 'tn_c': 19.0, 'fp_c': 2.0, 'fn_c': 95.0}
cid: 2 - {'cliente': 2, 'tp': 2759.0, 'tn': 1543.0, 'fp': 653.0, 'fn': 602.0, 'tp_c': 37.0, 'tn_c': 102.0, 'fp_c': 111.0, 'fn_c': 5.0}
cid: 3 - {'cliente': 3, 'tp': 2542.0, 'tn': 1691.0, 'fp': 621.0, 'fn': 697.0, 'tp_c': 101.0, 'tn_c': 56.0, 'fp_c': 41.0, 'fn_c': 63.0}
cid: 4 - {'cliente': 4, 'tp': 2374.0, 'tn': 1674.0, 'fp': 542.0, 'fn': 721.0, 'tp_c': 247.0, 'tn_c': 101.0, 'fp_c': 92.0, 'fn_c': 61.0}
cid: 5 - {'cliente': 5, 'tp': 2376.0, 'tn': 1673.0, 'fp': 537.0, 'fn': 725.0, 'tp_c': 185.0, 'tn_c': 118.0, 'fp_c': 81.0, 'fn_c': 117.0}
cid: 6 - {'cliente': 6, 'tp': 2423.0, 'tn': 1776.0, 'fp': 594.0, 'fn': 761.0, 'tp_c': 53.0, 'tn_c': 36.0, 'fp_c': 3.0, 'fn_c': 166.0}
cid: 7 - {'cliente': 7, 'tp': 2479.0, 'tn': 1789.0, 'fp': 550.0, 'fn': 741.0, 'tp_c': 91.0, 'tn_c': 50.0, 'fp_c': 20.0, 'fn_c': 92.0}
cid: 8 - {'cliente': 8, 'tp': 2602.0, 'tn': 1612.0, 'fp