En este jupyter notebook se agregarán los datos de todos los usuarios pertenecientes a cada compañia y se generarán los respectivos conjuntos de train y test

empresa_1  
- train   
- test
    
empresa_2
- train
- test

In [22]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.utils import to_categorical

In [23]:
def prepare_model_data(client_file):
    df = pd.read_csv(client_file)
    
    train, test = train_test_split(df, test_size=0.30, random_state=42)
    
    X_train = train[['psd_delta', 'psd_theta', 'psd_alpha', 'psd_beta', 'psd_gamma','eog_blinks', 'eog_var']]
    X_test = test[['psd_delta', 'psd_theta', 'psd_alpha', 'psd_beta', 'psd_gamma','eog_blinks', 'eog_var']]
    y_train = train['y_class']
    y_test = test['y_class']
    
    scaler = MinMaxScaler()

#     X_train = scaler.fit_transform(X_train)
#     X_test = scaler.transform(X_test)
    
    num_classes = 2

    y_train = to_categorical(y_train, num_classes)
    y_test = to_categorical(y_test, num_classes)
    
    return X_train, X_test, y_train, y_test

In [31]:
def get_data_empresa(empresa):
    # Cargar y procesar datos de todos sus clientes
    clientes = os.listdir(f"./data/horizontal/{empresa}/")
    X_train, X_test, y_train, y_test = prepare_model_data(f'./data/horizontal/{empresa}/{clientes[0]}')

    # Cada cliente envia el min y el max de cada columna para agregarlos y obtener el min max global (de los datos de train)
    min_max = {
        "min": {},
        "max": {}
    }
    t = X_train.aggregate([min, max])
    for c in t.columns:
        min_max["min"][c] = t[c][0]
        min_max["max"][c] = t[c][1]

    for file in clientes[1:]:
        path = f'./data/horizontal/{empresa}/{file}'
        X_train_act, X_test_act, y_train_act, y_test_act = prepare_model_data(path)

        t = X_train_act.aggregate([min, max])
        for c in t.columns:
            if min_max["min"][c] > t[c][0]:
                min_max["min"][c] = t[c][0]
            if min_max["max"][c] < t[c][1]:
                min_max["max"][c] = t[c][1]

        X_train = pd.concat([X_train, X_train_act], ignore_index=True)
        X_test = pd.concat([X_test, X_test_act], ignore_index=True)
        y_train = np.vstack((y_train, y_train_act))
        y_test = np.vstack((y_test, y_test_act))

    for feature_name in X_train.columns:
        min_value = min_max["min"][feature_name]
        max_value = min_max["max"][feature_name]

        X_train[feature_name] = (X_train[feature_name] - min_value) / (max_value - min_value)
        X_test[feature_name] = (X_test[feature_name] - min_value) / (max_value - min_value)

    X_train = X_train.to_numpy()
    X_test = X_test.to_numpy()
    
    return X_train, X_test, y_train, y_test

In [35]:
empresas = ["empresa_1", "empresa_2"]

for empresa in empresas:
    X_train, X_test, y_train, y_test = get_data_empresa(empresa)
    print(len(X_train),len(X_test), len(y_train), len(y_test))

6485 2789 6485 2789
7040 3023 7040 3023
