PREPROCESAMIENTO DE DATOS

In [132]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Cargar el archivo CSV
file_path = 'Base de datos HACQ.csv'
df = pd.read_csv(file_path)
print(df)
print(df.columns)

        var001       var002      var003  month  year     __sexo  __hgt  \
0       168692  U0004802741   4/11/2023      4  2023  Masculino    NaN   
1       214287  U0005119913   11/5/2023     11  2023  Masculino    NaN   
2       113755  U0004427396   8/13/2022      8  2022  Masculino    NaN   
3       202840  U0005038530   9/16/2023      9  2023  Masculino    NaN   
4        31544  U0003873327    7/6/2021      7  2021  Masculino    NaN   
...        ...          ...         ...    ...   ...        ...    ...   
103156  137998  U0004600580  11/26/2022     11  2022   Femenino    NaN   
103157   50108  U0003998985  10/21/2021     10  2021   Femenino  101.0   
103158   98326  U0004327713    6/3/2022      6  2022  Masculino    NaN   
103159  187747  U0004932755    7/1/2023      7  2023  Masculino    NaN   
103160  108243  U0004392833   7/20/2022      7  2022   Femenino    NaN   

        __temperatura  __pulso  __pas  __pad  __fres  __sat02  __peso  \
0           36.299999     81.0    NaN 

In [133]:
# Paso 1: Eliminar las columnas no útiles
df_cleaned = df.drop(columns=['var001', 'var002', 'var003', 'month', 'year'])
print(df_cleaned)

           __sexo  __hgt  __temperatura  __pulso  __pas  __pad  __fres  \
0       Masculino    NaN      36.299999     81.0    NaN    NaN     NaN   
1       Masculino    NaN      36.400002    105.0  185.0   96.0     NaN   
2       Masculino    NaN      36.200001    101.0    NaN    NaN     NaN   
3       Masculino    NaN      36.400002    106.0  125.0   99.0     NaN   
4       Masculino    NaN      36.500000     57.0  118.0   66.0     NaN   
...           ...    ...            ...      ...    ...    ...     ...   
103156   Femenino    NaN      36.299999     74.0  152.0   97.0     NaN   
103157   Femenino  101.0      36.500000     60.0  143.0   78.0     NaN   
103158  Masculino    NaN      36.299999     77.0  163.0   77.0     NaN   
103159  Masculino    NaN      36.400002    112.0  143.0   83.0     NaN   
103160   Femenino    NaN      36.000000     82.0  167.0   90.0     NaN   

        __sat02  __peso  __outcome_1  __destino  __outcome_2  __outcome_3  \
0          99.0    42.0          0

In [134]:
# Paso 2: Convertir las columnas categóricas a valores numéricos usando Label Encoding
label_encoders = {}
for column in ['__sexo', '__destino', '__categorizacion']:
    le = LabelEncoder()
    df_cleaned[column] = le.fit_transform(df_cleaned[column].astype(str))
    label_encoders[column] = le

# Extrae la columna de categorización
df_cleaned['__categorizacion'] = df_cleaned['__categorizacion'] + 1
df_output = df_cleaned['__categorizacion'].copy()
df_cleaned = df_cleaned.drop(['__categorizacion'],axis=1)
print(df_output)

print(df_cleaned)

0         4
1         4
2         3
3         4
4         5
         ..
103156    3
103157    2
103158    4
103159    4
103160    3
Name: __categorizacion, Length: 103161, dtype: int64
        __sexo  __hgt  __temperatura  __pulso  __pas  __pad  __fres  __sat02  \
0            1    NaN      36.299999     81.0    NaN    NaN     NaN     99.0   
1            1    NaN      36.400002    105.0  185.0   96.0     NaN     99.0   
2            1    NaN      36.200001    101.0    NaN    NaN     NaN     99.0   
3            1    NaN      36.400002    106.0  125.0   99.0     NaN     99.0   
4            1    NaN      36.500000     57.0  118.0   66.0     NaN    100.0   
...        ...    ...            ...      ...    ...    ...     ...      ...   
103156       0    NaN      36.299999     74.0  152.0   97.0     NaN     98.0   
103157       0  101.0      36.500000     60.0  143.0   78.0     NaN     98.0   
103158       1    NaN      36.299999     77.0  163.0   77.0     NaN    100.0   
103159       1 

In [135]:
# Paso 3: Manejar los valores faltantes
# Rellenar valores numéricos faltantes con la media de la columna
for column in df_cleaned.select_dtypes(include=['float64', 'int64']).columns:
    df_cleaned[column].fillna(df_cleaned[column].mean(), inplace=True)

print(df_cleaned)

        __sexo       __hgt  __temperatura  __pulso       __pas     __pad  \
0            1  184.247016      36.299999     81.0  131.586053  77.40956   
1            1  184.247016      36.400002    105.0  185.000000  96.00000   
2            1  184.247016      36.200001    101.0  131.586053  77.40956   
3            1  184.247016      36.400002    106.0  125.000000  99.00000   
4            1  184.247016      36.500000     57.0  118.000000  66.00000   
...        ...         ...            ...      ...         ...       ...   
103156       0  184.247016      36.299999     74.0  152.000000  97.00000   
103157       0  101.000000      36.500000     60.0  143.000000  78.00000   
103158       1  184.247016      36.299999     77.0  163.000000  77.00000   
103159       1  184.247016      36.400002    112.0  143.000000  83.00000   
103160       0  184.247016      36.000000     82.0  167.000000  90.00000   

           __fres  __sat02     __peso  __outcome_1  __destino  __outcome_2  \
0       2

In [50]:
# Paso 4: Escalar los datos para que tengan una media de 0 y una desviación estándar de 1
scaler = StandardScaler()
numeric_columns = df_cleaned.select_dtypes(include=['float64', 'int64']).columns
df_cleaned[numeric_columns] = scaler.fit_transform(df_cleaned[numeric_columns])

print(df_cleaned)

          __sexo         __hgt  __temperatura   __pulso     __pas     __pad  \
0       1.093955  2.172054e-15      -0.159434 -0.539508  0.000000  0.000000   
1       1.093955  2.172054e-15       0.017166  0.510587  2.612652  1.345863   
2       1.093955  2.172054e-15      -0.336025  0.335571  0.000000  0.000000   
3       1.093955  2.172054e-15       0.017166  0.554341 -0.322146  1.563050   
4       1.093955  2.172054e-15       0.193757 -1.589604 -0.664539 -0.826000   
...          ...           ...            ...       ...       ...       ...   
103156 -0.914043  2.172054e-15      -0.159434 -0.845786  0.998513  1.418259   
103157 -0.914043 -2.120645e+00       0.193757 -1.458342  0.558294  0.042745   
103158  1.093955  2.172054e-15      -0.159434 -0.714524  1.536560 -0.029650   
103159  1.093955  2.172054e-15       0.017166  0.816865  0.558294  0.404722   
103160 -0.914043  2.172054e-15      -0.689216 -0.495754  1.732213  0.911491   

        __fres   __sat02        __peso  __outcome_1

In [136]:
# Exportar el DataFrame resultante a un archivo CSV
df_cleaned['__categorizacion'] = df_output
output_file_path = 'Base de datos HACQ (preprocesada).csv'
df_cleaned.to_csv(output_file_path, index=False)

print(f"El archivo preprocesado se ha guardado en {output_file_path}")

El archivo preprocesado se ha guardado en Base de datos HACQ (preprocesada).csv


ENTRENAMIENTO Y EVALUACIÓN

In [137]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout, LeakyReLU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.layers import LeakyReLU


In [138]:
#1 Cargar los datos preprocesados
# Cargar los datos preprocesados
file_path = 'Base de datos HACQ (preprocesada).csv'
df = pd.read_csv(file_path)

In [139]:
#2 Separar las características de entrada y de salida (objetivo)
input = df.drop(columns=['__categorizacion'])
output = df['__categorizacion']

print(input)
print(output)

        __sexo       __hgt  __temperatura  __pulso       __pas     __pad  \
0            1  184.247016      36.299999     81.0  131.586053  77.40956   
1            1  184.247016      36.400002    105.0  185.000000  96.00000   
2            1  184.247016      36.200001    101.0  131.586053  77.40956   
3            1  184.247016      36.400002    106.0  125.000000  99.00000   
4            1  184.247016      36.500000     57.0  118.000000  66.00000   
...        ...         ...            ...      ...         ...       ...   
103156       0  184.247016      36.299999     74.0  152.000000  97.00000   
103157       0  101.000000      36.500000     60.0  143.000000  78.00000   
103158       1  184.247016      36.299999     77.0  163.000000  77.00000   
103159       1  184.247016      36.400002    112.0  143.000000  83.00000   
103160       0  184.247016      36.000000     82.0  167.000000  90.00000   

           __fres  __sat02     __peso  __outcome_1  __destino  __outcome_2  \
0       2

In [140]:
# Convertir la salida a categorías
output = to_categorical(output)
print(output)

[[0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0.]
 ...
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0.]]


In [141]:
#3 Dividir los datos en conjuntos de entrenamiento 80% y prueba 20% 
input_train, input_test, output_train, output_test = train_test_split(input, output, test_size=0.2, random_state=42)

In [144]:
#4 Construir el modelo dela red neuronal (Perceptron multicapa)
def MLP_NN():
    NumNeurons = 13
    model = Sequential()
    model.add(Dense(128, input_dim=input_train.shape[1]))
    model.add(LeakyReLU(alpha=0.01))
    model.add(Dropout(0.5))
    model.add(Dense(64))
    model.add(LeakyReLU(alpha=0.01))
    model.add(Dropout(0.5))
    model.add(Dense(output.shape[1], activation='softmax'))  # Usar 'softmax' para clasificación multiclase

    #opt =  keras.optimizers.Adam(learning_rate=0.001)

    # Compilar el modelo
    optimizer = Adam(learning_rate=0.001)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model


In [145]:
#5 Entrenar el modelo
n_epochs = 500
network = MLP_NN()
train = network.fit(input_train, output_train, epochs=n_epochs, batch_size=32, validation_split=0.2)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500

KeyboardInterrupt: 

In [78]:
#6 Evaluar el modelo
loss, accuracy = network.evaluate(input_test, output_test)
print(f"Precisión en el conjunto de prueba: {accuracy:.2f}")

Precisión en el conjunto de prueba: 0.54


In [79]:
#7 Predicciones
output_pred = network.predict(input_test)
print(output_pred)
output_pred_classes = np.argmax(output_pred, axis=1)
print(output_pred_classes)
output_test_classes = np.argmax(output_test, axis=1)
print(output_test_classes)

[[6.50842316e-17 3.58082239e-08 5.23350900e-03 4.34265994e-02
  5.33959210e-01 4.17380542e-01]
 [3.41017102e-12 2.12893399e-04 4.56655361e-02 1.34047508e-01
  5.69094241e-01 2.50979841e-01]
 [2.09899567e-12 6.59740108e-05 1.97786056e-02 1.13861255e-01
  3.34936410e-01 5.31357706e-01]
 ...
 [7.40153437e-18 2.93922698e-04 3.55640762e-02 2.12846592e-01
  5.26152909e-01 2.25142509e-01]
 [4.96651548e-12 8.04074007e-05 3.21448296e-02 1.18861236e-01
  4.84874576e-01 3.64038855e-01]
 [5.32948126e-19 2.41415878e-03 5.77372968e-01 2.46052369e-01
  1.66504592e-01 7.65590323e-03]]
[4 4 5 ... 4 4 2]
[4 4 5 ... 5 5 3]


In [80]:
#8 Generar el reporte de clasificación
print(classification_report(output_test_classes, output_pred_classes))

              precision    recall  f1-score   support

           1       0.49      0.35      0.41        48
           2       0.56      0.39      0.46      1217
           3       0.62      0.33      0.43      3852
           4       0.51      0.52      0.52      7947
           5       0.55      0.70      0.62      7569

    accuracy                           0.54     20633
   macro avg       0.55      0.46      0.49     20633
weighted avg       0.55      0.54      0.53     20633



In [81]:
#9 Matriz de confusión
print(confusion_matrix(output_test_classes, output_pred_classes))

[[  17   16    9    5    1]
 [  14  480  256  307  160]
 [   3  231 1259 1453  906]
 [   1  104  437 4146 3259]
 [   0   23   62 2189 5295]]
