In [1]:
# Importamos las bibliotecas necesarias
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [2]:
# Cargamos los datos en un DataFrame
data = pd.read_csv('databases/Base de datos HACQ.csv', sep=',')
print(data)

        var001       var002      var003  month  year     __sexo  __hgt  \
0       168692  U0004802741   4/11/2023      4  2023  Masculino    NaN   
1       214287  U0005119913   11/5/2023     11  2023  Masculino    NaN   
2       113755  U0004427396   8/13/2022      8  2022  Masculino    NaN   
3       202840  U0005038530   9/16/2023      9  2023  Masculino    NaN   
4        31544  U0003873327    7/6/2021      7  2021  Masculino    NaN   
...        ...          ...         ...    ...   ...        ...    ...   
103156  137998  U0004600580  11/26/2022     11  2022   Femenino    NaN   
103157   50108  U0003998985  10/21/2021     10  2021   Femenino  101.0   
103158   98326  U0004327713    6/3/2022      6  2022  Masculino    NaN   
103159  187747  U0004932755    7/1/2023      7  2023  Masculino    NaN   
103160  108243  U0004392833   7/20/2022      7  2022   Femenino    NaN   

        __temperatura  __pulso  __pas  __pad  __fres  __sat02  __peso  \
0           36.299999     81.0    NaN 

In [3]:
# Eliminamos las columnas no necesarias
columns_to_drop = ["var001", "var002", "var003", "year", "__outcome_1", "__outcome_2", "__destino", "__outcome_3"]
data = data.drop(columns=columns_to_drop)
print(data)

        month     __sexo  __hgt  __temperatura  __pulso  __pas  __pad  __fres  \
0           4  Masculino    NaN      36.299999     81.0    NaN    NaN     NaN   
1          11  Masculino    NaN      36.400002    105.0  185.0   96.0     NaN   
2           8  Masculino    NaN      36.200001    101.0    NaN    NaN     NaN   
3           9  Masculino    NaN      36.400002    106.0  125.0   99.0     NaN   
4           7  Masculino    NaN      36.500000     57.0  118.0   66.0     NaN   
...       ...        ...    ...            ...      ...    ...    ...     ...   
103156     11   Femenino    NaN      36.299999     74.0  152.0   97.0     NaN   
103157     10   Femenino  101.0      36.500000     60.0  143.0   78.0     NaN   
103158      6  Masculino    NaN      36.299999     77.0  163.0   77.0     NaN   
103159      7  Masculino    NaN      36.400002    112.0  143.0   83.0     NaN   
103160      7   Femenino    NaN      36.000000     82.0  167.0   90.0     NaN   

        __sat02  __peso __c

In [4]:
# Manejamos los datos faltantes
# Podemos optar por eliminar las columnas con muchos datos vacíos
data = data.drop(columns=["__hgt", "__fres", "__peso"])
print(data)

        month     __sexo  __temperatura  __pulso  __pas  __pad  __sat02  \
0           4  Masculino      36.299999     81.0    NaN    NaN     99.0   
1          11  Masculino      36.400002    105.0  185.0   96.0     99.0   
2           8  Masculino      36.200001    101.0    NaN    NaN     99.0   
3           9  Masculino      36.400002    106.0  125.0   99.0     99.0   
4           7  Masculino      36.500000     57.0  118.0   66.0    100.0   
...       ...        ...            ...      ...    ...    ...      ...   
103156     11   Femenino      36.299999     74.0  152.0   97.0     98.0   
103157     10   Femenino      36.500000     60.0  143.0   78.0     98.0   
103158      6  Masculino      36.299999     77.0  163.0   77.0    100.0   
103159      7  Masculino      36.400002    112.0  143.0   83.0     98.0   
103160      7   Femenino      36.000000     82.0  167.0   90.0     98.0   

       __categorizacion  
0                    C4  
1                    C4  
2                    

In [5]:
# Codificamos las variables categóricas
label_encoder = LabelEncoder()
data['__sexo'] = label_encoder.fit_transform(data['__sexo'])
data['__categorizacion'] = label_encoder.fit_transform(data['__categorizacion'])
print(data)

        month  __sexo  __temperatura  __pulso  __pas  __pad  __sat02  \
0           4       1      36.299999     81.0    NaN    NaN     99.0   
1          11       1      36.400002    105.0  185.0   96.0     99.0   
2           8       1      36.200001    101.0    NaN    NaN     99.0   
3           9       1      36.400002    106.0  125.0   99.0     99.0   
4           7       1      36.500000     57.0  118.0   66.0    100.0   
...       ...     ...            ...      ...    ...    ...      ...   
103156     11       0      36.299999     74.0  152.0   97.0     98.0   
103157     10       0      36.500000     60.0  143.0   78.0     98.0   
103158      6       1      36.299999     77.0  163.0   77.0    100.0   
103159      7       1      36.400002    112.0  143.0   83.0     98.0   
103160      7       0      36.000000     82.0  167.0   90.0     98.0   

        __categorizacion  
0                      3  
1                      3  
2                      2  
3                      3  


In [6]:
# Rellenamos los datos faltantes en las demás columnas
data = data.fillna(data.median())
print(data)

        month  __sexo  __temperatura  __pulso  __pas  __pad  __sat02  \
0           4       1      36.299999     81.0  129.0   77.0     99.0   
1          11       1      36.400002    105.0  185.0   96.0     99.0   
2           8       1      36.200001    101.0  129.0   77.0     99.0   
3           9       1      36.400002    106.0  125.0   99.0     99.0   
4           7       1      36.500000     57.0  118.0   66.0    100.0   
...       ...     ...            ...      ...    ...    ...      ...   
103156     11       0      36.299999     74.0  152.0   97.0     98.0   
103157     10       0      36.500000     60.0  143.0   78.0     98.0   
103158      6       1      36.299999     77.0  163.0   77.0    100.0   
103159      7       1      36.400002    112.0  143.0   83.0     98.0   
103160      7       0      36.000000     82.0  167.0   90.0     98.0   

        __categorizacion  
0                      3  
1                      3  
2                      2  
3                      3  


In [7]:
# Definimos las características y la variable objetivo
x = data.drop(columns=['__categorizacion'])
y = data['__categorizacion']

print(x)
print(y)

        month  __sexo  __temperatura  __pulso  __pas  __pad  __sat02
0           4       1      36.299999     81.0  129.0   77.0     99.0
1          11       1      36.400002    105.0  185.0   96.0     99.0
2           8       1      36.200001    101.0  129.0   77.0     99.0
3           9       1      36.400002    106.0  125.0   99.0     99.0
4           7       1      36.500000     57.0  118.0   66.0    100.0
...       ...     ...            ...      ...    ...    ...      ...
103156     11       0      36.299999     74.0  152.0   97.0     98.0
103157     10       0      36.500000     60.0  143.0   78.0     98.0
103158      6       1      36.299999     77.0  163.0   77.0    100.0
103159      7       1      36.400002    112.0  143.0   83.0     98.0
103160      7       0      36.000000     82.0  167.0   90.0     98.0

[103161 rows x 7 columns]
0         3
1         3
2         2
3         3
4         4
         ..
103156    2
103157    1
103158    3
103159    3
103160    2
Name: __categ

In [8]:
# Dividimos los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

# Entrenamos el modelo de Random Forest
model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)
print(model)


RandomForestClassifier(n_estimators=200, random_state=42)


In [9]:
# Realizamos predicciones
y_pred = model.predict(X_test)
print(y_pred)

[3 3 2 ... 3 3 3]


In [10]:
# Evaluamos el modelo
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))

Accuracy: 0.48030631038159555
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        76
           1       0.30      0.08      0.12      1840
           2       0.55      0.30      0.39      5743
           3       0.46      0.53      0.49     12009
           4       0.49      0.59      0.54     11281

    accuracy                           0.48     30949
   macro avg       0.36      0.30      0.31     30949
weighted avg       0.48      0.48      0.47     30949



In [11]:
# Ajustamos los hiperparámetros en caso de ser necesario para mejorar el rendimiento
from sklearn.model_selection import GridSearchCV

In [12]:
# Definimos los hiperparámetros a probar
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

In [13]:
# Creamos el modelo de Grid Search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)
print(grid_search)

Fitting 3 folds for each of 216 candidates, totalling 648 fits
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  50.3s[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  48.3s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  49.5s



PicklingError: Could not pickle the task to send it to the workers.

In [None]:
# Mostramos los mejores parámetros
print(grid_search.best_params_)

NameError: name 'grid_search' is not defined