# Indicadores de enfermedades cardíacas: Utilizando datos clínicos y de estilo de vida para predecir enfermedades cardíacas

Vamos a llevar a cabo una serie de pasos de preparación y ajuste en los datos antes de aplicarlos al modelo de clasificación. Estas acciones incluyen:

Preparación y ajuste de datos:

Aplicar Label Encoder: Si encontramos características en los datos que siguen un orden pero están expresadas en categorías, usaremos una técnica llamada Label Encoder. Esto asigna un número a cada categoría en orden, lo que ayuda al modelo a entender la jerarquía entre ellas.

Transformar columnas a binario: Para las columnas que tienen respuestas tipo "Sí" o "No", las convertiremos a un formato binario, donde "Sí" se convierte en 1 y "No" en 0.

Automatización mediante bucles: Para hacer estas transformaciones más eficientemente y evitar hacer lo mismo muchas veces, utilizaremos bucles. Esto nos ayuda a aplicar los mismos cambios a varias columnas de manera rápida y uniforme.



In [85]:
# Importamos las distintas librerias necesarias para el análisis

# Tratamiento de datos
import numpy as np
import pandas as pd


# Tratamiento de las variables categóricas que se pueden ordenar
from sklearn.preprocessing import LabelEncoder


In [94]:
df= pd.read_csv("heart_2020_cleaned.csv")
df

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.60,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,Yes,27.41,Yes,No,No,7.0,0.0,Yes,Male,60-64,Hispanic,Yes,No,Fair,6.0,Yes,No,No
319791,No,29.84,Yes,No,No,0.0,0.0,No,Male,35-39,Hispanic,No,Yes,Very good,5.0,Yes,No,No
319792,No,24.24,No,No,No,0.0,0.0,No,Female,45-49,Hispanic,No,Yes,Good,6.0,No,No,No
319793,No,32.81,No,No,No,0.0,0.0,No,Female,25-29,Hispanic,No,No,Good,12.0,No,No,No


In [95]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319795 entries, 0 to 319794
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   HeartDisease      319795 non-null  object 
 1   BMI               319795 non-null  float64
 2   Smoking           319795 non-null  object 
 3   AlcoholDrinking   319795 non-null  object 
 4   Stroke            319795 non-null  object 
 5   PhysicalHealth    319795 non-null  float64
 6   MentalHealth      319795 non-null  float64
 7   DiffWalking       319795 non-null  object 
 8   Sex               319795 non-null  object 
 9   AgeCategory       319795 non-null  object 
 10  Race              319795 non-null  object 
 11  Diabetic          319795 non-null  object 
 12  PhysicalActivity  319795 non-null  object 
 13  GenHealth         319795 non-null  object 
 14  SleepTime         319795 non-null  float64
 15  Asthma            319795 non-null  object 
 16  KidneyDisease     31

In [96]:

df.isnull().sum()

HeartDisease        0
BMI                 0
Smoking             0
AlcoholDrinking     0
Stroke              0
PhysicalHealth      0
MentalHealth        0
DiffWalking         0
Sex                 0
AgeCategory         0
Race                0
Diabetic            0
PhysicalActivity    0
GenHealth           0
SleepTime           0
Asthma              0
KidneyDisease       0
SkinCancer          0
dtype: int64

In [97]:
df.describe(include="all")

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
count,319795,319795.0,319795,319795,319795,319795.0,319795.0,319795,319795,319795,319795,319795,319795,319795,319795.0,319795,319795,319795
unique,2,,2,2,2,,,2,2,13,6,4,2,5,,2,2,2
top,No,,No,No,No,,,No,Female,65-69,White,No,Yes,Very good,,No,No,No
freq,292422,,187887,298018,307726,,,275385,167805,34151,245212,269653,247957,113858,,276923,308016,289976
mean,,28.325399,,,,3.37171,3.898366,,,,,,,,7.097075,,,
std,,6.3561,,,,7.95085,7.955235,,,,,,,,1.436007,,,
min,,12.02,,,,0.0,0.0,,,,,,,,1.0,,,
25%,,24.03,,,,0.0,0.0,,,,,,,,6.0,,,
50%,,27.34,,,,0.0,0.0,,,,,,,,7.0,,,
75%,,31.42,,,,2.0,3.0,,,,,,,,8.0,,,


In [98]:
# Verificamos la distribución de los valores en cada columna
for column in df.columns:
    print(df[column].value_counts())
    print()

No     292422
Yes     27373
Name: HeartDisease, dtype: int64

26.63    3762
27.46    2767
27.44    2723
24.41    2696
27.12    2525
         ... 
59.85       1
50.59       1
92.53       1
62.95       1
46.56       1
Name: BMI, Length: 3604, dtype: int64

No     187887
Yes    131908
Name: Smoking, dtype: int64

No     298018
Yes     21777
Name: AlcoholDrinking, dtype: int64

No     307726
Yes     12069
Name: Stroke, dtype: int64

0.0     226589
30.0     19509
2.0      14880
1.0      10489
3.0       8617
5.0       7606
10.0      5453
15.0      5012
7.0       4629
4.0       4468
20.0      3216
14.0      2893
6.0       1270
25.0      1164
8.0        924
21.0       626
12.0       605
28.0       446
29.0       204
9.0        180
18.0       167
16.0       135
27.0       124
17.0       110
13.0        91
22.0        89
11.0        85
24.0        67
26.0        66
23.0        46
19.0        35
Name: PhysicalHealth, dtype: int64

0.0     205401
30.0     17373
2.0      16495
5.0      14149
10.0  

In [99]:
df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


# TRATAMIENTO DE LAS VARIABLES BINARIAS

In [100]:
# Realizamos un bucle para ver los valores únicos de cada variable

In [101]:
columns = ["HeartDisease", "BMI", "Smoking", "AlcoholDrinking", "Stroke", "PhysicalHealth", "MentalHealth",
           "DiffWalking", "Sex", "AgeCategory", "Race", "Diabetic", "PhysicalActivity", "GenHealth", "SleepTime",
           "Asthma", "KidneyDisease", "SkinCancer"]

for column in columns:
    unique_values = df[column].unique()
    print(column, unique_values)

HeartDisease ['No' 'Yes']
BMI [16.6  20.34 26.58 ... 62.42 51.46 46.56]
Smoking ['Yes' 'No']
AlcoholDrinking ['No' 'Yes']
Stroke ['No' 'Yes']
PhysicalHealth [ 3.  0. 20. 28.  6. 15.  5. 30.  7.  1.  2. 21.  4. 10. 14. 18.  8. 25.
 16. 29. 27. 17. 24. 12. 23. 26. 22. 19.  9. 13. 11.]
MentalHealth [30.  0.  2.  5. 15.  8.  4.  3. 10. 14. 20.  1.  7. 24.  9. 28. 16. 12.
  6. 25. 17. 18. 21. 29. 22. 13. 23. 27. 26. 11. 19.]
DiffWalking ['No' 'Yes']
Sex ['Female' 'Male']
AgeCategory ['55-59' '80 or older' '65-69' '75-79' '40-44' '70-74' '60-64' '50-54'
 '45-49' '18-24' '35-39' '30-34' '25-29']
Race ['White' 'Black' 'Asian' 'American Indian/Alaskan Native' 'Other'
 'Hispanic']
Diabetic ['Yes' 'No' 'No, borderline diabetes' 'Yes (during pregnancy)']
PhysicalActivity ['Yes' 'No']
GenHealth ['Very good' 'Fair' 'Good' 'Poor' 'Excellent']
SleepTime [ 5.  7.  8.  6. 12.  4.  9. 10. 15.  3.  2.  1. 16. 18. 14. 20. 11. 13.
 17. 24. 19. 21. 22. 23.]
Asthma ['Yes' 'No']
KidneyDisease ['No' 'Yes']
Skin

In [102]:
# Realización de bucle para cambiar los "No" por 0 y los "Yes" por 1 de sus respectivas columnas
binary_columns = ["HeartDisease", "Smoking", "AlcoholDrinking", "Stroke", "DiffWalking",
                  "PhysicalActivity", "Asthma", "KidneyDisease", "SkinCancer"]

for column in binary_columns:
    df[column] = df[column].map({"No": 0, "Yes": 1})

In [103]:
df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,0,16.6,1,0,0,3.0,30.0,0,Female,55-59,White,Yes,1,Very good,5.0,1,0,1
1,0,20.34,0,0,1,0.0,0.0,0,Female,80 or older,White,No,1,Very good,7.0,0,0,0
2,0,26.58,1,0,0,20.0,30.0,0,Male,65-69,White,Yes,1,Fair,8.0,1,0,0
3,0,24.21,0,0,0,0.0,0.0,0,Female,75-79,White,No,0,Good,6.0,0,0,1
4,0,23.71,0,0,0,28.0,0.0,1,Female,40-44,White,No,1,Very good,8.0,0,0,0


In [104]:
# De la columna de "Sex"
df["Sex"][df["Sex"] == "Female"] = 0
df["Sex"][df["Sex"] == "Male"] = 1

# Cambio de tipo objet a tipo int
df["Sex"] = df["Sex"].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Sex"][df["Sex"] == "Female"] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Sex"][df["Sex"] == "Male"] = 1


In [105]:
# bucle para convertir las columnas a binarias
columns_to_convert = ["HeartDisease", "Smoking", "AlcoholDrinking", "Stroke", "DiffWalking",
                      "PhysicalActivity", "Asthma", "KidneyDisease", "SkinCancer"]

for column in columns_to_convert:
    df[column] = df[column].astype(int)

In [106]:
df.dtypes


HeartDisease          int64
BMI                 float64
Smoking               int64
AlcoholDrinking       int64
Stroke                int64
PhysicalHealth      float64
MentalHealth        float64
DiffWalking           int64
Sex                   int64
AgeCategory          object
Race                 object
Diabetic             object
PhysicalActivity      int64
GenHealth            object
SleepTime           float64
Asthma                int64
KidneyDisease         int64
SkinCancer            int64
dtype: object

In [107]:
# bucle para ver los datos binarios
columns_to_count = ["HeartDisease", "Smoking", "AlcoholDrinking", "Stroke", "DiffWalking",
                    "Sex", "PhysicalActivity", "Asthma", "KidneyDisease", "SkinCancer"]

for column in columns_to_count:
    value_counts = df[column].value_counts()
    print(column, value_counts)

HeartDisease 0    292422
1     27373
Name: HeartDisease, dtype: int64
Smoking 0    187887
1    131908
Name: Smoking, dtype: int64
AlcoholDrinking 0    298018
1     21777
Name: AlcoholDrinking, dtype: int64
Stroke 0    307726
1     12069
Name: Stroke, dtype: int64
DiffWalking 0    275385
1     44410
Name: DiffWalking, dtype: int64
Sex 0    167805
1    151990
Name: Sex, dtype: int64
PhysicalActivity 1    247957
0     71838
Name: PhysicalActivity, dtype: int64
Asthma 0    276923
1     42872
Name: Asthma, dtype: int64
KidneyDisease 0    308016
1     11779
Name: KidneyDisease, dtype: int64
SkinCancer 0    289976
1     29819
Name: SkinCancer, dtype: int64


# VARIABLES CATEGÓRICAS

In [108]:
# Columnas a las que se les aplicará el LabelEncoder
columns = ['AgeCategory', 'Race', 'Diabetic', 'GenHealth']

# Creamos una instancia de LabelEncoder
label_encoder = LabelEncoder()

# Realización de un bucle y aplicación del LabelEncoder
for col in columns:
    df[col + '_encoded'] = label_encoder.fit_transform(df[col])

In [109]:
df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,...,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer,AgeCategory_encoded,Race_encoded,Diabetic_encoded,GenHealth_encoded
0,0,16.6,1,0,0,3.0,30.0,0,0,55-59,...,1,Very good,5.0,1,0,1,7,5,2,4
1,0,20.34,0,0,1,0.0,0.0,0,0,80 or older,...,1,Very good,7.0,0,0,0,12,5,0,4
2,0,26.58,1,0,0,20.0,30.0,0,1,65-69,...,1,Fair,8.0,1,0,0,9,5,2,1
3,0,24.21,0,0,0,0.0,0.0,0,0,75-79,...,0,Good,6.0,0,0,1,11,5,0,2
4,0,23.71,0,0,0,28.0,0.0,1,0,40-44,...,1,Very good,8.0,0,0,0,4,5,0,4


In [110]:
df.dtypes

HeartDisease             int64
BMI                    float64
Smoking                  int64
AlcoholDrinking          int64
Stroke                   int64
PhysicalHealth         float64
MentalHealth           float64
DiffWalking              int64
Sex                      int64
AgeCategory             object
Race                    object
Diabetic                object
PhysicalActivity         int64
GenHealth               object
SleepTime              float64
Asthma                   int64
KidneyDisease            int64
SkinCancer               int64
AgeCategory_encoded      int64
Race_encoded             int64
Diabetic_encoded         int64
GenHealth_encoded        int64
dtype: object

# VARIABLES TIPO FLOAT


Variables: "BMI", "PhysicalHealt", "MentalHealth" Y "SleepTime".

## Variable BMI:


Bajo peso: IMC menor a 18.5

Peso normal: IMC entre 18.5 y 24.9

Sobrepeso: IMC entre 25 y 29.9

Obesidad clase I: IMC entre 30 y 34.9

Obesidad clase II: IMC entre 35 y 39.9

Obesidad clase III (obesidad mórbida): IMC igual o mayor a 40

 ##  Variable PhysicalHealt:



Buena salud = 1

 Moderada salud = 2

 Mala salud = 3


 ## Variable MentalHealth

 Buena salud = 1

 Moderada salud = 2

 Mala salud = 3

## Variable de SleepTime

Insuficiente = 1

Normal = 2

Excesivo = 3

## TRANSFORMACIÓN DE LA VARIABLE "BMI"


In [111]:
# La variable "BMI", la cual nos muestra el índice de masa corporal
df["BMI"] = df["BMI"].round()

In [112]:
# Definición de las categorías y los límites de los rangos de IMC
categories = {
    'Bajo peso': (0, 18.4),
    'Peso normal': (18.5, 24.9),
    'Sobrepeso': (25, 29.9),
    'Obesidad clase I': (30, 34.9),
    'Obesidad clase II': (35, 39.9),
    'Obesidad clase III (obesidad mórbida)': (40, float('inf'))
}

In [113]:
# función para aplicar un bucle sobre la variable
def assign_category(bmi):
    for category, (lower, upper) in categories.items():
        if lower <= bmi <= upper:
            return category

In [114]:
# Aplicamos la funcion a la variable y creamos una nueva columna
df['BMI_Category'] = df['BMI'].apply(assign_category)

In [115]:
# comprobamos que no quedan valores nulos
df['BMI_Category'].isnull().sum()

0

In [116]:
# Cambiamos los string
category_mapping = {
    'Bajo peso': 1,
    'Peso normal': 2,
    'Sobrepeso': 3,
    'Obesidad clase I': 4,
    'Obesidad clase II': 5,
    'Obesidad clase III (obesidad mórbida)': 6
}
df['BMI_Category_Ordinal'] = df['BMI_Category'].map(category_mapping)

In [117]:
# Comprobacion
df['BMI_Category_Ordinal']

0         1
1         2
2         3
3         2
4         2
         ..
319790    3
319791    4
319792    2
319793    4
319794    6
Name: BMI_Category_Ordinal, Length: 319795, dtype: int64

## TRANSFORMACIÓN DE LA VARIABLE DE "PhysicalHealth"

In [118]:
# los valores únicos de 'PhysicalHealth'
df['PhysicalHealth'].unique()

array([ 3.,  0., 20., 28.,  6., 15.,  5., 30.,  7.,  1.,  2., 21.,  4.,
       10., 14., 18.,  8., 25., 16., 29., 27., 17., 24., 12., 23., 26.,
       22., 19.,  9., 13., 11.])

In [119]:
# Definimos los grupos de salud
umbral_buena_salud = 5
umbral_moderada_salud = 15

# Creamos una nueva columna con los grupos de salud
df['GrupoSalud'] = np.select([df['PhysicalHealth'] < umbral_buena_salud, 
                              df['PhysicalHealth'] <= umbral_moderada_salud],
                             ['Buena salud', 'Salud moderada'], default='Mala salud')


In [120]:
# Comprobamos que se ha convertido correctamente
df['GrupoSalud']

0            Buena salud
1            Buena salud
2             Mala salud
3            Buena salud
4             Mala salud
               ...      
319790    Salud moderada
319791       Buena salud
319792       Buena salud
319793       Buena salud
319794       Buena salud
Name: GrupoSalud, Length: 319795, dtype: object

In [121]:
# Cambiamos los string
category_mapping = {
    'Buena salud': 1,
    'Salud moderada': 2,
    'Mala salud': 3,
}
df['GrupoSalud_Ordinal'] = df['GrupoSalud'].map(category_mapping)


In [122]:
# Comprobamos
df['GrupoSalud_Ordinal']

0         1
1         1
2         3
3         1
4         3
         ..
319790    2
319791    1
319792    1
319793    1
319794    1
Name: GrupoSalud_Ordinal, Length: 319795, dtype: int64

 ## TRANSFORMACIÓN DE LA VARIABLE "MentalHealth"


In [123]:
# Vemos los valores únicos de 'MentalHealth
df['MentalHealth'].unique()

array([30.,  0.,  2.,  5., 15.,  8.,  4.,  3., 10., 14., 20.,  1.,  7.,
       24.,  9., 28., 16., 12.,  6., 25., 17., 18., 21., 29., 22., 13.,
       23., 27., 26., 11., 19.])

In [124]:
# Definimos los grupos de salud mental
umbral_buena_salud_mental = 5
umbral_moderada_salud_mental = 15

# Creamos una nueva columna con los grupos de salud
df['GrupoSalud_Mental'] = np.select([df['MentalHealth'] < umbral_buena_salud_mental, 
                              df['MentalHealth'] <= umbral_moderada_salud_mental],
                             ['Buena salud', 'Salud moderada'], default='Mala salud')

In [125]:
# Comprobamos
df['GrupoSalud_Mental']

0          Mala salud
1         Buena salud
2          Mala salud
3         Buena salud
4         Buena salud
             ...     
319790    Buena salud
319791    Buena salud
319792    Buena salud
319793    Buena salud
319794    Buena salud
Name: GrupoSalud_Mental, Length: 319795, dtype: object

In [126]:
# Cambiamos los string 
category_mapping_mental = {
    'Buena salud': 1,
    'Salud moderada': 2,
    'Mala salud': 3,
}
df['GrupoSalud_Mental_Ordinal'] = df['GrupoSalud_Mental'].map(category_mapping_mental)

In [127]:
# Comprobamos
df['GrupoSalud_Mental_Ordinal']

0         3
1         1
2         3
3         1
4         1
         ..
319790    1
319791    1
319792    1
319793    1
319794    1
Name: GrupoSalud_Mental_Ordinal, Length: 319795, dtype: int64

## TRANSFORMACIÓN DE LA VARIABLE DE "SleepTime"


In [128]:
# Vemos los valores únicos de 'SleepTime´

df['SleepTime'].unique()


array([ 5.,  7.,  8.,  6., 12.,  4.,  9., 10., 15.,  3.,  2.,  1., 16.,
       18., 14., 20., 11., 13., 17., 24., 19., 21., 22., 23.])

In [129]:
# Definimos los grupos de "SleepTime"
insuficiente_limit = 6
normal_lower_limit = 7
normal_upper_limit = 9

# Crear una nueva columna 'SleepGroup'
bins = [0, insuficiente_limit, normal_lower_limit, float('inf')]
labels = ['Insuficiente', 'Normal', 'Excesivo']
df['SleepGroup'] = pd.cut(df['SleepTime'], bins=bins, labels=labels, right=False)

# Imprimir la cuenta de valores en cada grupo
print(df['SleepGroup'].value_counts())


Excesivo        222809
Normal           66721
Insuficiente     30265
Name: SleepGroup, dtype: int64


In [130]:
# Cambiamos los string 
category_mapping_sleep = {
    'Insuficiente': 1,
    'Normal': 2,
    'Excesivo': 3,
}
df['SleepGroup_Ordinal'] = df['SleepGroup'].map(category_mapping_sleep)

In [131]:
df['SleepGroup_Ordinal'].astype(int)

0         1
1         3
2         3
3         2
4         3
         ..
319790    2
319791    1
319792    2
319793    3
319794    3
Name: SleepGroup_Ordinal, Length: 319795, dtype: int64

In [132]:
df['SleepGroup_Ordinal'].isnull().sum()

0

In [133]:
df.dtypes

HeartDisease                    int64
BMI                           float64
Smoking                         int64
AlcoholDrinking                 int64
Stroke                          int64
PhysicalHealth                float64
MentalHealth                  float64
DiffWalking                     int64
Sex                             int64
AgeCategory                    object
Race                           object
Diabetic                       object
PhysicalActivity                int64
GenHealth                      object
SleepTime                     float64
Asthma                          int64
KidneyDisease                   int64
SkinCancer                      int64
AgeCategory_encoded             int64
Race_encoded                    int64
Diabetic_encoded                int64
GenHealth_encoded               int64
BMI_Category                   object
BMI_Category_Ordinal            int64
GrupoSalud                     object
GrupoSalud_Ordinal              int64
GrupoSalud_M

In [134]:
df.columns

Index(['HeartDisease', 'BMI', 'Smoking', 'AlcoholDrinking', 'Stroke',
       'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex', 'AgeCategory',
       'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'SleepTime',
       'Asthma', 'KidneyDisease', 'SkinCancer', 'AgeCategory_encoded',
       'Race_encoded', 'Diabetic_encoded', 'GenHealth_encoded', 'BMI_Category',
       'BMI_Category_Ordinal', 'GrupoSalud', 'GrupoSalud_Ordinal',
       'GrupoSalud_Mental', 'GrupoSalud_Mental_Ordinal', 'SleepGroup',
       'SleepGroup_Ordinal'],
      dtype='object')

In [135]:
# Eliminamos las columnas que no vamos a utilizar
columns_drop = ["PhysicalActivity","AlcoholDrinking", "GenHealth_encoded",
                   "Race_encoded","Asthma", "SkinCancer", "KidneyDisease", 
                   "AgeCategory", "SleepTime", "SleepGroup", "GenHealth","BMI_Category","GrupoSalud","GrupoSalud_Mental", "Race", "Diabetic", "PhysicalHealth", "BMI","MentalHealth"]

df = df.drop(columns=columns_drop)

In [136]:
df.columns

Index(['HeartDisease', 'Smoking', 'Stroke', 'DiffWalking', 'Sex',
       'AgeCategory_encoded', 'Diabetic_encoded', 'BMI_Category_Ordinal',
       'GrupoSalud_Ordinal', 'GrupoSalud_Mental_Ordinal',
       'SleepGroup_Ordinal'],
      dtype='object')

In [137]:
df

Unnamed: 0,HeartDisease,Smoking,Stroke,DiffWalking,Sex,AgeCategory_encoded,Diabetic_encoded,BMI_Category_Ordinal,GrupoSalud_Ordinal,GrupoSalud_Mental_Ordinal,SleepGroup_Ordinal
0,0,1,0,0,0,7,2,1,1,3,1
1,0,0,1,0,0,12,0,2,1,1,3
2,0,1,0,0,1,9,2,3,3,3,3
3,0,0,0,0,0,11,0,2,1,1,2
4,0,0,0,1,0,4,0,2,3,1,3
...,...,...,...,...,...,...,...,...,...,...,...
319790,1,1,0,1,1,8,2,3,2,1,2
319791,0,1,0,0,1,3,0,4,1,1,1
319792,0,0,0,0,0,5,0,2,1,1,2
319793,0,0,0,0,0,1,0,4,1,1,3
