# Importar y unificar los datos


## Importar librerías

In [315]:
print('Prueba')

Prueba


In [316]:
import pandas as pd
import numpy as np



### Archivo xlsx

In [317]:
df_xlsx = pd.read_excel('./DataFiles/Hungarian_Switzerland.xlsx')
df_xlsx.head()

Unnamed: 0.1,Unnamed: 0,age,sex,Chest pain type,resting blood pressure,serum cholestoral,fasting blood sugar,Resting EEG,maximum heart rate,exercise induced angina,ST depression,slope ST,number of major vessels,thal,diagnosis of heart disease
0,Patient 1,28,1,2,130,132,0,2,185,0,0.0,?,?,?,0
1,Patient 2,29,1,2,120,243,0,0,160,0,0.0,?,?,?,0
2,Patient 3,29,1,2,140,?,0,0,170,0,0.0,?,?,?,0
3,Patient 4,30,0,1,170,237,0,1,170,0,0.0,?,?,6,0
4,Patient 5,31,0,2,100,219,0,1,150,0,0.0,?,?,?,0


In [318]:
df_xlsx.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294 entries, 0 to 293
Data columns (total 15 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Unnamed: 0                   294 non-null    object 
 1   age                          294 non-null    int64  
 2   sex                          294 non-null    int64  
 3   Chest pain type              294 non-null    int64  
 4   resting blood pressure       294 non-null    object 
 5   serum cholestoral            294 non-null    object 
 6   fasting blood sugar          294 non-null    object 
 7   Resting EEG                  294 non-null    object 
 8   maximum heart rate           294 non-null    object 
 9   exercise induced angina      294 non-null    object 
 10  ST depression                294 non-null    float64
 11  slope ST                     294 non-null    object 
 12  number of major vessels      294 non-null    object 
 13  thal                

Parece que los nombres de columnas tienen espacios al final de la string,
Se corregiran y se tomaran como los nombres para el resto de datasets.

Dejaremos la primera columna *Unnamed: 0*, posteriormente se eliminará

In [319]:
columnas = list(df_xlsx.columns)

columnas = list(map(lambda x: x.strip(), columnas)) 

df_xlsx.columns = columnas



In [320]:
columnas

['Unnamed: 0',
 'age',
 'sex',
 'Chest pain type',
 'resting blood pressure',
 'serum cholestoral',
 'fasting blood sugar',
 'Resting EEG',
 'maximum heart rate',
 'exercise induced angina',
 'ST depression',
 'slope ST',
 'number of major vessels',
 'thal',
 'diagnosis of heart disease']

Renombrarelmos los encabezados a títulos descriptivos separados por guines bajos.

In [321]:
columnas_renombradas = ['ID_patient',
 'age',
 'sex',
 'chest_pain_type',
 'resting_blood_pressure',
 'serum_cholestoral',
 'fasting_blood_sugar',
 'resting_EEG',
 'maximum_heart_rate',
 'exercise_induced_angina',
 'ST_depression',
 'slope_ST',
 'number_major_vessels',
 'thal',
 'diagnosis_heart_disease']

In [322]:
print('Nombres de columna: ')
columnas_renombradas

Nombres de columna: 


['ID_patient',
 'age',
 'sex',
 'chest_pain_type',
 'resting_blood_pressure',
 'serum_cholestoral',
 'fasting_blood_sugar',
 'resting_EEG',
 'maximum_heart_rate',
 'exercise_induced_angina',
 'ST_depression',
 'slope_ST',
 'number_major_vessels',
 'thal',
 'diagnosis_heart_disease']

In [323]:
df_xlsx.columns = columnas_renombradas

Añadiremos una columna llamada 'city' para identificar el origen de los datos.

In [324]:
df_xlsx['city']= 'Budapest'

In [325]:
df_xlsx.head(3)

Unnamed: 0,ID_patient,age,sex,chest_pain_type,resting_blood_pressure,serum_cholestoral,fasting_blood_sugar,resting_EEG,maximum_heart_rate,exercise_induced_angina,ST_depression,slope_ST,number_major_vessels,thal,diagnosis_heart_disease,city
0,Patient 1,28,1,2,130,132,0,2,185,0,0.0,?,?,?,0,Budapest
1,Patient 2,29,1,2,120,243,0,0,160,0,0.0,?,?,?,0,Budapest
2,Patient 3,29,1,2,140,?,0,0,170,0,0.0,?,?,?,0,Budapest


Por último vamos a transformar la columna *ID_Patient*, a un valor numérico para optimizar el dataset.

In [326]:
df_xlsx['ID_patient']= df_xlsx['ID_patient']\
    .str.split().map(lambda x: x[-1])\
    .astype('int')
df_xlsx.head()

Unnamed: 0,ID_patient,age,sex,chest_pain_type,resting_blood_pressure,serum_cholestoral,fasting_blood_sugar,resting_EEG,maximum_heart_rate,exercise_induced_angina,ST_depression,slope_ST,number_major_vessels,thal,diagnosis_heart_disease,city
0,1,28,1,2,130,132,0,2,185,0,0.0,?,?,?,0,Budapest
1,2,29,1,2,120,243,0,0,160,0,0.0,?,?,?,0,Budapest
2,3,29,1,2,140,?,0,0,170,0,0.0,?,?,?,0,Budapest
3,4,30,0,1,170,237,0,1,170,0,0.0,?,?,6,0,Budapest
4,5,31,0,2,100,219,0,1,150,0,0.0,?,?,?,0,Budapest


### Archivo PDF

In [327]:
import pdfplumber

df_pdf = pd.DataFrame(columns=columnas)    



with pdfplumber.open("./DataFiles/LongBeachData.pdf") as pdf:

    primera_tabla = True
    columnas_tabla = []


    for page in pdf.pages:
        # Extrae las tablas de la página actual
        tablas_pagina = page.extract_tables()

        for tabla in tablas_pagina:

            if primera_tabla:
                df = pd.DataFrame(tabla[1:], columns=columnas) # el primer renglon son los registros 
                columnas_tabla = columnas
                # df_pdf.columns = list(columnas_tabla)
                primera_tabla = False
            else:
                df = pd.DataFrame(tabla[0:], columns=columnas_tabla)

            df_pdf = pd.concat([df_pdf, df]) # aqui acumulamos las tablas




In [328]:
df_pdf

Unnamed: 0.1,Unnamed: 0,age,sex,Chest pain type,resting blood pressure,serum cholestoral,fasting blood sugar,Resting EEG,maximum heart rate,exercise induced angina,ST depression,slope ST,number of major vessels,thal,diagnosis of heart disease
0,Patient 1,63,1,4,140,260,0,1,112,1,3,2,?,?,2
1,Patient 2,44,1,4,130,209,0,1,127,0,0,?,?,?,0
2,Patient 3,60,1,4,132,218,0,1,140,1,1.5,3,?,?,2
3,Patient 4,55,1,4,142,228,0,1,149,1,2.5,1,?,?,1
4,Patient 5,66,1,3,110,213,1,2,99,1,1.3,2,?,?,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,Patient 196,54,0,4,127,333,1,1,154,0,0,?,?,?,1
5,Patient 197,62,1,1,?,139,0,1,?,?,?,?,?,?,0
6,Patient 198,55,1,4,122,223,1,1,100,0,0,?,?,6,2
7,Patient 199,58,1,4,?,385,1,2,?,?,?,?,?,?,0


In [329]:
df_pdf.columns = columnas_renombradas
df_pdf['city'] = 'Long Beach'
df_pdf.head(3)

Unnamed: 0,ID_patient,age,sex,chest_pain_type,resting_blood_pressure,serum_cholestoral,fasting_blood_sugar,resting_EEG,maximum_heart_rate,exercise_induced_angina,ST_depression,slope_ST,number_major_vessels,thal,diagnosis_heart_disease,city
0,Patient 1,63,1,4,140,260,0,1,112,1,3.0,2,?,?,2,Long Beach
1,Patient 2,44,1,4,130,209,0,1,127,0,0.0,?,?,?,0,Long Beach
2,Patient 3,60,1,4,132,218,0,1,140,1,1.5,3,?,?,2,Long Beach


In [330]:
df_pdf['ID_patient']= df_pdf['ID_patient']\
    .str.split().map(lambda x: x[-1])\
    .astype('int')
df_pdf.head()

Unnamed: 0,ID_patient,age,sex,chest_pain_type,resting_blood_pressure,serum_cholestoral,fasting_blood_sugar,resting_EEG,maximum_heart_rate,exercise_induced_angina,ST_depression,slope_ST,number_major_vessels,thal,diagnosis_heart_disease,city
0,1,63,1,4,140,260,0,1,112,1,3.0,2,?,?,2,Long Beach
1,2,44,1,4,130,209,0,1,127,0,0.0,?,?,?,0,Long Beach
2,3,60,1,4,132,218,0,1,140,1,1.5,3,?,?,2,Long Beach
3,4,55,1,4,142,228,0,1,149,1,2.5,1,?,?,1,Long Beach
4,5,66,1,3,110,213,1,2,99,1,1.3,2,?,?,0,Long Beach


### Archivo .data


In [331]:
df_data = pd.read_csv('./DataFiles/processed.cleveland.data', names=columnas_renombradas[1:])
df_data

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,serum_cholestoral,fasting_blood_sugar,resting_EEG,maximum_heart_rate,exercise_induced_angina,ST_depression,slope_ST,number_major_vessels,thal,diagnosis_heart_disease
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1
299,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,2
300,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,3
301,57.0,0.0,2.0,130.0,236.0,0.0,2.0,174.0,0.0,0.0,2.0,1.0,3.0,1


En este dataset vamos a tener que agregar el *ID_patient* manualmente

In [332]:
df_data['ID_patient'] = pd.Series(range(1, len(df_data) + 1))
df_data

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,serum_cholestoral,fasting_blood_sugar,resting_EEG,maximum_heart_rate,exercise_induced_angina,ST_depression,slope_ST,number_major_vessels,thal,diagnosis_heart_disease,ID_patient
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0,1
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1,3
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0,4
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1,299
299,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,2,300
300,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,3,301
301,57.0,0.0,2.0,130.0,236.0,0.0,2.0,174.0,0.0,0.0,2.0,1.0,3.0,1,302


In [333]:
# y tenemos que darle la misma forma de columnas

columnas_temp = list(df_data.columns)

columnas_temp.pop(-1)
columnas_temp.insert(0, 'ID_patient')

df_data = df_data[columnas_temp]
df_data.head(3)

Unnamed: 0,ID_patient,age,sex,chest_pain_type,resting_blood_pressure,serum_cholestoral,fasting_blood_sugar,resting_EEG,maximum_heart_rate,exercise_induced_angina,ST_depression,slope_ST,number_major_vessels,thal,diagnosis_heart_disease
0,1,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,2,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,3,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1


In [334]:
df_data.columns = columnas_renombradas
df_data['city'] = 'Cleveland'
df_data.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_data['city'] = 'Cleveland'


Unnamed: 0,ID_patient,age,sex,chest_pain_type,resting_blood_pressure,serum_cholestoral,fasting_blood_sugar,resting_EEG,maximum_heart_rate,exercise_induced_angina,ST_depression,slope_ST,number_major_vessels,thal,diagnosis_heart_disease,city
0,1,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0,Cleveland
1,2,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2,Cleveland
2,3,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1,Cleveland


## Unificar datasets

In [335]:
print(f'Forma de dataset xlsx: {df_xlsx.shape}')
print(f'Forma de dataset pdf : {df_pdf.shape}')
print(f'Forma de dataset data: {df_data.shape}')

Forma de dataset xlsx: (294, 16)
Forma de dataset pdf : (200, 16)
Forma de dataset data: (303, 16)


In [336]:
df_completo = pd.concat( 
        [
        df_xlsx,
        df_pdf,
        df_data     
        ]
        , ignore_index=True)

df_completo

Unnamed: 0,ID_patient,age,sex,chest_pain_type,resting_blood_pressure,serum_cholestoral,fasting_blood_sugar,resting_EEG,maximum_heart_rate,exercise_induced_angina,ST_depression,slope_ST,number_major_vessels,thal,diagnosis_heart_disease,city
0,1,28,1,2,130,132,0,2,185,0,0.0,?,?,?,0,Budapest
1,2,29,1,2,120,243,0,0,160,0,0.0,?,?,?,0,Budapest
2,3,29,1,2,140,?,0,0,170,0,0.0,?,?,?,0,Budapest
3,4,30,0,1,170,237,0,1,170,0,0.0,?,?,6,0,Budapest
4,5,31,0,2,100,219,0,1,150,0,0.0,?,?,?,0,Budapest
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
792,299,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1,Cleveland
793,300,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,2,Cleveland
794,301,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,3,Cleveland
795,302,57.0,0.0,2.0,130.0,236.0,0.0,2.0,174.0,0.0,0.0,2.0,1.0,3.0,1,Cleveland


Pondremos la columna *city* al principio del dataset por términos prácticos.

In [337]:
columnas_temp = list(df_completo.columns)

columnas_temp.insert(0, columnas_temp.pop(-1))

df_completo = df_completo[columnas_temp]


In [338]:
df_completo.head(3)

Unnamed: 0,city,ID_patient,age,sex,chest_pain_type,resting_blood_pressure,serum_cholestoral,fasting_blood_sugar,resting_EEG,maximum_heart_rate,exercise_induced_angina,ST_depression,slope_ST,number_major_vessels,thal,diagnosis_heart_disease
0,Budapest,1,28,1,2,130,132,0,2,185,0,0.0,?,?,?,0
1,Budapest,2,29,1,2,120,243,0,0,160,0,0.0,?,?,?,0
2,Budapest,3,29,1,2,140,?,0,0,170,0,0.0,?,?,?,0


In [339]:
df_completo.to_csv('./DataFiles/enfermedad_cardiaca.csv')

## Codificar dataset
Antes de exportar el dataset, vamos a hacer una transformación basica de datos.

* Remplazaremos los valore de '?' por *NaN*.
* Verificaremos el tipo de dato de cada columna.
* Convertiremos a tipo de dato categórico las columnas que lo ncesiten.
* Exportaremos en formato *parquet* para no perder las tranformaciones y codificaciones.

In [340]:
df_completo = df_completo.replace('?', np.nan)

In [341]:
df_completo

Unnamed: 0,city,ID_patient,age,sex,chest_pain_type,resting_blood_pressure,serum_cholestoral,fasting_blood_sugar,resting_EEG,maximum_heart_rate,exercise_induced_angina,ST_depression,slope_ST,number_major_vessels,thal,diagnosis_heart_disease
0,Budapest,1,28,1,2,130,132,0,2,185,0,0.0,,,,0
1,Budapest,2,29,1,2,120,243,0,0,160,0,0.0,,,,0
2,Budapest,3,29,1,2,140,,0,0,170,0,0.0,,,,0
3,Budapest,4,30,0,1,170,237,0,1,170,0,0.0,,,6,0
4,Budapest,5,31,0,2,100,219,0,1,150,0,0.0,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
792,Cleveland,299,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1
793,Cleveland,300,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,2
794,Cleveland,301,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,3
795,Cleveland,302,57.0,0.0,2.0,130.0,236.0,0.0,2.0,174.0,0.0,0.0,2.0,1.0,3.0,1


In [342]:
df_completo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 797 entries, 0 to 796
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   city                     797 non-null    object
 1   ID_patient               797 non-null    int64 
 2   age                      797 non-null    object
 3   sex                      797 non-null    object
 4   chest_pain_type          797 non-null    object
 5   resting_blood_pressure   740 non-null    object
 6   serum_cholestoral        767 non-null    object
 7   fasting_blood_sugar      782 non-null    object
 8   resting_EEG              796 non-null    object
 9   maximum_heart_rate       743 non-null    object
 10  exercise_induced_angina  743 non-null    object
 11  ST_depression            741 non-null    object
 12  slope_ST                 505 non-null    object
 13  number_major_vessels     304 non-null    object
 14  thal                     363 non-null    o

In [343]:
df_completo['age'] = df_completo['age'].astype('int')
df_completo['resting_blood_pressure'] = df_completo['resting_blood_pressure'].astype('float')
df_completo['serum_cholestoral'] = df_completo['serum_cholestoral'].astype('float')
df_completo['maximum_heart_rate'] = df_completo['maximum_heart_rate'].astype('float')
# df['exercise induced angina'] = df_completo['exercise_induced_angina'].astype('category')
df_completo['ST_depression'] = df_completo['ST_depression'].astype('float')

In [344]:
df_completo['slope_ST'] = df_completo['slope_ST'].astype('float')

In [345]:
df_completo['number_major_vessels'] = df_completo['number_major_vessels'].astype('float')

In [346]:
df_completo['thal'] =df_completo['thal'].astype('float')

df_completo['diagnosis_heart_disease'].info()

In [347]:
df_completo['diagnosis_heart_disease'] = df_completo['diagnosis_heart_disease'].astype('int')

In [348]:
df_completo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 797 entries, 0 to 796
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   city                     797 non-null    object 
 1   ID_patient               797 non-null    int64  
 2   age                      797 non-null    int64  
 3   sex                      797 non-null    object 
 4   chest_pain_type          797 non-null    object 
 5   resting_blood_pressure   740 non-null    float64
 6   serum_cholestoral        767 non-null    float64
 7   fasting_blood_sugar      782 non-null    object 
 8   resting_EEG              796 non-null    object 
 9   maximum_heart_rate       743 non-null    float64
 10  exercise_induced_angina  743 non-null    object 
 11  ST_depression            741 non-null    float64
 12  slope_ST                 505 non-null    float64
 13  number_major_vessels     304 non-null    float64
 14  thal                     3

### Categorizando variables

In [349]:
df_completo['exercise_induced_angina'] = df_completo['exercise_induced_angina'].astype('bool')

In [350]:
df_completo['fasting_blood_sugar'] = df_completo['fasting_blood_sugar'].astype('float').astype('bool')

In [351]:
df_completo['sex'] = df_completo['sex'].map({1: 'male', 0: 'female'}).astype('category')


In [352]:
df_completo['city'] = df_completo['city'].astype('category')

In [353]:
dict_chest_paint_type = {
    1 : 'typical angina',
    2 : 'atypical angina',
    3 : 'non-anginal pain',
    4 : 'asymptomatic',
}

df_completo['chest_pain_type'] = df_completo['chest_pain_type'].astype('int')

In [354]:
df_completo['chest_pain_type'].head()

0    2
1    2
2    2
3    1
4    2
Name: chest_pain_type, dtype: int64

In [355]:
df_completo['resting_EEG'] = df_completo['resting_EEG'].astype('float')

In [356]:
df_completo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 797 entries, 0 to 796
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype   
---  ------                   --------------  -----   
 0   city                     797 non-null    category
 1   ID_patient               797 non-null    int64   
 2   age                      797 non-null    int64   
 3   sex                      597 non-null    category
 4   chest_pain_type          797 non-null    int64   
 5   resting_blood_pressure   740 non-null    float64 
 6   serum_cholestoral        767 non-null    float64 
 7   fasting_blood_sugar      797 non-null    bool    
 8   resting_EEG              796 non-null    float64 
 9   maximum_heart_rate       743 non-null    float64 
 10  exercise_induced_angina  797 non-null    bool    
 11  ST_depression            741 non-null    float64 
 12  slope_ST                 505 non-null    float64 
 13  number_major_vessels     304 non-null    float64 
 14  thal      

## Exportar dataset como parquet

Exportaremos el dataset en formato parquet para conservar todas las propiedades que hemos añádido en las columnas durante la codificación básica.

In [357]:
%pip install pyarrow

Note: you may need to restart the kernel to use updated packages.


In [358]:
import pyarrow as pa
print(pa.__version__)  # verify it's importable

22.0.0


In [359]:
df_completo.to_parquet('./DataFiles/enfermedad_cardiaca_encoded.parquet', engine='pyarrow')
print(f'Dataset exportado como parquet en: {'./DataFiles/enfermedad_cardiaca_encoded.parquet'}'  )

Dataset exportado como parquet en: ./DataFiles/enfermedad_cardiaca_encoded.parquet
