# Laboratorio 1 - Aprendizaje de Máquina

## Miembros del grupo

| Nombre             | Código    | Correo electrónico           |
|--------------------|-----------|------------------------------|
| Adrian Velasquez   | 202222737 | a.velasquezs@uniandes.edu.co |
| Andres Botero Ruiz | 202223503 | a.boteror@uniandes.edu.co    |

# Preparación del entorno de trabajo

In [1199]:
# Importar las librerías necesarias
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

In [1200]:
# Folders and constants
DATA_FOLDER = 'data/'
DATA_FILE = DATA_FOLDER + 'data.csv'
TEST_FILE = DATA_FOLDER + 'test.csv'
MODELS_FOLDER = 'models/'
MODEL = MODELS_FOLDER + 'model.pkl'

## Lectura de los datos

In [1201]:
# Leer el archivo CSV con los datos de entrenamiento
df = pd.read_csv( DATA_FILE )
df.head()

Unnamed: 0,Patient ID,Date of Service,Sex,Age,Weight (kg),Height (m),BMI,Abdominal Circumference (cm),Blood Pressure (mmHg),Total Cholesterol (mg/dL),...,Physical Activity Level,Family History of CVD,Height (cm),Waist-to-Height Ratio,Systolic BP,Diastolic BP,Blood Pressure Category,Estimated LDL (mg/dL),CVD Risk Score,CVD Risk Level
0,isDx5313,"November 08, 2023",M,44.0,114.3,1.72,38.6,100.0,112/83,228.0,...,High,N,172.0,0.581,112.0,83.0,Hypertension Stage 1,121.0,19.88,HIGH
1,LHCK2961,20/03/2024,F,57.0,92.923,1.842,33.116,106.315,101/91,158.0,...,High,Y,184.172,0.577,101.0,91.0,Hypertension Stage 2,57.0,16.833,INTERMEDIARY
2,WjVn1699,2021-05-27,F,,73.4,1.65,27.0,78.1,90/74,135.0,...,High,N,165.0,0.473,90.0,74.0,Normal,45.0,12.6,LOW
3,dCDO1109,"April 18, 2022",F,35.0,113.3,1.78,35.8,79.6,92/89,158.0,...,Moderate,Y,178.0,0.447,92.0,89.0,Hypertension Stage 1,94.0,14.92,HIGH
4,pnpE1080,01/11/2024,F,48.0,102.2,1.75,33.4,106.7,121/68,207.0,...,Low,Y,175.0,0.61,121.0,68.0,Elevated,128.0,18.87,HIGH


# Exploración de los datos

In [1202]:
df.shape

(1639, 24)

In [1203]:
# Obtener información general sobre el DataFrame
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 1639 entries, 0 to 1638
Data columns (total 24 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Patient ID                    1639 non-null   str    
 1   Date of Service               1639 non-null   str    
 2   Sex                           1639 non-null   str    
 3   Age                           1571 non-null   float64
 4   Weight (kg)                   1566 non-null   float64
 5   Height (m)                    1578 non-null   float64
 6   BMI                           1586 non-null   float64
 7   Abdominal Circumference (cm)  1578 non-null   float64
 8   Blood Pressure (mmHg)         1639 non-null   str    
 9   Total Cholesterol (mg/dL)     1571 non-null   float64
 10  HDL (mg/dL)                   1557 non-null   float64
 11  Fasting Blood Sugar (mg/dL)   1585 non-null   float64
 12  Smoking Status                1639 non-null   str    
 13  Diabetes Statu

In [1204]:
# Obtener estadísticas descriptivas de las columnas numéricas
df.describe()

Unnamed: 0,Age,Weight (kg),Height (m),BMI,Abdominal Circumference (cm),Total Cholesterol (mg/dL),HDL (mg/dL),Fasting Blood Sugar (mg/dL),Height (cm),Waist-to-Height Ratio,Systolic BP,Diastolic BP,Estimated LDL (mg/dL),CVD Risk Score
count,1571.0,1566.0,1578.0,1586.0,1578.0,1571.0,1557.0,1585.0,1571.0,1563.0,1578.0,1554.0,1582.0,1610.0
mean,46.803186,85.666006,1.757439,28.424744,91.538861,199.043673,56.183558,117.83686,175.770082,0.52244,125.632637,82.887536,113.235896,18.227281
std,13.039479,21.712504,0.118012,7.309275,13.427985,59.38867,16.721702,32.379634,11.69588,0.085692,22.577463,15.503625,61.435291,10.767666
min,6.134,13.261,1.371,4.317,49.542,-1.256,0.008,15.306,136.498,0.25,49.914,31.72,-92.055,-20.057
25%,37.0,67.1,1.6665,22.6,79.7,150.0,42.0,92.0,167.0,0.453,108.0,71.0,62.0,15.15
50%,46.0,86.314,1.76,28.0,91.2,199.0,56.0,115.0,176.0,0.519,125.0,82.0,112.0,16.967
75%,55.0,104.8015,1.85,33.963,102.26725,250.0,70.0,139.0,185.0,0.582,141.0,93.0,159.0,18.9
max,89.42,158.523,2.146,53.028,136.336,385.679,110.315,219.667,214.394,0.804,202.711,134.066,317.314,114.98


In [1205]:
# Verificar los valores nulos en el DataFrame
df.isnull().sum()

Patient ID                       0
Date of Service                  0
Sex                              0
Age                             68
Weight (kg)                     73
Height (m)                      61
BMI                             53
Abdominal Circumference (cm)    61
Blood Pressure (mmHg)            0
Total Cholesterol (mg/dL)       68
HDL (mg/dL)                     82
Fasting Blood Sugar (mg/dL)     54
Smoking Status                   0
Diabetes Status                  0
Physical Activity Level          0
Family History of CVD            0
Height (cm)                     68
Waist-to-Height Ratio           76
Systolic BP                     61
Diastolic BP                    85
Blood Pressure Category          0
Estimated LDL (mg/dL)           57
CVD Risk Score                  29
CVD Risk Level                   0
dtype: int64

In [1206]:
# Verificar duplicados totales
dupes = df.duplicated().sum()
print(f'Número de filas totalmente duplicadas: {dupes}')

Número de filas totalmente duplicadas: 151


In [1207]:
# Obtener listado de columnas no-numéricas
non_num_cols = df.select_dtypes(include=['str']).columns
print(f'Columnas categóricas: {list(non_num_cols)}')

Columnas categóricas: ['Patient ID', 'Date of Service', 'Sex', 'Blood Pressure (mmHg)', 'Smoking Status', 'Diabetes Status', 'Physical Activity Level', 'Family History of CVD', 'Blood Pressure Category', 'CVD Risk Level']


In [1208]:
# Obtener listado de columnas numéricas
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
print(f'Columnas numéricas: {list(num_cols)}')

Columnas numéricas: ['Age', 'Weight (kg)', 'Height (m)', 'BMI', 'Abdominal Circumference (cm)', 'Total Cholesterol (mg/dL)', 'HDL (mg/dL)', 'Fasting Blood Sugar (mg/dL)', 'Height (cm)', 'Waist-to-Height Ratio', 'Systolic BP', 'Diastolic BP', 'Estimated LDL (mg/dL)', 'CVD Risk Score']


# Preparación de los datos

Nuestro objetivo final será un modelo de predicción (regresión lineal) para la variable CVD Risk Score (puntaje de riesgo cardiovascular)

## Remover duplicados y valores nulos en la columna objetivo

In [1209]:
# Remover duplicados totales
df_limpieza = df.drop_duplicates()
# Verificar nuevamente los duplicados totales
dupes = df_limpieza.duplicated().sum()
print(f'Número de filas totalmente duplicadas: {dupes}')

Número de filas totalmente duplicadas: 0


In [1210]:
df_limpieza.shape

(1488, 24)

In [1211]:
# Eliminar filas con valores nulos en la variable objetivo (CVD Risk Score)
df_limpieza = df.dropna(subset=['CVD Risk Score'])
# Verificar nuevamente los valores nulos en la variable objetivo
null_target = df_limpieza['CVD Risk Score'].isnull().sum()
print(f'Número de filas con valores nulos en CVD Risk Score: {null_target}')

Número de filas con valores nulos en CVD Risk Score: 0


In [1212]:
df_limpieza.shape

(1610, 24)

## Transformar columnas de fecha -> Consistencia de los datos

La columna fecha no parece tener un formato consistente, por lo que la convertiremos a un formato de fecha estándar (datetime) para facilitar su manejo en el futuro

In [1213]:
# Transformar la columna 'Date of Service' a formato datetime, manejando posibles inconsistencias en el formato de fecha

# TODO: tener en cuenta para el pipeline

# Primero, eliminamos espacios en blanco y caracteres no deseados
df_limpieza['Date of Service'] = (
    df_limpieza['Date of Service'].copy()
    .astype(str)
    .str.strip()
    .str.replace(r'\s+', ' ', regex=True)
)

# Luego, intentamos convertir a datetime utilizando un formato mixto, permitiendo que pandas infiera el formato correcto
df_limpieza['Date of Service'] = pd.to_datetime(
    df_limpieza['Date of Service'],
    format='mixed',
    dayfirst=True,
    errors='coerce'
)

# Verificar si hay valores que no se pudieron convertir a fecha (serán NaT)
null_dates = df_limpieza['Date of Service'].isnull().sum()
print(f'Número de filas con valores "coerced": {null_dates}')

df_limpieza.head()


Número de filas con valores "coerced": 0


Unnamed: 0,Patient ID,Date of Service,Sex,Age,Weight (kg),Height (m),BMI,Abdominal Circumference (cm),Blood Pressure (mmHg),Total Cholesterol (mg/dL),...,Physical Activity Level,Family History of CVD,Height (cm),Waist-to-Height Ratio,Systolic BP,Diastolic BP,Blood Pressure Category,Estimated LDL (mg/dL),CVD Risk Score,CVD Risk Level
0,isDx5313,2023-11-08,M,44.0,114.3,1.72,38.6,100.0,112/83,228.0,...,High,N,172.0,0.581,112.0,83.0,Hypertension Stage 1,121.0,19.88,HIGH
1,LHCK2961,2024-03-20,F,57.0,92.923,1.842,33.116,106.315,101/91,158.0,...,High,Y,184.172,0.577,101.0,91.0,Hypertension Stage 2,57.0,16.833,INTERMEDIARY
2,WjVn1699,2021-05-27,F,,73.4,1.65,27.0,78.1,90/74,135.0,...,High,N,165.0,0.473,90.0,74.0,Normal,45.0,12.6,LOW
3,dCDO1109,2022-04-18,F,35.0,113.3,1.78,35.8,79.6,92/89,158.0,...,Moderate,Y,178.0,0.447,92.0,89.0,Hypertension Stage 1,94.0,14.92,HIGH
4,pnpE1080,2024-11-01,F,48.0,102.2,1.75,33.4,106.7,121/68,207.0,...,Low,Y,175.0,0.61,121.0,68.0,Elevated,128.0,18.87,HIGH


## Completitud de los datos

### Rellenar nulos

In [1214]:
# Verificar la proporción de valores nulos en cada columna
((df_limpieza.isnull().sum()/df_limpieza.shape[0])).sort_values(ascending=False)

Diastolic BP                    0.051553
HDL (mg/dL)                     0.050932
Waist-to-Height Ratio           0.047205
Weight (kg)                     0.044720
Height (cm)                     0.042236
Total Cholesterol (mg/dL)       0.042236
Age                             0.041615
Height (m)                      0.037267
Systolic BP                     0.037267
Abdominal Circumference (cm)    0.036646
Estimated LDL (mg/dL)           0.035404
Fasting Blood Sugar (mg/dL)     0.032919
BMI                             0.031677
Patient ID                      0.000000
CVD Risk Score                  0.000000
Blood Pressure Category         0.000000
Smoking Status                  0.000000
Family History of CVD           0.000000
Physical Activity Level         0.000000
Diabetes Status                 0.000000
Date of Service                 0.000000
Blood Pressure (mmHg)           0.000000
Sex                             0.000000
CVD Risk Level                  0.000000
dtype: float64

In [1215]:
# Extraer columnas con nulos y su tipo de dato
cols_with_nulls = df_limpieza.columns[df_limpieza.isnull().any()]
cols_with_nulls_types = df_limpieza[cols_with_nulls].dtypes
print("Columnas con valores nulos y su tipo de dato:")
for col, dtype in cols_with_nulls_types.items():
    print(f"{dtype}: {col}")

Columnas con valores nulos y su tipo de dato:
float64: Age
float64: Weight (kg)
float64: Height (m)
float64: BMI
float64: Abdominal Circumference (cm)
float64: Total Cholesterol (mg/dL)
float64: HDL (mg/dL)
float64: Fasting Blood Sugar (mg/dL)
float64: Height (cm)
float64: Waist-to-Height Ratio
float64: Systolic BP
float64: Diastolic BP
float64: Estimated LDL (mg/dL)


Dado que todas las columnas con nulos son numéricas, podemos usar un imputer para rellenar los valores nulos con la media de cada columna

In [1216]:
# Imputer para rellenar los valores nulos en las columnas numéricas con la media

df_imputer = df_limpieza.copy()

si = SimpleImputer(missing_values=np.nan, strategy='mean')
si.fit(df_imputer[cols_with_nulls]) # hacemos fit utilizando las columnas con nulos
filled_cols = si.transform(df_imputer[cols_with_nulls]) # transformamos las columnas con nulos

# Reemplazamos las columnas originales con nulos por las columnas rellenadas
df_imputer[cols_with_nulls] = filled_cols

df_imputer.isnull().sum()

Patient ID                      0
Date of Service                 0
Sex                             0
Age                             0
Weight (kg)                     0
Height (m)                      0
BMI                             0
Abdominal Circumference (cm)    0
Blood Pressure (mmHg)           0
Total Cholesterol (mg/dL)       0
HDL (mg/dL)                     0
Fasting Blood Sugar (mg/dL)     0
Smoking Status                  0
Diabetes Status                 0
Physical Activity Level         0
Family History of CVD           0
Height (cm)                     0
Waist-to-Height Ratio           0
Systolic BP                     0
Diastolic BP                    0
Blood Pressure Category         0
Estimated LDL (mg/dL)           0
CVD Risk Score                  0
CVD Risk Level                  0
dtype: int64

In [1217]:
# Verificamos nuevamente la proporción de valores nulos en cada columna
((df_imputer.isnull().sum()/df_imputer.shape[0])).sort_values(ascending=False)

Patient ID                      0.0
Date of Service                 0.0
CVD Risk Score                  0.0
Estimated LDL (mg/dL)           0.0
Blood Pressure Category         0.0
Diastolic BP                    0.0
Systolic BP                     0.0
Waist-to-Height Ratio           0.0
Height (cm)                     0.0
Family History of CVD           0.0
Physical Activity Level         0.0
Diabetes Status                 0.0
Smoking Status                  0.0
Fasting Blood Sugar (mg/dL)     0.0
HDL (mg/dL)                     0.0
Total Cholesterol (mg/dL)       0.0
Blood Pressure (mmHg)           0.0
Abdominal Circumference (cm)    0.0
BMI                             0.0
Height (m)                      0.0
Weight (kg)                     0.0
Age                             0.0
Sex                             0.0
CVD Risk Level                  0.0
dtype: float64

In [1218]:
# Guardamos el DataFrame limpio en el DataFrame de limpieza para continuar con el proceso de modelado
df_limpieza = df_imputer.copy()
df_limpieza.head()

Unnamed: 0,Patient ID,Date of Service,Sex,Age,Weight (kg),Height (m),BMI,Abdominal Circumference (cm),Blood Pressure (mmHg),Total Cholesterol (mg/dL),...,Physical Activity Level,Family History of CVD,Height (cm),Waist-to-Height Ratio,Systolic BP,Diastolic BP,Blood Pressure Category,Estimated LDL (mg/dL),CVD Risk Score,CVD Risk Level
0,isDx5313,2023-11-08,M,44.0,114.3,1.72,38.6,100.0,112/83,228.0,...,High,N,172.0,0.581,112.0,83.0,Hypertension Stage 1,121.0,19.88,HIGH
1,LHCK2961,2024-03-20,F,57.0,92.923,1.842,33.116,106.315,101/91,158.0,...,High,Y,184.172,0.577,101.0,91.0,Hypertension Stage 2,57.0,16.833,INTERMEDIARY
2,WjVn1699,2021-05-27,F,46.769697,73.4,1.65,27.0,78.1,90/74,135.0,...,High,N,165.0,0.473,90.0,74.0,Normal,45.0,12.6,LOW
3,dCDO1109,2022-04-18,F,35.0,113.3,1.78,35.8,79.6,92/89,158.0,...,Moderate,Y,178.0,0.447,92.0,89.0,Hypertension Stage 1,94.0,14.92,HIGH
4,pnpE1080,2024-11-01,F,48.0,102.2,1.75,33.4,106.7,121/68,207.0,...,Low,Y,175.0,0.61,121.0,68.0,Elevated,128.0,18.87,HIGH


## Unicidad

### Duplicados parciales

In [1219]:
# Verificar duplicados parciales -> Patient ID
dupes_patient_id = df_limpieza.duplicated(subset=['Patient ID']).sum()
print(f'Número de filas con Patient ID duplicado: {dupes_patient_id}')

Número de filas con Patient ID duplicado: 261


In [1220]:
# Evaluar posibilidad de diferentes filas con el mismo Patient ID
dupes_patient_id_date = df_limpieza.duplicated(subset=['Patient ID', 'Date of Service']).sum()
print(f'Número de filas con Patient ID y Date of Service duplicados: {dupes_patient_id_date}')

dupes_patient_id_age = df_limpieza.duplicated(subset=['Patient ID', 'Age']).sum()
print(f'Número de filas con Patient ID y Age duplicados: {dupes_patient_id_age}')

dupes_patient_id_date_age = df_limpieza.duplicated(subset=['Patient ID', 'Date of Service', 'Age']).sum()
print(f'Número de filas con Patient ID, Date of Service y Age duplicados: {dupes_patient_id_date_age}')

Número de filas con Patient ID y Date of Service duplicados: 261
Número de filas con Patient ID y Age duplicados: 261
Número de filas con Patient ID, Date of Service y Age duplicados: 261


Vemos que este parece ser un patrón común. Sin embargo, es necesario evaluarlos más a fondo.

In [1221]:
# Extraer todas las instancias con Patient ID, Date of Service y Age duplicados para evaluar si se trata de filas completamente idénticas o si hay diferencias en otras columnas

dup_table = (
    df_limpieza[df_limpieza['Patient ID'].duplicated(keep=False)]
    .copy()
    .assign(repeticiones=df_limpieza.groupby('Patient ID')['Patient ID'].transform('size'))
    .sort_values(['repeticiones', 'Patient ID'], ascending=[False, True])
)
# Mostrar tabla
dup_table.head(20)

Unnamed: 0,Patient ID,Date of Service,Sex,Age,Weight (kg),Height (m),BMI,Abdominal Circumference (cm),Blood Pressure (mmHg),Total Cholesterol (mg/dL),...,Family History of CVD,Height (cm),Waist-to-Height Ratio,Systolic BP,Diastolic BP,Blood Pressure Category,Estimated LDL (mg/dL),CVD Risk Score,CVD Risk Level,repeticiones
17,AhYt1346,2020-09-28,M,41.0,71.3,1.73,23.8,107.9,139/61,253.0,...,Y,173.0,0.522502,139.0,82.878344,Hypertension Stage 1,146.0,16.77,HIGH,3
1227,AhYt1346,2020-09-28,M,41.0,71.3,1.73,23.8,107.9,139/61,253.0,...,Y,173.0,0.522502,139.0,82.878344,Hypertension Stage 1,146.0,-13.09,HIGH,3
1584,AhYt1346,2020-09-28,M,41.0,71.3,1.73,23.8,107.9,139/61,253.0,...,Y,173.0,0.522502,139.0,82.878344,Hypertension Stage 1,146.0,16.77,HIGH,3
130,BQvQ6431,2020-11-09,M,33.0,118.3,1.69,41.4,72.1,116/93,171.0,...,N,210.554,0.427,116.0,93.0,Hypertension Stage 2,97.0,17.5,LOW,3
1469,BQvQ6431,2020-11-09,M,33.0,118.3,1.69,41.4,72.1,116/93,171.0,...,N,210.554,0.427,116.0,93.0,Hypertension Stage 2,97.0,29.833,LOW,3
1544,BQvQ6431,2020-11-09,M,33.0,118.3,1.69,41.4,72.1,116/93,171.0,...,N,210.554,0.427,116.0,93.0,Hypertension Stage 2,97.0,17.5,LOW,3
850,CDsa2651,2025-06-23,M,39.0,73.3,1.74,24.2,95.0,111/84,158.0,...,Y,174.0,0.546,111.0,84.0,Hypertension Stage 1,91.0,26.604,INTERMEDIARY,3
1241,CDsa2651,2025-06-23,M,39.0,73.3,1.74,24.2,95.0,111/84,158.0,...,Y,174.0,0.546,111.0,84.0,Hypertension Stage 1,91.0,15.55,INTERMEDIARY,3
1566,CDsa2651,2025-06-23,M,39.0,73.3,1.74,24.2,95.0,111/84,158.0,...,Y,174.0,0.546,111.0,84.0,Hypertension Stage 1,91.0,15.55,INTERMEDIARY,3
246,CKKa5109,2023-03-18,M,51.0,85.9,1.78,27.1,87.2,144/70,189.0,...,N,178.0,0.49,144.0,70.0,Hypertension Stage 2,300.227,6.55,INTERMEDIARY,3


Al hacer el detalle, podemos darnos cuenta que los duplicados parciales son repeticiones del mismo paciente, por lo general con 3 apariciones, donde la única diferencia es un CVD distinto. Para resolver esto, removeremos los duplicados con el valor de CVD que más se repite entre sus duplicados, o, de lo contrario, el promedio de los valores de CVD entre sus duplicados.

In [1222]:
# Resolver duplicados parciales

def resolve_partial_dupes(group):
    """
    Resuelve los duplicados parciales para un grupo de filas con el mismo Patient ID. Si hay filas idénticas, se mantiene una sola. Si hay filas con diferentes valores de CVD Risk Score, se mantiene la fila con el valor de CVD que más se repite entre sus duplicados, o, de lo contrario, el promedio de los valores de CVD entre sus duplicados.
    :param group: DataFrame con las filas que tienen el mismo Patient ID
    :return: Una sola fila representativa para el grupo de duplicados parciales
    """
    if len(group) == 1:
        return group.iloc[0]

    counts = group['CVD Risk Score'].value_counts()
    if counts.max() > 1:
        return group.loc[group['CVD Risk Score'] == counts.idxmax()].iloc[0]
    else:
        row = group.iloc[0].copy()
        row['CVD Risk Score'] = group['CVD Risk Score'].mean()
        return row

print(f'Número de filas antes de resolver duplicados parciales: {df_limpieza.shape[0]}')
print('Número de duplicados parciales a resolver:', dupes_patient_id)

# Aplicar la función para resolver los duplicados parciales y crear un nuevo DataFrame con los resultados
df_resolved = (
    df_limpieza
        .groupby('Patient ID')
        .apply(resolve_partial_dupes)
        .reset_index()
)

# Verificar el número de filas después de resolver los duplicados parciales
print(f'Número de filas después de resolver duplicados parciales: {df_resolved.shape[0]}')
df_resolved.head(5)


df_limpieza = df_resolved.copy()

Número de filas antes de resolver duplicados parciales: 1610
Número de duplicados parciales a resolver: 261
Número de filas después de resolver duplicados parciales: 1349


## Consistencia de los datos

### Verificar valores categóricas

In [1223]:
# Verificar que los valores categóricos sean consistentes
cat_cols = df_limpieza.select_dtypes(include=['str']).columns

df_cat = df_limpieza[cat_cols].copy()
df_cat.head()

Unnamed: 0,Patient ID,Sex,Blood Pressure (mmHg),Smoking Status,Diabetes Status,Physical Activity Level,Family History of CVD,Blood Pressure Category,CVD Risk Level
0,AEFC1294,F,103/99,N,Y,High,N,Hypertension Stage 2,HIGH
1,AHTL6366,F,164/90,N,N,Low,Y,Hypertension Stage 2,HIGH
2,AHjK2744,M,113/91,Y,Y,Low,N,Hypertension Stage 2,HIGH
3,ALHn0227,F,111/69,N,N,Low,Y,Normal,HIGH
4,AMxU2442,M,117/116,N,N,Low,Y,Hypertension Stage 2,HIGH


In [1224]:
# Verificar los valores únicos en cada columna categórica

for col in cat_cols:
    if col != 'Patient ID':  # Excluir la columna Patient ID, ya que es un identificador único
        unique_values = df_cat[col].unique()
        print(f'Columna: {col}')
        print(f'Valores únicos: {unique_values}')
        print('---')

Columna: Sex
Valores únicos: <StringArray>
['F', 'M']
Length: 2, dtype: str
---
Columna: Blood Pressure (mmHg)
Valores únicos: <StringArray>
[ '103/99',  '164/90',  '113/91',  '111/69', '117/116', '113/119',  '144/81',
  '116/69',  '144/77',  '151/90',
 ...
  '162/94',  '168/63',  '174/76',  '129/77', '119/100',  '101/73',  '119/72',
  '129/60',  '140/67', '110/114']
Length: 1131, dtype: str
---
Columna: Smoking Status
Valores únicos: <StringArray>
['N', 'Y']
Length: 2, dtype: str
---
Columna: Diabetes Status
Valores únicos: <StringArray>
['Y', 'N']
Length: 2, dtype: str
---
Columna: Physical Activity Level
Valores únicos: <StringArray>
['High', 'Low', 'Moderate']
Length: 3, dtype: str
---
Columna: Family History of CVD
Valores únicos: <StringArray>
['N', 'Y']
Length: 2, dtype: str
---
Columna: Blood Pressure Category
Valores únicos: <StringArray>
['Hypertension Stage 2', 'Normal', 'Hypertension Stage 1', 'Elevated']
Length: 4, dtype: str
---
Columna: CVD Risk Level
Valores únicos: <St

## Validez de los datos

In [1225]:
# Verificar que los valores numéricos estén dentro de rangos razonables
df_limpieza.describe()

Unnamed: 0,Date of Service,Age,Weight (kg),Height (m),BMI,Abdominal Circumference (cm),Total Cholesterol (mg/dL),HDL (mg/dL),Fasting Blood Sugar (mg/dL),Height (cm),Waist-to-Height Ratio,Systolic BP,Diastolic BP,Estimated LDL (mg/dL),CVD Risk Score
count,1349,1349.0,1349.0,1349.0,1349.0,1349.0,1349.0,1349.0,1349.0,1349.0,1349.0,1349.0,1349.0,1349.0,1349.0
mean,2022-12-07 00:06:24.284655,47.06039,86.005373,1.756099,28.554707,91.905445,198.126498,56.09398,117.098785,175.566514,0.525138,125.758911,82.829924,112.198083,18.170198
min,2020-01-05 00:00:00,6.134,13.261,1.371,4.317,49.542,-1.256,0.008,15.306,136.498,0.25,49.914,31.72,-92.055,-20.057
25%,2021-05-26 00:00:00,37.0,67.9,1.67,22.7,80.8,152.0,42.0,92.0,167.0,0.46,109.0,71.0,63.0,15.19
50%,2022-11-08 00:00:00,46.769697,85.880566,1.757666,28.472249,91.557663,198.971213,56.164136,117.0,175.737219,0.522502,125.74471,82.878344,113.0,16.92
75%,2024-07-08 00:00:00,55.0,104.2,1.841,34.0,102.0,247.0,69.0,138.0,184.0,0.581,141.0,93.0,157.0,18.81
max,2025-12-30 00:00:00,89.42,158.523,2.146,53.028,136.336,385.679,110.315,219.667,214.394,0.804,202.711,134.066,317.314,114.98
std,,12.628781,21.232342,0.11588,7.257954,13.140767,58.100997,16.292392,31.432091,11.391824,0.083198,22.366234,15.251254,60.543396,10.776622


In [1226]:
# Verificar los valores de colesterol negativos o extremadamente altos
cholesterol_cols = ['Total Cholesterol (mg/dL)', 'HDL (mg/dL)', 'Estimated LDL (mg/dL)']
for col in cholesterol_cols:
    invalid_values = df_limpieza[(df_limpieza[col] < 0) | (df_limpieza[col] > 1000)]
    print(f'Columna: {col}')
    print(f'Número de valores inválidos: {invalid_values.shape[0]}')
    print('---')

Columna: Total Cholesterol (mg/dL)
Número de valores inválidos: 1
---
Columna: HDL (mg/dL)
Número de valores inválidos: 0
---
Columna: Estimated LDL (mg/dL)
Número de valores inválidos: 15
---


In [1227]:
# Verificar los valores de presión arterial negativos o extremadamente altos
pressure_cols = ['Systolic BP', 'Diastolic BP']
for col in pressure_cols:
    invalid_values = df_limpieza[(df_limpieza[col] < 0) | (df_limpieza[col] > 300)]
    print(f'Columna: {col}')
    print(f'Número de valores inválidos: {invalid_values.shape[0]}')
    print('---')

Columna: Systolic BP
Número de valores inválidos: 0
---
Columna: Diastolic BP
Número de valores inválidos: 0
---


In [1228]:
# Verificar los valores de edad negativos o extremadamente altos
invalid_age = df_limpieza[(df_limpieza['Age'] < 0) | (df_limpieza['Age'] > 120)]
print(f'Número de valores de edad inválidos: {invalid_age.shape[0]}')

Número de valores de edad inválidos: 0


In [1229]:
# Verificar los valores de BMI negativos o extremadamente altos
invalid_bmi = df_limpieza[(df_limpieza['BMI'] < 0) | (df_limpieza['BMI'] > 100)]
print(f'Número de valores de BMI inválidos: {invalid_bmi.shape[0]}')

Número de valores de BMI inválidos: 0


In [1230]:
# Verificar los valores de CVD Risk Score negativos o extremadamente altos
invalid_cvd = df_limpieza[(df_limpieza['CVD Risk Score'] < 0) | (df_limpieza['CVD Risk Score'] > 100)]
print(f'Número de valores de CVD Risk Score inválidos: {invalid_cvd.shape[0]}')

Número de valores de CVD Risk Score inválidos: 15


In [1231]:
# Remover filas con valores inválidos en las columnas numéricas
df_valid = df_limpieza[
    (df_limpieza['Age'] >= 0) & (df_limpieza['Age'] <= 120) &
    (df_limpieza['BMI'] >= 0) & (df_limpieza['BMI'] <= 100) &
    (df_limpieza['CVD Risk Score'] >= 0) & (df_limpieza['CVD Risk Score'] <= 100) &
    (df_limpieza['Systolic BP'] >= 0) & (df_limpieza['Systolic BP'] <= 300) &
    (df_limpieza['Diastolic BP'] >= 0) & (df_limpieza['Diastolic BP'] <= 300) &
    (df_limpieza['Total Cholesterol (mg/dL)'] >= 0) & (df_limpieza['Total Cholesterol (mg/dL)'] <= 1000) &
    (df_limpieza['HDL (mg/dL)'] >= 0) & (df_limpieza['HDL (mg/dL)'] <= 1000) &
    (df_limpieza['Estimated LDL (mg/dL)'] >= 0) & (df_limpieza['Estimated LDL (mg/dL)'] <= 1000)
].copy()
# Verificar el número de filas después de remover los valores inválidos
print(f'Número de filas antes de remover valores inválidos: {df_limpieza.shape[0]}')
print(f'Número de filas después de remover valores inválidos: {df_valid.shape[0]}')
df_valid.head()

df_limpieza = df_valid.copy()

Número de filas antes de remover valores inválidos: 1349
Número de filas después de remover valores inválidos: 1318


## Resultados de la limpieza de datos

In [1232]:
# Resumen de las transformaciones realizadas en el proceso de limpieza de datos
print("Resumen de la limpieza de datos:")
print(f"- Número de filas originales: {df.shape[0]}")
print(f"- Número de filas después de la limpieza: {df_limpieza.shape[0]}")
print(f"- Número de columnas originales: {df.shape[1]}")
print(f"- Número de columnas después de la limpieza: {df_limpieza.shape[1]}")

Resumen de la limpieza de datos:
- Número de filas originales: 1639
- Número de filas después de la limpieza: 1318
- Número de columnas originales: 24
- Número de columnas después de la limpieza: 24


In [1233]:
# Definicion del dataframe limpio final para el proceso de modelado
df_final = df_limpieza.copy()
df_final.head()

Unnamed: 0,Patient ID,Date of Service,Sex,Age,Weight (kg),Height (m),BMI,Abdominal Circumference (cm),Blood Pressure (mmHg),Total Cholesterol (mg/dL),...,Physical Activity Level,Family History of CVD,Height (cm),Waist-to-Height Ratio,Systolic BP,Diastolic BP,Blood Pressure Category,Estimated LDL (mg/dL),CVD Risk Score,CVD Risk Level
0,AEFC1294,2020-03-21,F,52.0,109.7,1.78,34.6,104.4,103/99,197.0,...,High,N,178.0,0.587,103.0,99.0,Hypertension Stage 2,113.190076,18.01,HIGH
1,AHTL6366,2020-03-03,F,46.769697,104.469,1.995,25.39,86.894,164/90,198.971213,...,Low,Y,199.458,0.436,164.0,90.0,Hypertension Stage 2,147.0,18.458,HIGH
2,AHjK2744,2022-03-08,M,58.0,85.880566,1.87,33.8,99.6,113/91,106.0,...,Low,N,187.0,0.533,125.74471,91.0,Hypertension Stage 2,33.0,16.53,HIGH
3,ALHn0227,2025-09-17,F,52.0,107.8,1.78,34.0,103.5,111/69,179.0,...,Low,Y,178.0,0.581,111.0,69.0,Normal,76.0,15.93,HIGH
4,AMxU2442,2023-09-10,M,57.0,116.772,1.569,34.126,90.737,117/116,254.0,...,Low,Y,156.854,0.578,117.0,116.0,Hypertension Stage 2,164.0,17.755,HIGH


# Modelo LR

In [1234]:
# Aquí se implementaría el modelo de regresión lineal utilizando la librería de scikit-learn

target = "CVD Risk Score"
X = df_final.drop(columns=[target])
y = df_final[target]

In [1235]:
X.head()

Unnamed: 0,Patient ID,Date of Service,Sex,Age,Weight (kg),Height (m),BMI,Abdominal Circumference (cm),Blood Pressure (mmHg),Total Cholesterol (mg/dL),...,Diabetes Status,Physical Activity Level,Family History of CVD,Height (cm),Waist-to-Height Ratio,Systolic BP,Diastolic BP,Blood Pressure Category,Estimated LDL (mg/dL),CVD Risk Level
0,AEFC1294,2020-03-21,F,52.0,109.7,1.78,34.6,104.4,103/99,197.0,...,Y,High,N,178.0,0.587,103.0,99.0,Hypertension Stage 2,113.190076,HIGH
1,AHTL6366,2020-03-03,F,46.769697,104.469,1.995,25.39,86.894,164/90,198.971213,...,N,Low,Y,199.458,0.436,164.0,90.0,Hypertension Stage 2,147.0,HIGH
2,AHjK2744,2022-03-08,M,58.0,85.880566,1.87,33.8,99.6,113/91,106.0,...,Y,Low,N,187.0,0.533,125.74471,91.0,Hypertension Stage 2,33.0,HIGH
3,ALHn0227,2025-09-17,F,52.0,107.8,1.78,34.0,103.5,111/69,179.0,...,N,Low,Y,178.0,0.581,111.0,69.0,Normal,76.0,HIGH
4,AMxU2442,2023-09-10,M,57.0,116.772,1.569,34.126,90.737,117/116,254.0,...,N,Low,Y,156.854,0.578,117.0,116.0,Hypertension Stage 2,164.0,HIGH


In [1236]:
y.head()

0    18.010
1    18.458
2    16.530
3    15.930
4    17.755
Name: CVD Risk Score, dtype: float64

In [1237]:
# Dividir el dataset en conjunto de entrenamiento y conjunto de prueba
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.25, random_state=42 )

In [1238]:
X_train.shape, y_train.shape

((988, 23), (988,))

In [1239]:
X_test.shape, y_test.shape

((330, 23), (330,))

### Pipeline

In [1240]:
all_cols = X_train.columns.tolist()
print(f'Todas las columnas: {all_cols}')

num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
print(f'Columnas numéricas: {num_cols}')

cat_cols = X_train.select_dtypes(include=['str']).columns.tolist()
print(f'Columnas categóricas: {cat_cols}')

Todas las columnas: ['Patient ID', 'Date of Service', 'Sex', 'Age', 'Weight (kg)', 'Height (m)', 'BMI', 'Abdominal Circumference (cm)', 'Blood Pressure (mmHg)', 'Total Cholesterol (mg/dL)', 'HDL (mg/dL)', 'Fasting Blood Sugar (mg/dL)', 'Smoking Status', 'Diabetes Status', 'Physical Activity Level', 'Family History of CVD', 'Height (cm)', 'Waist-to-Height Ratio', 'Systolic BP', 'Diastolic BP', 'Blood Pressure Category', 'Estimated LDL (mg/dL)', 'CVD Risk Level']
Columnas numéricas: ['Age', 'Weight (kg)', 'Height (m)', 'BMI', 'Abdominal Circumference (cm)', 'Total Cholesterol (mg/dL)', 'HDL (mg/dL)', 'Fasting Blood Sugar (mg/dL)', 'Height (cm)', 'Waist-to-Height Ratio', 'Systolic BP', 'Diastolic BP', 'Estimated LDL (mg/dL)']
Columnas categóricas: ['Patient ID', 'Sex', 'Blood Pressure (mmHg)', 'Smoking Status', 'Diabetes Status', 'Physical Activity Level', 'Family History of CVD', 'Blood Pressure Category', 'CVD Risk Level']


In [1241]:
# Implementar el pipeline de preprocesamiento y modelado utilizando ColumnTransformer y Pipeline de scikit-learn

# Columnas a eliminar
cols_to_drop = ['Patient ID', 'Date of Service', 'CVD Risk Level', 'Blood Pressure (mmHg)']

# Columnas numéricas para escalar, eliminando las columnas que se van a eliminar
num_cols = [col for col in num_cols if col not in cols_to_drop]
print(f'Columnas numéricas: {num_cols}')

# Columnas categóricas para codificar, eliminando las columnas que se van a eliminar
cat_cols = [col for col in cat_cols if col not in cols_to_drop]
print(f'Columnas categóricas: {cat_cols}')


Columnas numéricas: ['Age', 'Weight (kg)', 'Height (m)', 'BMI', 'Abdominal Circumference (cm)', 'Total Cholesterol (mg/dL)', 'HDL (mg/dL)', 'Fasting Blood Sugar (mg/dL)', 'Height (cm)', 'Waist-to-Height Ratio', 'Systolic BP', 'Diastolic BP', 'Estimated LDL (mg/dL)']
Columnas categóricas: ['Sex', 'Smoking Status', 'Diabetes Status', 'Physical Activity Level', 'Family History of CVD', 'Blood Pressure Category']


In [1242]:
# Preprocesamiento para eliminar columnas no relevantes utilizando FunctionTransformer para eliminar las columnas no relevantes

def drop_columns(df):
    return df.drop(columns=cols_to_drop, errors="ignore")

dropper = FunctionTransformer(drop_columns)


In [1243]:
# Pipeline de escalado para columnas numéricas utilizando StandardScaler para estandarizar las columnas numéricas, eliminando las columnas que se van a eliminar

# Preprocesamiento para columnas numéricas
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])


In [1244]:
# Pipeline de codificación para columnas categóricas utilizando OneHotEncoder para convertir las variables categóricas en variables dummy, eliminando las columnas que se van a eliminar

# Preprocesamiento para columnas categóricas, utilizar un OneHotEncoder para convertir las variables categóricas en variables dummy
cat_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


In [1245]:
# El pipeline de preprocesamiento categórico se aplicará solo a las columnas categóricas, por lo que se especificarán las columnas categóricas en el ColumnTransformer para que el OneHotEncoder se aplique únicamente a esas columnas

# Combinar los preprocesamientos en un ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols)
    ]
)

In [1246]:
# Crear el pipeline completo con el preprocesamiento y el modelo de regresión lineal

pipeline_regression = Pipeline(steps=[
    ('dropper', dropper),
    ('preprocessor', preprocessor),
])


In [1247]:
# Visualizar el pipeline
pipeline_regression

0,1,2
,"steps  steps: list of tuples List of (name of step, estimator) tuples that are to be chained in sequential order. To be compatible with the scikit-learn API, all steps must define `fit`. All non-last steps must also define `transform`. See :ref:`Combining Estimators ` for more details.","[('dropper', ...), ('preprocessor', ...)]"
,"transform_input  transform_input: list of str, default=None The names of the :term:`metadata` parameters that should be transformed by the pipeline before passing it to the step consuming it. This enables transforming some input arguments to ``fit`` (other than ``X``) to be transformed by the steps of the pipeline up to the step which requires them. Requirement is defined via :ref:`metadata routing `. For instance, this can be used to pass a validation set through the pipeline. You can only set this if metadata routing is enabled, which you can enable using ``sklearn.set_config(enable_metadata_routing=True)``. .. versionadded:: 1.6",
,"memory  memory: str or object with the joblib.Memory interface, default=None Used to cache the fitted transformers of the pipeline. The last step will never be cached, even if it is a transformer. By default, no caching is performed. If a string is given, it is the path to the caching directory. Enabling caching triggers a clone of the transformers before fitting. Therefore, the transformer instance given to the pipeline cannot be inspected directly. Use the attribute ``named_steps`` or ``steps`` to inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time consuming. See :ref:`sphx_glr_auto_examples_neighbors_plot_caching_nearest_neighbors.py` for an example on how to enable caching.",
,"verbose  verbose: bool, default=False If True, the time elapsed while fitting each step will be printed as it is completed.",False

0,1,2
,"func  func: callable, default=None The callable to use for the transformation. This will be passed the same arguments as transform, with args and kwargs forwarded. If func is None, then func will be the identity function.",<function dro...t 0x11d116fb0>
,"inverse_func  inverse_func: callable, default=None The callable to use for the inverse transformation. This will be passed the same arguments as inverse transform, with args and kwargs forwarded. If inverse_func is None, then inverse_func will be the identity function.",
,"validate  validate: bool, default=False Indicate that the input X array should be checked before calling ``func``. The possibilities are: - If False, there is no input validation. - If True, then X will be converted to a 2-dimensional NumPy array or  sparse matrix. If the conversion is not possible an exception is  raised. .. versionchanged:: 0.22  The default of ``validate`` changed from True to False.",False
,"accept_sparse  accept_sparse: bool, default=False Indicate that func accepts a sparse matrix as input. If validate is False, this has no effect. Otherwise, if accept_sparse is false, sparse matrix inputs will cause an exception to be raised.",False
,"check_inverse  check_inverse: bool, default=True Whether to check that or ``func`` followed by ``inverse_func`` leads to the original inputs. It can be used for a sanity check, raising a warning when the condition is not fulfilled. .. versionadded:: 0.20",True
,"feature_names_out  feature_names_out: callable, 'one-to-one' or None, default=None Determines the list of feature names that will be returned by the `get_feature_names_out` method. If it is 'one-to-one', then the output feature names will be equal to the input feature names. If it is a callable, then it must take two positional arguments: this `FunctionTransformer` (`self`) and an array-like of input feature names (`input_features`). It must return an array-like of output feature names. The `get_feature_names_out` method is only defined if `feature_names_out` is not None. See ``get_feature_names_out`` for more details. .. versionadded:: 1.1",
,"kw_args  kw_args: dict, default=None Dictionary of additional keyword arguments to pass to func. .. versionadded:: 0.18",
,"inv_kw_args  inv_kw_args: dict, default=None Dictionary of additional keyword arguments to pass to inverse_func. .. versionadded:: 0.18",

0,1,2
,"transformers  transformers: list of tuples List of (name, transformer, columns) tuples specifying the transformer objects to be applied to subsets of the data. name : str  Like in Pipeline and FeatureUnion, this allows the transformer and  its parameters to be set using ``set_params`` and searched in grid  search. transformer : {'drop', 'passthrough'} or estimator  Estimator must support :term:`fit` and :term:`transform`.  Special-cased strings 'drop' and 'passthrough' are accepted as  well, to indicate to drop the columns or to pass them through  untransformed, respectively. columns : str, array-like of str, int, array-like of int, array-like of bool, slice or callable  Indexes the data on its second axis. Integers are interpreted as  positional columns, while strings can reference DataFrame columns  by name. A scalar string or int should be used where  ``transformer`` expects X to be a 1d array-like (vector),  otherwise a 2d array will be passed to the transformer.  A callable is passed the input data `X` and can return any of the  above. To select multiple columns by name or dtype, you can use  :obj:`make_column_selector`.","[('num', ...), ('cat', ...)]"
,"remainder  remainder: {'drop', 'passthrough'} or estimator, default='drop' By default, only the specified columns in `transformers` are transformed and combined in the output, and the non-specified columns are dropped. (default of ``'drop'``). By specifying ``remainder='passthrough'``, all remaining columns that were not specified in `transformers`, but present in the data passed to `fit` will be automatically passed through. This subset of columns is concatenated with the output of the transformers. For dataframes, extra columns not seen during `fit` will be excluded from the output of `transform`. By setting ``remainder`` to be an estimator, the remaining non-specified columns will use the ``remainder`` estimator. The estimator must support :term:`fit` and :term:`transform`. Note that using this feature requires that the DataFrame columns input at :term:`fit` and :term:`transform` have identical order.",'drop'
,"sparse_threshold  sparse_threshold: float, default=0.3 If the output of the different transformers contains sparse matrices, these will be stacked as a sparse matrix if the overall density is lower than this value. Use ``sparse_threshold=0`` to always return dense. When the transformed output consists of all dense data, the stacked result will be dense, and this keyword will be ignored.",0.3
,"n_jobs  n_jobs: int, default=None Number of jobs to run in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"transformer_weights  transformer_weights: dict, default=None Multiplicative weights for features per transformer. The output of the transformer is multiplied by these weights. Keys are transformer names, values the weights.",
,"verbose  verbose: bool, default=False If True, the time elapsed while fitting each transformer will be printed as it is completed.",False
,"verbose_feature_names_out  verbose_feature_names_out: bool, str or Callable[[str, str], str], default=True - If True, :meth:`ColumnTransformer.get_feature_names_out` will prefix  all feature names with the name of the transformer that generated that  feature. It is equivalent to setting  `verbose_feature_names_out=""{transformer_name}__{feature_name}""`. - If False, :meth:`ColumnTransformer.get_feature_names_out` will not  prefix any feature names and will error if feature names are not  unique. - If ``Callable[[str, str], str]``,  :meth:`ColumnTransformer.get_feature_names_out` will rename all the features  using the name of the transformer. The first argument of the callable is the  transformer name and the second argument is the feature name. The returned  string will be the new feature name. - If ``str``, it must be a string ready for formatting. The given string will  be formatted using two field names: ``transformer_name`` and ``feature_name``.  e.g. ``""{feature_name}__{transformer_name}""``. See :meth:`str.format` method  from the standard library for more info. .. versionadded:: 1.0 .. versionchanged:: 1.6  `verbose_feature_names_out` can be a callable or a string to be formatted.",True
,"force_int_remainder_cols  force_int_remainder_cols: bool, default=False This parameter has no effect. .. note::  If you do not access the list of columns for the remainder columns  in the `transformers_` fitted attribute, you do not need to set  this parameter. .. versionadded:: 1.5 .. versionchanged:: 1.7  The default value for `force_int_remainder_cols` will change from  `True` to `False` in version 1.7. .. deprecated:: 1.7  `force_int_remainder_cols` is deprecated and will be removed in 1.9.",'deprecated'

0,1,2
,"missing_values  missing_values: int, float, str, np.nan, None or pandas.NA, default=np.nan The placeholder for the missing values. All occurrences of `missing_values` will be imputed. For pandas' dataframes with nullable integer dtypes with missing values, `missing_values` can be set to either `np.nan` or `pd.NA`.",
,"strategy  strategy: str or Callable, default='mean' The imputation strategy. - If ""mean"", then replace missing values using the mean along  each column. Can only be used with numeric data. - If ""median"", then replace missing values using the median along  each column. Can only be used with numeric data. - If ""most_frequent"", then replace missing using the most frequent  value along each column. Can be used with strings or numeric data.  If there is more than one such value, only the smallest is returned. - If ""constant"", then replace missing values with fill_value. Can be  used with strings or numeric data. - If an instance of Callable, then replace missing values using the  scalar statistic returned by running the callable over a dense 1d  array containing non-missing values of each column. .. versionadded:: 0.20  strategy=""constant"" for fixed value imputation. .. versionadded:: 1.5  strategy=callable for custom value imputation.",'mean'
,"fill_value  fill_value: str or numerical value, default=None When strategy == ""constant"", `fill_value` is used to replace all occurrences of missing_values. For string or object data types, `fill_value` must be a string. If `None`, `fill_value` will be 0 when imputing numerical data and ""missing_value"" for strings or object data types.",
,"copy  copy: bool, default=True If True, a copy of X will be created. If False, imputation will be done in-place whenever possible. Note that, in the following cases, a new copy will always be made, even if `copy=False`: - If `X` is not an array of floating values; - If `X` is encoded as a CSR matrix; - If `add_indicator=True`.",True
,"add_indicator  add_indicator: bool, default=False If True, a :class:`MissingIndicator` transform will stack onto output of the imputer's transform. This allows a predictive estimator to account for missingness despite imputation. If a feature has no missing values at fit/train time, the feature won't appear on the missing indicator even if there are missing values at transform/test time.",False
,"keep_empty_features  keep_empty_features: bool, default=False If True, features that consist exclusively of missing values when `fit` is called are returned in results when `transform` is called. The imputed value is always `0` except when `strategy=""constant""` in which case `fill_value` will be used instead. .. versionadded:: 1.2",False

0,1,2
,"copy  copy: bool, default=True If False, try to avoid a copy and do inplace scaling instead. This is not guaranteed to always work inplace; e.g. if the data is not a NumPy array or scipy.sparse CSR matrix, a copy may still be returned.",True
,"with_mean  with_mean: bool, default=True If True, center the data before scaling. This does not work (and will raise an exception) when attempted on sparse matrices, because centering them entails building a dense matrix which in common use cases is likely to be too large to fit in memory.",True
,"with_std  with_std: bool, default=True If True, scale the data to unit variance (or equivalently, unit standard deviation).",True

0,1,2
,"missing_values  missing_values: int, float, str, np.nan, None or pandas.NA, default=np.nan The placeholder for the missing values. All occurrences of `missing_values` will be imputed. For pandas' dataframes with nullable integer dtypes with missing values, `missing_values` can be set to either `np.nan` or `pd.NA`.",
,"strategy  strategy: str or Callable, default='mean' The imputation strategy. - If ""mean"", then replace missing values using the mean along  each column. Can only be used with numeric data. - If ""median"", then replace missing values using the median along  each column. Can only be used with numeric data. - If ""most_frequent"", then replace missing using the most frequent  value along each column. Can be used with strings or numeric data.  If there is more than one such value, only the smallest is returned. - If ""constant"", then replace missing values with fill_value. Can be  used with strings or numeric data. - If an instance of Callable, then replace missing values using the  scalar statistic returned by running the callable over a dense 1d  array containing non-missing values of each column. .. versionadded:: 0.20  strategy=""constant"" for fixed value imputation. .. versionadded:: 1.5  strategy=callable for custom value imputation.",'most_frequent'
,"fill_value  fill_value: str or numerical value, default=None When strategy == ""constant"", `fill_value` is used to replace all occurrences of missing_values. For string or object data types, `fill_value` must be a string. If `None`, `fill_value` will be 0 when imputing numerical data and ""missing_value"" for strings or object data types.",
,"copy  copy: bool, default=True If True, a copy of X will be created. If False, imputation will be done in-place whenever possible. Note that, in the following cases, a new copy will always be made, even if `copy=False`: - If `X` is not an array of floating values; - If `X` is encoded as a CSR matrix; - If `add_indicator=True`.",True
,"add_indicator  add_indicator: bool, default=False If True, a :class:`MissingIndicator` transform will stack onto output of the imputer's transform. This allows a predictive estimator to account for missingness despite imputation. If a feature has no missing values at fit/train time, the feature won't appear on the missing indicator even if there are missing values at transform/test time.",False
,"keep_empty_features  keep_empty_features: bool, default=False If True, features that consist exclusively of missing values when `fit` is called are returned in results when `transform` is called. The imputed value is always `0` except when `strategy=""constant""` in which case `fill_value` will be used instead. .. versionadded:: 1.2",False

0,1,2
,"categories  categories: 'auto' or a list of array-like, default='auto' Categories (unique values) per feature: - 'auto' : Determine categories automatically from the training data. - list : ``categories[i]`` holds the categories expected in the ith  column. The passed categories should not mix strings and numeric  values within a single feature, and should be sorted in case of  numeric values. The used categories can be found in the ``categories_`` attribute. .. versionadded:: 0.20",'auto'
,"drop  drop: {'first', 'if_binary'} or an array-like of shape (n_features,), default=None Specifies a methodology to use to drop one of the categories per feature. This is useful in situations where perfectly collinear features cause problems, such as when feeding the resulting data into an unregularized linear regression model. However, dropping one category breaks the symmetry of the original representation and can therefore induce a bias in downstream models, for instance for penalized linear classification or regression models. - None : retain all features (the default). - 'first' : drop the first category in each feature. If only one  category is present, the feature will be dropped entirely. - 'if_binary' : drop the first category in each feature with two  categories. Features with 1 or more than 2 categories are  left intact. - array : ``drop[i]`` is the category in feature ``X[:, i]`` that  should be dropped. When `max_categories` or `min_frequency` is configured to group infrequent categories, the dropping behavior is handled after the grouping. .. versionadded:: 0.21  The parameter `drop` was added in 0.21. .. versionchanged:: 0.23  The option `drop='if_binary'` was added in 0.23. .. versionchanged:: 1.1  Support for dropping infrequent categories.",
,"sparse_output  sparse_output: bool, default=True When ``True``, it returns a :class:`scipy.sparse.csr_matrix`, i.e. a sparse matrix in ""Compressed Sparse Row"" (CSR) format. .. versionadded:: 1.2  `sparse` was renamed to `sparse_output`",True
,"dtype  dtype: number type, default=np.float64 Desired dtype of output.",<class 'numpy.float64'>
,"handle_unknown  handle_unknown: {'error', 'ignore', 'infrequent_if_exist', 'warn'}, default='error' Specifies the way unknown categories are handled during :meth:`transform`. - 'error' : Raise an error if an unknown category is present during transform. - 'ignore' : When an unknown category is encountered during  transform, the resulting one-hot encoded columns for this feature  will be all zeros. In the inverse transform, an unknown category  will be denoted as None. - 'infrequent_if_exist' : When an unknown category is encountered  during transform, the resulting one-hot encoded columns for this  feature will map to the infrequent category if it exists. The  infrequent category will be mapped to the last position in the  encoding. During inverse transform, an unknown category will be  mapped to the category denoted `'infrequent'` if it exists. If the  `'infrequent'` category does not exist, then :meth:`transform` and  :meth:`inverse_transform` will handle an unknown category as with  `handle_unknown='ignore'`. Infrequent categories exist based on  `min_frequency` and `max_categories`. Read more in the  :ref:`User Guide `. - 'warn' : When an unknown category is encountered during transform  a warning is issued, and the encoding then proceeds as described for  `handle_unknown=""infrequent_if_exist""`. .. versionchanged:: 1.1  `'infrequent_if_exist'` was added to automatically handle unknown  categories and infrequent categories. .. versionadded:: 1.6  The option `""warn""` was added in 1.6.",'ignore'
,"min_frequency  min_frequency: int or float, default=None Specifies the minimum frequency below which a category will be considered infrequent. - If `int`, categories with a smaller cardinality will be considered  infrequent. - If `float`, categories with a smaller cardinality than  `min_frequency * n_samples` will be considered infrequent. .. versionadded:: 1.1  Read more in the :ref:`User Guide `.",
,"max_categories  max_categories: int, default=None Specifies an upper limit to the number of output features for each input feature when considering infrequent categories. If there are infrequent categories, `max_categories` includes the category representing the infrequent categories along with the frequent categories. If `None`, there is no limit to the number of output features. .. versionadded:: 1.1  Read more in the :ref:`User Guide `.",
,"feature_name_combiner  feature_name_combiner: ""concat"" or callable, default=""concat"" Callable with signature `def callable(input_feature, category)` that returns a string. This is used to create feature names to be returned by :meth:`get_feature_names_out`. `""concat""` concatenates encoded feature name and category with `feature + ""_"" + str(category)`.E.g. feature X with values 1, 6, 7 create feature names `X_1, X_6, X_7`. .. versionadded:: 1.3",'concat'


In [1248]:
# Aplicar el pipeline de preprocesamiento al conjunto de entrenamiento para obtener los datos transformados listos para entrenar el modelo de regresión lineal
Xt_train = pipeline_regression.fit_transform(X_train)

In [1249]:
# Obtener los nombres de las características después del preprocesamiento para crear un DataFrame con los datos transformados y los nombres de las columnas correspondientes, lo que facilitará la interpretación de los resultados del modelo
feature_names = pipeline_regression.named_steps["preprocessor"].get_feature_names_out()
Xt_train_df = pd.DataFrame(
    Xt_train.toarray() if hasattr(Xt_train, "toarray") else Xt_train,
    columns=feature_names,
    index=X_train.index
)

### Modelo

In [1250]:
# Definir el modelo

Modelo = LinearRegression()

In [1251]:
# Entrenar el modelo utilizando el conjunto de entrenamiento preprocesado
Modelo.fit(Xt_train_df,y_train)

0,1,2
,"fit_intercept  fit_intercept: bool, default=True Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered).",True
,"copy_X  copy_X: bool, default=True If True, X will be copied; else, it may be overwritten.",True
,"tol  tol: float, default=1e-6 The precision of the solution (`coef_`) is determined by `tol` which specifies a different convergence criterion for the `lsqr` solver. `tol` is set as `atol` and `btol` of :func:`scipy.sparse.linalg.lsqr` when fitting on sparse training data. This parameter has no effect when fitting on dense data. .. versionadded:: 1.7",1e-06
,"n_jobs  n_jobs: int, default=None The number of jobs to use for the computation. This will only provide speedup in case of sufficiently large problems, that is if firstly `n_targets > 1` and secondly `X` is sparse or if `positive` is set to `True`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"positive  positive: bool, default=False When set to ``True``, forces the coefficients to be positive. This option is only supported for dense arrays. For a comparison between a linear regression model with positive constraints on the regression coefficients and a linear regression without such constraints, see :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py`. .. versionadded:: 0.24",False


In [1252]:
# Predicciones del modelo en el conjunto de entrenamiento
y_train_pred = Modelo.predict(Xt_train_df)

# Evaluación cuantitativa

# Evaluación cualitativa

# Conclusiones

# Uso del modelo - Dataset de prueba

In [1253]:
# Leer el dataset de prueba
# test_df = pd.read_csv( TEST_FILE )
# test_df.head()