# Integrantes
- Daniel Diab
- Laura Martinez

# 1. Preparación Datos
## 1.1. Importar Librerias

In [32]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

## 1.2. Cargar Datos

In [33]:
df = pd.read_csv('./data.csv', sep=',', header=0)
datos = df.copy()

## 1.3. Eliminar Duplicados

In [34]:
import pandas as pd

def corregir_fechas(df, col="Date of Service"):
    df = df.copy()

    s = df[col].astype("string").str.strip()
    s = s.replace({"": pd.NA, "nan": pd.NA, "NaN": pd.NA, "null": pd.NA, "None": pd.NA})

    # pandas moderno: resuelve formatos mixtos de forma fiable
    try:
        dt = pd.to_datetime(s, errors="coerce", format="mixed", dayfirst=True)
    except TypeError:
        dt = pd.to_datetime(s, errors="coerce", dayfirst=True)
        m = dt.isna() & s.notna()
        dt.loc[m] = pd.to_datetime(s[m], errors="coerce", dayfirst=False)

    df[col] = dt.dt.normalize() #solo fecha, no hora
    return df

datos = corregir_fechas(datos, "Date of Service")
print("Fechas inválidas:", datos["Date of Service"].isna().sum())

Fechas inválidas: 0


In [35]:
print("Duplicados exactos:", datos.duplicated().sum())

Duplicados exactos: 151


In [36]:
datos = datos.drop_duplicates()
datos.shape

(1488, 24)

In [37]:
cols_llave = ["Patient ID", "Date of Service"]

conflict_keys = datos.loc[
    datos.duplicated(subset=cols_llave, keep=False), cols_llave].drop_duplicates()

print("Llaves conflictivas:", len(conflict_keys))

datos = datos.merge(conflict_keys, on=cols_llave, how="left", indicator=True)
datos = datos[datos["_merge"] == "left_only"].drop(columns="_merge")

print("Filas restantes:", len(datos))

Llaves conflictivas: 112
Filas restantes: 1264


## 1.4. Separar Dataset

In [38]:
from sklearn.model_selection import train_test_split

X = datos.drop(columns=["CVD Risk Score", "CVD Risk Level"], errors="ignore")
y = datos["CVD Risk Score"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

## 1.5. Pipeline

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer

In [None]:
cols_to_drop = ["Patient ID", "Date of Service", "CVD Risk Level"]

def drop_columns(df):
    return df.drop(columns=cols_to_drop, errors="ignore")

dropper = FunctionTransformer(drop_columns)

In [None]:
# =========================
# 0) Helpers opcionales
# =========================
def _to_num(s):
    return pd.to_numeric(s, errors="coerce")


# =========================
# 1) BP: completar Systolic/Diastolic desde "Blood Pressure (mmHg)"
# =========================
def completar_bp_desde_texto(df,
                             col_bp_texto="Blood Pressure (mmHg)",
                             col_sis="Systolic BP",
                             col_dia="Diastolic BP"):
    df = df.copy()
    if col_bp_texto not in df.columns:
        return df

    s = df[col_bp_texto].astype("string").str.strip()
    extra = s.str.extract(r"^\s*(\d+)\s*/\s*(\d+)\s*$")
    sys = _to_num(extra[0])
    dia = _to_num(extra[1])

    if col_sis in df.columns:
        m = df[col_sis].isna() & sys.notna()
        df.loc[m, col_sis] = sys.loc[m]

    if col_dia in df.columns:
        m = df[col_dia].isna() & dia.notna()
        df.loc[m, col_dia] = dia.loc[m]

    return df

tr_completar_bp = FunctionTransformer(completar_bp_desde_texto)


# =========================
# 2) BP: si Diastolic > Systolic, swap
# =========================
def corregir_swap_bp(df, col_sis="Systolic BP", col_dia="Diastolic BP"):
    df = df.copy()
    if col_sis not in df.columns or col_dia not in df.columns:
        return df

    sis = _to_num(df[col_sis])
    dia = _to_num(df[col_dia])

    m = sis.notna() & dia.notna() & (dia > sis)
    if m.any():
        df.loc[m, col_sis], df.loc[m, col_dia] = dia.loc[m].values, sis.loc[m].values

    return df

tr_swap_bp = FunctionTransformer(corregir_swap_bp)


# =========================
# 3) Altura: completar Height(m) <-> Height(cm)
# =========================
def completar_altura_m_cm(df, col_h_m="Height (m)", col_h_cm="Height (cm)"):
    df = df.copy()
    if col_h_m not in df.columns or col_h_cm not in df.columns:
        return df

    hm = _to_num(df[col_h_m])
    hcm = _to_num(df[col_h_cm])

    # si falta cm y hay m
    m1 = hm.notna() & hcm.isna()
    df.loc[m1, col_h_cm] = hm.loc[m1] * 100

    # si falta m y hay cm
    m2 = hm.isna() & hcm.notna()
    df.loc[m2, col_h_m] = hcm.loc[m2] / 100

    return df

tr_altura_completar = FunctionTransformer(completar_altura_m_cm)


# =========================
# 4) Altura: si ambas existen y difieren, manda Height(m) y recalcula cm
# (esto es exactamente la política que describiste) :contentReference[oaicite:1]{index=1}
# =========================
def resolver_inconsistencia_altura(df, col_h_m="Height (m)", col_h_cm="Height (cm)", tol_cm=1.0):
    df = df.copy()
    if col_h_m not in df.columns or col_h_cm not in df.columns:
        return df

    hm = _to_num(df[col_h_m])
    hcm = _to_num(df[col_h_cm])

    m = hm.notna() & hcm.notna() & (abs(hcm - hm * 100) > tol_cm)
    if m.any():
        df.loc[m, col_h_cm] = hm.loc[m] * 100

    return df

tr_altura_resolver = FunctionTransformer(resolver_inconsistencia_altura)


# =========================
# 5) Derivada: recalcular BMI = Weight / Height(m)^2
# =========================
def recalcular_bmi(df, col_peso="Weight", col_h_m="Height (m)", col_bmi="BMI"):
    df = df.copy()
    if col_peso not in df.columns or col_h_m not in df.columns or col_bmi not in df.columns:
        return df

    w = _to_num(df[col_peso])
    hm = _to_num(df[col_h_m])

    m = w.notna() & hm.notna() & (hm > 0)
    df.loc[m, col_bmi] = w.loc[m] / (hm.loc[m] ** 2)

    return df

tr_bmi = FunctionTransformer(recalcular_bmi)


# =========================
# 6) Derivada: Waist-to-Height Ratio = AbdominalCircumference / Height(cm)
# =========================
def recalcular_waist_height_ratio(df,
                                 col_cintura="Abdominal Circumference (cm)",
                                 col_h_cm="Height (cm)",
                                 col_ratio="Waist-to-Height Ratio"):
    df = df.copy()
    if col_cintura not in df.columns or col_h_cm not in df.columns or col_ratio not in df.columns:
        return df

    cintura = _to_num(df[col_cintura])
    hcm = _to_num(df[col_h_cm])

    m = cintura.notna() & hcm.notna() & (hcm > 0)
    df.loc[m, col_ratio] = cintura.loc[m] / hcm.loc[m]

    return df

tr_ratio = FunctionTransformer(recalcular_waist_height_ratio)

In [None]:
numeric_features = X_train.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X_train.select_dtypes(include=["object", "category"]).columns.tolist()

