<a href="https://colab.research.google.com/github/Iostream10081/ML_pipeline/blob/main/machine_learning_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

# --- Cargar CSV (ajusta la ruta si hace falta) ---
#############################################
#############################################

df = pd.read_csv("datos_preproc_practica.csv")

#############################################
#############################################

df.columns

Index(['edad', 'ingreso_mensual', 'talla_cm', 'peso_kg', 'temperatura_c',
       'frecuencia_cardiaca', 'ciudad', 'tipo_servicio', 'nivel',
       'codigo_serie', 'referencia', 'sensor_defectuoso'],
      dtype='object')

In [None]:
df.head()

Unnamed: 0,edad,ingreso_mensual,talla_cm,peso_kg,temperatura_c,frecuencia_cardiaca,ciudad,tipo_servicio,nivel,codigo_serie,referencia,sensor_defectuoso
0,39.0,,174.287097,78.317375,,76.552698,Puebla,basico,alto,Z,REF-964053,
1,,16048.0,168.917333,73.530586,36.332796,,Puebla,estandar,medio,Y,REF-147729,
2,44.0,41790.0,161.139089,94.093905,36.117298,96.188603,,basico,alto,X,REF-867825,
3,,13803.0,160.267146,71.686776,36.251983,89.602824,Puebla,estandar,bajo,,REF-406327,
4,15.0,,,50.069666,,64.701549,Guadalajara,estandar,alto,X,REF-159441,


In [None]:
X_train, X_test = train_test_split(df, test_size=0.25, random_state=0)

In [None]:
###########################
########################### DIAGNÓSTICO DE COLUMNAS
###########################


# 1) Detecta numéricas
num_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
print("Columnas numéricas:", num_cols)

# 2) Funciones auxiliares
def iqr_outlier_stats(s: pd.Series):
    s = pd.to_numeric(s, errors="coerce").dropna()
    if s.empty:
        return 0, 0.0, np.nan, np.nan, np.nan, np.nan
    q1, q3 = s.quantile(0.25), s.quantile(0.75)
    iqr = q3 - q1
    lim_inf, lim_sup = q1 - 1.5*iqr, q3 + 1.5*iqr
    n_out = int(((s < lim_inf) | (s > lim_sup)).sum())
    prop_out = n_out / len(s)
    return n_out, prop_out, q1, q3, lim_inf, lim_sup

def bounded_guess(s: pd.Series):
    """Detecta si parece estar acotada en [0,1] o [0,100]."""
    s = pd.to_numeric(s, errors="coerce").dropna()
    if s.empty:
        return None
    mn, mx = float(s.min()), float(s.max())
    if 0.0 <= mn and mx <= 1.0:
        return "[0,1]"
    if 0.0 <= mn and mx <= 100.0:
        return "[0,100]"
    return None

if len(num_cols) == 0:
    print("No hay columnas numéricas en X_train. No se generan imágenes.")
else:
    # --------- Figura 1: HISTOGRAMAS (todos en subplots) ----------
    n = len(num_cols)
    ncols = min(3, n)
    nrows = math.ceil(n / ncols)

    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(5*ncols, 3.8*nrows))
    axes = np.atleast_1d(axes).ravel()

    for i, col in enumerate(num_cols):
        s = pd.to_numeric(X_train[col], errors="coerce").dropna()
        ax = axes[i]
        if s.empty:
            ax.text(0.5, 0.5, "Sin datos", ha="center", va="center")
            ax.set_title(col); ax.set_xlabel(col); ax.set_ylabel("Frecuencia")
            ax.set_xticks([]); ax.set_yticks([])
        else:
            ax.hist(s, bins=30)
            ax.set_title(col)
            ax.set_xlabel(col)
            ax.set_ylabel("Frecuencia")

    # Oculta subplots sobrantes
    for j in range(len(num_cols), len(axes)):
        axes[j].axis("off")

    fig.suptitle("Histogramas de columnas numéricas", y=1.02, fontsize=12)
    fig.tight_layout()
    fig.savefig("histogramas_numericas.png", dpi=150, bbox_inches="tight")
    plt.close(fig)

    # --------- Figura 2: BOXPLOTS (todas en un eje) ----------
    series_pairs = []
    for c in num_cols:
        v = pd.to_numeric(X_train[c], errors="coerce").dropna().values
        if v.size > 0:
            series_pairs.append((c, v))

    if len(series_pairs) == 0:
        print("No hay datos numéricos válidos para boxplots. No se genera boxplot.")
        box_path = None
    else:
        labels = [c for c, _ in series_pairs]
        values = [v for _, v in series_pairs]

        fig2 = plt.figure(figsize=(1.6*len(labels)+4, 5))
        plt.boxplot(values, vert=True, showmeans=True)
        plt.xticks(ticks=range(1, len(labels)+1), labels=labels, rotation=35, ha="right")
        plt.ylabel("Valor")
        plt.title("Boxplots de columnas numéricas")
        plt.tight_layout()
        fig2.savefig("boxplots_numericas.png", dpi=150, bbox_inches="tight")
        plt.close(fig2)


# ================================
# Diagnóstico numérico + heurística
# ================================
diagnostico = []
for c in num_cols:
    s = pd.to_numeric(X_train[c], errors="coerce")
    miss_pct = s.isna().mean() * 100
    s_no_na = s.dropna()
    skew = s_no_na.skew() if s_no_na.size > 1 else np.nan
    kurt = s_no_na.kurt() if s_no_na.size > 1 else np.nan
    n_out, prop_out, q1, q3, li, ls = iqr_outlier_stats(s)
    bounds = bounded_guess(s)

    # Heurística de imputación
    if (prop_out >= 0.05) or (pd.notna(skew) and abs(skew) >= 1.0):
        imputador = "median"
    else:
        imputador = "mean"

    # Heurística de escalado
    if prop_out >= 0.05:
        escalador = "RobustScaler"
    elif pd.notna(skew) and abs(skew) <= 0.5:
        escalador = "StandardScaler"
    else:
        escalador = "MinMaxScaler"  # por defecto en casos no normales o acotados

    diagnostico.append({
        "columna": c,
        "missing_%": round(miss_pct, 2),
        "skew": round(skew, 3) if pd.notna(skew) else np.nan,
        "kurtosis": round(kurt, 3) if pd.notna(kurt) else np.nan,
        "outliers_n": n_out,
        "outliers_%": round(prop_out*100, 2),
        "q1": q1, "q3": q3, "IQR": (q3 - q1),
        "lim_inf": li, "lim_sup": ls,
        "bounded": bounds,
        "recom_imputer": imputador,
        "recom_scaler": escalador,
    })

diag_df = pd.DataFrame(diagnostico).sort_values(["outliers_%","missing_%"], ascending=False)
print("\n=== Diagnóstico numérico (heurística) ===\n")

##########################################################
##########################################################

print(diag_df)

##########################################################
##########################################################

# Sugerencias de bloques numéricos (incluye median+Robust)
suggest_mean_rob = diag_df.query("recom_imputer=='mean' and recom_scaler=='RobustScaler'")["columna"].tolist()
suggest_med_rob  = diag_df.query("recom_imputer=='median' and recom_scaler=='RobustScaler'")["columna"].tolist()
suggest_med_min  = diag_df.query("recom_imputer=='median' and recom_scaler=='MinMaxScaler'")["columna"].tolist()
suggest_med_std  = diag_df.query("recom_imputer=='median' and recom_scaler=='StandardScaler'")["columna"].tolist()
suggest_mean_min = diag_df.query("recom_imputer=='mean' and recom_scaler=='MinMaxScaler'")["columna"].tolist()
suggest_mean_std = diag_df.query("recom_imputer=='mean' and recom_scaler=='StandardScaler'")["columna"].tolist()

print("\nSugerencias de bloques numéricos (auto):")
print("median+Robust   :", suggest_med_rob)
print("median+MinMax   :", suggest_med_min)
print("median+Standard :", suggest_med_std)
print("mean+Robust     :", suggest_mean_rob)
print("mean+MinMax     :", suggest_mean_min)
print("mean+Standard   :", suggest_mean_std)

Columnas numéricas: ['edad', 'ingreso_mensual', 'talla_cm', 'peso_kg', 'temperatura_c', 'frecuencia_cardiaca', 'sensor_defectuoso']

=== Diagnóstico numérico (heurística) ===

               columna  missing_%   skew  kurtosis  outliers_n  outliers_%  \
1      ingreso_mensual      18.93  2.509     9.280          24        7.89   
0                 edad      23.20  2.009     6.932           9        3.12   
4        temperatura_c      15.73  3.031    18.198           8        2.53   
3              peso_kg      12.27  0.024    -0.169           2        0.61   
2             talla_cm       9.60 -0.121    -0.066           2        0.59   
6    sensor_defectuoso      90.67 -0.719     0.689           0        0.00   
5  frecuencia_cardiaca      26.13  0.122    -0.278           0        0.00   

             q1            q3           IQR      lim_inf       lim_sup  \
1  12193.250000  26335.750000  14142.500000 -9020.500000  47549.500000   
0     28.000000     42.000000     14.000000     7.0

In [None]:
# --- Listas de columnas ---

# Numéricas
# num_med_rob_cols: inputacion mediana con escalado robusto
# num_med_min_cols: inputación mediana con escalado min_max
# num_med_std_cols: inputacion mediana con escalado Estandar
# num_mean_rob_cols: inputación media con escalado robusto
# num_mean_min_cols: inputacion media con escalado min_max
# num_mean_std_cols: inputacion media con escalado Estandar

# Categóricas
# cat_ohe_cols: inputacion moda con OneHot
# cat_ord_cols: inputacion moda con Ordinal

#########################################################
#########################################################

num_med_rob_cols = ["ingreso_mensual"]   # mediana + Robust
#num_med_min_cols = []   # mediana + MinMax
num_med_std_cols = ["edad","temperatura_c"]   # mediana + Estandar
#num_mean_rob_cols = []   # media + Robust
#num_mean_min_cols  = []    # media + MinMax
num_mean_std_cols  = ["peso_kg", "talla_cm", "frecuencia_cardiaca"]    # media + Standard

cat_ohe_cols = ["ciudad","codigo_serie"]                  # moda + OneHot
cat_ord_cols = ["tipo_servicio","nivel"]                  # moda + Ordinal

#########################################################
#########################################################

passthrough_cols = ["referencia"]                      # pasar sin procesar
drop_cols        = ["sensor_defectuoso"]                         # eliminar

###### orden de las categorías ordinales
categorias_ordinales = [
    ["basico", "estandar", "premium"],
    ["bajo", "medio", "alto"]
]

#########################################################
#########################################################

In [None]:
# --- Pipelines NUMÉRICOS ---
pipe_med_rob = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler",  RobustScaler())
])

'''pipe_med_min = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler",  MinMaxScaler())
])'''

pipe_med_std = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler",  StandardScaler())
])


'''pipe_mean_rob = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler",  RobustScaler())
])'''

'''pipe_mean_min = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler",  MinMaxScaler())
])'''

pipe_mean_std = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler",  StandardScaler())
])

# --- Pipelines CATEGÓRICOS ---
pipe_cat_ohe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),  # moda
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

pipe_cat_ord = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),  # moda
    ("encoder", OrdinalEncoder(categories=categorias_ordinales,
                               handle_unknown="use_encoded_value", unknown_value=-1))
])

In [None]:
# --- ColumnTransformer unificado ---
preprocessor = ColumnTransformer(
    transformers=[
        ("num_med_rob", pipe_med_rob, num_med_rob_cols),
        #("num_med_min", pipe_med_min, num_med_min_cols),
        ("num_med_std", pipe_med_std, num_med_std_cols),
        #("num_mean_rob",  pipe_mean_rob,  num_mean_rob_cols),
        #("num_mean_min",  pipe_mean_min,  num_mean_min_cols),
        ("num_mean_std",  pipe_mean_std,  num_mean_std_cols),

        # Categóricos
        ("cat_ohe",      pipe_cat_ohe,  cat_ohe_cols),
        ("cat_ord",      pipe_cat_ord,  cat_ord_cols),

        # Passthrough (sin preprocesar)
        ("passthrough",  "passthrough", passthrough_cols),

        # Drop explícito
        ("drop_high_na", "drop",        drop_cols),
    ],
    remainder="drop",                        # descarta cualquier otra columna no listada
    verbose_feature_names_out=False
)


In [None]:
# ---------- Ajuste y transformación ----------
preprocessor.fit(X_train)

X_train_proc = preprocessor.transform(X_train)
X_test_proc  = preprocessor.transform(X_test)

print("Shape train ->", X_train_proc.shape)
print("Shape test  ->", X_test_proc.shape)

Shape train -> (375, 17)
Shape test  -> (125, 17)


In [None]:
# Reconstruir DataFrame con nombres de columnas
cols_out = preprocessor.get_feature_names_out()
X_train_proc_df = pd.DataFrame(X_train_proc, columns=cols_out)
X_test_proc_df = pd.DataFrame(X_test_proc, columns=cols_out)
X_test_proc_df

Unnamed: 0,ingreso_mensual,edad,temperatura_c,peso_kg,talla_cm,frecuencia_cardiaca,ciudad_CDMX,ciudad_Guadalajara,ciudad_Monterrey,ciudad_Puebla,ciudad_Toluca,codigo_serie_X,codigo_serie_Y,codigo_serie_Z,tipo_servicio,nivel,referencia
0,-1.094332,0.463013,-0.437344,0.301234,-0.640519,-2.285066,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,REF-745165
1,0.0,-0.342051,1.759915,-0.202711,-1.509423,-0.731724,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,REF-533077
2,1.692537,0.243451,0.228965,0.863335,0.0,-0.879164,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,REF-487548
3,-0.697307,0.0,0.475649,-0.671162,-1.311557,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,REF-667211
4,0.0,-0.854364,0.204949,0.414022,0.351798,0.336683,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2.0,REF-203743
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120,0.0,0.0,0.08825,0.25731,0.016152,1.733744,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,2.0,REF-131286
121,-0.107412,0.0,0.312892,-1.355613,0.037559,-2.023112,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,REF-240107
122,0.0,0.0,0.202398,1.198491,-0.714856,0.966452,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,REF-787152
123,-0.454681,1.19489,-0.419879,0.302708,0.665719,0.791312,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,REF-550545


In [None]:
X_train_proc_df.to_csv("Entrenamiento_procesado.csv",index=False)
X_test_proc_df.to_csv("Prueba_procesado.csv",index=False)

In [None]:
# Columnas escaladas que indicaste:

#############################################
#############################################

cols_escaladas_usuario = ["ingreso_mensual","edad","temperatura_c","peso_kg", "talla_cm", "frecuencia_cardiaca"]

#############################################
#############################################

cols = [c for c in cols_escaladas_usuario if c in X_train_proc_df.columns]
print(f"Columnas escaladas encontradas en X_train_proc_df: {cols}")

if len(cols) == 0:
    print("No hay columnas escaladas válidas en X_train_proc_df. No se generan imágenes.")
else:
    # ---------- Figura 1: HISTOGRAMAS (todas en subplots) ----------
    n = len(cols)
    ncols = min(3, n)
    nrows = math.ceil(n / ncols)

    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(5*ncols, 3.8*nrows))
    axes = np.atleast_1d(axes).ravel()

    for i, col in enumerate(cols):
        s = pd.to_numeric(X_train_proc_df[col], errors="coerce").dropna()
        ax = axes[i]
        if s.empty:
            ax.text(0.5, 0.5, "Sin datos", ha="center", va="center")
            ax.set_title(col); ax.set_xlabel(col); ax.set_ylabel("Frecuencia")
            ax.set_xticks([]); ax.set_yticks([])
        else:
            ax.hist(s, bins=30)
            ax.set_title(col)
            ax.set_xlabel(col)
            ax.set_ylabel("Frecuencia")

    # Ocultar subplots sobrantes
    for j in range(len(cols), len(axes)):
        axes[j].axis("off")

    fig.suptitle("Histogramas (columnas escaladas)", y=1.02, fontsize=12)
    fig.tight_layout()
    hist_file = "scaled_histogramas.png"
    fig.savefig(hist_file, dpi=150, bbox_inches="tight")
    plt.close(fig)

    # ---------- Figura 2: BOXPLOTS (todas en un solo eje) ----------
    series_pairs = []
    for c in cols:
        v = pd.to_numeric(X_train_proc_df[c], errors="coerce").dropna().values
        if v.size > 0:
            series_pairs.append((c, v))

    if len(series_pairs) == 0:
        print("No hay datos válidos para boxplots. No se genera boxplot.")
        box_file = None
    else:
        labels = [c for c, _ in series_pairs]
        values = [v for _, v in series_pairs]

        fig2 = plt.figure(figsize=(1.6*len(labels)+4, 5))
        plt.boxplot(values, vert=True, showmeans=True)
        plt.xticks(ticks=range(1, len(labels)+1), labels=labels, rotation=35, ha="right")
        plt.ylabel("Valor")
        plt.title("Boxplots (columnas escaladas)")
        plt.tight_layout()
        box_file = "scaled_boxplots.png"
        fig2.savefig(box_file, dpi=150, bbox_inches="tight")
        plt.close(fig2)

    print("Imágenes guardadas:")
    print(" -", hist_file)
    if box_file: print(" -", box_file)

Columnas escaladas encontradas en X_train_proc_df: ['ingreso_mensual', 'edad', 'temperatura_c', 'peso_kg', 'talla_cm', 'frecuencia_cardiaca']
Imágenes guardadas:
 - scaled_histogramas.png
 - scaled_boxplots.png
