# Pipelines de operaciones

Agrupar múltiples operaciones en un mismo objeto:

* Imputar nulos
* Codificación de categóricos
* Escalado de datos
* Modelado

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor


In [2]:
df = sns.load_dataset('penguins')
df.isnull().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

In [3]:
# Imputar nulos en la columna de salida (y) que es body_mass_g
df['body_mass_g'] = SimpleImputer(missing_values=np.nan, strategy='median').fit_transform(df[['body_mass_g']])

In [4]:
pipeline_numeric = Pipeline([
    ('impute_median', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())
])

pipeline_categorical = Pipeline([
    ('impute_mode', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first', sparse=False))
])

X = df.drop('body_mass_g', axis=1)
y = df['body_mass_g'] # no tiene nulos

# Asignar pipelines a columnas con ColumnTransformer
# numeric_col_names = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
numeric_col_names = X.select_dtypes(include=np.number).columns.to_list()
categorical_col_names = X.select_dtypes(include='object').columns.to_list()

preprocessor = ColumnTransformer([
    ('numerical', pipeline_numeric, numeric_col_names),
    ('categorical', pipeline_categorical, categorical_col_names)
])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('linear_regression', LinearRegression()) # se puede cambiar el algoritmo
    # ('knn', KNeighborsRegressor())
])

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

mean_squared_error(y_test, y_pred, squared=False)




318.79717144386433