## **Librerías**

In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.options.display.max_columns = False

In [3]:
from sklearn import set_config

from sklearn.datasets import load_diabetes

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA, TruncatedSVD

from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion

In [4]:
set_config(transform_output='pandas')

## **Datos**

In [5]:
X, y = load_diabetes(scaled=False, return_X_y=True, as_frame=True)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=13, test_size=0.3)

## **1. Pipeline**

Aplicar nuestra transformación al dataset completo

In [7]:
# Definimos y confuguramos nuestro Pipeline
pipe = Pipeline(steps=[
    ('impute_mean', SimpleImputer(strategy='mean')),
    ('rescale', MinMaxScaler())
])

In [8]:
# Ajustamos nuestro Pipeline
pipe.fit(X_train)

In [9]:
# Transformamos nuestra data
X_train_transformed = pipe.transform(X_train)
X_test_transformed = pipe.transform(X_test)

In [None]:
X_train_transformed.head()

Unnamed: 0,categorical_transformer__sex_2.0,numerical_transformer__pca__pca0,numerical_transformer__svd__truncatedsvd0,numerical_transformer__svd__truncatedsvd1,remainder__age
322,1.0,0.820853,1.720242,-0.631405,55.0
124,0.0,-0.206021,1.003551,0.203048,47.0
159,0.0,0.233434,1.36895,-0.007116,47.0
404,0.0,-0.081697,1.129361,0.189352,44.0
99,0.0,0.006219,1.092412,-0.034063,48.0


## **2. Column Transfomer**

Realizar cambios dependiendo del tipo de dato de nuestros features

In [10]:
X_train.sample()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
392,37.0,2.0,23.6,94.0,205.0,138.8,53.0,4.0,4.1897,107.0


In [11]:
# Variables categoricas
categorical_features = ['sex']

categorical_transformer = Pipeline(steps=[
    ('impute_mode', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(
        handle_unknown='ignore',
        sparse_output=False,
        drop='first'
    ))
])

In [12]:
# Variables numericas 
numerical_features = ['bp', 'bmi', 's1', 's2', 's3', 's4', 's5', 's6']

numerical_transformer = Pipeline(steps=[
    ('impute_mean', SimpleImputer(strategy='mean')),
    ('rescale', MinMaxScaler())
])

In [13]:
# Combinamos nuestros transformadores
preprocessor = ColumnTransformer(
    transformers=[
        ('categorical_transformer', categorical_transformer, categorical_features),
        ('numerical_transformer', numerical_transformer, numerical_features)
    ],
    remainder='passthrough',
    verbose_feature_names_out=True
)

preprocessor

In [14]:
# Ajustamos el procesador a nuestra data
preprocessor.fit(X_train)

In [15]:
# Aplicamos la transformación a nuestra data
X_train_transformed = preprocessor.transform(X_test)
X_train_transformed = preprocessor.transform(X_test)

In [None]:
X_train_transformed.head()

Unnamed: 0,categorical_transformer__sex_2.0,numerical_transformer__pca__pca0,numerical_transformer__svd__truncatedsvd0,numerical_transformer__svd__truncatedsvd1,remainder__age
322,1.0,0.820853,1.720242,-0.631405,55.0
124,0.0,-0.206021,1.003551,0.203048,47.0
159,0.0,0.233434,1.36895,-0.007116,47.0
404,0.0,-0.081697,1.129361,0.189352,44.0
99,0.0,0.006219,1.092412,-0.034063,48.0


## **3. Feature Union**

In [16]:
union = FeatureUnion(transformer_list=[
    ('pca', PCA(n_components=1)),
    ('svd', TruncatedSVD(n_components=2))
])

In [17]:
# Variables numericas 
numerical_features = ['bp', 'bmi', 's1', 's2', 's3', 's4', 's5', 's6']

numerical_transformer = Pipeline(steps=[
    ('impute_mean', SimpleImputer(strategy='mean')),
    ('rescale', MinMaxScaler()),
    ('reduce_dimensionality', union)
])

In [18]:
# Variables categoricas
categorical_features = ['sex']

categorical_transformer = Pipeline(steps=[
    ('impute_mode', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(
        handle_unknown='ignore',
        sparse_output=False,
        drop='first'
    ))
])

In [19]:
# Creamos nuestro transformador combinado
preprocessor = ColumnTransformer(
    transformers=[
        ('categorical_transformer', categorical_transformer, categorical_features),
        ('numerical_transformer', numerical_transformer, numerical_features)
    ],
    remainder='passthrough',
    verbose_feature_names_out=True
)

In [20]:
preprocessor

In [22]:
preprocessor.fit(X_train)

In [23]:
X_train_transformed = preprocessor.transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

In [24]:
X_train_transformed.head()

Unnamed: 0,categorical_transformer__sex_2.0,numerical_transformer__pca__pca0,numerical_transformer__svd__truncatedsvd0,numerical_transformer__svd__truncatedsvd1,remainder__age
322,1.0,0.820853,1.720242,-0.631405,55.0
124,0.0,-0.206021,1.003551,0.203048,47.0
159,0.0,0.233434,1.36895,-0.007116,47.0
404,0.0,-0.081697,1.129361,0.189352,44.0
99,0.0,0.006219,1.092412,-0.034063,48.0


## **4. Function Transformer**

Agregar nuestras funciones propias al Pipeline

In [54]:
def custom_function(x):

    new_variable = (x - np.mean(x, axis=0)) / np.std(x, axis=0)

    return X

In [56]:
feature_engineering = Pipeline(steps=[
    ('custom_transform', FunctionTransformer(func=custom_function))
])

feature_engineering

In [57]:
pipe = Pipeline([
    # ('preprocessing', preprocessor),
    ('feature_engineering', FunctionTransformer(feature_engineering))
])

pipe

In [58]:
pipe.fit(X_train)

InvalidParameterError: The 'func' parameter of FunctionTransformer must be a callable or None. Got Pipeline(steps=[('custom_transform',
                 FunctionTransformer(func=<function custom_function at 0x00000292DAF4A480>))]) instead.

In [59]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_diabetes

# Cargar los datos
data = load_diabetes()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

# Definir la función de transformación personalizada
def custom_transform(x):
    return (x - np.mean(x, axis=0)) / np.std(x, axis=0)

# Crear el FunctionTransformer
custom_transformer = FunctionTransformer(func=custom_transform)

# Crear el pipeline
pipeline = Pipeline([
    ('custom_transform', custom_transformer)
])

# Aplicar la transformación
X_transformed = pipeline.fit_transform(X)

# Mostrar los resultados
print("Datos originales:\n", X.head())
print("Datos transformados:\n", pd.DataFrame(X_transformed, columns=X.columns).head())


Datos originales:
         age       sex       bmi  ...        s4        s5        s6
0  0.038076  0.050680  0.061696  ... -0.002592  0.019907 -0.017646
1 -0.001882 -0.044642 -0.051474  ... -0.039493 -0.068332 -0.092204
2  0.085299  0.050680  0.044451  ... -0.002592  0.002861 -0.025930
3 -0.089063 -0.044642 -0.011595  ...  0.034309  0.022688 -0.009362
4  0.005383 -0.044642 -0.036385  ... -0.002592 -0.031988 -0.046641

[5 rows x 10 columns]
Datos transformados:
         age       sex       bmi  ...        s4        s5        s6
0  0.800500  1.065488  1.297088  ... -0.054499  0.418531 -0.370989
1 -0.039567 -0.938537 -1.082180  ... -0.830301 -1.436589 -1.938479
2  1.793307  1.065488  0.934533  ... -0.054499  0.060156 -0.545154
3 -1.872441 -0.938537 -0.243771  ...  0.721302  0.476983 -0.196823
4  0.113172 -0.938537 -0.764944  ... -0.054499 -0.672502 -0.980568

[5 rows x 10 columns]
