Дальнейший код имеет отношение к книге **Орельена Жерона "Прикладное машинное обучениее с помощью Scikit-Learn и TensorFlow"**
Код воспроизведен с целью обучения и создания шпаргалки по pipeline.

In [1]:
import pandas as pd
import numpy as np

Данные по недвижимости в Калифорнии

In [2]:
from sklearn.datasets import fetch_california_housing
data = fetch_california_housing()
X = pd.DataFrame(data.data, columns = data.feature_names)
y = data.target
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


**Трансформатор из датафрейма в numpy для использования в pipeline**

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    """
    Срез pandas DataFrame с преобразованием в numpy массив.
    Трансформатор обязательно должен иметь методы fit и transform
    """
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names # столбцы для среза        
        
    def fit(self, X, y=None):
        return self 
    # fit ничего не делает, этот метод необходим для единообразности с sklearn API
    
    def transform(self, X):
        return X[self.attribute_names].values

**Трансформатор для добавления новых аттрибутов**

In [4]:
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[
                X, rooms_per_household, 
                population_per_household,
                bedrooms_per_room
            ]
        else:
            return np.c_[
                X, rooms_per_household, 
                population_per_household
            ]

**Pipeline для численных значений**

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
num_attribs = list(data.feature_names)

numeric_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler())
])

**Pipeline для категорий**

In [6]:
cat_attribs = ["ocean_proximity"]
categories = ['<1H ocean', 'island', 'near ocean', 'inland']
X["ocean_proximity"] = np.random.choice(categories, X.shape[0])

In [7]:
from sklearn.preprocessing import LabelBinarizer

class CustomLabelBinarizer(BaseEstimator, TransformerMixin):
    def __init__(self, sparse_output=False):
        self.sparse_output = sparse_output
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        enc = LabelBinarizer(sparse_output=self.sparse_output)
        return enc.fit_transform(X)
    
cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attribs)),
    ('label_encoder', CustomLabelBinarizer())
])

**Объединение pipeline**

In [8]:
from sklearn.pipeline import FeatureUnion
full_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", numeric_pipeline),
    ("cat_pipeline", cat_pipeline)
])

In [9]:
prepared_data = full_pipeline.fit_transform(X)

In [10]:
prepared_data.shape

(20640, 15)

In [11]:
X.shape

(20640, 9)