In [103]:
import pandas as pd
import numpy as np

from sklearn.datasets import fetch_california_housing
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, FunctionTransformer, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer

In [104]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X):
        #print(type(X[self.attribute_names]))
        return X[self.attribute_names].values

In [105]:
data = fetch_california_housing()

In [106]:
data.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR'])

In [107]:
df = pd.DataFrame(data.data, columns = data.feature_names)
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [134]:
df.shape

(20640, 9)

In [108]:
df['cat_feature'] = pd.cut(df['HouseAge'].values, 
                           [min(df['HouseAge'].values), 5, max(df['HouseAge'].values)], 
                           labels = ['new','old'])

In [109]:
df.dtypes

MedInc          float64
HouseAge        float64
AveRooms        float64
AveBedrms       float64
Population      float64
AveOccup        float64
Latitude        float64
Longitude       float64
cat_feature    category
dtype: object

In [110]:
num_featues = df.select_dtypes(include = ['float64']).columns
cat_features = ['cat_feature']

Write function for adding some new features

In [111]:
df.columns

Index(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
       'Latitude', 'Longitude', 'cat_feature'],
      dtype='object')

In [112]:
# get the right column indices: safer than hard-coding indices 3, 4, 5, 6
rooms_ix, bedrooms_ix, population_ix,  = [
    list(df.columns).index(col)
    for col in ('AveRooms', 'AveBedrms', "Population")]

In [128]:
def add_extra_features(X, add_bedrooms_per_room = True):
    rooms_per_population = X[:, rooms_ix] / X[:, population_ix]
    if add_bedrooms_per_room:
        bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
        return np.c_[X, rooms_per_population, bedrooms_per_room]
    return  np.c_[X, rooms_per_population]

attr_adder = FunctionTransformer(add_extra_features, validate=False,
                                 kw_args={"add_bedrooms_per_room": False})

In [129]:
num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_featues)),
    ('imputer', SimpleImputer(strategy = 'median')), 
    ('features_adder', FunctionTransformer(add_extra_features, validate=False)),
    ('std_scaler', StandardScaler())
])


cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_features)),
    ('imputer', SimpleImputer(strategy = 'constant', fill_value = 'Nan')),
    ('cat_encoder', OneHotEncoder(sparse = False))
])

In [130]:
full_pipeline = FeatureUnion(transformer_list = [
    ('num_pipeline', num_pipeline),
    ('cat_pipeline', cat_pipeline)
])

In [131]:
prepared_data = full_pipeline.fit_transform(df)

In [133]:
print(prepared_data.shape)

(20640, 13)
