# Custom Transformer

In [1]:
from sklearn import set_config; set_config(display='diagram')

👇 Consider the following dataset

In [2]:
import pandas as pd

data = pd.read_csv("data.csv")

data.head()

Unnamed: 0,customer_state,seller_state,product_weight_g,product_length_cm,product_height_cm,product_width_cm,days_until_delivery
0,RJ,SP,1825,53,10,40,9
1,RJ,SP,700,65,18,28,9
2,RJ,SP,1825,53,10,40,11
3,RJ,SP,1825,53,10,40,12
4,RJ,SP,1825,53,10,40,14


Each observation of the dataset represents an item being delivered from a  `seller_state` to a `customer_state`. The columns describe the size and weight of each item. The target is the number of days between the order and the delivery.

👇 In a pipeline:

- Engineer a 'volume' feature from the dimensions features
- Preserve the original product dimensions features for training
- Scale all numerical features
- Encode the categorical features
- Train a default `Ridge` regression and cross_validate its score on the train set. Low r2 score are expected.

Use your pipeline to predict the delivery of the following order

<details><summary>Hints</summary>

- There are many ways to create your preprocessed matrix (using `ColumnTransformer` and/or `FeatureUnion`). 
    
- If your transformed feature matrix look wierd, it may be stored as "sparse" by the default behavior of `OneHotEncoder(sparse=True)`. Use `.todense()` to turn it back to a dense matrix

</details>

In [3]:
new_data = pd.read_csv("data_new.csv")
new_data

Unnamed: 0,customer_state,seller_state,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,RJ,SP,1825,53,10,40


## Solution

### Preprocessing solution 1: Function Transformer

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer

from sklearn.preprocessing import StandardScaler
from sklearn.base import TransformerMixin
from sklearn.base import BaseEstimator

from sklearn.pipeline import FeatureUnion

In [5]:
X = data.drop(columns=['days_until_delivery'])
y = data['days_until_delivery']
X.head()

Unnamed: 0,customer_state,seller_state,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,RJ,SP,1825,53,10,40
1,RJ,SP,700,65,18,28
2,RJ,SP,1825,53,10,40
3,RJ,SP,1825,53,10,40
4,RJ,SP,1825,53,10,40


In [6]:
def multiply(df):
    return pd.DataFrame(
        df['product_length_cm'] * df['product_height_cm'] * df['product_width_cm'])
    
volume_computer = Pipeline([
    ('compute_volume', FunctionTransformer(multiply)),
    ('scale_volume', MinMaxScaler())
])

final_preprocessor = ColumnTransformer([
    ('num_encoder', MinMaxScaler(), make_column_selector(dtype_include="int64")),
    ('cat_encoder', OneHotEncoder(handle_unknown='ignore', sparse=False), make_column_selector(dtype_include="object")),
    ('volume_computer', volume_computer, ['product_length_cm','product_height_cm','product_width_cm'])
])
final_preprocessor

In [39]:
pd.DataFrame(final_preprocessor.fit_transform(X)).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,0.057692,0.402439,0.056818,0.271028,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.079177
1,0.020067,0.54878,0.147727,0.158879,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.125218
2,0.057692,0.402439,0.056818,0.271028,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.079177
3,0.057692,0.402439,0.056818,0.271028,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.079177
4,0.057692,0.402439,0.056818,0.271028,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.079177


In [8]:
preprocessor = ColumnTransformer([
    ('num_encoder', MinMaxScaler(), make_column_selector(dtype_include="int64")),
    ('cat_encoder', OneHotEncoder(handle_unknown='ignore', sparse=False), make_column_selector(dtype_include="object"))
])

def multiply(df):
    return pd.DataFrame(df['product_length_cm'] * df['product_height_cm'] * df['product_width_cm'])
    
volume_computer = Pipeline([
    ('compute_volume', FunctionTransformer(multiply)),
    ('scale_volume', MinMaxScaler())
])

final_preprocessor2 = FeatureUnion([
    ('preprocessess', preprocessor),
    ('compute_volume', volume_computer)
])
final_preprocessor2

In [40]:
pd.DataFrame(final_preprocessor2.fit_transform(X)).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,0.057692,0.402439,0.056818,0.271028,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.079177
1,0.020067,0.54878,0.147727,0.158879,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.125218
2,0.057692,0.402439,0.056818,0.271028,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.079177
3,0.057692,0.402439,0.056818,0.271028,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.079177
4,0.057692,0.402439,0.056818,0.271028,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.079177


### Preprocessing Solution 2: Build custom transformer class for engineering (overkill ?)

In [10]:
# Create a class
class ColumnMultiplier(TransformerMixin, BaseEstimator): 
# TransformerMixin generates a fit_transform method from fit and transform
# BaseEstimator generates get_params and set_params methods
    
    # Create parameters "column_1", "column_2", "column_3" to choose which columns of dataframe to multiply
    def __init__(self, column_1, column_2, column_3):
        self.column_1 = column_1
        self.column_2 = column_2
        self.column_3 = column_3
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        
        # Multiplication
        multiplied_features =X[self.column_1]*X[self.column_2]*X[self.column_3]
        
        # Return result as dataframe (for integration into ColumnTransformer)
        return pd.DataFrame(multiplied_features, columns=['volume'])

In [11]:
# Plug custom transformer into a feature engineering ColumnTransformer
# Select and preserve the original features
feature_engineering = ColumnTransformer([
    ('column_multiplier', ColumnMultiplier(column_1="product_length_cm",
                                           column_2="product_height_cm",
                                           column_3="product_width_cm"),
     ['product_length_cm', 'product_height_cm', "product_width_cm"]),

    ('select_original_features', FunctionTransformer(lambda x: x), ['product_length_cm',
                                                                    'product_height_cm',
                                                                    'product_width_cm',
                                                                    'product_weight_g'])])

# Chain the feature engineering transformer with a scaler
numerical_pipe = Pipeline([
    ('engineering', feature_engineering),
    ('scaling', MinMaxScaler())])

# Create a final preprocessing pipeline that combines the above pipeline with a One hot encoder for categorical features
final_preprocessor3 = ColumnTransformer([
    ('categorical_preprocessing', OneHotEncoder(sparse=False,
                                                handle_unknown='ignore'), make_column_selector(dtype_include="object")),
    ('numerical_preprocessing', numerical_pipe, make_column_selector(dtype_include="int64"))])

In [12]:
final_preprocessor3

In [38]:
pd.DataFrame(final_preprocessor3.fit_transform(X)).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.079177,0.402439,0.056818,0.271028,0.057692
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.125218,0.54878,0.147727,0.158879,0.020067
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.079177,0.402439,0.056818,0.271028,0.057692
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.079177,0.402439,0.056818,0.271028,0.057692
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.079177,0.402439,0.056818,0.271028,0.057692


In [14]:
final_preprocessor3.fit_transform(X).shape

(1000, 32)

### Modelling

In [15]:
from sklearn.linear_model import Ridge

# Chain the final_preprocessor pipeline with a linear regression model
final_pipe = Pipeline([
    ('preprocessing', final_preprocessor),
    ('linear_regression', Ridge())])

In [16]:
# Fit the final pipe to the data
final_pipe.fit(X, y)

In [17]:
from sklearn.model_selection import cross_val_score

# Cross validate the final_pipe
cross_val_score(final_pipe, X, y, cv=10, scoring='r2').mean()

0.15824755009515182

### Predictions

In [18]:
new_data

Unnamed: 0,customer_state,seller_state,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,RJ,SP,1825,53,10,40


In [19]:
# Use final pipe for prediction
final_pipe.predict(new_data)

array([20.67221182])

### Bonus: get feature names

In [20]:
set_config(display='diagram')
final_pipe

In [37]:
pd.DataFrame(data=final_pipe['preprocessing'].fit_transform(X)).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,0.057692,0.402439,0.056818,0.271028,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.079177
1,0.020067,0.54878,0.147727,0.158879,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.125218
2,0.057692,0.402439,0.056818,0.271028,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.079177
3,0.057692,0.402439,0.056818,0.271028,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.079177
4,0.057692,0.402439,0.056818,0.271028,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.079177


In [22]:
from sklearn import set_config; set_config(display='text')

In [23]:
final_pipe['preprocessing'].transformers_

[('num_encoder',
  MinMaxScaler(),
  ['product_weight_g',
   'product_length_cm',
   'product_height_cm',
   'product_width_cm']),
 ('cat_encoder',
  OneHotEncoder(handle_unknown='ignore', sparse=False),
  ['customer_state', 'seller_state']),
 ('volume_computer',
  Pipeline(steps=[('compute_volume',
                   FunctionTransformer(func=<function multiply at 0x12c018550>)),
                  ('scale_volume', MinMaxScaler())]),
  ['product_length_cm', 'product_height_cm', 'product_width_cm'])]

In [24]:
col_num = final_pipe['preprocessing'].transformers_[0][2]
col_num

['product_weight_g',
 'product_length_cm',
 'product_height_cm',
 'product_width_cm']

In [25]:
final_pipe['preprocessing'].transformers_[1][1].get_feature_names()

array(['x0_AL', 'x0_AM', 'x0_AP', 'x0_BA', 'x0_CE', 'x0_DF', 'x0_ES',
       'x0_GO', 'x0_MA', 'x0_MG', 'x0_MS', 'x0_MT', 'x0_PA', 'x0_PB',
       'x0_PE', 'x0_PI', 'x0_PR', 'x0_RJ', 'x0_RN', 'x0_RO', 'x0_RS',
       'x0_SC', 'x0_SE', 'x0_SP', 'x0_TO', 'x1_SC', 'x1_SP'], dtype=object)

In [32]:
col_cat = list(final_pipe['preprocessing'].transformers_[1][1].get_feature_names())

In [28]:
final_pipe['preprocessing'].transformers_[2]
col_volume = ['volume']

['volume']

In [33]:
import itertools
col_prepross = list(itertools.chain(col_num, col_cat, col_volume))

In [34]:
len(col_prepross)

32

In [36]:
pd.DataFrame(data=final_pipe['preprocessing'].fit_transform(X), columns=col_prepross).head()

Unnamed: 0,product_weight_g,product_length_cm,product_height_cm,product_width_cm,x0_AL,x0_AM,x0_AP,x0_BA,x0_CE,x0_DF,...,x0_RN,x0_RO,x0_RS,x0_SC,x0_SE,x0_SP,x0_TO,x1_SC,x1_SP,volume
0,0.057692,0.402439,0.056818,0.271028,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.079177
1,0.020067,0.54878,0.147727,0.158879,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.125218
2,0.057692,0.402439,0.056818,0.271028,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.079177
3,0.057692,0.402439,0.056818,0.271028,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.079177
4,0.057692,0.402439,0.056818,0.271028,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.079177


🏁 Congratulation. Don't forget to add, commit and push your notebook.