# Custom Transformer

In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn import set_config; set_config(display='diagram')

👇 Consider the following dataset

In [2]:
import pandas as pd
data = pd.read_csv("data.csv")
data.head()

Unnamed: 0,customer_state,seller_state,product_weight_g,product_length_cm,product_height_cm,product_width_cm,days_until_delivery
0,RJ,SP,1825,53,10,40,9
1,RJ,SP,700,65,18,28,9
2,RJ,SP,1825,53,10,40,11
3,RJ,SP,1825,53,10,40,12
4,RJ,SP,1825,53,10,40,14


In [7]:
X = data.drop(columns='days_until_delivery')
y = data['days_until_delivery']

In [39]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=0)

- Each observation of the dataset represents an item being delivered from a  `seller_state` to a `customer_state`. 
- Other columns describe the packaging properties of each item.

🎯 The target is the number of days between the order and the delivery.

In [0]:
# Check target
sns.histplot(data.days_until_delivery)

## 1. Pipeline

👇 Create a scikit-learn pipeline named `pipe`:

- Engineer a `volume` feature from the dimensions features
- Preserve the original product dimensions features for training
- Scale all numerical features
- Encode the categorical features
- Add a default `Ridge` regression estimator

<details><summary>Hints</summary>

- There are many ways to create your preprocessed matrix (using `ColumnTransformer` and/or `FeatureUnion`). 
    
- If your transformed feature matrix look wierd, it may be stored as "sparse" by the default behavior of `OneHotEncoder(sparse=True)`. Use `.todense()` to turn it back to a dense matrix

</details>

In [78]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [79]:
# Create a transformer that multiplies two columns
volume = FunctionTransformer(lambda data: pd.DataFrame \
    (data["product_length_cm"] * data["product_height_cm"]* data["product_width_cm"]))

volume.fit_transform(X_train)

Unnamed: 0,0
105,107136
68,67716
479,82080
399,93744
434,23625
...,...
835,92925
192,15300
629,20000
559,120156


In [80]:
volume_transformer = Pipeline([("volume", volume),  ('scaler', StandardScaler())]) 


In [95]:
# Impute then Scale for numerical variables: 
num_transformer = Pipeline([('imputer', SimpleImputer()),('scaler', StandardScaler())])

# Encode categorical variables
cat_transformer = Pipeline ([
                            ("encode", OneHotEncoder(sparse=False))
                            ])

preprocessor = ColumnTransformer([
('num_transformer', num_transformer, ['product_weight_g','product_length_cm', 'product_height_cm', 'product_width_cm']),
("volume_transformer", volume_transformer, ['product_length_cm', 'product_height_cm', 'product_width_cm']),
('cat_transformer', cat_transformer, ['customer_state', 'seller_state'])])

In [96]:
from sklearn.linear_model import Ridge

In [104]:
final_pipe = Pipeline([('preprocessor', preprocessor),
                       ('linear_regression', Ridge())])

In [114]:
trained_model = final_pipe.fit(X_train, y_train)
trained_model

In [110]:
new_data = pd.read_csv("data_new.csv")
new_data

Unnamed: 0,customer_state,seller_state,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,RJ,SP,1825,53,10,40


#### 🧪 Test your pipe

In [0]:
from nbresult import ChallengeResult

pipe_test = pipe

# Check that it doesn't crash
assert pipe_test.fit(X,y)

result = ChallengeResult('pipe', 
                         shape = pipe_test[:-1].fit_transform(X).shape
)
result.write()
print(result.check())

## 2. Train and Predict

👇 Let's imagine `data` is your entire training set.

- `cross_validate` your pipeline on this dataset (❗️low r2 score are expected)
- Now, imagine you just received an new order `new_data`: predict it's duration of delivery in a variable `prediction`

In [0]:
new_data = pd.read_csv("data_new.csv")
new_data

In [112]:
final_pipe.predict(new_data)[0]

19.919047693692036