# Custom Transformer

In [21]:
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn import set_config; set_config(display='diagram')

👇 Consider the following dataset

In [22]:
import pandas as pd

data = pd.read_csv("data.csv")
data.head()

Unnamed: 0,customer_state,seller_state,product_weight_g,product_length_cm,product_height_cm,product_width_cm,days_until_delivery
0,RJ,SP,1825,53,10,40,9
1,RJ,SP,700,65,18,28,9
2,RJ,SP,1825,53,10,40,11
3,RJ,SP,1825,53,10,40,12
4,RJ,SP,1825,53,10,40,14


- Each observation of the dataset represents an item being delivered from a  `seller_state` to a `customer_state`. 
- Other columns describe the packaging properties of each item.

🎯 The target is the number of days between the order and the delivery.

In [0]:
# Check target
sns.histplot(data.days_until_delivery)

## 1. Pipeline

👇 Create a scikit-learn pipeline named `pipe`:

- Engineer a `volume` feature from the dimensions features
- Preserve the original product dimensions features for training
- Scale all numerical features
- Encode the categorical features
- Add a default `Ridge` regression estimator

<details><summary>Hints</summary>

- There are many ways to create your preprocessed matrix (using `ColumnTransformer` and/or `FeatureUnion`). 
    
- If your transformed feature matrix look wierd, it may be stored as "sparse" by the default behavior of `OneHotEncoder(sparse=True)`. Use `.todense()` to turn it back to a dense matrix

</details>

In [23]:
# YOUR CODE HERE

Unnamed: 0,customer_state,seller_state,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,RJ,SP,1825,53,10,40


#### 🧪 Test your pipe

In [0]:
from nbresult import ChallengeResult

pipe_test = pipe

# Check that it doesn't crash
assert pipe_test.fit(X,y)

result = ChallengeResult('pipe', 
                         shape = pipe_test[:-1].fit_transform(X).shape
)
result.write()
print(result.check())

## 2. Train and Predict

👇 Let's imagine `data` is your entire training set.

- `cross_validate` your pipeline on this dataset (❗️low r2 score are expected)
- Now, imagine you just received an new order `new_data`: predict it's duration of delivery in a variable `prediction`

In [0]:
new_data = pd.read_csv("data_new.csv")
new_data

In [34]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from sklearn.linear_model import Ridge
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import cross_val_score

X = data.drop(columns='days_until_delivery')
y = data['days_until_delivery']

num_transformer = Pipeline([
    ('scaler', MinMaxScaler())
])

cat_transformer = Pipeline([
    ('OneHotEncoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, make_column_selector(dtype_include=['int64'])),
    ('cat_transformer', cat_transformer, make_column_selector(dtype_include=['object']))],
    remainder='passthrough')

volume = FunctionTransformer(lambda data: pd.DataFrame(data['product_height_cm'] * data['product_length_cm'] * data['product_width_cm']))

union = FeatureUnion([
    ('preprocessing', preprocessor),
    ('volume', volume)
])

final_pipe = Pipeline([
    ('union', union),
    ('linear_regression', Ridge())])

final_pipe.fit(X, y)

In [37]:
cv = cross_val_score(final_pipe, X, y, cv = 10)
score =cv.mean()
score

0.018560887253848402

In [38]:
final_pipe.predict(new_data)

array([16.83841458])