# Custom Transformer

In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn import set_config; set_config(display='diagram')

👇 Consider the following dataset

In [2]:
import pandas as pd

data = pd.read_csv("data.csv")
data.head()

Unnamed: 0,games played,minutes played,points per game,field goals made,field goal attempts,field goal percent,3 point made,3 point attempt,3 point %,free throw made,free throw attempts,free throw %,offensive rebounds,defensive rebounds,rebounds,assists,steals,blocks,turnovers,target_5y
0,36.0,27.4,7.4,2.6,7.6,,0.5,2.1,25.0,1.6,2.3,69.9,0.7,3.4,4.1,1.9,0.4,0.4,1.3,0
1,35.0,26.9,,2.0,6.7,29.6,0.7,2.8,23.5,2.6,3.4,76.5,0.5,2.0,2.4,3.7,1.1,0.5,1.6,0
2,,15.3,5.2,2.0,4.7,42.2,0.4,1.7,24.4,0.9,1.3,67.0,0.5,1.7,2.2,1.0,0.5,0.3,1.0,0
3,58.0,11.6,5.7,2.3,5.5,42.6,0.1,0.5,22.6,0.9,1.3,68.9,1.0,0.9,1.9,0.8,0.6,0.1,1.0,1
4,48.0,11.5,4.5,1.6,3.0,52.4,0.0,0.1,0.0,1.3,1.9,67.4,1.0,1.5,2.5,0.3,0.3,0.4,0.8,1


- Each observation of the dataset represents an item being delivered from a  `seller_state` to a `customer_state`. 
- Other columns describe the packaging properties of each item.

🎯 The target is the number of days between the order and the delivery.

In [3]:
# Check target
sns.histplot(data.days_until_delivery)

AttributeError: 'DataFrame' object has no attribute 'days_until_delivery'

In [None]:
X = data.drop(columns='days_until_delivery')
y = data['days_until_delivery']

## 1. Pipeline

👇 Create a scikit-learn pipeline named `pipe`:

- Engineer a `volume` feature from the dimensions features
- Preserve the original product dimensions features for training
- Scale all numerical features
- Encode the categorical features
- Add a default `Ridge` regression estimator

<details><summary>Hints</summary>

- There are many ways to create your preprocessed matrix (using `ColumnTransformer` and/or `FeatureUnion`). 
    
- If your transformed feature matrix look wierd, it may be stored as "sparse" by the default behavior of `OneHotEncoder(sparse=True)`. Use `.todense()` to turn it back to a dense matrix

</details>

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import FeatureUnion
from sklearn.compose import make_column_selector
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import Ridge

In [None]:
lambda df: pd.DataFrame(df['product_length_cm']*df['product_height_cm']*df['product_width_cm'])

In [None]:
data.info()

In [None]:
vol = FunctionTransformer(lambda df: pd.DataFrame( \
                    df['product_length_cm']*df['product_height_cm']*df['product_width_cm'], columns=['volume_cm3']))


num_col = make_column_selector(dtype_include=['float64','int64'])

cat_col = make_column_selector(dtype_include=['object','bool'])

#num_transformer = make_pipeline(vol,RobustScaler())
num_transformer = make_pipeline(RobustScaler())
cat_transformer = make_pipeline(OneHotEncoder(sparse=False, handle_unknown='ignore'))
vol_transformer = make_pipeline(vol,RobustScaler())

col_trans = ColumnTransformer([("num_t",num_transformer,num_col),
                              ("cat_t",cat_transformer,cat_col),
                              ("vol_t",vol_transformer,num_col)])

col_trans

In [None]:
pipe = make_pipeline(col_trans,Ridge())

#### 🧪 Test your pipe

In [None]:
from nbresult import ChallengeResult

pipe_test = pipe

# Check that it doesn't crash
assert pipe_test.fit(X,y)

result = ChallengeResult('pipe', 
                         shape = pipe_test[:-1].fit_transform(X).shape
)
result.write()
print(result.check())

## 2. Train and Predict

👇 Let's imagine `data` is your entire training set.

- `cross_validate` your pipeline on this dataset (❗️low r2 score are expected)
- Now, imagine you just received an new order `new_data`: predict it's duration of delivery in a variable `prediction`

In [None]:
X_transform = col_trans.fit_transform(X)

In [None]:
X_transform.shape()

In [None]:
display(pd.DataFrame(X_transform).head(3))

In [None]:
new_data = pd.read_csv("data_new.csv")
new_data

In [None]:
from sklearn.model_selection import cross_val_score

cross_val_score(pipe,X,y,scoring='r2').mean()

In [None]:
pipe.fit(X,y)
prediction = pipe.predict(new_data)
prediction

In [None]:
from nbresult import ChallengeResult

result = ChallengeResult('prediction',
    prediction = prediction
)
result.write()
print(result.check())


🏁 Congratulation. Don't forget to add, commit and push your notebook.