In [1]:
import sklearn
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

In [2]:
print(sklearn.__version__)

1.8.0


### Load the data

In [3]:
df = pd.read_csv('/Users/herrakaava/Documents/ML_DATA/cali_houses.csv', index_col=0)

In [4]:
X = df.drop(['median_house_value'], axis=1)
y = df['median_house_value']

In [5]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.42,37.8,52.0,3321.0,1115.0,1576.0,1034.0,2.0987,458300.0,NEAR BAY
1,-118.38,34.14,40.0,1965.0,354.0,666.0,357.0,6.0876,483800.0,<1H OCEAN
2,-121.98,38.36,33.0,1083.0,217.0,562.0,203.0,2.433,101700.0,INLAND
3,-117.11,33.75,17.0,4174.0,851.0,1845.0,780.0,2.2618,96100.0,INLAND
4,-118.15,33.77,36.0,4366.0,1211.0,1912.0,1172.0,3.5292,361800.0,NEAR OCEAN


### Log-transformer

In [6]:
# Create the custom transformer
log_trans = FunctionTransformer(func=np.log, inverse_func=np.exp)

# Use the custom transformer on the 'population' feature
lop_population = log_trans.transform(df.loc[:, 'population'])

In [7]:
(lop_population == np.log(df.loc[:, 'population'])).all()

np.True_

When providing an argument for the `inverse_func` parameter, one can perform inverse transformation as well.

In [8]:
np.isclose(log_trans.inverse_func(lop_population), df.loc[:, 'population']).all()

np.True_

- Note that due to floating-point arithmetic in the transformations, we cannot get the **exact** same output back.

### Combining features

In [9]:
ratio_trans = FunctionTransformer(
    func=lambda x: x[:, 0] / x[:, 1]
)

In [10]:
population_per_household = ratio_trans.transform(df.loc[:, ['population', 'households']].values)

In [11]:
pd.DataFrame(population_per_household, columns=['pop_per_hh']).head()

Unnamed: 0,pop_per_hh
0,1.524178
1,1.865546
2,2.768473
3,2.365385
4,1.631399


Let's be cleaner and add it straight back to the DataFrame.

In [12]:
# Define the FunctionTransformer with 'feature_names_out'
# This callable receives the transformer and input features, and must return the new name(s)
ft = FunctionTransformer(
    func=lambda x: (x.iloc[:, 0] / x.iloc[:, 1]).to_frame(),  # Needs to be a 2D output
    feature_names_out=lambda transformer, input_features: ['pop_per_hh']
)
# Define ColumnTransformer
ct = ColumnTransformer([
    ('ratio', ft, ['population', 'households'])
], remainder='passthrough', verbose_feature_names_out=False)

# Enable Pandas output globally for this SPECIFIC transformer
ct.set_output(transform='pandas')

# Transform the data
data = ct.fit_transform(df)

In [13]:
data.head()

Unnamed: 0,pop_per_hh,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,median_income,median_house_value,ocean_proximity
0,1.524178,-122.42,37.8,52.0,3321.0,1115.0,2.0987,458300.0,NEAR BAY
1,1.865546,-118.38,34.14,40.0,1965.0,354.0,6.0876,483800.0,<1H OCEAN
2,2.768473,-121.98,38.36,33.0,1083.0,217.0,2.433,101700.0,INLAND
3,2.365385,-117.11,33.75,17.0,4174.0,851.0,2.2618,96100.0,INLAND
4,1.631399,-118.15,33.77,36.0,4366.0,1211.0,3.5292,361800.0,NEAR OCEAN


## About the code above

### Pseudo-code inside Scikit-learn

```python
def get_feature_names_out(self, input_features=None):
    # self is the transformer instance
    # input_features are the column names coming in 
    
    # Scikit-learn calls YOUR lambda here, passing both items:
    return self.feature_names_out(self, input_features)
```
This is why we need to give two *positional parameters* to the lambda function in
```python
lambda transformer, input_features: ['pop_per_hh']
```
We also need to specify `verbose_feature_names_out=False` or the colum names will get a 'ratio' prefix (since this is specified as the name of the transformer inside the `ColumnTransformer` function).

Note also that the transformed DataFrame does not include the original columns that were used in the ratio transformation. If we want to keep the original columns, we need to modify the `ColumnTransformer` function.

In [14]:
# Define the FunctionTransformer with 'feature_names_out'
# This callable receives the transformer and input features, and must return the new name(s)
ft = FunctionTransformer(
    func=lambda x: (x.iloc[:, 0] / x.iloc[:, 1]).to_frame(),  # Needs to be a 2D output
    feature_names_out=lambda transformer, input_features: ['pop_per_hh']
)

ct = ColumnTransformer([
    # Step 1: calculate the ratio (consumes the columns to create a new one)
    ('ratio', ft, ['population', 'households']),
    
    # Step 2: explicitly keep the original columns
    ('orig', 'passthrough', ['population', 'households'])
], remainder='passthrough', verbose_feature_names_out=False)

# Enable Pandas output globally for this SPECIFIC transformer
ct.set_output(transform='pandas')

# Transform the data
data = ct.fit_transform(df)

In [15]:
data.head()

Unnamed: 0,pop_per_hh,population,households,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,median_income,median_house_value,ocean_proximity
0,1.524178,1576.0,1034.0,-122.42,37.8,52.0,3321.0,1115.0,2.0987,458300.0,NEAR BAY
1,1.865546,666.0,357.0,-118.38,34.14,40.0,1965.0,354.0,6.0876,483800.0,<1H OCEAN
2,2.768473,562.0,203.0,-121.98,38.36,33.0,1083.0,217.0,2.433,101700.0,INLAND
3,2.365385,1845.0,780.0,-117.11,33.75,17.0,4174.0,851.0,2.2618,96100.0,INLAND
4,1.631399,1912.0,1172.0,-118.15,33.77,36.0,4366.0,1211.0,3.5292,361800.0,NEAR OCEAN
