In [1]:
import numpy as np
from sklearn.preprocessing import TargetEncoder
import polars as pl


X = np.array([["dog"] * 20 + ["cat"] * 30 + ["snake"] * 38], dtype=object).T
X = [item for sublist in X.tolist() for item in sublist]
y = [90.3] * 5 + [80.1] * 15 + [20.4] * 5 + [20.1] * 25 + [21.2] * 8 + [49] * 30

df = pl.DataFrame({'col1': X  , 'col2':y})
print(df)

shape: (88, 2)
┌───────┬──────┐
│ col1  ┆ col2 │
│ ---   ┆ ---  │
│ str   ┆ f64  │
╞═══════╪══════╡
│ dog   ┆ 90.3 │
│ dog   ┆ 90.3 │
│ dog   ┆ 90.3 │
│ dog   ┆ 90.3 │
│ dog   ┆ 90.3 │
│ …     ┆ …    │
│ snake ┆ 49.0 │
│ snake ┆ 49.0 │
│ snake ┆ 49.0 │
│ snake ┆ 49.0 │
│ snake ┆ 49.0 │
└───────┴──────┘


In [2]:
enc_auto = TargetEncoder(smooth="auto")
X_trans = enc_auto.fit_transform(df.select(pl.col('col1')).to_numpy(), df.select(pl.col('col2')).to_numpy().ravel())

encoded_features_df = pl.DataFrame(X_trans).rename({'column_0' : 'col1'})

df = df.with_columns(encoded_features_df)

print(df)

shape: (88, 2)
┌───────────┬──────┐
│ col1      ┆ col2 │
│ ---       ┆ ---  │
│ f64       ┆ f64  │
╞═══════════╪══════╡
│ 80.841084 ┆ 90.3 │
│ 80.841084 ┆ 90.3 │
│ 80.841084 ┆ 90.3 │
│ 80.841084 ┆ 90.3 │
│ 82.571349 ┆ 90.3 │
│ …         ┆ …    │
│ 42.932255 ┆ 49.0 │
│ 43.22723  ┆ 49.0 │
│ 42.73582  ┆ 49.0 │
│ 43.22723  ┆ 49.0 │
│ 42.73582  ┆ 49.0 │
└───────────┴──────┘


In [3]:
# A high `smooth` parameter puts more weight on global mean on the categorical
# encodings:
enc_high_smooth = TargetEncoder(smooth=5000.0).fit(df.select(pl.col('col1')).to_numpy(), df.select(pl.col('col2')).to_numpy().ravel())
enc_high_smooth.target_mean_

np.float64(44.28522727272727)

In [4]:
enc_high_smooth.encodings_

[array([44.25153512, 44.28039119, 44.25147521, 44.25147521, 44.24665263,
        44.28720774, 44.2862665 , 44.28532488, 44.2825986 , 44.2761055 ,
        44.34344645, 44.31589456, 44.29954745, 44.30670325, 44.31385619])]

In [5]:
# On the other hand, a low `smooth` parameter puts more weight on target
# conditioned on the value of the categorical:
enc_low_smooth = TargetEncoder(smooth=1.0).fit(df.select(pl.col('col1')).to_numpy(), df.select(pl.col('col2')).to_numpy().ravel())
enc_low_smooth.encodings_

[array([23.19815341, 32.19261364, 23.16065341, 23.16065341, 22.82058081,
        45.38724747, 44.93565341, 44.35503247, 42.96852273, 39.2094697 ,
        80.72315341, 74.97704545, 68.16174242, 71.14630682, 72.93704545])]