In [None]:
import pandas as pd
import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.exceptions import NotFittedError
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

# Mean Encoding (DataFrame)

In [None]:
class MeanEncoderDF(BaseEstimator, TransformerMixin):
    def __init__(self, columns, weight=0, decimal=None):
        self.columns      = columns
        self.weight       = weight
        self.decimal      = decimal
        self.encoder_dict = None
    
    # Create function with y=None if y is not needed, to ensure compatibility with sklearn.pipeline.Pipeline
    def fit(self, X, y):
        dtypes = [pd.core.series.Series, pd.core.frame.DataFrame]
        assert type(X) in dtypes, f'X dtype is not in valid list: {dtypes}'
        assert type(y) in dtypes, f'y dtype is not in valid list: {dtypes}'
        
        self.encoder_dict = {}
        
        df       = pd.concat([X, y], axis=1)
        target   = y.name

        for column in self.columns:
            agg_df = df.groupby(column)[target].agg(['count', 'mean'])
            count  = agg_df['count']
            estimate_mean = agg_df['mean']
            overall_mean  = df[target].mean()
            smooth_mean   = (count * estimate_mean + self.weight * overall_mean) / (count + self.weight)

            mapper      = smooth_mean if self.decimal is None else np.round(smooth_mean, self.decimal)
            mapper.name = 'encode'
            self.encoder_dict[column] = mapper
        
        return self
    
    def transform(self, X):
        dtypes = [pd.core.series.Series, pd.core.frame.DataFrame]
        assert type(X) in dtypes, f'X dtype is not in valid list: {dtypes}'
        
        try:
            new_X = X.copy() if type(X) == pd.core.frame.DataFrame else X.to_frame().copy()
            for k,v in self.encoder_dict.items():
                new_X[k] = new_X[k].map(v)
                
            return new_X
        except AttributeError:
            raise NotFittedError(f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.")
    
    # Create function with y=None if y is not needed, to ensure compatibility with sklearn.pipeline.Pipeline
    def fit_transform(self, X, y):
        self.fit(X, y)
        return self.transform(X)
        
    def inverse_transform(self, X):
        dtypes = [pd.core.series.Series, pd.core.frame.DataFrame]
        assert type(X) in dtypes, f'X dtype is not in valid list: {dtypes}'
        
        try:
            new_X = X.copy() if type(X) == pd.core.frame.DataFrame else X.to_frame().copy()
            for k,v in self.encoder_dict.items():
                if k not in new_X.columns:
                    continue

                inverse_mapper = self.encoder_dict[k].reset_index().groupby('encode').agg({
                    k: lambda x: ', '.join(x)
                })[k]
                new_X[k] = new_X[k].map(inverse_mapper)
            
            return new_X
        except AttributeError:
            raise NotFittedError(f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.")

In [None]:
df = pd.DataFrame({
    'temperature': ['Hot', 'Cold', 'Very Hot', 'Warm', 'Hot', 'Warm', 'Warm', 'Hot', 'Hot', 'Cold'],
    'target': [1, 1, 1, 0, 1, 0, 1, 0, 1, 1]
})
df

In [None]:
encoder = MeanEncoderDF(columns=['temperature'])
encoder

In [None]:
X = df[['temperature']]
y = df['target']

encode_df = encoder.fit_transform(X, y)
encode_df

In [None]:
inverse_df = encoder.inverse_transform(encode_df)
inverse_df

# One-Hot Encoding (DataFrame)

In [None]:
class OneHotEncoderDF(BaseEstimator, TransformerMixin):
    def __init__(self, columns, **kwargs):
        self.columns = columns
        self.model   = OneHotEncoder(**kwargs)
        
    def fit(self, X, y=None):
        self.model.fit(X[self.columns])
    
    def transform(self, X):
        new_df = pd.concat([
            X,
            pd.DataFrame(
                self.model.transform(X[self.columns]).toarray(),
                columns=[f'{self.columns[i]}_{y}' for i,x in enumerate(self.model.categories_) for y in x]
            )
        ], axis=1)
        
        return new_df.drop(columns=self.columns)
    
    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)
    
    def inverse_transform(self, X):
        columns = [x for x in X.columns if any([y for y in self.columns if x.startswith(f'{y}_')])]
        new_df  = pd.concat([
            X,
            pd.DataFrame(self.model.inverse_transform(X[columns]), columns=self.columns)
        ], axis=1)
        
        return new_df.drop(columns=columns)

In [None]:
encoder = OneHotEncoderDF(columns=['temperature'], dtype=np.int8)
encoder

In [None]:
encode_df = encoder.fit_transform(df)
encode_df

In [None]:
encoder.inverse_transform(encode_df)

# Pipeline

In [None]:
X = df[['temperature']]
y = df['target']

steps = [
    ('mean_encoder', MeanEncoderDF(columns=['temperature'], decimal=3)),
    ('onehot_encoder', OneHotEncoderDF(columns=['temperature'], dtype=np.int8))
]
pipeline = Pipeline(steps, verbose=True)
pipeline.fit_transform(X, y)