In [1]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from typing import TypeAlias
from sklearn.utils.validation import check_is_fitted, check_array
from sklearn.utils.estimator_checks import check_estimator

In [2]:
ArrayLike: TypeAlias = np.ndarray | pd.DataFrame | pd.Series | list

In [3]:
class StandardScalerClone(BaseEstimator, TransformerMixin):
    """
    Clone of sklearn.preprocessing.StandardScaler
    """
    def __init__(self, with_mean: bool = True) -> None:
        self.with_mean: bool = with_mean
            
    def fit(self, X: ArrayLike, y: ArrayLike | None = None) -> 'StandardScalerClone':
        X_orig = X
        X = check_array(X)
        
        self.mean_ = X.mean(axis= 0)
        self.scale_ = X.std(axis= 0)
        self.n_features_in_ = X.shape[1]
        
        if hasattr(X_orig, 'columns'):
            self.feature_names_in_ = X_orig.columns.to_numpy()
            
        return self
    
    def transform(self, X: ArrayLike) -> np.ndarray:
        check_is_fitted(self)
        X = check_array(X)
        
        if self.n_features_in_ != X.shape[1]:
            raise ValueError('Unexpected number of features.')
            
        if self.with_mean:
            X = X - self.mean_
            
        return X / self.scale_
    
    def inverse_transform(self, X: ArrayLike) -> np.ndarray:
        check_is_fitted(self)
        X = check_array(X)
        
        if self.n_features_in_ != X.shape[1]:
            raise ValueError('Unexpected number of features.')
            
        X = X * self.scale_
        if self.with_mean:
            X = X + self.mean_
            
        return X
    
    def get_feature_names_out(self, input_features: ArrayLike | None = None) -> np.ndarray:
        if input_features is None:
            return getattr(self, 'feature_names_in_', np.asarray([f'x{i}' for i in range(self.n_features_in_)], dtype= 'object'))
        
        if len(input_features) != self.n_features_in_:
            raise ValueError('Invalid number of features.')
            
        if hasattr(self, 'feature_names_in_') and not np.all(self.feature_names_in_ == input_features):
            raise ValueError('self.feature_names_in_ does not equal to input_features.')
        
        return input_features

In [4]:
check_estimator(StandardScalerClone())

In [5]:
np.random.seed(42)

In [6]:
X = np.random.rand(1000, 3)

In [7]:
X

array([[0.37454012, 0.95071431, 0.73199394],
       [0.59865848, 0.15601864, 0.15599452],
       [0.05808361, 0.86617615, 0.60111501],
       ...,
       [0.80000348, 0.55270708, 0.39655368],
       [0.13171503, 0.86529576, 0.15727321],
       [0.30978786, 0.29004553, 0.87141403]])

In [8]:
scaler = StandardScalerClone()
X_scaled = scaler.fit_transform(X)

assert np.allclose(X_scaled, (X - X.mean(axis=0)) / X.std(axis=0))

In [9]:
inverse_X_scaled = scaler.inverse_transform(X_scaled)

assert np.allclose(inverse_X_scaled, X)

In [10]:
scaler.get_feature_names_out()

array(['x0', 'x1', 'x2'], dtype=object)

Without mean

In [11]:
scaler = StandardScalerClone(with_mean= False)
X_scaled = scaler.fit_transform(X)

assert np.allclose(X_scaled, X / X.std(axis= 0))

In [12]:
inverse_X_scaled = scaler.inverse_transform(X_scaled)

assert np.allclose(inverse_X_scaled, X)

In [13]:
scaler.get_feature_names_out()

array(['x0', 'x1', 'x2'], dtype=object)

For DataFrame

In [14]:
df = pd.DataFrame(X, columns= ['feature1', 'feature2', 'feature3'])

In [15]:
df.head()

Unnamed: 0,feature1,feature2,feature3
0,0.37454,0.950714,0.731994
1,0.598658,0.156019,0.155995
2,0.058084,0.866176,0.601115
3,0.708073,0.020584,0.96991
4,0.832443,0.212339,0.181825


In [16]:
scaler = StandardScalerClone()
df_scaled = scaler.fit_transform(df)

assert np.allclose(df_scaled, ((df - df.mean(axis= 0)) / df.std(axis= 0)).to_numpy(), rtol= 1e05)

In [17]:
inverse_df_scaled = scaler.inverse_transform(df_scaled)

assert np.allclose(inverse_df_scaled, df.to_numpy())

In [18]:
scaler.get_feature_names_out()

array(['feature1', 'feature2', 'feature3'], dtype=object)

In [19]:
scaler.feature_names_in_

array(['feature1', 'feature2', 'feature3'], dtype=object)