# Preprocessing

In [2]:
import numpy as np
from sklearn.datasets import load_iris

In [32]:
X, y = load_iris('data')
mean = X.mean(axis=0).reshape(1,4)
std = X.std(axis=0).reshape(1,4)

#StandardScaler
X_standard = (X-mean) / std

print(X_standard.mean(axis=0))
print(X_standard.std(axis=0))

[-1.69031455e-15 -1.84297022e-15 -1.69864123e-15 -1.40924309e-15]
[1. 1. 1. 1.]


In [9]:
#skearn way - 1
#StandardScaler
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X)
X_standard_sk = scaler.transform(X)

print(X_standard_sk.mean(axis=0))
print(X_standard_sk.std(axis=0))

[-1.69031455e-15 -1.84297022e-15 -1.69864123e-15 -1.40924309e-15]
[1. 1. 1. 1.]


In [10]:
#skearn way - 2
#StandardScaler
from sklearn.preprocessing import StandardScaler

X_standard_sk = StandardScaler().fit_transform(X)

print(X_standard_sk.mean(axis=0))
print(X_standard_sk.std(axis=0))

[-1.69031455e-15 -1.84297022e-15 -1.69864123e-15 -1.40924309e-15]
[1. 1. 1. 1.]


### MyStandardScaler

In [44]:
from sklearn.base import BaseEstimator, TransformerMixin
#BaseEstimator : ajouts les getters et les setters
#TransformerMixin : ajout une methode fit_transform (!!! ne pas oublier retrun self à fit!!!)

class MyStandardScaler(BaseEstimator, TransformerMixin):
    def __init__(self, with_mean=True):
        """
        Parameters:
        -----------
            with_mean : bool 
                Use mean to center data. Defaults to True.
                If False only std is computed and used.

        Returns
        -------
        None.
        """
        self.with_mean = with_mean

    def fit(self, X, y=None):
        if self.with_mean:
            self.mean_ = X.mean(axis=0)
        self.std_ = X.std(axis=0)
        return self

    def transform(self, X):
        if self.with_mean:
            return (X - self.mean_) / self.std_
        return X / self.std_

    #def fit_transform(self, X, y=None):
    #    return self.fit(X).transform(X)

In [46]:
X, y = load_iris('data')
scaler = MyStandardScaler(with_mean=False)
#scaler.fit(X)
#X_scaled = scaler.transform(X)
#print(X_scaled)

X_scaled2 = scaler.fit_transform(X)
print(X_scaled2)

print(scaler.get_params())

[[ 6.17956139  8.05688682  0.79572398  0.26326437]
 [ 5.93722565  6.90590299  0.79572398  0.26326437]
 [ 5.69488991  7.36629652  0.73888655  0.26326437]
 [ 5.57372204  7.13609975  0.8525614   0.26326437]
 [ 6.05839352  8.28708359  0.79572398  0.26326437]
 [ 6.543065    8.97767388  0.96623626  0.52652874]
 [ 5.57372204  7.82669005  0.79572398  0.39489655]
 [ 6.05839352  7.82669005  0.8525614   0.26326437]
 [ 5.3313863   6.67570622  0.79572398  0.26326437]
 [ 5.93722565  7.13609975  0.8525614   0.13163218]
 [ 6.543065    8.51728035  0.8525614   0.26326437]
 [ 5.81605778  7.82669005  0.90939883  0.26326437]
 [ 5.81605778  6.90590299  0.79572398  0.13163218]
 [ 5.21021843  6.90590299  0.6252117   0.13163218]
 [ 7.02773649  9.20787065  0.68204912  0.26326437]
 [ 6.90656862 10.12865772  0.8525614   0.52652874]
 [ 6.543065    8.97767388  0.73888655  0.52652874]
 [ 6.17956139  8.05688682  0.79572398  0.39489655]
 [ 6.90656862  8.74747712  0.96623626  0.39489655]
 [ 6.17956139  8.74747712  0.85



## OrdinalEncoder

In [81]:
from sklearn.datasets import load_iris

data = load_iris(as_frame=True)['frame']
features = load_iris()['target_names']
features = {0 : 'setosa',
            1 : 'versicolor',
            2 : 'virginica'}

data['target'] = data['target'].map(features)
X = data['target'].values

In [110]:
from sklearn.base import BaseEstimator, TransformerMixin


class MyOrdinalEncoder():
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        self.categories_ = np.unique(X)
    
    def transform(self, X):
        def f(x):
            return list(self.categories_).index(x)
        if X.ndim == 1:
            return np.array([f(xi) for xi in X])
        
        for i in X.ndim:
            

In [113]:
test = MyOrdinalEncoder()
test.fit(X)
test.transform(X)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

## Corrrection

In [162]:
from sklearn.base import BaseEstimator, TransformerMixin

class MyOrdinalEncoderCorr(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        n_samples, n_features = X.shape
        self.categories_ = []
        
        #np.unique(X[:, 0])
        for i in range(n_features):
            self.categories_.append(np.unique(X[:, i]))
        
        return self
    
    def transform(self, X):
        for i, categorie in enumerate(self.categories_):
            for j, cat in enumerate(categorie):
                X[:,i][X[:,i]==cat] = j
            
        X = X.astype(np.uint32)
        return X

In [164]:
from sklearn.datasets import load_iris

data = load_iris(as_frame=True)['frame']
features = load_iris()['target_names']
features = {0 : 'setosa',
            1 : 'versicolor',
            2 : 'virginica'}

data['target'] = data['target'].map(features)
X = data['target'].values.reshape(-1, 1)
X = np.concatenate([X, X], axis=1)

In [165]:
encoder = MyOrdinalEncoderCorr()
encoder.fit(X)
encoder.categories_

[array(['setosa', 'versicolor', 'virginica'], dtype=object),
 array(['setosa', 'versicolor', 'virginica'], dtype=object)]

In [166]:
encoder = MyOrdinalEncoderCorr()
encoder.fit_transform(X)

array([[0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1,