## Preprocessing

In [4]:
from sklearn import preprocessing
import numpy as np
import pandas as pd

In [5]:
data = np.array([[ 1., -1.,  2.],
                [ 2.,  0.,  0.],
                [ 0.,  1., -1.]])
data

array([[ 1., -1.,  2.],
       [ 2.,  0.,  0.],
       [ 0.,  1., -1.]])

In [6]:
df=pd.DataFrame(data,columns=list('ABC'))
df

Unnamed: 0,A,B,C
0,1.0,-1.0,2.0
1,2.0,0.0,0.0
2,0.0,1.0,-1.0


In [7]:
scaler = preprocessing.StandardScaler()
scaler.fit(data)

StandardScaler()

In [8]:
df.A.sum()/len(df),     df.B.sum()/len(df),    df.C.sum()/len(df)

(1.0, 0.0, 0.3333333333333333)

In [9]:
mean = scaler.mean_
mean

array([1.        , 0.        , 0.33333333])

![](_pic/sample_variance.svg)

sample variance   $$S^2$$	

the value of the one observation   $$x_i$$

the mean value of all observations   $$\bar{x}$$

the number of observations   $$n$$

In [11]:
variance = scaler.var_
variance

array([0.66666667, 0.66666667, 1.55555556])

In [12]:
np.sqrt(variance)

array([0.81649658, 0.81649658, 1.24721913])

In [10]:
Standard_deviation = scaler.scale_
Standard_deviation

array([0.81649658, 0.81649658, 1.24721913])

In [22]:
X_scaled = scaler.transform(data)
X_scaled

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

In [23]:
X=data
X

array([[ 1., -1.,  2.],
       [ 2.,  0.,  0.],
       [ 0.,  1., -1.]])

In [24]:
X.min(axis=0) 

array([ 0., -1., -1.])

In [25]:
(X - X.min(axis=0)) 

array([[1., 0., 3.],
       [2., 1., 1.],
       [0., 2., 0.]])

In [26]:
#scaler.fit_transform(data)
X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
X_std

array([[0.5       , 0.        , 1.        ],
       [1.        , 0.5       , 0.33333333],
       [0.        , 1.        , 0.        ]])

In [27]:
X_scaled.std()

1.0

In [14]:
X_scaled.mean(axis=0)

array([0., 0., 0.])

In [15]:
X_scaled.std(axis=0)

array([1., 1., 1.])

### 1. Scaling:

In [16]:
# help(preprocessing.scale)

In [17]:
preprocessing.scale(data)

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

- **"fit"** computes the mean and std to be used for later scaling. (jsut a computation), nothing is given to you. 

- **"transform"** uses a previously computed mean and std to autoscale the data (subtract mean from all values and then divide it by std).
- **"fit_transform"** does both at the same time.

**Scaling features to a range**

- MinMaxScaler 
- MaxAbsScaler

**Load Dataset/Clean**

### 2. MinMaxScalar :
 <center>$\begin{align*}\frac {MinRange+(MaxRange-MinRange)*(x_{i}-x_{min})}{  x_{max}-{x}_{min}}\end{align*}$</center>

In [104]:
# help(preprocessing.MinMaxScaler)

In [124]:
data

array([[ 1., -1.,  2.],
       [ 2.,  0.,  0.],
       [ 0.,  1., -1.]])

In [132]:
scaler = preprocessing.MinMaxScaler()
# scaler = MinMaxScaler(feature_range=(1,5))

In [133]:
scaler.fit(data)
scaler.transform(data)

array([[0.5       , 0.        , 1.        ],
       [1.        , 0.5       , 0.33333333],
       [0.        , 1.        , 0.        ]])

In [134]:
scaler.fit_transform(data)

array([[0.5       , 0.        , 1.        ],
       [1.        , 0.5       , 0.33333333],
       [0.        , 1.        , 0.        ]])

In [135]:
scaler.data_max_

array([2., 1., 2.])

In [136]:
scaler.min_

array([0.        , 0.5       , 0.33333333])

In [137]:
scaler.scale_

array([0.5       , 0.5       , 0.33333333])

**manual**

In [138]:
X=data
X

array([[ 1., -1.,  2.],
       [ 2.,  0.,  0.],
       [ 0.,  1., -1.]])

In [117]:
#scaler.fit_transform(data)
X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
X_std

array([[0.5       , 0.        , 1.        ],
       [1.        , 0.5       , 0.33333333],
       [0.        , 1.        , 0.        ]])

In [141]:
X_scaled = X_std * (X.max() - X.min()) + X.min()
X_scaled

array([[ 0.5, -1. ,  2. ],
       [ 2. ,  0.5,  0. ],
       [-1. ,  2. , -1. ]])

In [142]:
data[0:,0].min()

0.0

### 2. MaxAbsScaler :
 ## <center>$\begin{align*}\frac{x_i}{abs(x_{max})}\end{align*}$</center>

In [None]:
data

sc=preprocessing.MaxAbsScaler()
sc.fit_transform(data)

### 3. Standard Scalar :
 ## <center>$\begin{align*}\frac{x_i-x_{mean}}{std\ of\ feature}\end{align*}$</center>
 
  **full formula is:**
 
 
**X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))**

**X_scaled = X_std * (max - min) + min**

In [None]:
data

In [119]:
A=np.array([1,0,2,2,1])
A

array([1, 0, 2, 2, 1])

In [120]:
std=np.std(A)
std

0.7483314773547883

In [121]:
mean=np.mean(A)
(1-mean)/std

-0.2672612419124243

In [122]:
sc=preprocessing.StandardScaler()
sc.fit_transform(data)

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

### 4. Normalizer :

**it works with row **

## <center>$\begin{align*}\frac{x_i}{(sum\ of\ square\ each\ element\ in\ row)^{2}}\end{align*}$</center>


### 5. Binarizer :
all values above threshold will be 1 and less or same will be 0

**Fit and Transform**

In [6]:
sc.fit_transform(X_train)

array([[0.5       , 0.        , 1.        ],
       [1.        , 0.5       , 0.33333333],
       [0.        , 1.        , 0.        ]])

In [8]:
sc.scale_

array([0.5       , 0.5       , 0.33333333])

In [9]:
sc.min_ 

array([0.        , 0.5       , 0.33333333])