# Various Data Preprocessing techniques

[Source](https://data-flair.training/blogs/python-ml-data-preprocessing/)

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.datasets import load_wine

df = load_wine()

In [3]:
print(df.feature_names)
print(df.target_names)

['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']
['class_0' 'class_1' 'class_2']


In [4]:
print(df.DESCR)

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
    Total Phenols:                0.98  3.88    2.29  0.63
    Fl

In [5]:
X = df.data
y = df.target

In [6]:
print(X.shape, y.shape)

(178, 13) (178,)


In [40]:
print(X.min(), X.max())

0.13 1680.0


# MinMaxScaler

- Rescaling using MinMaxScaler

**Decr:** Transform features by scaling each feature to a given range.

This estimator scales and translates each feature individually such that it is in the given range on the training set, e.g. between zero and one.

- `X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))`
- `X_scaled = X_std * (max - min) + min`

In [8]:
from sklearn.preprocessing import MinMaxScaler

In [31]:
scaler = MinMaxScaler(feature_range = (0, 1))
mmrescaledX = scaler.fit_transform(X)

# above line is combo of:
# rescaledX = scaler.fit(x)
# rescaledX = scaler.transform(x)

In [10]:
# for viewing only 3 decimals
np.set_printoptions(precision=3)

In [32]:
mmrescaledX[0:5, :]

array([[0.842, 0.192, 0.572, 0.258, 0.62 , 0.628, 0.574, 0.283, 0.593,
        0.372, 0.455, 0.971, 0.561],
       [0.571, 0.206, 0.417, 0.031, 0.326, 0.576, 0.511, 0.245, 0.274,
        0.265, 0.463, 0.78 , 0.551],
       [0.561, 0.32 , 0.701, 0.412, 0.337, 0.628, 0.612, 0.321, 0.757,
        0.375, 0.447, 0.696, 0.647],
       [0.879, 0.239, 0.61 , 0.32 , 0.467, 0.99 , 0.665, 0.208, 0.558,
        0.556, 0.309, 0.799, 0.857],
       [0.582, 0.366, 0.807, 0.536, 0.522, 0.628, 0.496, 0.491, 0.445,
        0.259, 0.455, 0.608, 0.326]])

In [39]:
print(mmrescaledX.min(), mmrescaledX.max())

0.0 1.0


# StandardScaler

**Decr**:
With standardizing, we can take attributes with a Gaussian distribution and different means and standard deviations and transform them into a standard Gaussian distribution with a mean of 0 and a standard deviation of 1.

`z = (x - u) / s`
- where u is the mean of the training samples or zero if with_mean=False, and s is the standard deviation of the training samples or one if with_std=False.

In [12]:
from sklearn.preprocessing import StandardScaler

In [34]:
scaler = StandardScaler()
ss_scaled = scaler.fit_transform(X)

# above line is combo of:
# rescaled = scaler.fit(x)
# rescaled = scaler.transform(x)

In [36]:
ss_scaled[0:5]

array([[ 1.519, -0.562,  0.232, -1.17 ,  1.914,  0.809,  1.035, -0.66 ,
         1.225,  0.252,  0.362,  1.848,  1.013],
       [ 0.246, -0.499, -0.828, -2.491,  0.018,  0.569,  0.734, -0.821,
        -0.545, -0.293,  0.406,  1.113,  0.965],
       [ 0.197,  0.021,  1.109, -0.269,  0.088,  0.809,  1.216, -0.498,
         2.136,  0.269,  0.318,  0.789,  1.395],
       [ 1.692, -0.347,  0.488, -0.809,  0.931,  2.491,  1.467, -0.982,
         1.032,  1.186, -0.428,  1.184,  2.335],
       [ 0.296,  0.228,  1.84 ,  0.452,  1.282,  0.809,  0.663,  0.227,
         0.401, -0.319,  0.362,  0.45 , -0.038]])

In [38]:
print(ss_scaled.min(), ss_scaled.max())

-3.6791622340370145 4.371372139554767


# Normalizer

- Normalize samples individually to unit norm.

In [16]:
from sklearn.preprocessing import Normalizer

In [17]:
norm = Normalizer()
normalizedX = norm.fit_transform(X)

In [44]:
normalizedX[0:5]

array([[1.326e-02, 1.594e-03, 2.265e-03, 1.454e-02, 1.184e-01, 2.610e-03,
        2.852e-03, 2.610e-04, 2.135e-03, 5.257e-03, 9.694e-04, 3.654e-03,
        9.927e-01],
       [1.251e-02, 1.687e-03, 2.029e-03, 1.062e-02, 9.479e-02, 2.512e-03,
        2.616e-03, 2.465e-04, 1.213e-03, 4.152e-03, 9.953e-04, 3.223e-03,
        9.953e-01],
       [1.106e-02, 1.984e-03, 2.245e-03, 1.564e-02, 8.491e-02, 2.354e-03,
        2.724e-03, 2.522e-04, 2.362e-03, 4.775e-03, 8.659e-04, 2.665e-03,
        9.962e-01],
       [9.680e-03, 1.314e-03, 1.684e-03, 1.132e-02, 7.612e-02, 2.593e-03,
        2.351e-03, 1.617e-04, 1.468e-03, 5.254e-03, 5.793e-04, 2.324e-03,
        9.970e-01],
       [1.777e-02, 3.477e-03, 3.853e-03, 2.819e-02, 1.584e-01, 3.759e-03,
        3.611e-03, 5.236e-04, 2.443e-03, 5.800e-03, 1.396e-03, 3.934e-03,
        9.868e-01]])

In [45]:
print(normalizedX.min(), normalizedX.max())

0.00014484038599355936 0.9978349045602742


# Binarizer

Binarize data (set feature values to 0 or 1) according to a threshold.

Values greater than the threshold map to 1, while values less than or equal to the threshold map to 0. With the default threshold of 0, only positive values map to 1.

Binarization is a common operation on text count data where the analyst can decide to only consider the presence or absence of a feature rather than a quantified number of occurrences for instance.

In [46]:
from sklearn.preprocessing import Binarizer

In [47]:
binarizer = Binarizer(threshold=0.0)
binaryX = binarizer.fit_transform(X)

In [48]:
binaryX[0:5, :]

array([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])

# Scaling

Standardize a dataset along any axis.

Center to the mean and component wise scale to unit variance.

In [22]:
from sklearn.preprocessing import scale

In [23]:
data_standardized = scale(X)
data_standardized.mean(axis=0)

array([ 7.841e-15,  2.445e-16, -4.059e-15, -7.110e-17, -2.495e-17,
       -1.955e-16,  9.443e-16, -4.179e-16, -1.541e-15, -4.129e-16,
        1.398e-15,  2.127e-15, -6.986e-17])

# Label Encoding

Encode target labels with value between 0 and n_classes-1.

This transformer should be used to encode target values, i.e. y, and not the input X.

In [24]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [25]:
data = np.array(['cold', 'cold', 'warm', 'cold', 'hot', 'hot', 'hot', 'warm', 'cold', 'warm', 'hot'])

In [26]:
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(data)

In [27]:
integer_encoded

array([0, 0, 2, 0, 1, 1, 1, 2, 0, 2, 1])

In [28]:
label_encoder.inverse_transform(integer_encoded)

array(['cold', 'cold', 'warm', 'cold', 'hot', 'hot', 'hot', 'warm',
       'cold', 'warm', 'hot'], dtype='<U4')

# OneHotEncoding

Encode categorical features as a one-hot numeric array.

The input to this transformer should be an array-like of integers or strings, denoting the values taken on by categorical (discrete) features. The features are encoded using a one-hot (aka ‘one-of-K’ or ‘dummy’) encoding scheme. This creates a binary column for each category and returns a sparse matrix or dense array (depending on the sparse_output parameter)

In [29]:
ohe_encoder = OneHotEncoder(sparse_output=False) # renamed sparse-> sparse_output in v1.2
vals_encoded = data.reshape(len(interger_encoded),1)
ohe_encoded = ohe_encoder.fit_transform(vals_encoded)

NameError: name 'interger_encoded' is not defined

In [None]:
print(ohe_encoded)

- **inverse-transform** -> Transform back to original encoding.

In [None]:
inverted = ohe_encoder.inverse_transform(ohe_encoded)

In [None]:
inverted