## Prepare Your Data For Machine Learning

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#### Data Transforms
- Split the dataset into the input and output variables
- Apply a pre-processing transform
- Summarize

In [2]:
data = pd.read_csv('dataset/diabetes.csv')
X = data.iloc[:, 0:8].values
y = data.iloc[:, 8].values
data.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


### Rescale Data
Rescaling the attributes to all have the same scale

In [3]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX = scaler.fit_transform(X)
np.set_printoptions(precision=3)
rescaledX

array([[0.353, 0.744, 0.59 , ..., 0.501, 0.234, 0.483],
       [0.059, 0.427, 0.541, ..., 0.396, 0.117, 0.167],
       [0.471, 0.92 , 0.525, ..., 0.347, 0.254, 0.183],
       ...,
       [0.294, 0.608, 0.59 , ..., 0.39 , 0.071, 0.15 ],
       [0.059, 0.633, 0.492, ..., 0.449, 0.116, 0.433],
       [0.059, 0.467, 0.574, ..., 0.453, 0.101, 0.033]])

### Standardize Data
Gaussian distribution and differing means and standard deviations to a standard Gaussian distribution with a mean of 0 and a standard deviation of 1.

In [4]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X)
rescaledX = scaler.transform(X)
np.set_printoptions(precision=3)
rescaledX

array([[ 0.64 ,  0.848,  0.15 , ...,  0.204,  0.468,  1.426],
       [-0.845, -1.123, -0.161, ..., -0.684, -0.365, -0.191],
       [ 1.234,  1.944, -0.264, ..., -1.103,  0.604, -0.106],
       ...,
       [ 0.343,  0.003,  0.15 , ..., -0.735, -0.685, -0.276],
       [-0.845,  0.16 , -0.471, ..., -0.24 , -0.371,  1.171],
       [-0.845, -0.873,  0.046, ..., -0.202, -0.474, -0.871]])

### Normalize Data
Rescaling each observation (row) to have a length of 1 (called a unit norm or vector) <br>
- It's useful for sparse datasets (lots of zeros)

In [5]:
from sklearn.preprocessing import Normalizer

normalizer = Normalizer().fit(X)
normalizedX = normalizer.transform(X)
np.set_printoptions(precision=3)
normalizedX

array([[0.034, 0.828, 0.403, ..., 0.188, 0.004, 0.28 ],
       [0.008, 0.716, 0.556, ..., 0.224, 0.003, 0.261],
       [0.04 , 0.924, 0.323, ..., 0.118, 0.003, 0.162],
       ...,
       [0.027, 0.651, 0.388, ..., 0.141, 0.001, 0.161],
       [0.007, 0.838, 0.399, ..., 0.2  , 0.002, 0.313],
       [0.008, 0.736, 0.554, ..., 0.241, 0.002, 0.182]])

### Binarize Data
All values above the threshold are marked 1 and all equal to or below are marked as 0

In [10]:
from sklearn.preprocessing import Binarizer

binarizer = Binarizer(threshold=0.0).fit(X)
binaryX = binarizer.transform(X)
np.set_printoptions(precision=3)
binaryX[:5, :]

array([[1., 1., 1., 1., 0., 1., 1., 1.],
       [1., 1., 1., 1., 0., 1., 1., 1.],
       [1., 1., 1., 0., 0., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1.],
       [0., 1., 1., 1., 1., 1., 1., 1.]])