# Preprocessing

In [8]:
# Imports and definitions
import numpy
from numpy import ndarray, array
from sklearn import preprocessing, linear_model
from matplotlib import pyplot
import ipywidgets as widgets
from IPython.display import display

import sys

sys.path.append("../")

import common

data = array([
    [5.1, -2.9, 3.3],
    [-1.2, 7.8, -6.1],
    [3.9, 0.4, 2.1],
    [7.3, -9.9, -4.5]
]) # type: ndarray

## Binarization

In [7]:
binarizer = preprocessing.Binarizer(2.1)

binarizer.transform(data)

array([[ 1.,  0.,  1.],
       [ 0.,  1.,  0.],
       [ 1.,  0.,  0.],
       [ 1.,  0.,  0.]])

## Mean Removal

In [12]:
mean_removed_data = preprocessing.scale(data)

print("Unmodified Data:")
print("\tMean\t=", data.mean(axis=0))
print("\tStddev.\t=", data.std(axis=0))
print()
print("Mean-Removed Data:")
print("\tMean\t=", mean_removed_data.mean(axis=0))
print("\tStddev.\t=", mean_removed_data.std(axis=0))
print()
print("Full Data:")
print(mean_removed_data)

Unmodified Data:
	Mean	= [ 3.775 -1.15  -1.3  ]
	Stddev.	= [ 3.12039661  6.36651396  4.0620192 ]

Mean-Removed Data:
	Mean	= [  1.11022302e-16   0.00000000e+00   2.77555756e-17]
	Stddev.	= [ 1.  1.  1.]

Full Data:
[[ 0.42462551 -0.2748757   1.13244172]
 [-1.59434861  1.40579288 -1.18167831]
 [ 0.04005901  0.24346134  0.83702214]
 [ 1.12966409 -1.37437851 -0.78778554]]


## Min/Max Scaling

In [13]:
scaler = preprocessing.MinMaxScaler((0, 1))
scaled_data = scaler.fit_transform(data)

print("Unmodified Data:")
print("\tMean\t=", data.mean(axis=0))
print("\tStddev.\t=", data.std(axis=0))
print()
print("Scaled Data (0, 1):")
print("\tMean:\t=", scaled_data.mean(0))
print("\tStddev.\t=", scaled_data.std(0))
print()
print("Full Data:")
print(scaled_data)

Unmodified Data:
	Mean	= [ 3.775 -1.15  -1.3  ]
	Stddev.	= [ 3.12039661  6.36651396  4.0620192 ]

Scaled Data (0, 1):
	Mean:	= [ 0.58529412  0.49435028  0.5106383 ]
	Stddev.	= [ 0.36710548  0.35969005  0.4321297 ]

Full Data:
[[ 0.74117647  0.39548023  1.        ]
 [ 0.          1.          0.        ]
 [ 0.6         0.5819209   0.87234043]
 [ 1.          0.          0.17021277]]


## Normalized Data

$L_p$ space is a function space

$L_p(\mathbf{x}) = ||\mathbf{x}||_p = \sqrt[p]{\sum{|x_i|^p}}$, for $p \in R$ and $p \ge 1$

For Manhattan distance, $p = 1$, implying $L_1(\mathbf{x}) = ||\mathbf{x}||_1 = \sum{|x_i|}$.  This is the $L_1$ norm.

For Euclidean distance, $p = 2$, implying $L_2(\mathbf{x}) = ||\mathbf{x}||_2 = \sqrt{\sum{|x_i|^2}}$.  This is the $L_2$ norm.

For the max norm, $p = \infty$, implying $L_\infty(\mathbf{x}) = ||\mathbf{x}||_\infty = \max(\{|x_1|, |x_2|, \dots , |x_n|\})$.  This is the $L_\infty$ norm.

In [11]:
l1_normalized_data = preprocessing.normalize(data, "l1")
# Manhattan distance (p = 1)
# Better if you want to want to ignore outliers

l2_normalized_data = preprocessing.normalize(data, "l2")
# Euclidean distance (p = 2)
# Better if you want to consider outliers

max_normalized_data = preprocessing.normalize(data, "max")
# Max norm (p = infinity)
# Lnorm(infinity, data) = max(abs(x) for x in data)

print("Data:")
print(data)
print()
print("L1-Normalized Data:")
print(l1_normalized_data)
print()
print("L2-Normalized Data:")
print(l2_normalized_data)
print()
print("Max-Normalized Data:")
print(max_normalized_data)

Data:
[[ 5.1 -2.9  3.3]
 [-1.2  7.8 -6.1]
 [ 3.9  0.4  2.1]
 [ 7.3 -9.9 -4.5]]

L1-Normalized Data:
[[ 0.45132743 -0.25663717  0.2920354 ]
 [-0.0794702   0.51655629 -0.40397351]
 [ 0.609375    0.0625      0.328125  ]
 [ 0.33640553 -0.4562212  -0.20737327]]

L2-Normalized Data:
[[ 0.75765788 -0.43082507  0.49024922]
 [-0.12030718  0.78199664 -0.61156148]
 [ 0.87690281  0.08993875  0.47217844]
 [ 0.55734935 -0.75585734 -0.34357152]]

Max-Normalized Data:
[[ 1.         -0.56862745  0.64705882]
 [-0.15384615  1.         -0.78205128]
 [ 1.          0.1025641   0.53846154]
 [ 1.         -1.35616438 -0.61643836]]
