In [32]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns 
sns.set()

train = pd.read_csv('train.csv', index_col='id')
train.tail()

Unnamed: 0_level_0,target,0,1,2,3,4,5,6,7,8,...,290,291,292,293,294,295,296,297,298,299
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
245,0.0,-1.199,0.466,-0.908,2.771,1.631,0.931,0.182,-0.652,-0.512,...,0.724,0.177,-0.039,0.759,0.461,-0.243,0.525,0.281,-0.255,-1.136
246,0.0,0.237,0.233,-0.38,-1.748,0.839,-0.721,-0.114,0.005,-1.788,...,0.857,0.147,0.601,-0.21,-0.768,1.004,-0.979,0.007,0.112,-0.558
247,0.0,1.411,-1.465,0.119,0.583,1.634,-0.207,1.173,1.622,-0.071,...,-0.499,-0.455,0.759,0.222,0.105,-0.727,0.461,0.76,0.168,-0.719
248,1.0,0.62,1.04,0.184,-0.57,-0.087,-0.748,-1.559,-0.553,0.552,...,0.557,-1.494,0.977,0.882,-1.512,0.478,-0.91,-0.805,2.029,-0.423
249,0.0,0.489,0.403,0.139,-2.046,1.345,0.122,1.255,0.647,-0.107,...,-0.025,1.305,-1.169,1.413,0.517,0.812,0.269,-1.454,-0.625,1.474


In [2]:
data = train.loc[:, '0':]
target = train.loc[:, 'target']

## Preprocessing

In [20]:
raw_mean = data.mean(axis=0)
raw_mean[:4]

0    0.023292
1   -0.026872
2    0.167404
3    0.001904
dtype: float64

In [23]:
raw_std = data.std(axis=0)
raw_std[:4]

0    0.998354
1    1.009314
2    1.021709
3    1.011751
dtype: float64

### Mean removal
Removing the mean from each feature so the mean is centered on zero.
Our data is already centered around zero. 

In [7]:
from sklearn import preprocessing
data_standardized = preprocessing.scale(data)
d_mean = data_standardized.mean(axis=0)
d_std = data_standardized.std(axis=0)
d_mean[:4], d_std[:4]

(array([-1.42108547e-17,  5.32907052e-18, -1.42108547e-17,  2.66453526e-17]),
 array([1., 1., 1., 1.]))

### Scaling 
The values of each feature can have difference variances

In [15]:
variance = data.var(axis=0)
variance[10:20]

10    1.177419
11    1.073844
12    1.102586
13    1.049200
14    0.858938
15    0.913773
16    1.051967
17    1.025717
18    0.893212
19    1.114999
dtype: float64

In [18]:
data_scalar = preprocessing.MinMaxScaler(feature_range=(0,1))
data_scaled = data_scalar.fit_transform(data)
data_scaled

array([[0.45456406, 0.95252336, 0.53808144, ..., 0.43724546, 0.63032191,
        0.38829533],
       [0.69586574, 0.36598131, 0.35678991, ..., 0.31044058, 0.33748702,
        0.71328337],
       [0.36758084, 0.53121495, 0.36275345, ..., 0.51462421, 0.59501558,
        0.36501355],
       ...,
       [0.76340565, 0.27401869, 0.44232408, ..., 0.65457238, 0.50126094,
        0.44346994],
       [0.60151453, 0.74224299, 0.45339922, ..., 0.36486486, 0.77733274,
        0.49067134],
       [0.57470323, 0.62317757, 0.44573181, ..., 0.24472418, 0.38362261,
        0.79317493]])

### Normalization
Data normalization is used when you want to adjust the values in the feature vector so that they can be measured on a common scale. 

In [25]:
data_normalized = preprocessing.normalize(data, norm='l1')
data_normalized[:4]

array([[-3.65850472e-04,  8.08230889e-03,  2.54228746e-03, ...,
        -1.54553158e-03,  3.87502847e-03, -3.97582400e-03],
       [ 4.55451301e-03, -4.09948304e-03, -1.61367112e-03, ...,
        -4.63035134e-03, -3.94359314e-03,  4.09948304e-03],
       [-2.16145177e-03, -3.67818752e-04, -1.43821265e-03, ...,
         1.65311799e-05,  3.30623597e-03, -5.00481471e-03],
       [ 2.94864516e-04, -9.24202215e-05,  1.72517747e-03, ...,
        -1.11784458e-03, -2.34571324e-03,  1.04742918e-03]])

### Binarization 
Binarization is used when you want to convert your numerical feature into a Boolean vector.

In [31]:
data_binarized = preprocessing.Binarizer(threshold=0).transform(data)
data_binarized[:4]

array([[0., 1., 1., ..., 0., 1., 0.],
       [1., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 1., 1., 0.],
       [1., 0., 1., ..., 0., 0., 1.]])

In [None]:
X_train = train[]
y