# Chapter 4: Handling Numerical Data
* scaling的方法：min-max、standard
## 4.1 rescaling a feature: rescale the values of a numerical feature to be between two values.（适合神经网络）

In [7]:
# 采用的是 min-max scaling方法
import numpy as np
from sklearn import preprocessing
#feature = np.array([-500.5,-100.1,0.0,100.1,500.5]) # 需要输入 2D-array
feature = np.array([
    [-500.5],
    [-100.1],
    [0.0],
    [100.1],
    [900.5]
])

# create scaler
minmax_scale=preprocessing.MinMaxScaler(feature_range=(0,1))

# Scale feature
scaled_feature=minmax_scale.fit_transform(feature)
print(scaled_feature)
print(feature)

[[0.        ]
 [0.28579586]
 [0.35724483]
 [0.42869379]
 [1.        ]]
[[-500.5]
 [-100.1]
 [   0. ]
 [ 100.1]
 [ 900.5]]


## 4.2 standardizing a feature: transform a feature to have a mean of 0 and a standard deviation of 1.（适合主成分分析）
* 注意：当存在异常值时，建议 suing the median and quantile range(取代均值和标准差)

In [18]:
import numpy as np 
from sklearn import preprocessing
x=np.array((
[-1000.1],
[-200.1],
[500.5],
[600.6],
[1300.3]))

# create a scaler
scaler=preprocessing.StandardScaler()

# fit & transform
standardized=scaler.fit_transform(x)
standardized

array([[-1.58713949],
       [-0.56345921],
       [ 0.33302878],
       [ 0.46111678],
       [ 1.35645314]])

In [12]:
# create scaler
robust_scaler=preprocessing.RobustScaler()
# transform
robust_scaler.fit_transform(x)

array([[-1.87411015],
       [-0.87498439],
       [ 0.        ],
       [ 0.12501561],
       [ 0.99887598]])

## 4.3 Normalizing Observation: rescale the feature values of observations to have unit norm(a total length of 1)

In [17]:
import numpy as np
from sklearn.preprocessing import Normalizer
features=np.array([
    [-0.5,0.5],
    [1.5,20.2],
    [1.63,34.4],
    [10.9,3.3]
])

# create normalizer
normalizer=Normalizer(norm="l2") # "2"表示欧几里得范数中的次数是2

# transform
normalizer.transform(features) # 每一行的norm为1

array([[-0.5       ,  0.5       ],
       [ 0.06912442,  0.93087558],
       [ 0.04524008,  0.95475992],
       [ 0.76760563,  0.23239437]])

## 4.4 _Generating_ Polynomial and Interaction Features

In [21]:
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
features = np.array([[2, 3], [4, 5], [6, 7]])

# create PolynomialFeatures object
polynomial_interaction= PolynomialFeatures(degree=2,include_bias=False) # 2是多项式的最高次

# create polynomial features
polynomial_interaction.fit_transform(features) # 两个一次项，两个平方项以及一个交叉项

array([[ 2.,  3.,  4.,  6.,  9.],
       [ 4.,  5., 16., 20., 25.],
       [ 6.,  7., 36., 42., 49.]])

In [22]:
# create PolynomialFeatures object with only interaction features by setting "interaction_only" to True
interaction = PolynomialFeatures(degree=2, 
                                 interaction_only=True, 
                                 include_bias=False) 
interaction.fit_transform(features)

array([[ 2.,  3.,  6.],
       [ 4.,  5., 20.],
       [ 6.,  7., 42.]])

## 4.5 Transforming Features: make a custom transformation to one or more features

In [24]:
import numpy as np
from sklearn.preprocessing import FunctionTransformer
features = np.array([[2, 3], [4, 5], [6, 7]])

# Define a simple function
def add_ten(x):
    return x+10

# create transformer
ten_transformer=FunctionTransformer(add_ten)

ten_transformer.transform(features)

array([[12, 13],
       [14, 15],
       [16, 17]])

In [26]:
# we can use the same transfromation in pandas using apply
import pandas as pd
df=pd.DataFrame(features,columns=["feature1","feature2"])
# apply function
df.apply(add_ten)

Unnamed: 0,feature1,feature2
0,12,13
1,14,15
2,16,17


## 4.6 Detecting Outliers: identify extreme observations
* 方法一：EllipticEnvelope-> 此方法的一个局限性在于：需要实现设定一个 contamination(污染)参数，即观测值中outlier的比例
* 方法二：IQR-based detection

In [31]:
import numpy as np
from sklearn.covariance import EllipticEnvelope
from sklearn.datasets import make_blobs

# create simuylated data
features,_=make_blobs(n_samples=10,n_features=2,centers=1,random_state=1)

# replace the first observation's values with extreme values
features[0,0]=10000
features[1,0]=10000

# create detector
outlier_detector=EllipticEnvelope(contamination=.1)

# fit detector
outlier_detector.fit(features)

# predict outliers
print(features)
outlier_detector.predict(features) # 1表示 inlier，-1 表示outlier

[[ 1.00000000e+04  3.52863145e+00]
 [ 1.00000000e+04  5.55121358e+00]
 [-1.61734616e+00  4.98930508e+00]
 [-5.25790464e-01  3.30659860e+00]
 [ 8.52518583e-02  3.64528297e+00]
 [-7.94152277e-01  2.10495117e+00]
 [-1.34052081e+00  4.15711949e+00]
 [-1.98197711e+00  4.02243551e+00]
 [-2.18773166e+00  3.33352125e+00]
 [-1.97451969e-01  2.34634916e+00]]


array([ 1, -1,  1,  1,  1,  1,  1,  1,  1,  1])

In [36]:
# using interquartile range to identify extreme values
feature=features[:,0]

# create a function to return index of outliers
def indices_of_outliers(x):
    q1,q3=np.percentile(x,[25,75])
    iqr=q3-q1
    lower_bound=q1-iqr*1.5
    upper_bound=q3+iqr*1.5
    return np.where((x>upper_bound) | (x< lower_bound))

print(feature)
indices_of_outliers(feature)

[ 1.00000000e+04  1.00000000e+04 -1.61734616e+00 -5.25790464e-01
  8.52518583e-02 -7.94152277e-01 -1.34052081e+00 -1.98197711e+00
 -2.18773166e+00 -1.97451969e-01]


(array([0, 1]),)

## 4.7 Handling Outliers

In [40]:
import pandas as pd
houses = pd.DataFrame()
houses['Price']=[524433,392333,293222,4322032]
houses['Bathrooms']=[2,3.5,2,116]
houses['Square_Feet']=[1500,2500,1500,48000]

#Filter observations
houses[houses['Bathrooms']<20]

# 接下来可以 mark them as outliers and include it as a feature

import numpy as np
# create feature based on boolean condition
houses["outlier"]=np.where(houses['Bathrooms']<20,0,1)
houses

# 最后，we can transfrom the feature to dampen the effect of the outlier(取对数)
houses['log_of_square_feet']=[np.log(x) for x in houses['Square_Feet']]
houses

Unnamed: 0,Price,Bathrooms,Square_Feet,outlier,log_of_square_feet
0,524433,2.0,1500,0,7.31322
1,392333,3.5,2500,0,7.824046
2,293222,2.0,1500,0,7.31322
3,4322032,116.0,48000,1,10.778956


## 4.8 Discretizating Features