## Normalization

In [72]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.preprocessing import normalize
from sklearn.preprocessing import Normalizer

iris = load_iris()
data = pd.DataFrame(iris.data, columns=iris.feature_names)

# ------ toy ------
def normalize_l1(sample):
    """
    :param sample: DataFrame 的一行(axis=1) , 一个 Series
    :return: 经过 l1 正则的样本 , 一个 Series
    """
    return sample/sum(abs(sample))

print(data.head().apply(normalize_l1, axis=1))  # 对每一个样本操作
print('------------------------------------')

# ------ normalize function ------
print(pd.DataFrame(normalize(data, norm='l1'), columns=iris.feature_names).head())
print('------------------------------------')

# ------ Normalizer ------
scaler = Normalizer(norm='l1').fit(data)
print(pd.DataFrame(scaler.transform(data), columns=iris.feature_names).head())

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0           0.500000          0.343137           0.137255          0.019608
1           0.515789          0.315789           0.147368          0.021053
2           0.500000          0.340426           0.138298          0.021277
3           0.489362          0.329787           0.159574          0.021277
4           0.490196          0.352941           0.137255          0.019608
------------------------------------
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0           0.500000          0.343137           0.137255          0.019608
1           0.515789          0.315789           0.147368          0.021053
2           0.500000          0.340426           0.138298          0.021277
3           0.489362          0.329787           0.159574          0.021277
4           0.490196          0.352941           0.137255          0.019608
------------------------------------
   sepal lengt

## Binarization

In [73]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.preprocessing import binarize
from sklearn.preprocessing import Binarizer

iris = load_iris()
data = pd.DataFrame(iris.data, columns=iris.feature_names)

# ------ toy ------
def binarize_one(sample):
    """
    :param sample:  DataFrame 的一行(axis=1) , 一个 Series
    :return: Series > 1 元素变为 1、Series <= 1 元素变为 0 , 一个 Series
    """
    # Feature values below or equal to this are replaced by 0, above it by 1.
    sample.loc[sample > 1] = 1
    sample.loc[sample <= 1] = 0
    return sample

print(data.head().apply(binarize_one, axis=1))
print('------------------------------------')

# ------ binarize function ------
print(pd.DataFrame(binarize(data, threshold=1), columns=iris.feature_names).head())

# ------ Binarizer ------
scaler = Binarizer(threshold=1).fit(data)
print(pd.DataFrame(scaler.transform(data), columns=iris.feature_names).head())


   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                0.0               0.0                0.0               0.0
1                0.0               0.0                0.0               0.0
2                0.0               0.0                0.0               0.0
3                0.0               0.0                0.0               0.0
4                0.0               0.0                0.0               0.0
------------------------------------
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                0.0               0.0                0.0               0.0
1                0.0               0.0                0.0               0.0
2                0.0               0.0                0.0               0.0
3                0.0               0.0                0.0               0.0
4                0.0               0.0                0.0               0.0
   sepal length (cm)  sepal width (cm)  petal lengt

## Encoding categorical features


In [74]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# The input to this transformer should be a matrix of integers, denoting the values taken on by categorical (discrete) features
data = pd.DataFrame([['male', 'from Europe', 'uses IE'], 
              ['female', 'from US', 'uses Firefox'], 
              ['male', 'from Asia', 'uses Chrome'],
              ['female', 'from Europe', 'uses Safari']],
             columns=['sex', 'homeland', 'browser'])

def category_to_integer(feature):
    """
    :param feature: DataFrame 的一列(axis=0) , 一个 Series
    :return: 使用 integer 编码后的 feature , 一个 Series
    """
    category_to_integer = {}
   
    for integer, category in enumerate(np.unique(feature)):
        category_to_integer[category] = integer
    
    return feature.replace(category_to_integer)

data = data.apply(category_to_integer, axis=0)
    
# ------ OneHotEncoder ------
enc = OneHotEncoder().fit(data)
# scaler.transform() 返回 scipy.sparse.csr.csr_matrix 对象
print(pd.DataFrame(enc.transform(data).toarray()).head())

     0    1    2    3    4    5    6    7    8
0  0.0  1.0  0.0  1.0  0.0  0.0  0.0  1.0  0.0
1  1.0  0.0  0.0  0.0  1.0  0.0  1.0  0.0  0.0
2  0.0  1.0  1.0  0.0  0.0  1.0  0.0  0.0  0.0
3  1.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  1.0


## Imputation of missing values


In [75]:
from scipy.sparse import csr_matrix
from sklearn.preprocessing import Imputer

data = csr_matrix([[1, 2], [0, 3], [7, 6]])

imp = Imputer(missing_values=0, strategy='mean', axis=0).fit(data)
print(imp.transform(data))
print('------------------------------------')
# below 可见 .transform() 不是根据传入的数据集进行特征均值填充 , 而是根据 .fit() 传入数据集得到
print(imp.transform(csr_matrix([[0, 0], [0, 0], [0, 0]])))



[[ 1.  2.]
 [ 4.  3.]
 [ 7.  6.]]
------------------------------------
[[ 4.          3.66666667]
 [ 4.          3.66666667]
 [ 4.          3.66666667]]


## Generating polynomial features

In [76]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.preprocessing import PolynomialFeatures

iris = load_iris()

data = pd.DataFrame(iris.data, columns=iris.feature_names)

# ------ Polynomialfeatures ------
poly = PolynomialFeatures(degree=2).fit(data)
print(poly.transform(data).shape)
print('------------------------------------')
poly = PolynomialFeatures(degree=2, interaction_only=True).fit(data)
print(poly.transform(data).shape)

(150, 15)
------------------------------------
(150, 11)


## Custom transformers

In [77]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.preprocessing import FunctionTransformer

iris = load_iris()

data = pd.DataFrame(iris.data, columns=iris.feature_names)

# ------ FunctionTransformer-1 ------
# 等价 np.log(data.values)
# FunctionTransformer 类的存在就是封装传入的 func 函数使得有 fit()、transform()、fit_transform() 方法 
scaler = FunctionTransformer(func=np.log).fit(data)
print(scaler.transform(data)[0:10])
print('------------------------------------')

# ------ FunctionTransformer-2 ------
def hello_world(X):
    return 'HelloWorld'

# func(data.values) hello_world(data.values)
scaler = FunctionTransformer(func=hello_world).fit(data)
print(scaler.transform(data)[0:10])

[[ 1.62924054  1.25276297  0.33647224 -1.60943791]
 [ 1.58923521  1.09861229  0.33647224 -1.60943791]
 [ 1.54756251  1.16315081  0.26236426 -1.60943791]
 [ 1.5260563   1.13140211  0.40546511 -1.60943791]
 [ 1.60943791  1.28093385  0.33647224 -1.60943791]
 [ 1.68639895  1.36097655  0.53062825 -0.91629073]
 [ 1.5260563   1.22377543  0.33647224 -1.2039728 ]
 [ 1.60943791  1.22377543  0.40546511 -1.60943791]
 [ 1.48160454  1.06471074  0.33647224 -1.60943791]
 [ 1.58923521  1.13140211  0.40546511 -2.30258509]]
------------------------------------
HelloWorld


## And So On