# 7 特征工程

改编自 sklearn样例 https://scikit-learn.org/stable/auto_examples/impute/plot_iterative_imputer_variants_comparison.html#sphx-glr-auto-examples-impute-plot-iterative-imputer-variants-comparison-py

本例使用加州房价数据，人为添加了数据缺失，对比了不同填补方案的差异

请注意代码风格使用了pipeline，不同于我们之前的toy风格

如果感觉阅读有困难，可先学习pipeline相关知识

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import RandomForestRegressor

# To use this experimental feature, we need to explicitly ask for it:
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.kernel_approximation import Nystroem
from sklearn.linear_model import BayesianRidge, Ridge
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.impute import KNNImputer

## 7.1 缺失值填充

In [None]:
N_SPLITS = 5



dataset = pd.read_csv('california_housing.csv')
feature_names = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
X_full = dataset[feature_names].values
y_full = dataset['y'].values

In [None]:
#只使用十分之一数据作说明，不运行这两行代码即使用全样本
X_full = X_full[::10]
y_full = y_full[::10]


In [None]:
n_samples, n_features = X_full.shape
# Estimate the score on the entire dataset, with no missing values
#相较于原例，我们使用岭回归而非贝叶斯版本的岭回归
br_estimator = Ridge()
score_full_data = pd.DataFrame(
    cross_val_score(
        br_estimator, X_full, y_full, scoring="neg_mean_squared_error", cv=N_SPLITS
    ),
    columns=["Full Data"],
)

In [None]:

# Add a single missing value to each row
#这里给每一行、每一列都添加了缺失值，这种程度的缺失其实比较罕见的
rng = np.random.RandomState(0)
X_missing = X_full.copy()
y_missing = y_full
missing_samples = np.arange(n_samples)
missing_features = rng.choice(n_features, n_samples, replace=True)
X_missing[missing_samples, missing_features] = np.nan


In [None]:
[np.isnan(X_missing[:,col_i]).sum() for col_i in range(n_features)]

In [None]:
X_missing[:,0]

In [None]:
# Estimate the score after imputation (mean and median strategies)
# 计算简单填补的表现
score_simple_imputer = pd.DataFrame()
for strategy in ("mean", "median"):
    estimator = make_pipeline(
        #在流水线放入以均值、中位数为填补的 SimpleImputer
        SimpleImputer(missing_values=np.nan, strategy=strategy), br_estimator
    )
    #计算交叉验证分数
    score_simple_imputer[strategy] = cross_val_score(
        estimator, X_missing, y_missing, scoring="neg_mean_squared_error", cv=N_SPLITS
    )

In [None]:
# 计算knn填补的表现
score_simple_knn = pd.DataFrame()
for strategy in ("uniform", "distance"):
    estimator = make_pipeline(
        #在流水线放入 KNNImputer
        KNNImputer(missing_values=np.nan, n_neighbors=2, weights=strategy), br_estimator
    )
    #计算交叉验证分数
    score_simple_knn[strategy] = cross_val_score(
        estimator, X_missing, y_missing, scoring="neg_mean_squared_error", cv=N_SPLITS
    )

In [None]:


# Estimate the score after iterative imputation of the missing values
# with different estimators
#使用以下四种复杂的填补方法
estimators = [
    BayesianRidge(),
    #贝叶斯岭回归
    RandomForestRegressor(
        # We tuned the hyperparameters of the RandomForestRegressor to get a good
        # enough predictive performance for a restricted execution time.
        n_estimators=4,
        max_depth=10,
        bootstrap=True,
        max_samples=0.5,
        n_jobs=2,
        random_state=0,
    ),
    #随机森林回归
    make_pipeline(
        Nystroem(kernel="polynomial", degree=2, random_state=0), Ridge(alpha=1e3)
    ),
    #使用核方法的岭回归
    KNeighborsRegressor(n_neighbors=15),
    #k临近回归
    
    #显然，这些用于填补缺失值的回归方法都很复杂，
]
#新建dataframe以保存分数
score_iterative_imputer = pd.DataFrame()
# iterative imputer is sensible to the tolerance and
# dependent on the estimator used internally.
# we tuned the tolerance to keep this example run with limited computational
# resources while not changing the results too much compared to keeping the
# stricter default value for the tolerance parameter.

tolerances = (1e-3, 1e-1, 1e-1, 1e-2)
for impute_estimator, tol in zip(estimators, tolerances):
    estimator = make_pipeline(
        IterativeImputer(
            #尝试不同种的estimator，max_iter 最大迭代次数，tol 拟合终点的误差值，迭代中误差小于该值或迭代次数大于max_iter即停止
            random_state=0, estimator=impute_estimator, max_iter=25, tol=tol
        ),
        #最终的回归还是ridge
        br_estimator,
    )
    score_iterative_imputer[impute_estimator.__class__.__name__] = cross_val_score(
        estimator, X_missing, y_missing, scoring="neg_mean_squared_error", cv=N_SPLITS
    )

scores = pd.concat(
    [score_full_data, score_simple_imputer,score_simple_knn, score_iterative_imputer],
    keys=["Original", "SimpleImputer",'KNNImputer', "IterativeImputer"],
    axis=1,
)

In [None]:
# plot california housing results
fig, ax = plt.subplots(figsize=(13, 6))
means = -scores.mean()
errors = scores.std()
means.plot.barh(xerr=errors, ax=ax)
ax.set_title("California Housing Regression with Different Imputation Methods")
ax.set_xlabel("MSE (smaller is better)")
ax.set_yticks(np.arange(means.shape[0]))
ax.set_yticklabels([" w/ ".join(label) for label in means.index.tolist()])
plt.tight_layout(pad=1)
plt.savefig('缺失_2k_逐行.png')
plt.show()

In [None]:
df_2k_perrow = scores.copy()

In [None]:
N_SPLITS = 5

rng = np.random.RandomState(0)

dataset = pd.read_csv('california_housing.csv')
feature_names = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
X_full = dataset[feature_names].values
y_full = dataset['y'].values

n_samples, n_features = X_full.shape
# Estimate the score on the entire dataset, with no missing values
#相较于原例，我们使用岭回归而非贝叶斯版本的岭回归
br_estimator = Ridge()
score_full_data = pd.DataFrame(
    cross_val_score(
        br_estimator, X_full, y_full, scoring="neg_mean_squared_error", cv=N_SPLITS
    ),
    columns=["Full Data"],
)


# Add a single missing value to each row
#这里给每一行、每一列都添加了缺失值，这种程度的缺失其实比较罕见的
X_missing = X_full.copy()
y_missing = y_full
missing_samples = np.arange(n_samples)
missing_features = rng.choice(n_features, n_samples, replace=True)
X_missing[missing_samples, missing_features] = np.nan


# Estimate the score after imputation (mean and median strategies)
# 计算简单填补的表现
score_simple_imputer = pd.DataFrame()
for strategy in ("mean", "median"):
    estimator = make_pipeline(
        #在流水线放入以均值、中位数为填补的 SimpleImputer
        SimpleImputer(missing_values=np.nan, strategy=strategy), br_estimator
    )
    #计算交叉验证分数
    score_simple_imputer[strategy] = cross_val_score(
        estimator, X_missing, y_missing, scoring="neg_mean_squared_error", cv=N_SPLITS
    )
    
# 计算knn填补的表现
score_simple_knn = pd.DataFrame()
for strategy in ("uniform", "distance"):
    estimator = make_pipeline(
        #在流水线放入 KNNImputer
        KNNImputer(missing_values=np.nan, n_neighbors=2, weights=strategy), br_estimator
    )
    #计算交叉验证分数
    score_simple_knn[strategy] = cross_val_score(
        estimator, X_missing, y_missing, scoring="neg_mean_squared_error", cv=N_SPLITS
    )
    

# Estimate the score after iterative imputation of the missing values
# with different estimators
#使用以下四种复杂的填补方法
estimators = [
    BayesianRidge(),
    #贝叶斯岭回归
    RandomForestRegressor(
        # We tuned the hyperparameters of the RandomForestRegressor to get a good
        # enough predictive performance for a restricted execution time.
        n_estimators=4,
        max_depth=10,
        bootstrap=True,
        max_samples=0.5,
        n_jobs=2,
        random_state=0,
    ),
    #随机森林回归
    make_pipeline(
        Nystroem(kernel="polynomial", degree=2, random_state=0), Ridge(alpha=1e3)
    ),
    #使用核方法的岭回归
    KNeighborsRegressor(n_neighbors=15),
    #k临近回归
    
    #显然，这些用于填补缺失值的回归方法都很复杂，
]
#新建dataframe以保存分数
score_iterative_imputer = pd.DataFrame()
# iterative imputer is sensible to the tolerance and
# dependent on the estimator used internally.
# we tuned the tolerance to keep this example run with limited computational
# resources while not changing the results too much compared to keeping the
# stricter default value for the tolerance parameter.

tolerances = (1e-3, 1e-1, 1e-1, 1e-2)
for impute_estimator, tol in zip(estimators, tolerances):
    estimator = make_pipeline(
        IterativeImputer(
            #尝试不同种的estimator，max_iter 最大迭代次数，tol 拟合终点的误差值，迭代中误差小于该值或迭代次数大于max_iter即停止
            random_state=0, estimator=impute_estimator, max_iter=25, tol=tol
        ),
        #最终的回归还是ridge
        br_estimator,
    )
    score_iterative_imputer[impute_estimator.__class__.__name__] = cross_val_score(
        estimator, X_missing, y_missing, scoring="neg_mean_squared_error", cv=N_SPLITS
    )

scores = pd.concat(
    [score_full_data, score_simple_imputer,score_simple_knn, score_iterative_imputer],
    keys=["Original", "SimpleImputer",'KNNImputer', "IterativeImputer"],
    axis=1,
)

# plot california housing results
fig, ax = plt.subplots(figsize=(13, 6))
means = -scores.mean()
errors = scores.std()
means.plot.barh(xerr=errors, ax=ax)
ax.set_title("California Housing Regression with Different Imputation Methods")
ax.set_xlabel("MSE (smaller is better)")
ax.set_yticks(np.arange(means.shape[0]))
ax.set_yticklabels([" w/ ".join(label) for label in means.index.tolist()])
plt.tight_layout(pad=1)
plt.savefig('缺失_20k_逐行.png')
plt.show()

In [None]:
df_20k_perrow = scores.copy()

In [None]:
N_SPLITS = 5

rng = np.random.RandomState(0)

dataset = pd.read_csv('california_housing.csv')
feature_names = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
X_full = dataset[feature_names].values
y_full = dataset['y'].values

X_full = X_full[::10]
y_full = y_full[::10]


n_samples, n_features = X_full.shape
# Estimate the score on the entire dataset, with no missing values
#相较于原例，我们使用岭回归而非贝叶斯版本的岭回归
br_estimator = Ridge()
score_full_data = pd.DataFrame(
    cross_val_score(
        br_estimator, X_full, y_full, scoring="neg_mean_squared_error", cv=N_SPLITS
    ),
    columns=["Full Data"],
)


# Add a single missing value to each row
#这里给每一行、每一列都添加了缺失值，这种程度的缺失其实比较罕见的
X_missing = X_full.copy()
y_missing = y_full
missing_samples = np.arange(n_samples)
missing_features = rng.choice(4, n_samples, replace=True)
X_missing[missing_samples, missing_features] = np.nan


# Estimate the score after imputation (mean and median strategies)
# 计算简单填补的表现
score_simple_imputer = pd.DataFrame()
for strategy in ("mean", "median"):
    estimator = make_pipeline(
        #在流水线放入以均值、中位数为填补的 SimpleImputer
        SimpleImputer(missing_values=np.nan, strategy=strategy), br_estimator
    )
    #计算交叉验证分数
    score_simple_imputer[strategy] = cross_val_score(
        estimator, X_missing, y_missing, scoring="neg_mean_squared_error", cv=N_SPLITS
    )
# 计算knn填补的表现
score_simple_knn = pd.DataFrame()
for strategy in ("uniform", "distance"):
    estimator = make_pipeline(
        #在流水线放入 KNNImputer
        KNNImputer(missing_values=np.nan, n_neighbors=2, weights=strategy), br_estimator
    )
    #计算交叉验证分数
    score_simple_knn[strategy] = cross_val_score(
        estimator, X_missing, y_missing, scoring="neg_mean_squared_error", cv=N_SPLITS
    )
       


# Estimate the score after iterative imputation of the missing values
# with different estimators
#使用以下四种复杂的填补方法
estimators = [
    BayesianRidge(),
    #贝叶斯岭回归
    RandomForestRegressor(
        # We tuned the hyperparameters of the RandomForestRegressor to get a good
        # enough predictive performance for a restricted execution time.
        n_estimators=4,
        max_depth=10,
        bootstrap=True,
        max_samples=0.5,
        n_jobs=2,
        random_state=0,
    ),
    #随机森林回归
    make_pipeline(
        Nystroem(kernel="polynomial", degree=2, random_state=0), Ridge(alpha=1e3)
    ),
    #使用核方法的岭回归
    KNeighborsRegressor(n_neighbors=15),
    #k临近回归
    
    #显然，这些用于填补缺失值的回归方法都很复杂，
]
#新建dataframe以保存分数
score_iterative_imputer = pd.DataFrame()
# iterative imputer is sensible to the tolerance and
# dependent on the estimator used internally.
# we tuned the tolerance to keep this example run with limited computational
# resources while not changing the results too much compared to keeping the
# stricter default value for the tolerance parameter.

tolerances = (1e-3, 1e-1, 1e-1, 1e-2)
for impute_estimator, tol in zip(estimators, tolerances):
    estimator = make_pipeline(
        IterativeImputer(
            #尝试不同种的estimator，max_iter 最大迭代次数，tol 拟合终点的误差值，迭代中误差小于该值或迭代次数大于max_iter即停止
            random_state=0, estimator=impute_estimator, max_iter=25, tol=tol
        ),
        #最终的回归还是ridge
        br_estimator,
    )
    score_iterative_imputer[impute_estimator.__class__.__name__] = cross_val_score(
        estimator, X_missing, y_missing, scoring="neg_mean_squared_error", cv=N_SPLITS
    )

scores = pd.concat(
    [score_full_data, score_simple_imputer,score_simple_knn, score_iterative_imputer],
    keys=["Original", "SimpleImputer",'KNNImputer', "IterativeImputer"],
    axis=1,
)

# plot california housing results
fig, ax = plt.subplots(figsize=(13, 6))
means = -scores.mean()
errors = scores.std()
means.plot.barh(xerr=errors, ax=ax)
ax.set_title("California Housing Regression with Different Imputation Methods")
ax.set_xlabel("MSE (smaller is better)")
ax.set_yticks(np.arange(means.shape[0]))
ax.set_yticklabels([" w/ ".join(label) for label in means.index.tolist()])
plt.tight_layout(pad=1)
plt.savefig('缺失_2k_逐行_half_col.png')
plt.show()

In [None]:
df_2k_perrow_halfcol = scores.copy()

In [None]:
N_SPLITS = 5

rng = np.random.RandomState(0)

dataset = pd.read_csv('california_housing.csv')
feature_names = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
X_full = dataset[feature_names].values
y_full = dataset['y'].values

X_full = X_full[::10]
y_full = y_full[::10]


n_samples, n_features = X_full.shape
# Estimate the score on the entire dataset, with no missing values
#相较于原例，我们使用岭回归而非贝叶斯版本的岭回归
br_estimator = Ridge()
score_full_data = pd.DataFrame(
    cross_val_score(
        br_estimator, X_full, y_full, scoring="neg_mean_squared_error", cv=N_SPLITS
    ),
    columns=["Full Data"],
)


# Add a single missing value to each row
#这里给每一行、每一列都添加了缺失值，这种程度的缺失其实比较罕见的
X_missing = X_full.copy()
y_missing = y_full
missing_samples = np.arange(int(n_samples/2))
missing_features = rng.choice(n_features, int(n_samples/2), replace=True)
X_missing[missing_samples*2, missing_features] = np.nan


# Estimate the score after imputation (mean and median strategies)
# 计算简单填补的表现
score_simple_imputer = pd.DataFrame()
for strategy in ("mean", "median"):
    estimator = make_pipeline(
        #在流水线放入以均值、中位数为填补的 SimpleImputer
        SimpleImputer(missing_values=np.nan, strategy=strategy), br_estimator
    )
    #计算交叉验证分数
    score_simple_imputer[strategy] = cross_val_score(
        estimator, X_missing, y_missing, scoring="neg_mean_squared_error", cv=N_SPLITS
    )
    
# 计算knn填补的表现
score_simple_knn = pd.DataFrame()
for strategy in ("uniform", "distance"):
    estimator = make_pipeline(
        #在流水线放入 KNNImputer
        KNNImputer(missing_values=np.nan, n_neighbors=2, weights=strategy), br_estimator
    )
    #计算交叉验证分数
    score_simple_knn[strategy] = cross_val_score(
        estimator, X_missing, y_missing, scoring="neg_mean_squared_error", cv=N_SPLITS
    )


# Estimate the score after iterative imputation of the missing values
# with different estimators
#使用以下四种复杂的填补方法
estimators = [
    BayesianRidge(),
    #贝叶斯岭回归
    RandomForestRegressor(
        # We tuned the hyperparameters of the RandomForestRegressor to get a good
        # enough predictive performance for a restricted execution time.
        n_estimators=4,
        max_depth=10,
        bootstrap=True,
        max_samples=0.5,
        n_jobs=2,
        random_state=0,
    ),
    #随机森林回归
    make_pipeline(
        Nystroem(kernel="polynomial", degree=2, random_state=0), Ridge(alpha=1e3)
    ),
    #使用核方法的岭回归
    KNeighborsRegressor(n_neighbors=15),
    #k临近回归
    
    #显然，这些用于填补缺失值的回归方法都很复杂，
]
#新建dataframe以保存分数
score_iterative_imputer = pd.DataFrame()
# iterative imputer is sensible to the tolerance and
# dependent on the estimator used internally.
# we tuned the tolerance to keep this example run with limited computational
# resources while not changing the results too much compared to keeping the
# stricter default value for the tolerance parameter.

tolerances = (1e-3, 1e-1, 1e-1, 1e-2)
for impute_estimator, tol in zip(estimators, tolerances):
    estimator = make_pipeline(
        IterativeImputer(
            #尝试不同种的estimator，max_iter 最大迭代次数，tol 拟合终点的误差值，迭代中误差小于该值或迭代次数大于max_iter即停止
            random_state=0, estimator=impute_estimator, max_iter=25, tol=tol
        ),
        #最终的回归还是ridge
        br_estimator,
    )
    score_iterative_imputer[impute_estimator.__class__.__name__] = cross_val_score(
        estimator, X_missing, y_missing, scoring="neg_mean_squared_error", cv=N_SPLITS
    )

scores = pd.concat(
    [score_full_data, score_simple_imputer,score_simple_knn, score_iterative_imputer],
    keys=["Original", "SimpleImputer",'KNNImputer', "IterativeImputer"],
    axis=1,
)

# plot california housing results
fig, ax = plt.subplots(figsize=(13, 6))
means = -scores.mean()
errors = scores.std()
means.plot.barh(xerr=errors, ax=ax)
ax.set_title("California Housing Regression with Different Imputation Methods")
ax.set_xlabel("MSE (smaller is better)")
ax.set_yticks(np.arange(means.shape[0]))
ax.set_yticklabels([" w/ ".join(label) for label in means.index.tolist()])
plt.tight_layout(pad=1)
plt.savefig('缺失_2k_半行.png')
plt.show()

In [None]:
df_2k_halfrow = scores.copy()

In [None]:
full_score = pd.concat(
    [df_2k_perrow, df_20k_perrow, df_2k_perrow_halfcol,df_2k_halfrow],
    keys=["base", "20k", "hal col",'half row'],
    axis=1,
)

In [None]:
full_score

In [None]:
full_score.to_csv('compare_different_missing.csv')

In [None]:
# plot california housing results
fig, ax = plt.subplots(figsize=(13, 6))
scores = full_score
means = -scores.mean()
errors = scores.std()
means.plot.barh(xerr=errors, ax=ax)
ax.set_title("California Housing Regression with Different Imputation Methods")
ax.set_xlabel("MSE (smaller is better)")
ax.set_yticks(np.arange(means.shape[0]))
ax.set_yticklabels(["--".join(label) for label in means.index.tolist()])
plt.tight_layout(pad=1)
plt.savefig('缺失大全.png')
plt.show()

## 7.2 Encoding

In [None]:
X = [['男','学士','程序员'],
    ['女','硕士','公务员'],
    ['男','博士','外卖员'],
    ['女',np.nan,'交易员']]

In [None]:
from sklearn import preprocessing
enc_ord = preprocessing.OrdinalEncoder()
enc_ord.fit(X)
enc_ord.transform(X)

In [None]:
enc_ord.transform([['女','硕士','教师']])

In [None]:
enc_ord = preprocessing.OrdinalEncoder(encoded_missing_value=-1)
enc_ord.fit(X)
enc_ord.transform(X)

In [None]:
enc_oh = preprocessing.OneHotEncoder()
enc_oh.fit(X)
enc_oh.transform(X)

In [None]:
enc_oh.transform(X).toarray()

In [None]:
enc_oh.categories_

In [None]:
enc_oh.transform([['女','硕士','教师']])

In [None]:
enc_oh = preprocessing.OneHotEncoder(handle_unknown='infrequent_if_exist')
enc_oh.fit(X)
enc_oh.transform(X)
enc_oh.transform([['女','硕士','教师']]).toarray()