# 变量选择
在本案例中，我们使用winsored的房价数据，来进行变量选择演示

In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np

In [None]:
dataset = pd.read_csv('california_housing.csv')
feature_names = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']


In [None]:
feature_mapping = {
    "MedInc": "Median income in block in $1,000",
    "HouseAge": "Median house age in block",
    "AveRooms": "Average number of rooms",
    "AveBedrms": "Average number of bedrooms",
    "Population": "Block population",
    "AveOccup": "Average house occupancy",
    "Latitude": "House block latitude",
    "Longitude": "House block longitude",
    "y":"Median House Price in $100,000"
}

In [None]:
dataset.describe()

In [None]:
dataset['AveOccup'].quantile(0.01)

In [None]:
dataset['AveOccup'].quantile(0.99)

In [None]:
dataset_win =dataset[dataset['AveOccup']<dataset['AveOccup'].quantile(0.99)][dataset['AveOccup']>dataset['AveOccup'].quantile(0.01)]


In [None]:
dataset_win = dataset_win.reset_index(drop=True)
dataset_win.describe()

In [None]:
X_full = dataset_win[feature_names].values
y_full = dataset_win['y'].values

# 1.单变量选择

## 1.1 方差选择

In [None]:
from sklearn.feature_selection import VarianceThreshold

X_sel_var = VarianceThreshold(threshold=(1)).fit_transform(X_full)


In [None]:
X_sel_var.shape

## 1.2 评分选择

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import r_regression
uni_kbest = SelectKBest(r_regression, k=4)
X_sel_uniscore =uni_kbest.fit_transform(X_full, y_full)

In [None]:
X_sel_uniscore.shape

In [None]:
for i in range(len(feature_names)):
    print(feature_names[i],uni_kbest.scores_[i])

## 1.3 标准化数据

In [None]:
for ki in feature_names:
    print(ki)
    dataset_win[ki].hist()
    plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler
X = StandardScaler().fit_transform(X_full)

In [None]:
pd.DataFrame(X).describe()

# 2. 基于模型选择

## 2.1 Rigde的结果  

In [None]:
from sklearn.linear_model import RidgeCV
plt.figure(figsize=(12,8))

ridge = RidgeCV(alphas=np.logspace(-6, 6, num=5)).fit(X, y_full)
importance = np.abs(ridge.coef_)
feature_names = np.array(feature_names)
plt.bar(height=importance, x=feature_names)
plt.title("Feature importances via Ridge")
plt.show()

In [None]:
from sklearn.feature_selection import SelectFromModel
threshold = np.sort(importance)[-5] + 0.01

sfm = SelectFromModel(ridge, threshold=threshold).fit(X, y_full)
print(f"Features selected by SelectFromModel: {feature_names[sfm.get_support()]}")

## 2.2线性删除

In [None]:
from sklearn.feature_selection import SequentialFeatureSelector
from time import time
tic_fwd = time()
sfs_forward = SequentialFeatureSelector(
    ridge, n_features_to_select=5, direction="forward"
).fit(X, y_full)
toc_fwd = time()

tic_bwd = time()
sfs_backward = SequentialFeatureSelector(
    ridge, n_features_to_select=5, direction="backward"
).fit(X, y_full)
toc_bwd = time()

print(
    "Features selected by forward sequential selection: "
    f"{feature_names[sfs_forward.get_support()]}"
)
print(f"Done in {toc_fwd - tic_fwd:.3f}s")
print(
    "Features selected by backward sequential selection: "
    f"{feature_names[sfs_backward.get_support()]}"
)
print(f"Done in {toc_bwd - tic_bwd:.3f}s")

## 2.3使用随机森林回归

In [None]:
from sklearn.ensemble import RandomForestRegressor
plt.figure(figsize=(12,8))

RF = RandomForestRegressor(min_samples_leaf=20).fit(X, y_full)
importance = np.abs(RF.feature_importances_)
feature_names = np.array(feature_names)
plt.bar(height=importance, x=feature_names)
plt.title("Feature importances via RandomForest")
plt.show()