# СЕМИНАР. Выбор признаков

<hr>

Папулин С.Ю. (papulin.study@yandex.ru)

<a name="0"></a>
<div><span style="font-size:16pt; font-weight:bold">Содержание</span>
    <ol>
        <li><a href="#1">Загрузка исходных данных</a></li>
        <li><a href="#2">Подход с порогом дисперсии</a></li>
        <li><a href="#3">Подход с L1 регуляризацией</a></li>
        <li><a href="#4">Подход с перебором признаков</a></li>
        <li><a href="#5">Подход с постепенным уменьшением количества признаков</a></li>
        <li><a href="#6">Источники</a></li>
    </ol>
</div>

In [None]:
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, KFold, cross_validate, cross_val_score 
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
import pandas as pd
import numpy as np

In [None]:
from pandas.plotting import scatter_matrix
from sklearn.linear_model import LinearRegression, Lasso

<a name="1"></a>
<div style="display:table; width:100%; padding-top:10px; padding-bottom:10px; border-bottom:1px solid lightgrey">
    <div style="display:table-row">
        <div style="display:table-cell; width:80%; font-size:16pt; font-weight:bold">1. Загрузка исходных данных</div>
    	<div style="display:table-cell; width:20%; text-align:center; background-color:whitesmoke; border:1px solid lightgrey"><a href="#0">К содержанию</a></div>
    </div>
</div>

In [None]:
FILE_PATH = "../data/Advertising.csv"
df = pd.read_csv(FILE_PATH, sep=",", index_col=0)
df.head(5)

Набор данных:
- `TV`, `radio` и `newspaper` - бюджеты на рекламу
- `sales` - количество продаж

In [None]:
X_COLUMNS = df.columns.drop("sales")
Y_COLUMN = "sales"

In [None]:
scatter_matrix(df, figsize=[12,12])
plt.show()

In [None]:
labels = df.columns.to_list()

# plt.figure(figsize=[8, 8])
# plt.matshow(df.corr(), fignum=0, vmin=-1, vmax=1)
# plt.colorbar()
# plt.xticks(range(len(labels)), labels)
# plt.yticks(range(len(labels)), labels)
# plt.show()

fig = plt.figure(figsize=[8, 8])
ax = fig.add_subplot(1,1,1)
cax = ax.matshow(df.corr(), vmin=-1, vmax=1)
fig.colorbar(cax)
ax.set_xticklabels([""]+labels)
ax.set_yticklabels([""]+labels)

plt.show()

In [None]:
df_X = df[X_COLUMNS]
df_y = df[Y_COLUMN]

X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.2, random_state=1234)
X_train[:5]

In [None]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

train_mse = mean_squared_error(y_train, lr_model.predict(X_train))
test_mse = mean_squared_error(y_test, lr_model.predict(X_test))

print("Ошибка обучения: {}".format(train_mse))
print("Ошибка тестирования: {}".format(test_mse))

In [None]:
y_test__pred = lr_model.predict(X_test)

In [None]:
plt.figure("1",figsize=[14, 4])

plt.subplot(1,4,1)

plt.title("Prediction: TV-sales")
plt.plot(X_test["TV"], y_test, "ob", label="True")
plt.plot(X_test["TV"], y_test__pred, "or", label="Predicted")
plt.xlabel("$X_{TV}$")
plt.ylabel("$Y_{sales}$")
plt.legend()
plt.grid(True)


plt.subplot(1,4,2)
plt.title("Prediction: radio-sales")

plt.plot(X_test["radio"], y_test, "ob", label="True")
plt.plot(X_test["radio"], y_test__pred, "or", label="Predicted")
plt.xlabel("$X_{radio}$")
plt.ylabel("$Y_{sales}$")
plt.legend()
plt.grid(True)


plt.subplot(1,4,3)
plt.title("Prediction: newspaper-sales")

plt.plot(X_test["newspaper"], y_test, "ob", label="True")
plt.plot(X_test["newspaper"], y_test__pred, "or", label="Predicted")
plt.xlabel("$X_{newspaper}$")
plt.ylabel("$Y_{sales}$")
plt.legend()
plt.grid(True)

plt.subplot(1,4,4)
plt.title("Observation vs Prediction")
plt.scatter(y_test__pred, y_test, color="slategrey")
xlim = plt.gca().get_xlim() 
plt.plot(xlim, xlim, '--', color="grey")
plt.xlim(xlim) 
plt.xlabel("$\\bar{y}$")
plt.ylabel("$y$")
plt.grid(True)

plt.tight_layout()

plt.show()

<a name="2"></a>
<div style="display:table; width:100%; padding-top:10px; padding-bottom:10px; border-bottom:1px solid lightgrey">
    <div style="display:table-row">
        <div style="display:table-cell; width:80%; font-size:16pt; font-weight:bold">2. Подход с порогом дисперсии</div>
    	<div style="display:table-cell; width:20%; text-align:center; background-color:whitesmoke; border:1px solid lightgrey"><a href="#0">К содержанию</a></div>
    </div>
</div>

In [None]:
from sklearn.feature_selection import VarianceThreshold

In [None]:
sel_var = VarianceThreshold(threshold=300)

In [None]:
X_train_var = sel_var.fit_transform(X_train)
X_train_var[:5]

In [None]:
sel_var.variances_

In [None]:
sel_var.get_support()

In [None]:
clmns = list(X_COLUMNS[indx] for indx, state in enumerate(sel_var.get_support()) if state)
clmns

In [None]:
X_train_var[:5]

In [None]:
lr_model = LinearRegression()
lr_model.fit(X_train_var, y_train)

train_mse = mean_squared_error(y_train, lr_model.predict(X_train_var))
test_mse = mean_squared_error(y_test, lr_model.predict(X_test[clmns]))

print("Ошибка обучения: {}".format(train_mse))
print("Ошибка тестирования: {}".format(test_mse))

In [None]:
y_test__pred = lr_model.predict(X_test[clmns])

In [None]:
plt.figure("1",figsize=[12, 4])

plt.subplot(1,3,1)

plt.title("Prediction: TV-sales")
plt.plot(X_test["TV"], y_test, "ob", label="True")
plt.plot(X_test["TV"], y_test__pred, "or", label="Predicted")
plt.xlabel("$X_{TV}$")
plt.ylabel("$Y_{sales}$")
plt.legend()
plt.grid(True)


plt.subplot(1,3,2)
plt.title("Prediction: newspaper-sales")

plt.plot(X_test["newspaper"], y_test, "ob", label="True")
plt.plot(X_test["newspaper"], y_test__pred, "or", label="Predicted")
plt.xlabel("$X_{newspaper}$")
plt.ylabel("$Y_{sales}$")
plt.legend()
plt.grid(True)

plt.subplot(1,3,3)
plt.title("Observation vs Prediction")
plt.scatter(y_test__pred, y_test, color="slategrey")
xlim = plt.gca().get_xlim() 
plt.plot(xlim, xlim, '--', color="grey")
plt.xlim(xlim) 
plt.xlabel("$\\bar{y}$")
plt.ylabel("$y$")
plt.grid(True)

plt.tight_layout()

plt.show()

<a name="3"></a>
<div style="display:table; width:100%; padding-top:10px; padding-bottom:10px; border-bottom:1px solid lightgrey">
    <div style="display:table-row">
        <div style="display:table-cell; width:80%; font-size:16pt; font-weight:bold">3. Подход с L1 регуляризацией</div>
    	<div style="display:table-cell; width:20%; text-align:center; background-color:whitesmoke; border:1px solid lightgrey"><a href="#0">К содержанию</a></div>
    </div>
</div>

In [None]:
from sklearn.linear_model import LinearRegression, Lasso

In [None]:
coefs = []

alphas = np.logspace(-5, 2, 100)

kf = KFold(n_splits=10, shuffle=True, random_state=0)

for alpha in alphas:
    lasso_model = Lasso(alpha=alpha, fit_intercept=True, normalize=True)
    lasso_model.fit(X_train, y_train)
    coefs.append(lasso_model.coef_)
    
coefs = np.array(coefs)

In [None]:
plt.figure("1",figsize=[12, 4])

plt.subplot(1,2,1)
plt.title("Parameter Estimates")

for i in range(coefs.shape[1]):
    plt.plot(alphas, coefs[:,i], label="$\\theta_{%s}$" % X_COLUMNS[i])
    plt.xscale("log") 
    
plt.grid(True)
plt.legend()
plt.xlabel("$\lambda$")
plt.ylabel("$\\theta$")
plt.tight_layout()
plt.autoscale(enable=True, axis="x", tight=True) 

plt.tight_layout()
plt.show()

In [None]:
clmns = ["TV", "radio"]
X_train_lasso = X_train[clmns]
X_train_lasso[:5]

In [None]:
lr_model = LinearRegression()
lr_model.fit(X_train_lasso, y_train)

train_mse = mean_squared_error(y_train, lr_model.predict(X_train_lasso))
test_mse = mean_squared_error(y_test, lr_model.predict(X_test[clmns]))

print("Ошибка обучения: {}".format(train_mse))
print("Ошибка тестирования: {}".format(test_mse))

In [None]:
y_test__pred = lr_model.predict(X_test[clmns])

In [None]:
plt.figure("1",figsize=[12, 4])

plt.subplot(1,3,1)

plt.title("Prediction: TV-sales")
plt.plot(X_test["TV"], y_test, "ob", label="True")
plt.plot(X_test["TV"], lr_model.predict(X_test[clmns]), "or", label="Predicted")
plt.xlabel("$X_{TV}$")
plt.ylabel("$Y_{sales}$")
plt.legend()
plt.grid(True)


plt.subplot(1,3,2)
plt.title("Prediction: radio-sales")

plt.plot(X_test["radio"], y_test, "ob", label="True")
plt.plot(X_test["radio"], lr_model.predict(X_test[clmns]), "or", label="Predicted")
plt.xlabel("$X_{radio}$")
plt.ylabel("$Y_{sales}$")
plt.legend()
plt.grid(True)

plt.subplot(1,3,3)
plt.title("Observation vs Prediction")
plt.scatter(y_test__pred, y_test, color="slategrey")
xlim = plt.gca().get_xlim() 
plt.plot(xlim, xlim, '--', color="grey")
plt.xlim(xlim) 
plt.xlabel("$\\bar{y}$")
plt.ylabel("$y$")
plt.grid(True)

plt.tight_layout()

plt.show()

<a name="4"></a>
<div style="display:table; width:100%; padding-top:10px; padding-bottom:10px; border-bottom:1px solid lightgrey">
    <div style="display:table-row">
        <div style="display:table-cell; width:80%; font-size:16pt; font-weight:bold">4. Подход с перебором признаков</div>
    	<div style="display:table-cell; width:20%; text-align:center; background-color:whitesmoke; border:1px solid lightgrey"><a href="#0">К содержанию</a></div>
    </div>
</div>

In [None]:
import itertools

Количество комбинаций признаков:

In [None]:
2**len(X_COLUMNS)-1

Вывод всех комбиаций признаков:

In [None]:
features_by_num = list()
for num in range(1, len(X_COLUMNS)+1):
    features_set = [list(el) for el in tuple(itertools.combinations(X_COLUMNS, num))]
    features_by_num.append(features_set)
    print("#признаков={}: {}".format(num, features_set))

#### Выбор количества признаков с использованием BIC, AIC

In [None]:
# TODO

#### Выбор количества признаков с использованием кросс-валидации

Выбор количества признаков:

In [None]:
kf = KFold(n_splits=5)

val_errors = list()

for features_set in features_by_num:
    
    train_errors_by_num = list()
    val_errors_by_num = list()
    
    print("Количество признаков: {}\n".format(len(features_set[0])))
    
    for features in features_set:
        
        lr_model = LinearRegression()
        lr_model.fit(X_train[features], y_train)

        scores = cross_validate(lr_model, X_train[features], y_train, cv=kf, return_train_score=True, 
                    scoring=["neg_mean_squared_error", "r2"])

        val_mse = -scores["test_neg_mean_squared_error"].mean()
        train_mse = -scores["train_neg_mean_squared_error"].mean()
        
        train_errors_by_num.append(train_mse)
        val_errors_by_num.append(val_mse)
        
        print("Признаки: {}".format(features))
        print("Ошибка обучения: {}".format(train_mse))
        print("Ошибка проверочная: {}\n".format(val_mse))
        
    indx_min_by_num = np.argmin(train_errors_by_num)
    val_errors.append(val_errors_by_num[indx_min_by_num])
    print("Минимальная ошибка обучения: {}".format(train_errors_by_num[indx_min_by_num]))
    print("Набор признаков с минимальной ошибкой обучения: {}".format(features_set[indx_min_by_num]))
    print("Ошибка проверочная: {}\n".format(val_errors_by_num[indx_min_by_num]))


feature_number_selected = np.argmin(val_errors) + 1
print("{}".format("="*30))
print("Минимальная проверочная ошибка: {}".format(val_errors[feature_number_selected-1]))
print("Выбранное количество признаков: {}".format(feature_number_selected))

Выбор лучшего набора признаков:

In [None]:
train_mse = list()

for features in features_by_num[feature_number_selected-1]:
    
    lr_model = LinearRegression()
    lr_model.fit(X_train[features], y_train)
    train_mse.append(mean_squared_error(y_train, lr_model.predict(X_train[features])))

feature_selected = features_by_num[feature_number_selected-1][np.argmin(train_mse)]
print("Выбранные признаки: {}".format(feature_selected))

Построение модели с выбранными признаками:

In [None]:
lr_model = LinearRegression()
lr_model.fit(X_train[feature_selected], y_train)

train_mse = mean_squared_error(y_train, lr_model.predict(X_train[feature_selected]))
test_mse = mean_squared_error(y_test, lr_model.predict(X_test[feature_selected]))

print("Ошибка обучения: {}".format(train_mse))
print("Ошибка тестирования: {}".format(test_mse))

In [None]:
y_test__pred = lr_model.predict(X_test[feature_selected])

In [None]:
plt.figure("1",figsize=[12, 4])

plt.subplot(1,3,1)

plt.title("Prediction: TV-sales")
plt.plot(X_test["TV"], y_test, "ob", label="True")
plt.plot(X_test["TV"], y_test__pred, "or", label="Predicted")
plt.xlabel("$X_{TV}$")
plt.ylabel("$Y_{sales}$")
plt.legend()
plt.grid(True)


plt.subplot(1,3,2)
plt.title("Prediction: radio-sales")

plt.plot(X_test["radio"], y_test, "ob", label="True")
plt.plot(X_test["radio"], y_test__pred, "or", label="Predicted")
plt.xlabel("$X_{radio}$")
plt.ylabel("$Y_{sales}$")
plt.legend()
plt.grid(True)

plt.subplot(1,3,3)
plt.title("Observation vs Prediction")
plt.scatter(y_test__pred, y_test, color="slategrey")
xlim = plt.gca().get_xlim() 
plt.plot(xlim, xlim, '--', color="grey")
plt.xlim(xlim) 
plt.xlabel("$\\bar{y}$")
plt.ylabel("$y$")
plt.grid(True)

plt.tight_layout()

plt.show()

<a name="5"></a>
<div style="display:table; width:100%; padding-top:10px; padding-bottom:10px; border-bottom:1px solid lightgrey">
    <div style="display:table-row">
        <div style="display:table-cell; width:80%; font-size:16pt; font-weight:bold">5. Подход с постепенным уменьшением количества признаков</div>
    	<div style="display:table-cell; width:20%; text-align:center; background-color:whitesmoke; border:1px solid lightgrey"><a href="#0">К содержанию</a></div>
    </div>
</div>

#### Уменьшение до заданного количества признаков

In [None]:
from sklearn.goo import (
    RFE, 
    RFECV
)

In [None]:
# New in version 0.24
from sklearn.feature_selection import SequentialFeatureSelector

In [None]:
lr_model = LinearRegression(fit_intercept=True)

*Два признака*

In [None]:
rfe = RFE(estimator=lr_model, n_features_to_select=2, step=1)
rfe.fit(X_train, y_train)

Ранг (важность) признаков:

In [None]:
rfe.ranking_

Выбранные признаки:

In [None]:
feature_selected = list(X_COLUMNS[indx] for indx, state in enumerate(rfe.support_) if state)
feature_selected

Обучение и тестирование модели с выбранными признаками:

In [None]:
lr_model = LinearRegression()
lr_model.fit(X_train[feature_selected], y_train)

train_mse = mean_squared_error(y_train, lr_model.predict(X_train[feature_selected]))
test_mse = mean_squared_error(y_test, lr_model.predict(X_test[feature_selected]))

print("Ошибка обучения: {}".format(train_mse))
print("Ошибка тестирования: {}".format(test_mse))

In [None]:
y_test__pred = lr_model.predict(X_test[feature_selected])

In [None]:
plt.figure("1",figsize=[12, 4])

plt.subplot(1,3,1)

plt.title("Prediction: TV-sales")
plt.plot(X_test["TV"], y_test, "ob", label="True")
plt.plot(X_test["TV"], lr_model.predict(X_test[feature_selected]), "or", label="Predicted")
plt.xlabel("$X_{TV}$")
plt.ylabel("$Y_{sales}$")
plt.legend()
plt.grid(True)


plt.subplot(1,3,2)
plt.title("Prediction: radio-sales")

plt.plot(X_test["radio"], y_test, "ob", label="True")
plt.plot(X_test["radio"], lr_model.predict(X_test[feature_selected]), "or", label="Predicted")
plt.xlabel("$X_{radio}$")
plt.ylabel("$Y_{sales}$")
plt.legend()
plt.grid(True)

plt.subplot(1,3,3)
plt.title("Observation vs Prediction")
plt.scatter(y_test__pred, y_test, color="slategrey")
xlim = plt.gca().get_xlim() 
plt.plot(xlim, xlim, '--', color="grey")
plt.xlim(xlim) 
plt.xlabel("$\\bar{y}$")
plt.ylabel("$y$")
plt.grid(True)

plt.tight_layout()

plt.show()

*Один признак*

In [None]:
rfe = RFE(estimator=lr_model, n_features_to_select=1, step=1)
rfe.fit(X_train, y_train)
feature_selected = list(X_COLUMNS[indx] for indx, state in enumerate(rfe.support_) if state)

print("Выбранные признаки: {}".format(feature_selected))

lr_model = LinearRegression()
lr_model.fit(X_train[feature_selected], y_train)

train_mse = mean_squared_error(y_train, lr_model.predict(X_train[feature_selected]))
test_mse = mean_squared_error(y_test, lr_model.predict(X_test[feature_selected]))

print("Ошибка обучения: {}".format(train_mse))
print("Ошибка тестирования: {}".format(test_mse))

plt.figure("1",figsize=[8, 4])

plt.subplot(1,2,1)
plt.title("Prediction: radio-sales")

plt.plot(X_test["radio"], y_test, "ob", label="True")
plt.plot(X_test["radio"], lr_model.predict(X_test[feature_selected]), "or", label="Predicted")
plt.xlabel("$X_{radio}$")
plt.ylabel("$Y_{sales}$")
plt.legend()
plt.grid(True)

plt.subplot(1,2,2)
plt.title("Observation vs Prediction")
plt.scatter(y_test__pred, y_test, color="slategrey")
xlim = plt.gca().get_xlim() 
plt.plot(xlim, xlim, '--', color="grey")
plt.xlim(xlim) 
plt.xlabel("$\\bar{y}$")
plt.ylabel("$y$")
plt.grid(True)

plt.tight_layout()

plt.show()

#### Backward Stepwise Selection

In [None]:
# TODO

<a name="6"></a>
<div style="display:table; width:100%; padding-top:10px; padding-bottom:10px; border-bottom:1px solid lightgrey">
    <div style="display:table-row">
        <div style="display:table-cell; width:80%; font-size:16pt; font-weight:bold">6. Источники</div>
    	<div style="display:table-cell; width:20%; text-align:center; background-color:whitesmoke; border:1px solid lightgrey"><a href="#0">К содержанию</a></div>
    </div>
</div>

- [Sklearn: Feature selection](https://scikit-learn.org/stable/modules/feature_selection.html)
- Chapter 6. Linear Model Selection and Regularization // An Introduction to Statistical Learning by Gareth James, Daniela Witten, Trevor Hastie, Robert Tibshir. pp. 203–264. URL: http://faculty.marshall.usc.edu/gareth-james/ISL/