In [1]:
import numpy as np
from sklearn.datasets import load_boston

In [2]:
dataset = load_boston()

In [3]:
x = dataset.data
y = dataset.target

Подсчет коэффициента корреляции Пирсона:


In [4]:
Y = y - y.mean()
X = x - x.mean(axis=0)
corr = Y.dot(X) / np.sqrt(sum(Y ** 2) * sum(X ** 2))

In [5]:
arr = list(zip(dataset.feature_names, corr))

5 лучших признаков:


In [6]:
arr.sort(key=lambda x: -abs(x[1]))
arr[:5]

[('LSTAT', -0.7376627261740146),
 ('RM', 0.6953599470715388),
 ('PTRATIO', -0.5077866855375608),
 ('INDUS', -0.48372516002837274),
 ('TAX', -0.46853593356776624)]

In [7]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

Разделим выборку на тренировочную и тестовую и отмасштабируем признаки:

In [8]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [9]:
scaler = StandardScaler()

In [10]:
scaler.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [11]:
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [12]:
x_train_df = pd.DataFrame(X_train_scaled, columns=dataset.feature_names)
x_test_df = pd.DataFrame(X_test_scaled, columns=dataset.feature_names)

Forward selection:

In [13]:
error = 10e5
min_error = 10e5
features = list(dataset.feature_names)
cur_features = []
best_features = []
best_feature = 'LSTAT'
for i in range(5):
    min_error = 10e5
    for feature in features:
        lr = LinearRegression(n_jobs=-1).fit(x_train_df[cur_features + [feature]], y_train)
        error = mean_squared_error(y_test, lr.predict(x_test_df[cur_features + [feature]]))
        if error < min_error:
            min_error = error
            best_feature = feature
    cur_features.append(best_feature)
    features.remove(best_feature)
print('Лучшие признаки: ', ', '.join(cur_features))
print('Ошибка: ', min_error)

Лучшие признаки:  LSTAT, RM, PTRATIO, DIS, NOX
Ошибка:  21.94172105977485


Backward selection:

In [14]:
error = 10e5
min_error = 10e5
features = list(dataset.feature_names)
cur_features = list(dataset.feature_names)
best_features = []
best_feature = 'LSTAT'
for i in range(8):
    min_error = 10e5
    for feature in features:
        cur_features = features.copy()
        cur_features.remove(feature)
        lr = LinearRegression(n_jobs=-1).fit(x_train_df[cur_features], y_train)
        error = mean_squared_error(y_test, lr.predict(x_test_df[cur_features]))
        if error < min_error:
            min_error = error
            best_feature = feature
    features.remove(best_feature)
print('Лучшие признаки: ', ', '.join(features))
print('Ошибка: ', min_error)

Лучшие признаки:  NOX, RM, DIS, PTRATIO, LSTAT
Ошибка:  21.94172105977485


# Lasso

Будем перебирать параметр $\lambda$ от 0.00001 до 1 до тех пор пока, число ненулевых признаков будет больше 5. 


In [15]:
from sklearn.linear_model import Lasso

In [16]:
error = 10e5
for alpha in np.linspace(0.00001, 1, 10000):
    l = Lasso(alpha, max_iter=1000000).fit(x_train_df, y_train)
    error = mean_squared_error(y_test, l.predict(x_test_df))
    if np.sum(l.coef_ != 0) == 5:        
        print('lambda', alpha)
        print('Лучшие признаки: ', ', '.join(dataset.feature_names[l.coef_ != 0]))
        print('Ошибка: ', error)
        break

lambda 0.7362762646264627
Лучшие признаки:  CHAS, RM, PTRATIO, B, LSTAT
Ошибка:  25.417610745527323
