# Feature selection

Воспользуемся корпусом `Adult`, в котором нужно предсказать уровень дохода человека (больше или меньше 50k).

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

TARGET = "income"

data = pd.read_csv("data/adult.csv", na_values=["?"])
data.fillna(data.mean(numeric_only=True), inplace=True)
data.fillna(data.mode().iloc[0], inplace=True)
data["income"] = (data["income"] == ">50K").astype(np.int64)
data = pd.get_dummies(data, columns=["workclass", "education", "marital-status", "occupation", "relationship", "race", "gender", "native-country"])
data.head()

data_train, data_test = train_test_split(data, test_size=0.2, random_state=0)

X_train = data_train.drop([TARGET], axis=1)
X_test = data_test.drop([TARGET], axis=1)
y_train = data_train[TARGET]
y_test = data_test[TARGET]

print("Train size", len(X_train))
print("Test size", len(X_test))

Обучим линейную модель.

In [None]:
from pandas.api.types import is_numeric_dtype
from sklearn.linear_model import LogisticRegression


def run(X_train, y_train, X_test, y_test, max_iter=10000, verbose=True):
    columns = [column for column in X_train.columns if is_numeric_dtype(X_train[column])]
    drop = len(X_train.columns) - len(columns)
    if drop > 0 and verbose:
        print("Drop", drop, "categorical features")
    X_train = X_train[columns]
    X_test = X_test[columns]
    if verbose:
        print("Train size", len(X_train))
        print("Test size", len(X_test))
        print("Num features", len(X_train.columns))
    X_train = X_train.to_numpy()
    y_train = y_train.to_numpy().flatten()
    X_test = X_test.to_numpy()
    y_test = y_test.to_numpy().flatten()
    cls = LogisticRegression(max_iter=max_iter)
    cls.fit(X_train, y_train)
    train_accuracy = (cls.predict(X_train) == y_train).mean()
    test_accuracy = (cls.predict(X_test) == y_test).mean()
    if verbose:
        print("Train accuracy: {}".format(train_accuracy))
        print("Test accuracy: {}".format(test_accuracy))
    return cls, train_accuracy
    
model, accuracy = run(X_train, y_train, X_test, y_test)

# Жадный алгоритм

In [None]:
from tqdm import tqdm

K = 5
features = []
for _ in range(K):
    best_feature = None
    max_accuracy = 0
    for column in tqdm(X_train.columns):
        if column in features:
            continue
        new_features = list(features) + [column]
        _, accuracy = run(X_train[new_features], y_train, X_test[new_features], y_test, verbose=False)
        if accuracy > max_accuracy:
            max_accuracy = accuracy
            best_feature = column
    print("Add", best_feature, "Accuracy:", max_accuracy)
    features.append(best_feature)

In [None]:
for i, name in enumerate(features):
    print("{}\t{}".format(i + 1, name))

In [None]:
run(X_train[features], y_train, X_test[features], y_test)

**Качество выше, чем на полном наборе признаков.**

# Интерпретация с SHAP

In [None]:
import shap

sample_id = 3

X100 = shap.utils.sample(X_train, 100)
explainer = shap.Explainer(model.predict, X100)
shap_values = explainer(X_train.iloc[[sample_id]])
shap.plots.waterfall(shap_values[0], max_display=10)