In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import (VarianceThreshold, SelectKBest, f_classif, SelectFromModel, SequentialFeatureSelector)
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

In [2]:
x_data_generated, y_data_generated = make_classification(
    n_samples=1000,
    n_informative=10,
    n_features=20,
    scale=1,
    random_state=42,
)

In [3]:
scaler = StandardScaler()
scaler.fit(x_data_generated)
x_data_generated = scaler.transform(x_data_generated)

In [16]:
def model(x, y):
    crossVal = cross_val_score(LogisticRegression(), x, y, scoring='accuracy').mean()
    print('Accuracy: %.3f' % crossVal)
    return crossVal

In [18]:
run = model(x_data_generated, y_data_generated)

Accuracy: 0.850


In [19]:
df = pd.DataFrame(x_data_generated, columns=[f'Feature_{i}' for i in range(x_data_generated.shape[1])])
df['target'] = y_data_generated

correlations = df.corr()['target'].abs().sort_values(ascending=False)
correlations = correlations[1:]

corr_threshold = 0.1
selected_corr = correlations[correlations > corr_threshold].index.tolist()
selected_corr

['Feature_5',
 'Feature_12',
 'Feature_7',
 'Feature_14',
 'Feature_15',
 'Feature_16',
 'Feature_2',
 'Feature_18']

In [20]:
df1 = df[selected_corr]
run1 = model(df1, y_data_generated)


Accuracy: 0.822


In [21]:
varianceThreshold = VarianceThreshold(threshold=0.8)
x_var = varianceThreshold.fit_transform(x_data_generated)
run2 = model(x_var, y_data_generated)


Accuracy: 0.850


In [22]:
k_best = SelectKBest(score_func=f_classif, k=5)
x_k_best = k_best.fit_transform(x_data_generated, y_data_generated)

run3 = model(x_k_best, y_data_generated)

Accuracy: 0.822


In [24]:
l1_model = LogisticRegression(penalty='l1', random_state=42, max_iter=1000, solver='liblinear')
l1_model.fit(x_data_generated, y_data_generated)

l1_selector = SelectFromModel(l1_model, prefit=True, threshold='mean')
x_l1_selector = l1_selector.transform(x_data_generated)
run4 = model(x_l1_selector, y_data_generated)

Accuracy: 0.848


In [25]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(x_data_generated, y_data_generated)

rf_selector = SelectFromModel(rf_model, prefit=True, threshold='mean')
x_rf_selector = rf_selector.transform(x_data_generated)
run5 = model(x_rf_selector, y_data_generated)

Accuracy: 0.849


In [26]:
sfs_selector = SequentialFeatureSelector(
    estimator=LogisticRegression(max_iter=1000, random_state=42),
    n_features_to_select='auto',
    direction='forward',
    scoring='accuracy',
    cv=3,
    n_jobs=-1
)

x_sfs = sfs_selector.fit_transform(x_data_generated, y_data_generated)

run6 = model(x_sfs, y_data_generated)

Accuracy: 0.851


In [28]:
result = {
    'Метод отбора признаков': [
        'Без отбора (все признаки)',
        'Корреляция с целевой переменной',
        'Отсечение низковариативных',
        'f_classif (5 лучших)',
        'L1-регуляризация (LogisticRegression)',
        'RandomForest (feature_importance)',
        'SequentialFeatureSelector'
    ],
    'Количество признаков': [
        x_data_generated.shape[1],
        len(selected_corr),
        x_var.shape[1],
        x_k_best.shape[1],
        x_l1_selector.shape[1],
        x_rf_selector.shape[1],
        x_data_generated.shape[1]
    ],
    'Средняя точность модели': [
        run,
        run1,
        run2,
        run3,
        run4,
        run5,
        run6,
    ]
}

df_result = pd.DataFrame(result)
df_result

Unnamed: 0,Метод отбора признаков,Количество признаков,Средняя точность модели
0,Без отбора (все признаки),20,0.85
1,Корреляция с целевой переменной,8,0.822
2,Отсечение низковариативных,20,0.85
3,f_classif (5 лучших),5,0.822
4,L1-регуляризация (LogisticRegression),7,0.848
5,RandomForest (feature_importance),9,0.849
6,SequentialFeatureSelector,20,0.851
