# Подключение необходимых библиотек

In [62]:
import itertools
import numpy as np
from sklearn.metrics import (
    accuracy_score, 
    precision_score, 
    recall_score, 
    f1_score, 
    roc_auc_score
)
from sklearn.feature_selection import chi2
from sklearn.base import clone
import sys
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_wine
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split as split
import pandas as pd
from mlxtend.feature_selection import SequentialFeatureSelector



# Инициализация датасета

In [63]:
data = load_wine(as_frame=True).frame
y = data["target"]
X = data.iloc[:,:-1]
X_train, X_test, y_train, y_test = split(X, y, test_size=0.3, shuffle=True, random_state=0, stratify=y)


# Исчерпывающий отбор признаков (перебор всех комбинаций)

In [64]:
class ESF:
    def __init__(self, model, min_features = 1, max_features = 1):
        self.model = model
        self.min_features = min_features
        self.max_features = max_features
    
    def _prepareCombinations(self, features, n):
        self.current_combination = [list(x) for x in itertools.combinations(features, n)]
            

    def fit(self,to_predict, to_evaluate):
        if (self.min_features > self.max_features):
            raise ValueError("Минимум больше максимума")
        if len(to_predict.columns) < self.min_features:
            return to_predict.columns
        if len(to_predict.columns) < self.max_features:
            self.max_features = len(to_predict.columns)
        self.best_score = -sys.float_info.max
        self.best_combination = []
        for i in range(self.min_features, self.max_features+1):
            self._prepareCombinations(to_predict.columns, i)
            print(len(self.current_combination))
            current_combinations = self.current_combination
            print("Оценка комбинаций признаков длины =", i)
            for j in current_combinations:
                model = clone(self.model)
                current_score = cross_val_score(model, to_predict[j], to_evaluate, scoring="accuracy").mean()
                if current_score > self.best_score:
                    self.best_score = current_score
                    self.best_combination = j
        print("Лучшие признаки:", self.best_combination)
        print("Лучшая достигнутая оценка:", self.best_score)
        return to_predict[self.best_combination]


# Тест

In [65]:

exhaustiveFeatureSelector = ESF(model=SVC(), max_features=8)
exhaustiveFeatureSelector.fit(X, y)

13
Оценка комбинаций признаков длины = 1
78
Оценка комбинаций признаков длины = 2
286
Оценка комбинаций признаков длины = 3
715
Оценка комбинаций признаков длины = 4
1287
Оценка комбинаций признаков длины = 5
1716
Оценка комбинаций признаков длины = 6
1716
Оценка комбинаций признаков длины = 7
1287
Оценка комбинаций признаков длины = 8
Лучшие признаки: ['alcohol', 'malic_acid', 'ash', 'flavanoids', 'color_intensity']
Лучшая достигнутая оценка: 0.9328571428571429


Unnamed: 0,alcohol,malic_acid,ash,flavanoids,color_intensity
0,14.23,1.71,2.43,3.06,5.64
1,13.20,1.78,2.14,2.76,4.38
2,13.16,2.36,2.67,3.24,5.68
3,14.37,1.95,2.50,3.49,7.80
4,13.24,2.59,2.87,2.69,4.32
...,...,...,...,...,...
173,13.71,5.65,2.45,0.61,7.70
174,13.40,3.91,2.48,0.75,7.30
175,13.27,4.28,2.26,0.69,10.20
176,13.17,2.59,2.37,0.68,9.30


In [66]:
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
efs1 = EFS(SVC(), 
           min_features=1,
           max_features=8,
           scoring='accuracy',
           n_jobs=-1)
efs1.fit(X=X_train, y=y_train)

Features: 7098/7098

In [89]:
efs1.best_feature_names_

('alcohol', 'total_phenols', 'flavanoids')

In [90]:
efs1.best_score_

0.9359999999999999

# Последовательный прямой отбор признаков

In [67]:
class SFS:
    def __init__(self, estimator, n_features, floating=False) -> None:
        self.model = estimator
        self.n_features = n_features
        self.floating=floating
    
    def floating_step(self, features, to_predict):
        if (self.n_chosen_features <= 2):
            return 
        scores = []
        while(True):
            print(f"Floating step: Evaluating {self.n_chosen_features - 1} features")
            candidates = [list(x) for x in itertools.combinations(self.best_i[self.n_chosen_features], self.n_chosen_features - 1)]
            for j in candidates:
                model = clone(self.model)
                scores.append([j, cross_val_score(model, features[j], to_predict, scoring="accuracy").mean()])
            better = (list(filter(lambda x: x[1] > self.best_i_scores[self.n_chosen_features - 1], scores)))
            if (len(better)==0):
                print(f"Floating step: No improvement for subset of features with length {self.n_chosen_features - 1} ")
                return
            better.sort(key=lambda x: x[1], reverse=True)
            print(f"Floating step: Re-evaluating for subset of features with length {self.n_chosen_features - 1}")
            self.best_i[self.n_chosen_features - 1] = better[0][0]
            self.best_i_scores[self.n_chosen_features - 1] = better[0][1]
            self.n_chosen_features = self.n_chosen_features - 1
            candidates = [list(x) for x in itertools.combinations(self.best_i[self.n_chosen_features], self.n_chosen_features - 1)]
            scores = []
            
    
    def regular_step(self, features, to_predict):
            if self.n_chosen_features == 0:
                self.current_combinations = [[x] for x in features.columns]
            self.n_chosen_features = self.n_chosen_features + 1
            scores = []
            print(f"Regular step: Evaluating subsets of features with length {self.n_chosen_features}")
            for j in self.current_combinations:
                model = clone(self.model)
                scores.append([j, cross_val_score(model, features[j], to_predict, scoring="accuracy").mean()])
            scores.sort(key=lambda x: x[1], reverse=True)
            self.best_i[self.n_chosen_features] = scores[0][0]
            self.best_i_scores[self.n_chosen_features] = scores[0][1]
            
        

    def fit(self, features, to_predict):
        self.n_chosen_features = 0
        self.best_i = dict.fromkeys([x for x in range(1, self.n_features+1)], [])
        self.best_i_scores = dict.fromkeys([x for x in range(1, self.n_features+1)], 0)
        while(self.n_features > self.n_chosen_features):
            self.regular_step(features, to_predict)
            if self.floating : self.floating_step(features, to_predict)
            candidates = list(set(features.columns).difference(set(self.best_i[self.n_chosen_features])))
            self.current_combinations = [self.best_i[self.n_chosen_features] + [x] for x in candidates]
        print("Лучшие признаки по количеству:", self.best_i)
        print("Оценки лучших наборов:", self.best_i_scores)
        print(f"Лучшие {self.n_features} признаков:", self.best_i[self.n_features])
        print(f"Лучшая оценка для {self.n_features}:", self.best_i_scores[self.n_features])
        return X[self.best_i[self.n_features]]
        

# Тест

In [68]:
sfs = SFS(SVC(), 8)
sfs.fit(X, y)

Regular step: Evaluating subsets of features with length 1
Regular step: Evaluating subsets of features with length 2
Regular step: Evaluating subsets of features with length 3
Regular step: Evaluating subsets of features with length 4
Regular step: Evaluating subsets of features with length 5
Regular step: Evaluating subsets of features with length 6
Regular step: Evaluating subsets of features with length 7
Regular step: Evaluating subsets of features with length 8
Лучшие признаки по количеству: {1: ['flavanoids'], 2: ['flavanoids', 'alcohol'], 3: ['flavanoids', 'alcohol', 'hue'], 4: ['flavanoids', 'alcohol', 'hue', 'nonflavanoid_phenols'], 5: ['flavanoids', 'alcohol', 'hue', 'nonflavanoid_phenols', 'total_phenols'], 6: ['flavanoids', 'alcohol', 'hue', 'nonflavanoid_phenols', 'total_phenols', 'ash'], 7: ['flavanoids', 'alcohol', 'hue', 'nonflavanoid_phenols', 'total_phenols', 'ash', 'proanthocyanins'], 8: ['flavanoids', 'alcohol', 'hue', 'nonflavanoid_phenols', 'total_phenols', 'ash'

Unnamed: 0,flavanoids,alcohol,hue,nonflavanoid_phenols,total_phenols,ash,proanthocyanins,color_intensity
0,3.06,14.23,1.04,0.28,2.80,2.43,2.29,5.64
1,2.76,13.20,1.05,0.26,2.65,2.14,1.28,4.38
2,3.24,13.16,1.03,0.30,2.80,2.67,2.81,5.68
3,3.49,14.37,0.86,0.24,3.85,2.50,2.18,7.80
4,2.69,13.24,1.04,0.39,2.80,2.87,1.82,4.32
...,...,...,...,...,...,...,...,...
173,0.61,13.71,0.64,0.52,1.68,2.45,1.06,7.70
174,0.75,13.40,0.70,0.43,1.80,2.48,1.41,7.30
175,0.69,13.27,0.59,0.43,1.59,2.26,1.35,10.20
176,0.68,13.17,0.60,0.53,1.65,2.37,1.46,9.30


In [69]:
test_sequence = SequentialFeatureSelector(SVC(), k_features=8, forward=True, floating=False)
test_sequence.fit(X,y)
test_sequence.k_feature_names_



('alcohol',
 'ash',
 'total_phenols',
 'flavanoids',
 'nonflavanoid_phenols',
 'proanthocyanins',
 'color_intensity',
 'hue')

In [70]:
sfs = SFS(SVC(), n_features=8, floating=True)
sfs.fit(X, y)

Regular step: Evaluating subsets of features with length 1
Regular step: Evaluating subsets of features with length 2
Regular step: Evaluating subsets of features with length 3
Floating step: Evaluating 2 features
Floating step: No improvement for subset of features with length 2 
Regular step: Evaluating subsets of features with length 4
Floating step: Evaluating 3 features
Floating step: No improvement for subset of features with length 3 
Regular step: Evaluating subsets of features with length 5
Floating step: Evaluating 4 features
Floating step: No improvement for subset of features with length 4 
Regular step: Evaluating subsets of features with length 6
Floating step: Evaluating 5 features
Floating step: Re-evaluating for subset of features with length 5
Floating step: Evaluating 4 features
Floating step: No improvement for subset of features with length 4 
Regular step: Evaluating subsets of features with length 6
Floating step: Evaluating 5 features
Floating step: No improveme

Unnamed: 0,flavanoids,alcohol,hue,ash,nonflavanoid_phenols,color_intensity,malic_acid,proanthocyanins
0,3.06,14.23,1.04,2.43,0.28,5.64,1.71,2.29
1,2.76,13.20,1.05,2.14,0.26,4.38,1.78,1.28
2,3.24,13.16,1.03,2.67,0.30,5.68,2.36,2.81
3,3.49,14.37,0.86,2.50,0.24,7.80,1.95,2.18
4,2.69,13.24,1.04,2.87,0.39,4.32,2.59,1.82
...,...,...,...,...,...,...,...,...
173,0.61,13.71,0.64,2.45,0.52,7.70,5.65,1.06
174,0.75,13.40,0.70,2.48,0.43,7.30,3.91,1.41
175,0.69,13.27,0.59,2.26,0.43,10.20,4.28,1.35
176,0.68,13.17,0.60,2.37,0.53,9.30,2.59,1.46


In [71]:
test_sequence = SequentialFeatureSelector(SVC(), k_features=8, forward=True, floating=True)
test_sequence.fit(X,y)
test_sequence.k_feature_names_



('alcohol',
 'malic_acid',
 'ash',
 'flavanoids',
 'nonflavanoid_phenols',
 'proanthocyanins',
 'color_intensity',
 'hue')

# Последовательный отбор от большего к меньшему числу признаков 

In [72]:
class SBS: 
    def __init__(self, estimator, n_features, floating=True) -> None:
        self.model = estimator
        self.n_features = n_features
        self.floating=floating

    def floating_step(self, features, to_predict):
        if (self.n_chosen_features >= len(features.columns) - 2): return
        scores = []
        while(True):
            prev_count = self.n_chosen_features + 1
            print(f"Floating step: Evaluating subsets of features with length {prev_count}")
            candidates = [list(x) for x in itertools.combinations(self.best_i[prev_count+1], prev_count)]
            for j in candidates:
                model = clone(self.model)
                if isinstance(j, list):
                    scores.append([j, cross_val_score(model, features[j], to_predict, scoring="accuracy").mean()])
            better = (list(filter(lambda x: x[1] >  self.best_i_scores[prev_count], scores)))
            if (len(better)==0):
                print(f"Floating step: No improvement for subset of features with length {prev_count} ")
                return
            better.sort(key=lambda x: x[1], reverse=True)
            print(f"Floating step: Re-evaluating for subset of features with length {prev_count}")
            self.best_i[prev_count] = better[0][0]
            self.best_i_scores[prev_count] = better[0][1]
            self.n_chosen_features = self.n_chosen_features + 1
            candidates = [list(x) for x in itertools.combinations(self.best_i[self.n_chosen_features + 1], self.n_chosen_features - 1)]
            scores = []
            

    def regular_step(self, features, to_predict):
        if self.n_chosen_features == len(features.columns):
            self.current_combinations = [list(x) for x in itertools.combinations(features.columns, len(features.columns)-1)]
        self.n_chosen_features = self.n_chosen_features - 1
        scores = []
        print(f"Regular step: Evaluating subsets of features with length {self.n_chosen_features}")
        for j in self.current_combinations:
                model = clone(self.model)
                scores.append([j, cross_val_score(model, features[j], to_predict, scoring="accuracy").mean()])
        scores.sort(key=lambda x: x[1], reverse=True)
        self.best_i[self.n_chosen_features] = scores[0][0]
        self.best_i_scores[self.n_chosen_features] = scores[0][1]
        

    def fit(self, features, to_predict):
        self.n_chosen_features = len(features.columns)
        self.best_i = dict.fromkeys([x for x in range(self.n_features, len(features.columns) - 1)], [])
        self.best_i_scores = dict.fromkeys([x for x in range(self.n_features, len(features.columns) - 1)], 0)
        while self.n_features < self.n_chosen_features:
            self.regular_step(features, to_predict)
            if self.floating : self.floating_step(features, to_predict)
            self.current_combinations = [list(x) for x in itertools.combinations(self.best_i[self.n_chosen_features] , self.n_chosen_features - 1)]
        print("Лучшие признаки по количеству:", self.best_i)
        print("Оценки лучших наборов:", self.best_i_scores)
        print(f"Лучшие {self.n_features} признаков:", self.best_i[self.n_features])
        print(f"Лучшая оценка для {self.n_features}:", self.best_i_scores[self.n_features])
        return X[self.best_i[self.n_features]]

In [73]:
sbs = SBS(SVC(), 8, floating=False)
sbs.fit(X, y)

Regular step: Evaluating subsets of features with length 12
Regular step: Evaluating subsets of features with length 11
Regular step: Evaluating subsets of features with length 10
Regular step: Evaluating subsets of features with length 9
Regular step: Evaluating subsets of features with length 8
Лучшие признаки по количеству: {8: ['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'proline'], 9: ['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proline'], 10: ['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'color_intensity', 'proline'], 11: ['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'proline'], 12: ['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols',

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,proline
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,1065.0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,1185.0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,735.0
...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,740.0
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,750.0
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,835.0
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,840.0


('alcohol',
 'malic_acid',
 'ash',
 'alcalinity_of_ash',
 'magnesium',
 'total_phenols',
 'flavanoids',
 'proline')

In [74]:
test_sequence = SequentialFeatureSelector(SVC(), k_features=8, forward=False, floating=False)
test_sequence.fit(X,y)
test_sequence.k_feature_names_



('alcohol',
 'malic_acid',
 'ash',
 'alcalinity_of_ash',
 'magnesium',
 'total_phenols',
 'flavanoids',
 'proline')

In [75]:
sbs = SBS(SVC(), 8, floating=True)
sbs.fit(X, y)

Regular step: Evaluating subsets of features with length 12
Regular step: Evaluating subsets of features with length 11
Regular step: Evaluating subsets of features with length 10
Floating step: Evaluating subsets of features with length 11
Floating step: No improvement for subset of features with length 11 
Regular step: Evaluating subsets of features with length 9
Floating step: Evaluating subsets of features with length 10
Floating step: No improvement for subset of features with length 10 
Regular step: Evaluating subsets of features with length 8
Floating step: Evaluating subsets of features with length 9
Floating step: No improvement for subset of features with length 9 
Лучшие признаки по количеству: {8: ['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'proline'], 9: ['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proline'], 10: ['alcohol', 'malic_acid', 'a

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,proline
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,1065.0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,1185.0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,735.0
...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,740.0
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,750.0
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,835.0
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,840.0


In [76]:
test_sequence = SequentialFeatureSelector(SVC(), k_features=8, forward=False, floating=True)
test_sequence.fit(X,y)
test_sequence.k_feature_names_



('alcohol',
 'malic_acid',
 'ash',
 'alcalinity_of_ash',
 'magnesium',
 'total_phenols',
 'flavanoids',
 'proline')

# Рекурсивный отбор признаков

In [77]:
class RFE:
    def __init__(self, model, n_features, getter='coeff', step=1) -> None:
        self.model = model
        self.getter = getter
        self.step = step
        self.n_features = n_features
    
    def _get_important_features(self, model):
        match self.getter:
            case 'importance':
                return model.feature_importances_
            case 'coeff':
                return model.coef_
            case _:
                raise ValueError("Неизвестный способ извлечения важнеости признаков")
                

    def fit(self, X, y):
        current_X = X
        while len(current_X.columns) > self.n_features:
            estimator = clone(self.model)
            estimator.fit(current_X, y)
            important_features = self._get_important_features(estimator)
            ranks = list(zip(current_X.columns, important_features))
            sorted_ranks = sorted(ranks, key=lambda x: x[1], reverse=True)[:-self.step]
            current_X = current_X[[x[0] for x in sorted_ranks]]
        return current_X 

# Тест

In [78]:
from sklearn.feature_selection import RFE as RFESklearn
from sklearn.tree import DecisionTreeClassifier
rfe = RFE(DecisionTreeClassifier(), 4, 'importance', 1)
rfe_test = RFESklearn(DecisionTreeClassifier(), n_features_to_select=4, step=1, importance_getter='auto')

rfe.fit(X, y)

Unnamed: 0,proline,od280/od315_of_diluted_wines,flavanoids,hue
0,1065.0,3.92,3.06,1.04
1,1050.0,3.40,2.76,1.05
2,1185.0,3.17,3.24,1.03
3,1480.0,3.45,3.49,0.86
4,735.0,2.93,2.69,1.04
...,...,...,...,...
173,740.0,1.74,0.61,0.64
174,750.0,1.56,0.75,0.70
175,835.0,1.56,0.69,0.59
176,840.0,1.62,0.68,0.60


In [79]:
rfe_test.fit(X, y)
pd.DataFrame(rfe_test.support_,index=X.columns,columns=['Rank'])

Unnamed: 0,Rank
alcohol,False
malic_acid,False
ash,False
alcalinity_of_ash,False
magnesium,False
total_phenols,False
flavanoids,True
nonflavanoid_phenols,False
proanthocyanins,False
color_intensity,False


# Отбор признаков по вариативности

In [81]:
from sklearn.feature_selection import VarianceThreshold
class VT:
    def __init__(self, threshold) -> None:
        self.threshold = threshold
    
    def _variance(self, data):
        n = len(data)
        mean = sum(data) / n
        deviations = [(x - mean) ** 2 for x in data]
        variance = sum(deviations) / n
        return variance
    
    def fit(self, X):
        self.candidates = dict.fromkeys([x for x in X.columns], [])
        for x in X.columns:
            data = X[[x]]
            self.candidates[x] = self._variance(data.values)
        best = []
        for x in self.candidates.keys():
            if self.candidates.get(x) >= self.threshold:
                best.append(x)
        print("Лучшие признаки с вариативностью: ", best)
        return X[best]


# Тест

In [82]:
vt = VT(threshold=(0.8*(1-0.8)))
vt_test = VarianceThreshold(threshold=(0.8*(1-0.8)))
vt.fit(X)
vt_test.fit(X)

concol = [column for column in X.columns 
          if column in X.columns[vt_test.get_support()]]

for features in concol:
    print(features)

Лучшие признаки с вариативностью:  ['alcohol', 'malic_acid', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'proanthocyanins', 'color_intensity', 'od280/od315_of_diluted_wines', 'proline']
alcohol
malic_acid
alcalinity_of_ash
magnesium
total_phenols
flavanoids
proanthocyanins
color_intensity
od280/od315_of_diluted_wines
proline


# Выбор K лучших признаков согласно переданной метрике

In [83]:
class KBest:
    def __init__(self, scoring_func, k_features) -> None:
        self.scoring_func = scoring_func
        self.k_features = k_features
    
    def sort(self):
        self.best_features = []
        sorted_candidates = {}
        if self.candidates[X.columns[0]][1] == None:
            sorted_candidates = dict(sorted(self.candidates.items(), key=lambda feature: feature[1][0]))
        else: sorted_candidates = dict(sorted(self.candidates.items(), key=lambda feature: (feature[1][0], feature[1][1]), reverse=True))
        return list(sorted_candidates.keys())[:self.k_features]
    def fit(self, X, y):
        score_func_res = self.scoring_func(X, y)
        self.candidates = dict.fromkeys(X.columns, [[], []])
        if isinstance(score_func_res, (list, tuple)):
            self.scores, self.pvalues = score_func_res
            self.pvalues = np.asarray(self.pvalues)
        else:
            self.scores_ = score_func_res
            self.pvalues_ = None
        for i in range(0, len(X.columns)):
            self.candidates[X.columns[i]]= [self.scores[i], self.pvalues[i]]
        return X[self.sort()]

# Тест

In [84]:
from sklearn.feature_selection import SelectKBest
kbest = KBest(scoring_func=chi2, k_features=4)
kbest_test = SelectKBest(score_func=chi2, k=4)
kbest.fit(X, y)

Unnamed: 0,proline,color_intensity,flavanoids,magnesium
0,1065.0,5.64,3.06,127.0
1,1050.0,4.38,2.76,100.0
2,1185.0,5.68,3.24,101.0
3,1480.0,7.80,3.49,113.0
4,735.0,4.32,2.69,118.0
...,...,...,...,...
173,740.0,7.70,0.61,95.0
174,750.0,7.30,0.75,102.0
175,835.0,10.20,0.69,120.0
176,840.0,9.30,0.68,120.0


In [87]:
kbest_test.fit(X, y)
X.columns.values[kbest_test.get_support()]


array(['magnesium', 'flavanoids', 'color_intensity', 'proline'],
      dtype=object)

# Гибридный метод - пайплайн, в котором каждый этап выполняет отбор признаков

In [None]:
class FSPipeline:
    def __init__(self, pipeline: list) -> None:
        self.tasks = pipeline
    
    def select_features(self, X, y):
        current_X = X
        for task in self.tasks:
           current_X = task.fit(current_X, y) if task.fit.__code__.co_argcount == 3 else task.fit(current_X)
        return current_X

# Тест

In [None]:
tasks = [
    VT(threshold=(0.8*(1-0.8))),
    ESF(SVC(), min_features=4, max_features=7)
]
pipeline = FSPipeline(tasks)
pipeline.select_features(X, y)

Best features by variance:  ['alcohol', 'malic_acid', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'proanthocyanins', 'color_intensity', 'od280/od315_of_diluted_wines', 'proline']
210
Оценка комбинаций признаков длины = 4
252
Оценка комбинаций признаков длины = 5
210
Оценка комбинаций признаков длины = 6
120
Оценка комбинаций признаков длины = 7
Лучшие признаки: ['alcohol', 'malic_acid', 'flavanoids', 'color_intensity']
Лучшая достигнутая оценка: 0.9273015873015874


Unnamed: 0,alcohol,malic_acid,flavanoids,color_intensity
0,14.23,1.71,3.06,5.64
1,13.20,1.78,2.76,4.38
2,13.16,2.36,3.24,5.68
3,14.37,1.95,3.49,7.80
4,13.24,2.59,2.69,4.32
...,...,...,...,...
173,13.71,5.65,0.61,7.70
174,13.40,3.91,0.75,7.30
175,13.27,4.28,0.69,10.20
176,13.17,2.59,0.68,9.30


# Фабрика - паттерн программирования

In [88]:
class SequentialFS: 
    def __init__(self, model, n_features, forward=False, floating=False) -> None:
        self.model = model
        self.forward = forward
        self.floating = floating
        self.n_features = n_features

    def fit(self, X, y):
        match self.forward:
            case False:
                selector = SFS(self.model, self.n_features, self.floating)
            case True:
                selector = SBS(self.model, self.n_features, self.floating)
            case _:
                raise TypeError("Неизвестный последовательный отбор признаков")
        return selector.fit(X, y)