# Тема семинара: отбор признаков

- Фильтрационные методы
- Оберточные методы
- Встроенные методы
- Метод главных компонент или PCA

In [25]:
import pandas as pd

In [54]:
data = pd.read_csv('Pokemon.csv')

In [55]:
data

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,719,Diancie,Rock,Fairy,600,50,100,150,100,150,50,6,True
796,719,DiancieMega Diancie,Rock,Fairy,700,50,160,110,160,110,110,6,True
797,720,HoopaHoopa Confined,Psychic,Ghost,600,80,110,60,150,130,70,6,True
798,720,HoopaHoopa Unbound,Psychic,Dark,680,80,160,60,170,130,80,6,True


Columns description (it's crucial!)


- #: ID for each pokemon
- Name: Name of each pokemon
- Type 1: Each pokemon has a type, this determines weakness/resistance to attacks
- Type 2: Some pokemon are dual type and have 2
- Total: sum of all stats that come after this, a general guide to how strong a pokemon is
- HP: hit points, or health, defines how much damage a pokemon can withstand before fainting
- Attack: the base modifier for normal attacks (eg. Scratch, Punch)
- Defense: the base damage resistance against normal attacks
- SP Atk: special attack, the base modifier for special attacks (e.g. fire blast, bubble beam)
- SP Def: the base damage resistance against special attacks
- Speed: determines which pokemon attacks first each round

In [56]:
# fillna and drop useless cols

display(data.isnull().sum())
data['Type 2'] = data['Type 2'].fillna('No 2nd type')

data.drop(columns=['#', 'Name'], inplace=True)

#               0
Name            0
Type 1          0
Type 2        386
Total           0
HP              0
Attack          0
Defense         0
Sp. Atk         0
Sp. Def         0
Speed           0
Generation      0
Legendary       0
dtype: int64

In [57]:
X = data.drop(columns='Legendary')
y = data['Legendary'].astype('int')

In [58]:
y.value_counts(normalize=True)

0    0.91875
1    0.08125
Name: Legendary, dtype: float64

# Make some default pipeline

In [64]:
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from category_encoders.leave_one_out import LeaveOneOutEncoder
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate
import sklearn

In [65]:
# define cat_cols

cat_cols = ['Type 1', 'Type 2']

default_pipeline = Pipeline([
    ('cat_encoder_', LeaveOneOutEncoder(cols=cat_cols)),
    ('scaler_', StandardScaler()),
    ('model_', SVC(kernel='linear'))]
)

In [70]:
cv_res1 = cross_validate(default_pipeline,
                        X,
                        y,
                        cv=5,
                        scoring='f1',
                        n_jobs=-1,
                        return_train_score=True
                       )

In [71]:
cv_res1

{'fit_time': array([0.02865791, 0.02987289, 0.02858281, 0.02825904, 0.02561617]),
 'score_time': array([0.00753808, 0.00767303, 0.00701499, 0.0068512 , 0.00666904]),
 'test_score': array([0.5       , 0.72727273, 0.47619048, 0.38095238, 0.64864865]),
 'train_score': array([0.71287129, 0.56097561, 0.6744186 , 0.7311828 , 0.72727273])}

# Make pipeline more complicated

In [69]:
# difficult pipeline

pipe_dif = Pipeline([
    ('cat_encoder_', LeaveOneOutEncoder(cols=cat_cols)),
    ('poly_featurizer_', PolynomialFeatures(degree=5)),
    ('scaler_', StandardScaler()),
    ('model_', SVC(kernel='linear'))]
)

In [72]:
cv_res2 = cross_validate(pipe_dif,
                        X,
                        y,
                        cv=5,
                        scoring='f1',
                        n_jobs=-1,
                        return_train_score=True
                       )

cv_res2

{'fit_time': array([0.49763989, 0.53973198, 0.56674814, 0.53095603, 0.55187798]),
 'score_time': array([0.06022096, 0.03807998, 0.02559686, 0.03413606, 0.02689505]),
 'test_score': array([0.47058824, 0.85714286, 0.625     , 0.66666667, 0.44444444]),
 'train_score': array([0.96153846, 0.875     , 0.98113208, 0.95145631, 0.98076923])}

train_score - просто класс ! модель получилась сложная, только очевидно переобученная ...

согласны, узнали ?


# Introduce feature selectors

In [73]:
data_tr = pipe_dif[:-1]

In [74]:
data_tr

Pipeline(steps=[('cat_encoder_', LeaveOneOutEncoder(cols=['Type 1', 'Type 2'])),
                ('poly_featurizer_', PolynomialFeatures(degree=5)),
                ('scaler_', StandardScaler())])

In [77]:
X_tr = data_tr.fit_transform(X, y)
print(f'data shape after transformation is {X_tr.shape}')

data shape after transformation is (800, 3003)


3k признаков - многовато, добавим в пайплайн селектор

## Фильтрационные методы

Суть таких методов в том, чтобы для каждого признака посчитать некоторую метрику "связи" с целевым признаком. И в результате оставить топ-K признаков согласно выбранной метрике.

В том числе на лекции обсуждались:

 - статистика хи-квадрат
 - метрика mutual information

In [80]:
from sklearn.feature_selection import f_classif, chi2, mutual_info_classif

In [82]:
k_best = 30

pipe1 = Pipeline([
    ('cat_encoder_', LeaveOneOutEncoder(cols=cat_cols)),
    ('poly_featurizer_', PolynomialFeatures(degree=5)),
    ('scaler_', StandardScaler()),
    ('selector_', SelectKBest(score_func=f_classif, k=k_best)), 
    ('model_', SVC(kernel='linear'))]
)

pipe2 = Pipeline([
    ('cat_encoder_', LeaveOneOutEncoder(cols=cat_cols)),
    ('poly_featurizer_', PolynomialFeatures(degree=5)),
    ('scaler_', StandardScaler()),
    ('selector_', SelectKBest(score_func=chi2, k=k_best)), 
    ('model_', SVC(kernel='linear'))]
)

pipe3 = Pipeline([
    ('cat_encoder_', LeaveOneOutEncoder(cols=cat_cols)),
    ('poly_featurizer_', PolynomialFeatures(degree=5)),
    ('scaler_', StandardScaler()),
    ('selector_', SelectKBest(score_func=mutual_info_classif, k=k_best)), 
    ('model_', SVC(kernel='linear'))]
)



In [84]:
for idx, pipe in enumerate([pipe1, pipe2, pipe3]):
    cv_res = cross_validate(pipe, X, y, cv=5, scoring='f1')
    print(f'cv res for the pipeline {idx + 1} is {cv_res}')

cv res for the pipeline 1 is {'fit_time': array([6.72808385, 6.67671204, 6.67168808, 6.71307993, 6.71903896]), 'score_time': array([0.00718021, 0.00735903, 0.00734186, 0.00752997, 0.0078311 ]), 'test_score': array([0.        , 0.5       , 0.72      , 0.4       , 0.53061224])}
cv res for the pipeline 2 is {'fit_time': array([6.75841308, 6.6808598 , 6.69931722, 6.69226027, 6.69651985]), 'score_time': array([0.00756884, 0.00900006, 0.00741601, 0.00752091, 0.00734591]), 'test_score': array([0.        , 0.52173913, 0.72      , 0.4       , 0.55319149])}
cv res for the pipeline 3 is {'fit_time': array([6.69892192, 6.693398  , 6.75637007, 6.72445798, 6.71143579]), 'score_time': array([0.00768805, 0.00750804, 0.00777006, 0.00749898, 0.00771093]), 'test_score': array([0.26666667, 0.58333333, 0.64285714, 0.4       , 0.55319149])}
