# Тема семинара: отбор признаков

- Фильтрационные методы
- Оберточные методы
- Встроенные методы
- Метод главных компонент или PCA

In [1]:
import pandas as pd

In [62]:
data = pd.read_csv('Pokemon.csv')

In [63]:
data

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,719,Diancie,Rock,Fairy,600,50,100,150,100,150,50,6,True
796,719,DiancieMega Diancie,Rock,Fairy,700,50,160,110,160,110,110,6,True
797,720,HoopaHoopa Confined,Psychic,Ghost,600,80,110,60,150,130,70,6,True
798,720,HoopaHoopa Unbound,Psychic,Dark,680,80,160,60,170,130,80,6,True


Columns description (it's crucial!)


- #: ID for each pokemon
- Name: Name of each pokemon
- Type 1: Each pokemon has a type, this determines weakness/resistance to attacks
- Type 2: Some pokemon are dual type and have 2
- Total: sum of all stats that come after this, a general guide to how strong a pokemon is
- HP: hit points, or health, defines how much damage a pokemon can withstand before fainting
- Attack: the base modifier for normal attacks (eg. Scratch, Punch)
- Defense: the base damage resistance against normal attacks
- SP Atk: special attack, the base modifier for special attacks (e.g. fire blast, bubble beam)
- SP Def: the base damage resistance against special attacks
- Speed: determines which pokemon attacks first each round

In [64]:
# fillna and drop useless cols

display(data.isnull().sum())
data['Type 2'] = data['Type 2'].fillna('No 2nd type')

data.drop(columns=['#', 'Name'], inplace=True)

#               0
Name            0
Type 1          0
Type 2        386
Total           0
HP              0
Attack          0
Defense         0
Sp. Atk         0
Sp. Def         0
Speed           0
Generation      0
Legendary       0
dtype: int64

In [65]:
X = data.drop(columns='Legendary')
y = data['Legendary'].astype('int')

In [66]:
y.value_counts(normalize=True)

0    0.91875
1    0.08125
Name: Legendary, dtype: float64

# Make some default pipeline

In [7]:
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from category_encoders.leave_one_out import LeaveOneOutEncoder
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate
import sklearn

In [8]:
# define cat_cols

cat_cols = ['Type 1', 'Type 2']

default_pipeline = Pipeline([
    ('cat_encoder_', LeaveOneOutEncoder(cols=cat_cols)),
    ('scaler_', StandardScaler()),
    ('model_', SVC(kernel='linear'))]
)

In [9]:
cv_res1 = cross_validate(default_pipeline,
                        X,
                        y,
                        cv=5,
                        scoring='f1',
                        n_jobs=-1,
                        return_train_score=True
                       )

In [10]:
cv_res1

{'fit_time': array([0.05099964, 0.05500221, 0.04900098, 0.05799985, 0.04699564]),
 'score_time': array([0.01300311, 0.01400089, 0.01200008, 0.01400065, 0.01300478]),
 'test_score': array([0.5       , 0.72727273, 0.47619048, 0.38095238, 0.64864865]),
 'train_score': array([0.71287129, 0.56097561, 0.6744186 , 0.7311828 , 0.72727273])}

In [11]:
cv_res1['test_score'].mean()

0.5466128466128467

# Make pipeline more complicated

In [12]:
# difficult pipeline

pipe_dif = Pipeline([
    ('cat_encoder_', LeaveOneOutEncoder(cols=cat_cols)),
    ('poly_featurizer_', PolynomialFeatures(degree=4)),
    ('scaler_', StandardScaler()),
    ('model_', SVC(kernel='linear'))]
)

In [13]:
cv_res2 = cross_validate(pipe_dif,
                        X,
                        y,
                        cv=5,
                        scoring='f1',
                        n_jobs=-1,
                        return_train_score=True
                       )

cv_res2

{'fit_time': array([0.16700172, 0.17900038, 0.17099929, 0.1769979 , 0.18400025]),
 'score_time': array([0.02399874, 0.01899815, 0.02300191, 0.02300096, 0.02399755]),
 'test_score': array([0.375     , 0.88888889, 0.5       , 0.66666667, 0.53658537]),
 'train_score': array([0.95145631, 0.89583333, 0.97142857, 0.96153846, 0.98076923])}

In [15]:
cv_res2['test_score'].mean()

0.5934281842818427

train_score - просто класс ! модель получилась сложная, только очевидно переобученная ...

согласны, узнали ?


# Introduce feature selectors

In [16]:
data_tr = pipe_dif[:-1]

In [17]:
data_tr

Pipeline(steps=[('cat_encoder_', LeaveOneOutEncoder(cols=['Type 1', 'Type 2'])),
                ('poly_featurizer_', PolynomialFeatures(degree=4)),
                ('scaler_', StandardScaler())])

In [18]:
X_tr = data_tr.fit_transform(X, y)
print(f'data shape after transformation is {X_tr.shape}')

data shape after transformation is (800, 1001)


  elif pd.api.types.is_categorical(cols):


1k признаков - многовато, добавим в пайплайн селектор

## Фильтрационные методы

Суть таких методов в том, чтобы для каждого признака посчитать некоторую метрику "связи" с целевым признаком. И в результате оставить топ-K признаков согласно выбранной метрике.

В том числе на лекции обсуждались:

 - статистика хи-квадрат
 - метрика mutual information

In [19]:
from sklearn.feature_selection import f_classif, chi2, mutual_info_classif

In [23]:
k_best = 30

pipe = Pipeline([
    ('cat_encoder_', LeaveOneOutEncoder(cols=cat_cols)),
    ('poly_featurizer_', PolynomialFeatures(degree=4)),
    ('scaler_', StandardScaler()),
    ('selector_', SelectKBest(score_func=mutual_info_classif, k=50)), 
    ('model_', SVC(kernel='linear'))]
)

In [24]:
cv_res = cross_validate(pipe, X, y, cv=5, scoring='f1', return_train_score=True)
cv_res

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


{'fit_time': array([3.45266271, 3.42299962, 3.37600183, 3.33699465, 3.77500916]),
 'score_time': array([0.00999951, 0.00903654, 0.01003337, 0.01096606, 0.00999093]),
 'test_score': array([0.14285714, 0.64      , 0.71428571, 0.42105263, 0.60465116]),
 'train_score': array([0.78787879, 0.70103093, 0.76767677, 0.76      , 0.77894737])}

In [25]:
# k best нужно подбирать

cv_res['test_score'].mean()

0.5045693303025004

## Жадный метод отбора

In [26]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [27]:
rfe = RFE(LogisticRegression(max_iter=1000), n_features_to_select=k_best, step=30)

In [28]:
X_tr.shape

(800, 1001)

In [29]:
res = rfe.fit_transform(X_tr, y)
display(res.shape)
res

(800, 30)

array([[-0.98555744, -0.53089061, -0.76693052, ..., -0.40535388,
        -0.52135831, -0.81966779],
       [-0.48479877, -0.53089061, -0.49241971, ..., -0.16466941,
        -0.18003271, -0.81698899],
       [ 0.42451538, -0.53089061,  0.16799303, ...,  0.45023305,
         0.73602435, -0.81341726],
       ...,
       [-0.16049792, -0.53089061, -0.38721114, ...,  3.97831918,
         2.1076934 ,  4.18343684],
       [-0.16049792, -0.53089061, -0.38721114, ...,  6.06069716,
         2.51608978,  4.18343684],
       [ 1.36562373, -0.53089061,  1.4358977 , ...,  1.4680924 ,
         0.19750623,  2.6404483 ]])

In [30]:
pipe_rfe = Pipeline([
    ('cat_encoder_', LeaveOneOutEncoder(cols=cat_cols)),
    ('poly_featurizer_', PolynomialFeatures(degree=4)),
    ('scaler_', StandardScaler()),
    ('selector_', RFE(LogisticRegression(max_iter=1000),
                      n_features_to_select=30,
                      step=30
                     )), 
    ('model_', SVC(kernel='linear'))])

In [31]:
cv_res3 = cross_validate(pipe_rfe, X, y, cv=5, scoring='f1', return_train_score=True)
cv_res3

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


{'fit_time': array([3.78596306, 4.41500068, 3.86500096, 3.73300171, 3.49599814]),
 'score_time': array([0.0129993 , 0.01699758, 0.01000214, 0.01299548, 0.01699877]),
 'test_score': array([0.375     , 0.84615385, 0.64285714, 0.66666667, 0.60606061]),
 'train_score': array([0.90384615, 0.84536082, 0.96226415, 0.89583333, 0.94117647])}

In [32]:
cv_res3['test_score'].mean()

0.6273476523476524

## С помощью L1 регуляризации

In [33]:
from sklearn.feature_selection import SelectFromModel

In [47]:
sel = SelectFromModel(LogisticRegression(penalty='l1', max_iter=1000, solver='liblinear'), threshold=1e-5)

In [48]:
# пример

res = sel.fit_transform(X_tr, y)
display(res.shape)
res

(800, 48)

array([[-1.20562657, -0.94218651, -0.7732015 , ..., -0.52135831,
        -0.72668962, -0.81966779],
       [-1.20562657, -0.94218651, -0.33767384, ..., -0.18003271,
        -0.70651232, -0.81698899],
       [-1.20562657, -0.94218651,  0.6523422 , ...,  0.73602435,
        -0.6731154 , -0.81341726],
       ...,
       [-1.20562657, -0.94218651,  0.80566326, ...,  2.1076934 ,
         4.87819432,  4.18343684],
       [-1.20562657, -0.94218651,  1.06849938, ...,  2.51608978,
         4.87819432,  4.18343684],
       [-1.20562657, -0.94218651,  2.2512619 , ...,  0.19750623,
         1.93926564,  2.6404483 ]])

In [36]:
pipe_lasso =  Pipeline([
    ('cat_encoder_', LeaveOneOutEncoder(cols=cat_cols)),
    ('poly_featurizer_', PolynomialFeatures(degree=4)),
    ('scaler_', StandardScaler()),
    ('selector_', SelectFromModel(LogisticRegression(penalty='l1', max_iter=1000, solver='liblinear'), 
                                  threshold=1e-5)), 
    ('model_', SVC(kernel='linear'))])

In [37]:
cv_res4 = cross_validate(pipe_lasso, X, y, cv=5, scoring='f1', return_train_score=True)
cv_res4

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


{'fit_time': array([0.13402843, 0.12497544, 0.11903644, 0.1200316 , 0.1219728 ]),
 'score_time': array([0.01000619, 0.0099988 , 0.01098561, 0.00999856, 0.00903368]),
 'test_score': array([0.44444444, 0.84615385, 0.66666667, 0.60869565, 0.66666667]),
 'train_score': array([0.92307692, 0.89795918, 0.94117647, 0.93877551, 0.92929293])}

In [38]:
cv_res4['test_score'].mean()

0.6465254552211073

# PCA или метод главных компонент

Цель: создать k новых признаков из какого-либо количества старых признаков, так чтобы 
- каждый из новых признаков был линейной комбинацией старых

$z_i = u_1x_{1i} + ... + u_lx{li}$

- и дисперсия $z_i$, то есть новых признаков была максимальной (наиболее информативной)

С точки зрения линеной алгебры, процесс нахождения новых признаков из старых - это процесс проекции старых признаков на некоторую гиперплоскоть (линейное пространство). Как было показано на лекции, базисом этого пространства являются собственные вектора матрицы $X^TX$ - где Х - это центрированная матрица признаков

Тогда чтобы найти новые признаки (главные компоненты) нужно сначала
- найти собственные вектора V матрицы $X^TX$ (вектора должны быть приведены к длине 1)
- произвести матричное умножение Z = XV (то есть сделать проекцию матрицы X на линейное пространство с базисом V)

## Задание

a) Есть два признака, x1 = (1, 0, 0, 3), x2 = (3, 2, 0, 3). Найдите первую и вторую главные
компоненты.

б) Сколько дисперсии объясняется первой компонентой ?

Взято из задачника Б.Б. Демешева

In [49]:
from sklearn.decomposition import PCA

In [56]:
pca = PCA(n_components=2)

#X = [[1,3],[0,2],[0,0],[3,3]]
X = [[0,1],[-1,0],[-1,-2],[2,1]]
pca.fit_transform(X)

array([[-0.70710678,  0.70710678],
       [ 0.70710678,  0.70710678],
       [ 2.12132034, -0.70710678],
       [-2.12132034, -0.70710678]])

In [52]:
pca.explained_variance_ratio_

array([0.83333333, 0.16666667])

In [57]:
# пример

pca = PCA(n_components = 15)

In [58]:
res = pca.fit_transform(X_tr, y)
display(res.shape)
res

(800, 15)

array([[-1.98007291e+01,  4.46072632e+00, -4.82046552e-01, ...,
         2.72352942e-02,  4.35136790e-01, -1.29265633e+00],
       [-1.58785523e+01,  3.10134435e+00, -3.17353060e+00, ...,
        -8.35892258e-01,  2.34626674e-01, -1.23753516e+00],
       [-6.24072086e+00, -4.63280109e-01, -9.93003245e+00, ...,
        -3.27946647e+00, -4.32351321e-01, -1.25616602e+00],
       ...,
       [ 3.26767745e+01,  1.50340452e+01, -2.46534010e+01, ...,
         5.00567038e+00, -1.35742447e-02,  3.50894742e+00],
       [ 5.53322261e+01,  1.48563726e+01, -3.86325556e+01, ...,
         5.42283351e+00, -2.40413117e+00,  4.43465518e-01],
       [ 1.99828053e+01, -4.60646431e+00, -2.56386391e+01, ...,
         1.78416376e+00, -1.85425976e+00,  1.45714777e+00]])

In [60]:
# суммарная доля объясненной дисперсии исходных признаков

pca.explained_variance_ratio_.sum()

0.9438803626963046

In [59]:
# каждая следующая компонента менее информативна чем предыдущая
pca.explained_variance_ratio_

array([0.45670112, 0.1176569 , 0.08837368, 0.07259347, 0.05161779,
       0.03907593, 0.03147126, 0.02632701, 0.01579202, 0.01454208,
       0.00758009, 0.00692028, 0.00568022, 0.0051811 , 0.00436742])

In [69]:
n_components = 5

pipe_pca = Pipeline([
    ('cat_encoder_', LeaveOneOutEncoder(cols=cat_cols)),
    ('poly_featurizer_', PolynomialFeatures(degree=4)),
    ('scaler_', StandardScaler()),
    ('selector_', PCA(n_components=n_components)), 
    ('model_', SVC(kernel='linear'))])

cv_res5 = cross_validate(pipe_pca, X, y, cv=5, scoring='f1', return_train_score=True)
cv_res5

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


{'fit_time': array([0.13322496, 0.14600039, 0.12500024, 0.2340014 , 0.17100048]),
 'score_time': array([0.01000166, 0.00999999, 0.01100183, 0.0150001 , 0.01100063]),
 'test_score': array([0.55555556, 0.69565217, 0.47619048, 0.36363636, 0.8       ]),
 'train_score': array([0.66666667, 0.57777778, 0.61728395, 0.68131868, 0.5952381 ])}

In [70]:
cv_res5['test_score'].mean()

0.5782069138590878