## 1. Одномерный отбор признаков

Используется критерий хи-квадрат (chi-squared test) для неотрицательных признаков, чтобы отобрать 4 лучших признака.

In [98]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [99]:
phone_df = pd.read_csv("../Smartphone_chosse_preprocessed.csv")
phone_df

Unnamed: 0.1,Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,0,842,0,2.2,0,1,0,7,0.6,188,1,2,20,756,2549,9,7,19,0,0,1,1
1,1,1021,1,0.5,1,0,1,53,0.7,136,2,6,905,1988,2631,17,3,7,1,1,0,2
2,2,563,1,0.5,1,2,1,41,0.9,145,4,6,1263,1716,2603,11,2,9,1,1,0,2
3,3,615,1,2.5,0,0,0,10,0.8,131,5,9,1216,1786,2769,16,8,11,1,0,0,2
4,4,1821,1,1.2,0,13,1,44,0.6,141,1,14,1208,1212,1411,8,2,15,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1995,794,1,0.5,1,0,1,2,0.8,106,5,14,1222,1890,668,13,4,19,1,1,0,0
1996,1996,1965,1,2.6,1,0,0,39,0.2,187,3,3,915,1965,2032,11,10,16,1,1,1,2
1997,1997,1911,0,0.9,1,1,1,36,0.7,108,7,3,868,1632,3057,9,1,5,1,1,0,3
1998,1998,1512,0,0.9,0,4,1,46,0.1,145,4,5,336,670,869,18,10,19,1,1,1,0


In [100]:
phone_df = phone_df.drop('Unnamed: 0', 1)
phone_df

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,1,2,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,2,6,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,4,6,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,5,9,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,1,14,1208,1212,1411,8,2,15,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,794,1,0.5,1,0,1,2,0.8,106,5,14,1222,1890,668,13,4,19,1,1,0,0
1996,1965,1,2.6,1,0,0,39,0.2,187,3,3,915,1965,2032,11,10,16,1,1,1,2
1997,1911,0,0.9,1,1,1,36,0.7,108,7,3,868,1632,3057,9,1,5,1,1,0,3
1998,1512,0,0.9,0,4,1,46,0.1,145,4,5,336,670,869,18,10,19,1,1,1,0


In [101]:
array = phone_df.values
X = array[:,0:20]
Y = array[:,20]

In [102]:
# feature extraction
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(X, Y)

In [103]:
# summarize scores
numpy.set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(X)

[1.413e+04 7.232e-01 6.484e-01 6.310e-01 1.014e+01 1.522e+00 8.984e+01
 7.458e-01 9.597e+01 1.168e+01 9.186e+00 1.736e+04 9.811e+03 9.313e+05
 9.615e+00 1.648e+01 1.324e+01 3.276e-01 1.928e+00 4.221e-01]


In [104]:
# summarize selected features
print(features[0:5,:])

[[ 842.   20.  756. 2549.]
 [1021.  905. 1988. 2631.]
 [ 563. 1263. 1716. 2603.]
 [ 615. 1216. 1786. 2769.]
 [1821. 1208. 1212. 1411.]]


Мы видим оценки для каждого признака и 4 отобранных признака (с наивысшими оценками): battery_power, px_height,	px_width,	ram.	

## 2. Рекурсивное исключение признаков

Метод RFE применяется в сочетании с логистической регрессией для отбора 3-х лучших признаков. Для совместного использования с RFE можно выбирать различные модели, важно лишь, чтобы они были достаточно эффективны и совместимы с RFE.

In [105]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [106]:
phone_df = pd.read_csv("../Smartphone_chosse_preprocessed.csv")
phone_df = phone_df.drop('Unnamed: 0', 1)
phone_df

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,1,2,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,2,6,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,4,6,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,5,9,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,1,14,1208,1212,1411,8,2,15,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,794,1,0.5,1,0,1,2,0.8,106,5,14,1222,1890,668,13,4,19,1,1,0,0
1996,1965,1,2.6,1,0,0,39,0.2,187,3,3,915,1965,2032,11,10,16,1,1,1,2
1997,1911,0,0.9,1,1,1,36,0.7,108,7,3,868,1632,3057,9,1,5,1,1,0,3
1998,1512,0,0.9,0,4,1,46,0.1,145,4,5,336,670,869,18,10,19,1,1,1,0


In [121]:
array = phone_df.values
X = array[:,0:20]
Y = array[:,20]

In [122]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)

X = scaler.transform(X)
X

array([[-0.903, -0.99 ,  0.831, ..., -1.787, -1.006,  0.986],
       [-0.495,  1.01 , -1.253, ...,  0.56 ,  0.994, -1.014],
       [-1.538,  1.01 , -1.253, ...,  0.56 ,  0.994, -1.014],
       ...,
       [ 1.531, -0.99 , -0.763, ...,  0.56 ,  0.994, -1.014],
       [ 0.623, -0.99 , -0.763, ...,  0.56 ,  0.994,  0.986],
       [-1.658,  1.01 ,  0.586, ...,  0.56 ,  0.994,  0.986]])

In [125]:
# feature extraction
model = LogisticRegression()
rfe = RFE(model, 3)
fit = rfe.fit(X, Y)

In [126]:
print("Num Features: %d" % fit.n_features_) 
print("Selected Features: %s" % fit.support_) 
print("Feature Ranking: %s" % fit.ranking_) 

Num Features: 3
Selected Features: [ True False False False False False False False False False False  True
 False  True False False False False False False]
Feature Ranking: [ 1 18 11 10 12 13  4  9  3  7 17  1  2  1  5 16 15 14  8  6]


Мы видим, что в результате были отобраны 3 лучших признака:  battery_power, px_height, ram.

## 3. Метод главных компонент

Мы выделяем 3 главных компоненты с помощью PCA.

In [88]:
from sklearn.decomposition import PCA

In [89]:
phone_df = pd.read_csv("../Smartphone_chosse_preprocessed.csv")
phone_df = phone_df.drop('Unnamed: 0', 1)
phone_df

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,1,2,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,2,6,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,4,6,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,5,9,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,1,14,1208,1212,1411,8,2,15,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,794,1,0.5,1,0,1,2,0.8,106,5,14,1222,1890,668,13,4,19,1,1,0,0
1996,1965,1,2.6,1,0,0,39,0.2,187,3,3,915,1965,2032,11,10,16,1,1,1,2
1997,1911,0,0.9,1,1,1,36,0.7,108,7,3,868,1632,3057,9,1,5,1,1,0,3
1998,1512,0,0.9,0,4,1,46,0.1,145,4,5,336,670,869,18,10,19,1,1,1,0


In [90]:
array = phone_df.values
X = array[:,0:20]
Y = array[:,20]

In [91]:
# feature extraction
pca = PCA(n_components=3)
fit = pca.fit(X)
features = fit.transform(X)

In [92]:
# summarize components
print("Explained Variance: %s" % fit.explained_variance_ratio_)
print(features[0:5,:])

Explained Variance: [0.67  0.165 0.11 ]
[[ 430.597 -795.788 -390.07 ]
 [ 504.985  696.622 -235.629]
 [ 473.33   763.942 -680.059]
 [ 639.822  779.691 -630.784]
 [-718.985  382.305  591.04 ]]


Как видим, результат преобразования (3 главных компоненты) совсем не похож на исходные данные.

## 4. Отбор на основе важности признаков

Мы обучаем классификатор ExtraTreesClassifier, чтобы с его помощью определить важность признаков.

In [93]:
from sklearn.ensemble import ExtraTreesClassifier 

In [94]:
phone_df = pd.read_csv("../Smartphone_chosse_preprocessed.csv")
phone_df = phone_df.drop('Unnamed: 0', 1)
phone_df

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,1,2,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,2,6,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,4,6,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,5,9,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,1,14,1208,1212,1411,8,2,15,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,794,1,0.5,1,0,1,2,0.8,106,5,14,1222,1890,668,13,4,19,1,1,0,0
1996,1965,1,2.6,1,0,0,39,0.2,187,3,3,915,1965,2032,11,10,16,1,1,1,2
1997,1911,0,0.9,1,1,1,36,0.7,108,7,3,868,1632,3057,9,1,5,1,1,0,3
1998,1512,0,0.9,0,4,1,46,0.1,145,4,5,336,670,869,18,10,19,1,1,1,0


In [95]:
array = phone_df.values
X = array[:,0:20]
Y = array[:,20]

In [96]:
# feature extraction
model = ExtraTreesClassifier()
model.fit(X, Y)

ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100,
                     n_jobs=None, oob_score=False, random_state=None, verbose=0,
                     warm_start=False)

In [97]:
print(model.feature_importances_)

[0.06  0.02  0.034 0.02  0.033 0.018 0.036 0.035 0.037 0.033 0.035 0.046
 0.049 0.383 0.036 0.035 0.035 0.015 0.018 0.02 ]


Мы получили оценки для каждого признака. Чем больше значение оценки, тем важнее признак. Таким образом, согласно данному методу отбора, тремя наиболее важными признаками являются: battery_power, px_height и	px_width.	