# Feature Selection Techniques

In [None]:
import pandas as pd
df=pd.read_csv('mobile_dataset.csv')
df.head()


Out[2]:
battery_power	blue	clock_speed	dual_sim	fc	four_g	int_memory	m_dep	mobile_wt	n_cores	...	px_height	px_width	ram	sc_h	sc_w	talk_time	three_g	touch_screen	wifi	price_range
0	842	0	2.2	0	1	0	7	0.6	188	2	...	20	756	2549	9	7	19	0	0	1	1
1	1021	1	0.5	1	0	1	53	0.7	136	3	...	905	1988	2631	17	3	7	1	1	0	2
2	563	1	0.5	1	2	1	41	0.9	145	5	...	1263	1716	2603	11	2	9	1	1	0	2
3	615	1	2.5	0	0	0	10	0.8	131	6	...	1216	1786	2769	16	8	11	1	0	0	2
4	1821	1	1.2	0	13	1	44	0.6	141	2	...	1208	1212	1411	8	2	15	1	1	0	1
5 rows × 21 columns

# Univariate Selection

In [None]:
X=df.iloc[:,:-1]
y=df['price_range']

In [None]:
X.head()

Out[4]:
battery_power	blue	clock_speed	dual_sim	fc	four_g	int_memory	m_dep	mobile_wt	n_cores	pc	px_height	px_width	ram	sc_h	sc_w	talk_time	three_g	touch_screen	wifi
0	842	0	2.2	0	1	0	7	0.6	188	2	2	20	756	2549	9	7	19	0	0	1
1	1021	1	0.5	1	0	1	53	0.7	136	3	6	905	1988	2631	17	3	7	1	1	0
2	563	1	0.5	1	2	1	41	0.9	145	5	6	1263	1716	2603	11	2	9	1	1	0
3	615	1	2.5	0	0	0	10	0.8	131	6	9	1216	1786	2769	16	8	11	1	0	0
4	1821	1	1.2	0	13	1	44	0.6	141	2	14	1208	1212	1411	8	2	15	1	1	0

In [None]:
y.head()


Out[5]:
0    1
1    2
2    2
3    2
4    1
Name: price_range, dtype: int64

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [None]:
df.shape

Out[9]:
(2000, 21)

In [None]:
### Apply SelectKBest Algorithm
ordered_rank_features=SelectKBest(score_func=chi2,k=20)
ordered_feature=ordered_rank_features.fit(X,y)

In [None]:
dfscores=pd.DataFrame(ordered_feature.scores_,columns=["Score"])
dfcolumns=pd.DataFrame(X.columns)

In [None]:
features_rank=pd.concat([dfcolumns,dfscores],axis=1)

In [None]:
features_rank.columns=['Features','Score']
features_rank


Out[19]:
Features	Score
0	battery_power	14129.866576
1	blue	0.723232
2	clock_speed	0.648366
3	dual_sim	0.631011
4	fc	10.135166
5	four_g	1.521572
6	int_memory	89.839124
7	m_dep	0.745820
8	mobile_wt	95.972863
9	n_cores	9.097556
10	pc	9.186054
11	px_height	17363.569536
12	px_width	9810.586750
13	ram	931267.519053
14	sc_h	9.614878
15	sc_w	16.480319
16	talk_time	13.236400
17	three_g	0.327643
18	touch_screen	1.928429
19	wifi	0.422091

In [None]:
features_rank.nlargest(10,'Score')


Out[20]:
Features	Score
13	ram	931267.519053
11	px_height	17363.569536
0	battery_power	14129.866576
12	px_width	9810.586750
8	mobile_wt	95.972863
6	int_memory	89.839124
15	sc_w	16.480319
16	talk_time	13.236400
4	fc	10.135166
14	sc_h	9.614878

# Feature Importance
This technique gives you a score for each feature of your data,the higher the score mor relevant it is

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
model=ExtraTreesClassifier()
model.fit(X,y)

Out[21]:
ExtraTreesClassifier()

In [None]:
print(model.feature_importances_)


[0.06153371 0.01935284 0.0327604  0.02021518 0.03206452 0.01764694
 0.03493519 0.03307269 0.03599477 0.03262977 0.03336096 0.04747963
 0.04880065 0.39576103 0.03355555 0.03306568 0.03457466 0.01388524
 0.01851953 0.02079106]

In [None]:
ranked_features=pd.Series(model.feature_importances_,index=X.columns)
ranked_features.nlargest(10).plot(kind='barh')
plt.show()


# Correlation

In [None]:
df.corr()


Out[26]:
battery_power	blue	clock_speed	dual_sim	fc	four_g	int_memory	m_dep	mobile_wt	n_cores	...	px_height	px_width	ram	sc_h	sc_w	talk_time	three_g	touch_screen	wifi	price_range
battery_power	1.000000	0.011252	0.011482	-0.041847	0.033334	0.015665	-0.004004	0.034085	0.001844	-0.029727	...	0.014901	-0.008402	-0.000653	-0.029959	-0.021421	0.052510	0.011522	-0.010516	-0.008343	0.200723
blue	0.011252	1.000000	0.021419	0.035198	0.003593	0.013443	0.041177	0.004049	-0.008605	0.036161	...	-0.006872	-0.041533	0.026351	-0.002952	0.000613	0.013934	-0.030236	0.010061	-0.021863	0.020573
clock_speed	0.011482	0.021419	1.000000	-0.001315	-0.000434	-0.043073	0.006545	-0.014364	0.012350	-0.005724	...	-0.014523	-0.009476	0.003443	-0.029078	-0.007378	-0.011432	-0.046433	0.019756	-0.024471	-0.006606
dual_sim	-0.041847	0.035198	-0.001315	1.000000	-0.029123	0.003187	-0.015679	-0.022142	-0.008979	-0.024658	...	-0.020875	0.014291	0.041072	-0.011949	-0.016666	-0.039404	-0.014008	-0.017117	0.022740	0.017444
fc	0.033334	0.003593	-0.000434	-0.029123	1.000000	-0.016560	-0.029133	-0.001791	0.023618	-0.013356	...	-0.009990	-0.005176	0.015099	-0.011014	-0.012373	-0.006829	0.001793	-0.014828	0.020085	0.021998
four_g	0.015665	0.013443	-0.043073	0.003187	-0.016560	1.000000	0.008690	-0.001823	-0.016537	-0.029706	...	-0.019236	0.007448	0.007313	0.027166	0.037005	-0.046628	0.584246	0.016758	-0.017620	0.014772
int_memory	-0.004004	0.041177	0.006545	-0.015679	-0.029133	0.008690	1.000000	0.006886	-0.034214	-0.028310	...	0.010441	-0.008335	0.032813	0.037771	0.011731	-0.002790	-0.009366	-0.026999	0.006993	0.044435
m_dep	0.034085	0.004049	-0.014364	-0.022142	-0.001791	-0.001823	0.006886	1.000000	0.021756	-0.003504	...	0.025263	0.023566	-0.009434	-0.025348	-0.018388	0.017003	-0.012065	-0.002638	-0.028353	0.000853
mobile_wt	0.001844	-0.008605	0.012350	-0.008979	0.023618	-0.016537	-0.034214	0.021756	1.000000	-0.018989	...	0.000939	0.000090	-0.002581	-0.033855	-0.020761	0.006209	0.001551	-0.014368	-0.000409	-0.030302
n_cores	-0.029727	0.036161	-0.005724	-0.024658	-0.013356	-0.029706	-0.028310	-0.003504	-0.018989	1.000000	...	-0.006872	0.024480	0.004868	-0.000315	0.025826	0.013148	-0.014733	0.023774	-0.009964	0.004399
pc	0.031441	-0.009952	-0.005245	-0.017143	0.644595	-0.005598	-0.033273	0.026282	0.018844	-0.001193	...	-0.018465	0.004196	0.028984	0.004938	-0.023819	0.014657	-0.001322	-0.008742	0.005389	0.033599
px_height	0.014901	-0.006872	-0.014523	-0.020875	-0.009990	-0.019236	0.010441	0.025263	0.000939	-0.006872	...	1.000000	0.510664	-0.020352	0.059615	0.043038	-0.010645	-0.031174	0.021891	0.051824	0.148858
px_width	-0.008402	-0.041533	-0.009476	0.014291	-0.005176	0.007448	-0.008335	0.023566	0.000090	0.024480	...	0.510664	1.000000	0.004105	0.021599	0.034699	0.006720	0.000350	-0.001628	0.030319	0.165818
ram	-0.000653	0.026351	0.003443	0.041072	0.015099	0.007313	0.032813	-0.009434	-0.002581	0.004868	...	-0.020352	0.004105	1.000000	0.015996	0.035576	0.010820	0.015795	-0.030455	0.022669	0.917046
sc_h	-0.029959	-0.002952	-0.029078	-0.011949	-0.011014	0.027166	0.037771	-0.025348	-0.033855	-0.000315	...	0.059615	0.021599	0.015996	1.000000	0.506144	-0.017335	0.012033	-0.020023	0.025929	0.022986
sc_w	-0.021421	0.000613	-0.007378	-0.016666	-0.012373	0.037005	0.011731	-0.018388	-0.020761	0.025826	...	0.043038	0.034699	0.035576	0.506144	1.000000	-0.022821	0.030941	0.012720	0.035423	0.038711
talk_time	0.052510	0.013934	-0.011432	-0.039404	-0.006829	-0.046628	-0.002790	0.017003	0.006209	0.013148	...	-0.010645	0.006720	0.010820	-0.017335	-0.022821	1.000000	-0.042688	0.017196	-0.029504	0.021859
three_g	0.011522	-0.030236	-0.046433	-0.014008	0.001793	0.584246	-0.009366	-0.012065	0.001551	-0.014733	...	-0.031174	0.000350	0.015795	0.012033	0.030941	-0.042688	1.000000	0.013917	0.004316	0.023611
touch_screen	-0.010516	0.010061	0.019756	-0.017117	-0.014828	0.016758	-0.026999	-0.002638	-0.014368	0.023774	...	0.021891	-0.001628	-0.030455	-0.020023	0.012720	0.017196	0.013917	1.000000	0.011917	-0.030411
wifi	-0.008343	-0.021863	-0.024471	0.022740	0.020085	-0.017620	0.006993	-0.028353	-0.000409	-0.009964	...	0.051824	0.030319	0.022669	0.025929	0.035423	-0.029504	0.004316	0.011917	1.000000	0.018785
price_range	0.200723	0.020573	-0.006606	0.017444	0.021998	0.014772	0.044435	0.000853	-0.030302	0.004399	...	0.148858	0.165818	0.917046	0.022986	0.038711	0.021859	0.023611	-0.030411	0.018785	1.000000
21 rows × 21 columns

In [None]:
import seaborn as sns
corr=df.iloc[:,:-1].corr()
top_features=corr.index
plt.figure(figsize=(20,20))
sns.heatmap(df[top_features].corr(),annot=True)


Out[28]:
<matplotlib.axes._subplots.AxesSubplot at 0x1c5a0bbbf60>


#### Remove The correlated

In [None]:
threshold=0.8

In [None]:
# find and remove correlated features
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

In [None]:
correlation(df.iloc[:,:-1],threshold)

Out[35]:
set()

# Information Gain

In [None]:
from sklearn.feature_selection import mutual_info_classif

In [None]:
Feature Selection-Day 7.ipynb

In [None]:
mutual_data=pd.Series(mutual_info,index=X.columns)
mutual_data.sort_values(ascending=False)

Out[39]:
ram              0.848254
sc_w             0.032774
int_memory       0.029968
battery_power    0.027203
px_width         0.024848
fc               0.023923
dual_sim         0.022537
px_height        0.022146
mobile_wt        0.013782
touch_screen     0.012663
clock_speed      0.010944
n_cores          0.009517
four_g           0.008748
m_dep            0.000000
pc               0.000000
sc_h             0.000000
talk_time        0.000000
three_g          0.000000
blue             0.000000
wifi             0.000000
dtype: float64