In [1]:
import pandas as pd
import numpy as np
import gc
# import warnings
# warnings.filterwarnings("ignore")

### Reading the datasets

In [2]:
df = pd.read_csv('data/word2vec/tfidf_stem_2.csv', index_col=0)
df_class = pd.read_csv('data/target.csv', index_col=0)
#add target column
df['Class'] = df_class['Class']

##### Do sampling if needed

In [3]:
#we dont do the sampling, but you might want to change this cell later with some sampling
application  = df.copy()

##### Feature matrix and target

In [4]:
X = df.drop(['Class'], axis=1)
y = application.Class
feature_name = X.columns.tolist()

In [5]:
X.shape

(50, 1068)

### Feature Selection
- select ___100___ features from ___226___
- ***xxx_support***: list to represent select this feature or not
- ***xxx_feature***: the name of selected features

### 1 Filter

#### 1.1 Pearson Correlation

___Note___
- Normalization: no
- Impute missing values: yes

In [6]:
def cor_selector(X, y):
    cor_list = []
    # calculate the correlation with y for each feature
    for i in X.columns.tolist():
        cor = np.corrcoef(X[i], y)[0, 1]
        cor_list.append(cor)
    # replace NaN with 0
    cor_list = [0 if np.isnan(i) else i for i in cor_list]
    # feature name
    cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))[-100:]].columns.tolist()
    # feature selection? 0 for not select, 1 for select
    cor_support = [True if i in cor_feature else False for i in feature_name]
    return cor_support, cor_feature

In [7]:
cor_support, cor_feature = cor_selector(X, y)
print(str(len(cor_feature)), 'selected features')

100 selected features


#### 1.2 Chi-2
___Note___
- Normalization: MinMaxScaler (values should be bigger than 0)
- Impute missing values: yes

In [8]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
X_norm = MinMaxScaler().fit_transform(X)
chi_selector = SelectKBest(chi2, k=100)
chi_selector.fit(X_norm, y)

SelectKBest(k=100, score_func=<function chi2 at 0x7f5f1e75a158>)

In [9]:
chi_support = chi_selector.get_support()
chi_feature = X.loc[:,chi_support].columns.tolist()
print(str(len(chi_feature)), 'selected features')

100 selected features


### 2 Wrapper

___Note___
- Normalization: depend on the used model; yes for LR
- Impute missing values: depend on the used model; yes for LR

In [10]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=100, step=10, verbose=5)
rfe_selector.fit(X_norm, y)

Fitting estimator with 1068 features.
Fitting estimator with 1058 features.
Fitting estimator with 1048 features.
Fitting estimator with 1038 features.
Fitting estimator with 1028 features.
Fitting estimator with 1018 features.
Fitting estimator with 1008 features.
Fitting estimator with 998 features.
Fitting estimator with 988 features.
Fitting estimator with 978 features.
Fitting estimator with 968 features.
Fitting estimator with 958 features.
Fitting estimator with 948 features.
Fitting estimator with 938 features.
Fitting estimator with 928 features.
Fitting estimator with 918 features.
Fitting estimator with 908 features.
Fitting estimator with 898 features.
Fitting estimator with 888 features.
Fitting estimator with 878 features.
Fitting estimator with 868 features.
Fitting estimator with 858 features.
Fitting estimator with 848 features.
Fitting estimator with 838 features.
Fitting estimator with 828 features.
Fitting estimator with 818 features.
Fitting estimator with 808 feat

RFE(estimator=LogisticRegression(), n_features_to_select=100, step=10,
    verbose=5)

In [11]:
rfe_support = rfe_selector.get_support()
rfe_feature = X.loc[:,rfe_support].columns.tolist()
print(str(len(rfe_feature)), 'selected features')

100 selected features


### 3. Embeded

#### 3.1 Logistics Regression L1
___Note___
- Normalization: Yes
- Impute missing values: Yes

In [12]:
#change l1 to l2
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

embeded_lr_selector = SelectFromModel(LogisticRegression(penalty="l2"), '1.25*median')
embeded_lr_selector.fit(X_norm, y)



SelectFromModel(estimator=LogisticRegression(), threshold='1.25*median')

In [13]:
embeded_lr_support = embeded_lr_selector.get_support()
embeded_lr_feature = X.loc[:,embeded_lr_support].columns.tolist()
print(str(len(embeded_lr_feature)), 'selected features')

371 selected features


#### 3.2 Random Forest
___Note___
- Normalization: No
- Impute missing values: Yes

In [14]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100), threshold='1.25*median')
embeded_rf_selector.fit(X, y)

SelectFromModel(estimator=RandomForestClassifier(), threshold='1.25*median')

In [15]:
embeded_rf_support = embeded_rf_selector.get_support()
embeded_rf_feature = X.loc[:,embeded_rf_support].columns.tolist()
print(str(len(embeded_rf_feature)), 'selected features')

1068 selected features


#### 3.3 LightGBM
___Note___
- Normalization: No
- Impute missing values: No

In [16]:
from sklearn.feature_selection import SelectFromModel
from lightgbm import LGBMClassifier

lgbc=LGBMClassifier(n_estimators=500, learning_rate=0.05, num_leaves=32, colsample_bytree=0.2,
            reg_alpha=3, reg_lambda=1, min_split_gain=0.01, min_child_weight=40)

embeded_lgb_selector = SelectFromModel(lgbc, threshold='1.25*median')
embeded_lgb_selector.fit(X, y)

SelectFromModel(estimator=LGBMClassifier(colsample_bytree=0.2,
                                         learning_rate=0.05,
                                         min_child_weight=40,
                                         min_split_gain=0.01, n_estimators=500,
                                         num_leaves=32, reg_alpha=3,
                                         reg_lambda=1),
                threshold='1.25*median')

In [17]:
embeded_lgb_support = embeded_lgb_selector.get_support()
embeded_lgb_feature = X.loc[:,embeded_lgb_support].columns.tolist()
print(str(len(embeded_lgb_feature)), 'selected features')

1068 selected features


### Summary

In [18]:
pd.set_option('display.max_rows', None)
# put all selection together
feature_selection_df = pd.DataFrame({'Feature':feature_name, 'Pearson':cor_support, 'Chi-2':chi_support, 'RFE':rfe_support, 'Logistics':embeded_lr_support,
                                    'Random Forest':embeded_rf_support, 'LightGBM':embeded_lgb_support})
# count the selected times for each feature
feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)
# display the top 100
feature_selection_df = feature_selection_df.sort_values(['Total','Feature'] , ascending=False)
feature_selection_df.index = range(1, len(feature_selection_df)+1)
feature_selection_df.head(100)

Unnamed: 0,Feature,Pearson,Chi-2,RFE,Logistics,Random Forest,LightGBM,Total
1,young,True,True,True,True,True,True,6
2,ye,True,True,True,True,True,True,6
3,won,True,True,True,True,True,True,6
4,whatev,True,True,True,True,True,True,6
5,unit,True,True,True,True,True,True,6
6,two,True,True,True,True,True,True,6
7,twic,True,True,True,True,True,True,6
8,strong,True,True,True,True,True,True,6
9,stand,True,True,True,True,True,True,6
10,spee,True,True,True,True,True,True,6


### Embeded Random Forest Features

In [19]:
embeded_rf_feature

['a',
 'against',
 'also',
 'am',
 'an',
 'and',
 'anthony',
 'be',
 'box',
 'but',
 'camp',
 'everyth',
 'excel',
 'fan',
 'fought',
 'giv',
 'good',
 'had',
 'happy',
 'hav',
 'heavyweight',
 'him',
 'i',
 'in',
 'is',
 'joshu',
 'klitschko',
 'm',
 'match',
 'meet',
 'nic',
 'now',
 'of',
 'on',
 'opportun',
 'ring',
 's',
 'shap',
 'show',
 'strong',
 'strongest',
 'than',
 'that',
 'the',
 'thi',
 'to',
 'train',
 'very',
 'want',
 'was',
 'weak',
 'when',
 'why',
 'wil',
 'with',
 'world',
 'wors',
 'about',
 'acc',
 'adv',
 'battl',
 'bean',
 'can',
 'cas',
 'clos',
 'do',
 'everyon',
 'fac',
 'focus',
 'for',
 'gam',
 'get',
 'going',
 'gong',
 'he',
 'her',
 'his',
 'hop',
 'import',
 'improv',
 'it',
 'job',
 'just',
 'know',
 'll',
 'lot',
 'many',
 'me',
 'mik',
 'most',
 'mov',
 'much',
 'my',
 'not',
 'nt',
 'plan',
 'prep',
 're',
 'ready',
 'see',
 'seen',
 'shock',
 'so',
 'som',
 'spee',
 'stay',
 'stick',
 'strikes',
 'surpr',
 'tak',
 'talk',
 'terr',
 'them',
 'thi

### Top Features

In [20]:
top_1_features_df = feature_selection_df.loc[feature_selection_df['Total']==6]
top_1_features = top_1_features_df['Feature']

In [21]:
top_2_features_df = feature_selection_df.loc[feature_selection_df['Total']==5]
top_2_features = top_2_features_df['Feature']

In [22]:
top_1_features

1       young
2          ye
3         won
4      whatev
5        unit
6         two
7        twic
8      strong
9       stand
10       spee
11       spec
12       solv
13        sof
14       shap
15     second
16        sam
17        riv
18    respect
19        put
20      pulev
21       prov
22      press
23         ov
24        out
25         or
26       noth
27        nee
28        mot
29       many
30        lot
31       look
32      least
33       kubr
34      knock
35       keep
36       judg
37        hit
38        get
39        gam
40     friend
41        fan
42       fact
43    everyth
44        due
45       driv
46      cours
47      coach
48       clay
49      chain
50        cas
51       bulg
52      break
53        big
54    between
55     almost
56        aft
Name: Feature, dtype: object

In [23]:
top_2_features

57          whol
58           who
59       unpleas
60    underestim
61           tri
62           top
63         thing
64           thi
65          slow
66         round
67           rom
68            re
69       problem
70         partn
71      particip
72        otherw
73            op
74           now
75            my
76           mor
77           may
78          loss
79         learn
80           ind
81        import
82        honest
83       healthy
84           has
85         guard
86           god
87       forward
88           fac
89           did
90           day
91      competit
92          club
93        childr
94           cag
95          brok
96          been
97       apprecy
98        achiev
Name: Feature, dtype: object

### Filters

In [24]:
top_1_features = list(top_1_features)

In [25]:
top_1_features_class = top_1_features + ['Class']

In [26]:
top_2_features = list(top_2_features)

In [27]:
top_2_features_class = top_2_features + ['Class']

In [28]:
top_1_and_2_features = top_1_features + top_2_features

In [29]:
top_1_and_2_features_class = top_1_and_2_features + ['Class']

### New Dataframes

In [30]:
df_top_1 = df.loc[:, top_1_features_class]
df_top_1_and_2 = df.loc[:, top_1_and_2_features_class]

### Saving Dataframes

In [49]:
!mkdir -p data/w2v_fs

In [31]:
file_name = 'data/w2v_fs/df_top_1_stem_2.csv'
df_top_1.to_csv(file_name, sep=',', encoding='utf-8', header=True, index=True)

In [32]:
file_name = 'data/w2v_fs/df_top_1_and_2_stem_2.csv'
df_top_1_and_2.to_csv(file_name, sep=',', encoding='utf-8', header=True, index=True)