In [1]:
import pandas as pd
import numpy as np
import gc
# import warnings
# warnings.filterwarnings("ignore")

### Reading the datasets

In [2]:
df = pd.read_csv('data/word2vec/tfidf_stem_1.csv', index_col=0)
df_class = pd.read_csv('data/target.csv', index_col=0)
#add target column
df['Class'] = df_class['Class']

##### Do sampling if needed

In [3]:
#we dont do the sampling, but you might want to change this cell later with some sampling
application  = df.copy()

##### Feature matrix and target

In [4]:
X = df.drop(['Class'], axis=1)
y = application.Class
feature_name = X.columns.tolist()

In [22]:
X.shape

(50, 1153)

### Feature Selection
- select ___100___ features from ___226___
- ***xxx_support***: list to represent select this feature or not
- ***xxx_feature***: the name of selected features

In [9]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_sc = scaler.fit_transform(X)

pca = PCA()
X_pca = pca.fit_transform(X_sc)
X_pca.shape

(50, 50)

#### Let's keep 90% of variance

In [18]:
explained_variance = pca.explained_variance_ratio_
covered_variance = 0
for i in range(38):
    covered_variance += explained_variance[i]
covered_variance

0.911763715504207

In [19]:
pca = PCA(38)
X_pca = pca.fit_transform(X_sc)
X_pca.shape

(50, 38)

In [23]:
def cor_selector(X, y):
    cor_list = []
    # calculate the correlation with y for each feature
    for i in X.columns.tolist():
        cor = np.corrcoef(X[i], y)[0, 1]
        cor_list.append(cor)
    # replace NaN with 0
    cor_list = [0 if np.isnan(i) else i for i in cor_list]
    # feature name
    cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))[-100:]].columns.tolist()
    # feature selection? 0 for not select, 1 for select
    cor_support = [True if i in cor_feature else False for i in feature_name]
    return cor_support, cor_feature

In [24]:
cor_support, cor_feature = cor_selector(X, y)
print(str(len(cor_feature)), 'selected features')

100 selected features


#### 1.2 Chi-2
___Note___
- Normalization: MinMaxScaler (values should be bigger than 0)
- Impute missing values: yes

In [25]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
X_norm = MinMaxScaler().fit_transform(X)
chi_selector = SelectKBest(chi2, k=100)
chi_selector.fit(X_norm, y)

SelectKBest(k=100, score_func=<function chi2 at 0x7fd4ae65a048>)

In [26]:
chi_support = chi_selector.get_support()
chi_feature = X.loc[:,chi_support].columns.tolist()
print(str(len(chi_feature)), 'selected features')

100 selected features


### 2 Wrapper

___Note___
- Normalization: depend on the used model; yes for LR
- Impute missing values: depend on the used model; yes for LR

In [27]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=100, step=10, verbose=5)
rfe_selector.fit(X_norm, y)

Fitting estimator with 1153 features.
Fitting estimator with 1143 features.
Fitting estimator with 1133 features.
Fitting estimator with 1123 features.
Fitting estimator with 1113 features.
Fitting estimator with 1103 features.
Fitting estimator with 1093 features.
Fitting estimator with 1083 features.
Fitting estimator with 1073 features.
Fitting estimator with 1063 features.
Fitting estimator with 1053 features.
Fitting estimator with 1043 features.
Fitting estimator with 1033 features.
Fitting estimator with 1023 features.
Fitting estimator with 1013 features.
Fitting estimator with 1003 features.
Fitting estimator with 993 features.
Fitting estimator with 983 features.
Fitting estimator with 973 features.
Fitting estimator with 963 features.
Fitting estimator with 953 features.
Fitting estimator with 943 features.
Fitting estimator with 933 features.
Fitting estimator with 923 features.
Fitting estimator with 913 features.
Fitting estimator with 903 features.
Fitting estimator with

RFE(estimator=LogisticRegression(), n_features_to_select=100, step=10,
    verbose=5)

In [28]:
rfe_support = rfe_selector.get_support()
rfe_feature = X.loc[:,rfe_support].columns.tolist()
print(str(len(rfe_feature)), 'selected features')

100 selected features


### 3. Embeded

#### 3.1 Logistics Regression L1
___Note___
- Normalization: Yes
- Impute missing values: Yes

In [29]:
#change l1 to l2
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

embeded_lr_selector = SelectFromModel(LogisticRegression(penalty="l2"), '1.25*median')
embeded_lr_selector.fit(X_norm, y)



SelectFromModel(estimator=LogisticRegression(), threshold='1.25*median')

In [30]:
embeded_lr_support = embeded_lr_selector.get_support()
embeded_lr_feature = X.loc[:,embeded_lr_support].columns.tolist()
print(str(len(embeded_lr_feature)), 'selected features')

348 selected features


#### 3.2 Random Forest
___Note___
- Normalization: No
- Impute missing values: Yes

In [31]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100), threshold='1.25*median')
embeded_rf_selector.fit(X, y)

SelectFromModel(estimator=RandomForestClassifier(), threshold='1.25*median')

In [32]:
embeded_rf_support = embeded_rf_selector.get_support()
embeded_rf_feature = X.loc[:,embeded_rf_support].columns.tolist()
print(str(len(embeded_rf_feature)), 'selected features')

1153 selected features


#### 3.3 LightGBM
___Note___
- Normalization: No
- Impute missing values: No

In [33]:
from sklearn.feature_selection import SelectFromModel
from lightgbm import LGBMClassifier

lgbc=LGBMClassifier(n_estimators=500, learning_rate=0.05, num_leaves=32, colsample_bytree=0.2,
            reg_alpha=3, reg_lambda=1, min_split_gain=0.01, min_child_weight=40)

embeded_lgb_selector = SelectFromModel(lgbc, threshold='1.25*median')
embeded_lgb_selector.fit(X, y)

SelectFromModel(estimator=LGBMClassifier(colsample_bytree=0.2,
                                         learning_rate=0.05,
                                         min_child_weight=40,
                                         min_split_gain=0.01, n_estimators=500,
                                         num_leaves=32, reg_alpha=3,
                                         reg_lambda=1),
                threshold='1.25*median')

In [34]:
embeded_lgb_support = embeded_lgb_selector.get_support()
embeded_lgb_feature = X.loc[:,embeded_lgb_support].columns.tolist()
print(str(len(embeded_lgb_feature)), 'selected features')

1153 selected features


### Summary

In [35]:
pd.set_option('display.max_rows', None)
# put all selection together
feature_selection_df = pd.DataFrame({'Feature':feature_name, 'Pearson':cor_support, 'Chi-2':chi_support, 'RFE':rfe_support, 'Logistics':embeded_lr_support,
                                    'Random Forest':embeded_rf_support, 'LightGBM':embeded_lgb_support})
# count the selected times for each feature
feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)
# display the top 100
feature_selection_df = feature_selection_df.sort_values(['Total','Feature'] , ascending=False)
feature_selection_df.index = range(1, len(feature_selection_df)+1)
feature_selection_df.head(100)

Unnamed: 0,Feature,Pearson,Chi-2,RFE,Logistics,Random Forest,LightGBM,Total
1,young,True,True,True,True,True,True,6
2,ye,True,True,True,True,True,True,6
3,won,True,True,True,True,True,True,6
4,whatev,True,True,True,True,True,True,6
5,us,True,True,True,True,True,True,6
6,unit,True,True,True,True,True,True,6
7,two,True,True,True,True,True,True,6
8,twice,True,True,True,True,True,True,6
9,tough,True,True,True,True,True,True,6
10,strong,True,True,True,True,True,True,6


### Embeded Random Forest Features

In [36]:
embeded_rf_feature

['a',
 'against',
 'also',
 'am',
 'an',
 'and',
 'anthoni',
 'be',
 'boxer',
 'but',
 'camp',
 'everyth',
 'excel',
 'fan',
 'fought',
 'give',
 'good',
 'had',
 'happi',
 'have',
 'heavyweight',
 'him',
 'i',
 'in',
 'is',
 'joshua',
 'klitschko',
 'm',
 'match',
 'meet',
 'nice',
 'now',
 'of',
 'one',
 'opportun',
 'ring',
 's',
 'shape',
 'show',
 'strong',
 'strongest',
 'than',
 'that',
 'the',
 'thi',
 'to',
 'train',
 'veri',
 'wa',
 'want',
 'weaker',
 'when',
 'whi',
 'will',
 'with',
 'world',
 'wors',
 'about',
 'accuraci',
 'advantag',
 'battl',
 'bean',
 'can',
 'case',
 'closer',
 'do',
 'everyon',
 'face',
 'focus',
 'for',
 'game',
 'get',
 'go',
 'gong',
 'he',
 'here',
 'hi',
 'hope',
 'import',
 'improvis',
 'it',
 'job',
 'just',
 'know',
 'll',
 'lot',
 'mani',
 'me',
 'mike',
 'most',
 'movement',
 'much',
 'my',
 'not',
 'nt',
 'on',
 'plan',
 'prepar',
 're',
 'readi',
 'see',
 'seen',
 'shock',
 'so',
 'some',
 'speed',
 'stay',
 'stick',
 'strike',
 'surpris

### Top Features

In [37]:
top_1_features_df = feature_selection_df.loc[feature_selection_df['Total']==6]
top_1_features = top_1_features_df['Feature']

In [38]:
top_2_features_df = feature_selection_df.loc[feature_selection_df['Total']==5]
top_2_features = top_2_features_df['Feature']

In [39]:
top_1_features

1         young
2            ye
3           won
4        whatev
5            us
6          unit
7           two
8         twice
9         tough
10       strong
11        stand
12        speed
13        sofia
14        shape
15       second
16         same
17      respect
18         real
19      qualiti
20          put
21        pulev
22        prove
23      pressur
24        press
25      partner
26         over
27          out
28     otherwis
29           or
30         noth
31         need
32        motiv
33         mani
34          lot
35         look
36       kubrat
37        knock
38         keep
39         judg
40          hit
41         here
42      healthi
43         game
44       friend
45          fan
46         fact
47      everyth
48          due
49        drive
50        cours
51        coach
52         clay
53     children
54        chain
55    bulgarian
56        break
57         both
58          big
59      between
60       almost
61       achiev
Name: Feature, dtype: ob

In [40]:
top_2_features

62         whole
63           who
64    underestim
65           thi
66         state
67       special
68         round
69         rival
70            re
71      particip
72         offer
73           now
74            my
75          more
76           may
77         manag
78           man
79          loss
80         least
81         learn
82            ha
83         guard
84           get
85       forward
86           did
87      competit
88          club
89          case
90          cage
91       appreci
92         after
Name: Feature, dtype: object

### Filters

In [42]:
top_1_features = list(top_1_features)

In [43]:
top_1_features_class = top_1_features + ['Class']

In [44]:
top_2_features = list(top_2_features)

In [45]:
top_2_features_class = top_2_features + ['Class']

In [46]:
top_1_and_2_features = top_1_features + top_2_features

In [47]:
top_1_and_2_features_class = top_1_and_2_features + ['Class']

### New Dataframes

In [48]:
df_top_1 = df.loc[:, top_1_features_class]
df_top_1_and_2 = df.loc[:, top_1_and_2_features_class]

### Saving Dataframes

In [49]:
!mkdir -p data/w2v_fs

In [50]:
file_name = 'data/w2v_fs/df_top_1_stem_1.csv'
df_top_1.to_csv(file_name, sep=',', encoding='utf-8', header=True, index=True)

In [52]:
file_name = 'data/w2v_fs/df_top_1_and_2_stem_1.csv'
df_top_1_and_2.to_csv(file_name, sep=',', encoding='utf-8', header=True, index=True)