# Tutorial 5: RFE on all features - (GS)

---

### Introduction

Hello, this notebook will show how to use RFE to perform further feature selection, as we found that most features are very highly correlated, therfore we would need to remove those using the tool that we mentioned above. 

first, let us call the data

In [1]:
%store -r df_input_GS
%store -r y_GS
%store -r df_GS

In [2]:
y_GS = y_GS.map({'S': 1, 'B': 0})

---

calling some packages

In [3]:
# Python packages 
import pandas as pd # for importing data into data frame format
import seaborn as sns # For drawing useful graphs, such as bar graphs
import numpy as np
import matplotlib.pyplot as plt

---

<b><i> Data splitting </i></b> 

In [4]:
from sklearn.model_selection import train_test_split

Xtrain, Xtest, Ytrain, Ytest  = train_test_split( df_input_GS, y_GS, test_size = 0.3, random_state=3, stratify=y_GS) # train and valid sets


In [5]:
print(Xtrain.shape)
print(Xtest.shape)

(352, 2074)
(151, 2074)


---

<b><i> get the best N features </i></b> 

In [6]:
%store -r featImp_GS

In [7]:
arrimp = np.array(featImp_GS).mean(0)
sorted_idx = arrimp.argsort()
print(sorted_idx[-10:])

[1793 1775 2048   31    3 1771  544 1781   17   45]


In [8]:
# cols_ordered = df_input_GS.columns[sorted_idx][-200:]
cols_ordered = df_input_GS.columns[sorted_idx]

cols_ordered

Index([2432.121, 1356.693, 2480.996, 2264.315, 1781.884,  2387.33, 2436.692,
       1391.648, 2455.152,  991.829,
       ...
       1968.596, 1942.053, 2441.281,   842.04,  834.451, 1936.251, 1010.382,
       1950.821,  838.229,  845.886],
      dtype='object', length=2074)

---

In [9]:
Xtrain[cols_ordered]
# there is no need to use the features ordered accoring to the permutation importance

Unnamed: 0,2432.121,1356.693,2480.996,2264.315,1781.884,2387.330,2436.692,1391.648,2455.152,991.829,...,1968.596,1942.053,2441.281,842.040,834.451,1936.251,1010.382,1950.821,838.229,845.886
210,-0.133584,-0.554902,-0.081552,-0.221067,-0.370640,-0.140023,-0.128064,-0.388553,-0.129043,-0.783655,...,-0.103180,-0.064431,-0.126623,-0.852611,-0.868323,-0.056824,-0.791167,-0.077484,-0.863861,-0.859112
266,0.386911,0.757232,0.369436,0.498077,0.624460,0.413856,0.400203,0.552333,0.358630,0.974744,...,0.323973,0.292242,0.386553,1.040755,1.046849,0.284218,0.986368,0.303714,1.044837,1.048485
458,0.987044,0.614708,0.841419,1.031083,0.949595,0.983368,0.954004,0.915365,0.897222,0.153369,...,0.992127,0.954164,0.948222,-0.020894,-0.012900,0.947342,0.135750,0.966534,-0.014697,-0.009475
322,0.726899,0.915168,0.754887,0.800285,0.855494,0.747073,0.716623,0.820270,0.723298,1.081025,...,0.693932,0.669354,0.738082,1.176984,1.173067,0.661299,1.091540,0.674564,1.178287,1.173193
171,-1.294259,-1.305902,-1.255770,-1.329682,-1.333933,-1.313061,-1.298811,-1.337443,-1.308193,-1.173595,...,-1.295124,-1.280268,-1.305844,-1.101613,-1.111279,-1.276586,-1.157541,-1.285318,-1.109319,-1.106318
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45,-0.257176,-0.513379,-0.220794,-0.386509,-0.488748,-0.296421,-0.250719,-0.513112,-0.244562,-0.664593,...,-0.318073,-0.258179,-0.249466,-0.775807,-0.771421,-0.246058,-0.683964,-0.277660,-0.772976,-0.771858
430,1.860734,1.665808,1.734885,1.921995,1.860632,1.888557,1.828631,1.928577,1.883685,1.424222,...,1.950348,1.916986,1.834398,1.378890,1.385954,1.913913,1.412999,1.927220,1.381970,1.384165
5,-0.449107,-0.834204,-0.469491,-0.542428,-0.692639,-0.473065,-0.445028,-0.597362,-0.454035,-0.970991,...,-0.417211,-0.403042,-0.442860,-0.919328,-0.912298,-0.401600,-0.971304,-0.407482,-0.916736,-0.915225
310,1.145130,1.147228,1.160193,1.138276,1.144555,1.128844,1.147592,1.078369,1.149093,1.216298,...,1.100754,1.096170,1.103933,1.246935,1.246373,1.096653,1.219850,1.098692,1.251592,1.253352


### RFE

In [10]:
# explore the number of selected features for RFE
from numpy import mean
from numpy import std
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline
from matplotlib import pyplot
from sklearn.linear_model import LogisticRegression

In [12]:
# https://machinelearningmastery.com/rfe-feature-selection-in-python/
selected = []
imp_ind = []
imp_freq = []

for i in range(1, 51, 1):
    print(i)
    sfs = RFE(estimator=LogisticRegression(solver = 'newton-cg'), n_features_to_select=i)
    # fit sfs
    sfs.fit(Xtrain[cols_ordered].values, Ytrain)
    # summarize all features
    selected_feat = []
    for i in range(Xtrain[cols_ordered].shape[1]):
        if sfs.support_[i] == True:
            selected_feat.append(i)
            print('Column: %d, Selected %s,' % (i, sfs.support_[i]))
        
    # print()
    selected.append(selected_feat)
    
    result = list(set(imp_ind) ^ set(selected[-1]))
    imp_freq.extend(Xtrain[cols_ordered].columns[result])    
    imp_ind.extend(result)    
    
    print(selected[-1])
    print(result)
    print(imp_ind)
    print(imp_freq)
    print()

    

1
Column: 76, Selected True,
[76]
[76]
[76]
[1309.415]

2
Column: 76, Selected True,
Column: 1909, Selected True,
[76, 1909]
[1909]
[76, 1909]
[1309.415, 894.322]

3
Column: 76, Selected True,
Column: 1561, Selected True,
Column: 1909, Selected True,
[76, 1561, 1909]
[1561]
[76, 1909, 1561]
[1309.415, 894.322, 894.014]

4
Column: 76, Selected True,
Column: 694, Selected True,
Column: 1561, Selected True,
Column: 1909, Selected True,
[76, 694, 1561, 1909]
[694]
[76, 1909, 1561, 694]
[1309.415, 894.322, 894.014, 1310.076]

5
Column: 70, Selected True,
Column: 76, Selected True,
Column: 694, Selected True,
Column: 1324, Selected True,
Column: 1345, Selected True,
Column: 1561, Selected True,
Column: 1909, Selected True,
[70, 76, 694, 1324, 1345, 1561, 1909]
[1345]
[76, 1909, 1561, 694, 70, 1324, 1345]
[1309.415, 894.322, 894.014, 1310.076, 1310.738, 1500.371, 1308.754]

8
Column: 70, Selected True,
Column: 76, Selected True,
Column: 362, Selected True,
Column: 694, Selected True,
Column: 

---

### LR

In [63]:
lr = LogisticRegression()

In [64]:
solvers = ['newton-cg', 'liblinear']
penalty = ['l2']
c_values = [1000, 100, 10, 1.0, 0.1, 0.01, 0.001]
lr_par = dict(solver=solvers,penalty=penalty,C=c_values)

In [65]:
models = [[lr, 'lr', 14]]
par = [lr_par]

In [66]:
from source.ml_acc import get_accuracy_ml

In [67]:
num_ml_tools = len(par)
ml_dicts = {}

for m, par in zip(models, par):
    key0 = str(m[1])
    ml_dicts[key0] = {}
    for f in selected:
        print(cols_ordered[f])

        xtr =  Xtrain[cols_ordered].iloc[:, f]
        # xte =  Xtest[cols_ordered].iloc[:, f]
        print(xtr)
        results = get_accuracy_ml (m[0], m[2], par, np.array(xtr), np.array(Ytrain),np.array(xtr), np.array(Ytrain)) # to get the accuracies for the ml model

        key = str(m[1])+","+str(len(f))
        ml_dicts[key0][key] = {}

        ml_dicts[key0][key]['tot_acc'] = results[0]
        ml_dicts[key0][key]['jack_train'] = results[1]
        ml_dicts[key0][key]['jack_test'] = results[2]

Index([1309.415], dtype='object')
     1309.415
210 -0.680322
266  0.870632
458  0.418043
322  0.980841
171 -1.259652
..        ...
45  -0.579250
430  1.517541
5   -0.936200
310  1.172359
216 -0.873543

[352 rows x 1 columns]
Index([1309.415, 894.322], dtype='object')
     1309.415   894.322
210 -0.680322 -0.829390
266  0.870632  1.040786
458  0.418043 -0.027926
322  0.980841  1.160730
171 -1.259652 -1.098343
..        ...       ...
45  -0.579250 -0.763092
430  1.517541  1.391540
5   -0.936200 -0.930658
310  1.172359  1.247332
216 -0.873543 -0.907412

[352 rows x 2 columns]
Index([1309.415, 894.014, 894.322], dtype='object')
     1309.415   894.014   894.322
210 -0.680322 -0.830469 -0.829390
266  0.870632  1.039927  1.040786
458  0.418043 -0.026354 -0.027926
322  0.980841  1.161917  1.160730
171 -1.259652 -1.099038 -1.098343
..        ...       ...       ...
45  -0.579250 -0.762456 -0.763092
430  1.517541  1.392789  1.391540
5   -0.936200 -0.929073 -0.930658
310  1.172359  1.246734  1.

---

In [68]:
import json

In [69]:
# with open('ml_gs_rfs.txt', 'w') as file:
#      file.write(json.dumps(ml_dicts)) # use `json.loads` to do the reverse

---