In [1]:
#!/usr/bin/env python

##This script is for testing different feature selection methods in scikit-learn library to select the
##top 10 wavelenghts with and without a customized distance for the classification of peanut stem rot
##By Xing Wei 05/25/2021

##############################################################################################################
## 010421 - first draft
## 011421 - update and add the Recursive feature elimination and SelectFromModel,
##          both are from sklearn.feature_slection
## 011821 - update and add checking the versions of libraries
## 030121 - 1)update SelectFromModel and add univariate feature selection
##          2)add reduce X to only selected features and classification using only selected features
##          3)change the input X and output y from array to dataframe (for Univariable feature selection)
## 030221 - re-oragnize the cells and move the ones not work to the bottom
## 030421 - 1)update and add writting the feature scores into csv files
##          2)update the plotting for feature scores
## 030721 - 1)update the RFE section and adding the estimator of random forest for RFE
##        - 2)update and add gradient boosting and XGBoost classifier
## 031021 - 1)define the function of get_models() and evaluate_model() for examining the differet classifiers
##        - 2)add princinpal component analysis as one of the feature selection methods
##        - 3a)add the module to select wavelength with custom distance apart
##        - 3b)add seanborn package to plot the correlation heatmap
## 031321 - 1)define the function of band_select_distance() to select wavelengths with custom distance apart
## 031521 - 1)finalize the codes for each feature selection method adding band_select_distance() function
## 031621 - clean up the codes for the figure preparation
## 052521 - clean up the codes for code sharing on github
#############################################################################################################

##References: 
##1)https://scikit-learn.org/stable/modules/feature_selection.html
##2)https://machinelearningmastery.com/calculate-feature-importance-with-python/
##3)https://github.com/krishnaik06/Feature-Selection-techniques/blob/master/Feature%20Selection.ipynb

In [2]:
# check the versions of libraries
import sys
print('Python: {}'.format(sys.version))
# scipy
import scipy
print('scipy: {}'.format(scipy.__version__))
# numpy
import numpy
print('numpy: {}'.format(numpy.__version__))
# matplotlib
import matplotlib
print('matplotlib: {}'.format(matplotlib.__version__))
# pandas
import pandas
print('pandas: {}'.format(pandas.__version__))
# scikit-learn
import sklearn
print('sklearn: {}'.format(sklearn.__version__))
import xgboost
# display version
print('xgboost: {}'.format(xgboost.__version__))

Python: 3.7.10 (default, Feb 26 2021, 18:47:35) 
[GCC 7.3.0]
scipy: 1.6.1
numpy: 1.19.2
matplotlib: 3.3.4
pandas: 1.2.3
sklearn: 0.24.1
xgboost: 1.3.3


In [3]:
# load libraries
import numpy as np
from numpy import mean
from numpy import std
import matplotlib.pyplot as plt
import pandas as pd
from pandas import read_csv
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import PCA #unsupervised
from sklearn.naive_bayes import GaussianNB #generative and supervised
from sklearn.neighbors import KNeighborsClassifier #discriminative and supervised
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis #discriminative and supervised
from sklearn.neural_network import MLPClassifier #discriminative and supervised
from sklearn.ensemble import RandomForestClassifier #discriminative and supervised
from sklearn.svm import SVC #discriminative and supervised
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from time import time

In [4]:
# load the dataset
dataset = read_csv('../inputs/input_gh2019_all_data.wo.out.binned.cut.csv')
# In first column of Type: 0 = Healthy; 1 = Presymptomatic; 2 = Lesion_only; 3 = Mild; 4 = Severe

In [5]:
# split into input (X) and output (y) variables
wl_all = dataset.columns[1:158]
X = dataset.iloc[:,1:158]
y = dataset.iloc[:,0]
print ("shape of dataset:", X.shape, y.shape)

shape of dataset: (399, 157) (399,)


In [6]:
# get a list of models to evaluate
def get_models():
    models = []
    #models.append(('RF', RandomForestClassifier()))
    models.append(('SVML', SVC(kernel='linear')))
    return models

# evaluate a give model using cross-validation
def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    scores = 100*scores
    return scores

In [7]:
# Version_031321: select the top wavelengths with custom distance apart

# define a function to filter out elements that were within the distance of an wavelength
def get_wl_range(wl_test, low_range, high_range):
    ## filters the list based on the condition
    new_list = list(filter(lambda x: int(float(x)) <= low_range or int(float(x)) >= high_range, wl_test))
    return new_list

# define the function of band_select_distance() to select wavelengths with custom distance apart
def band_select_distance(X, featureScores, distance, top_n_features):
    # sort the featureScores dataframe based on feature weight and ignore the original index
    feature_sorted = featureScores.sort_values(by = 'Score', ascending=False, ignore_index=True)
    # store the list of wavelengths after sorting based on their scores
    wl = feature_sorted['Wavelengths']
    
    # set the distance between each selected features
    distance = distance
    # set the number of top features to be selected
    top_n_features = top_n_features
    # intialize the final list
    wl_dist = wl
    
    # setup the loop to filter out elements based on the order of original list
    for j in range(len(wl_dist)):
        low_r = int(float(wl_dist[j]))- distance
        high_r = int(float(wl_dist[j]))+ distance
        x = get_wl_range(wl_dist, low_r, high_r)
        wl_dist = [wl_dist[j]] + x
        j += 1
        if j == len(wl_dist):
            break
    # the final list should be with the reversed order
    wl_dist = list(reversed(wl_dist))
                
    # build up the new dataset with spectral reflectance values from the selected wavelengths
    X_dist = []
    column_name = []
    for wl in wl_dist[0:top_n_features]:
        column_name.append(wl)
        X_dist.append(X[wl].values)
    
    # covert the X_dist from list to dataframe
    from pandas import DataFrame
    X_dist = DataFrame (X_dist)
    X_dist = X_dist.T
    X_dist.columns = column_name
    # return the new dataframe of top selected bands with custom distance and their reflectance values
    return X_dist

In [8]:
# Initial a list of feature seletion methods to evaluate
methods = []

# 1. Univariate feature selection

# 1.1 chi2 test

In [9]:
# set the number of features to be selected
top_n_features = 10
# initialize a list to store all the selected top number of features
top_features = []
# set the distance between each selected features
distance = 20
# initialize a list to store all the selected top number of features WITH custom distance
top_features_dist = []

In [10]:
# load libraries
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(chi2, k=top_n_features)
fit = bestfeatures.fit(X, y)

feature_names = np.array(wl_all)
print("Features selected by SelectKBest: "
      f"{feature_names[fit.get_support()]}")

Features selected by SelectKBest: ['586.088501' '590.30426' '594.515015' '598.720825' '602.921631'
 '611.308228' '694.069275' '698.154175' '702.233948' '706.308533']


In [11]:
# build a dataframe including scores for each feature
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

# concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Wavelengths','Score']  #naming the dataframe columns

# sort the featureScores dataframe based on feature weight and ignore the original index
feature_sorted = featureScores.sort_values(by = 'Score', ascending=False, ignore_index=True)
# only save the top n features into the list
df_top_n_features = feature_sorted['Wavelengths'][0:top_n_features]
top_features.append(df_top_n_features.values)

print("Top features ranked by their scores: \n"
      f"{featureScores.nlargest(10,'Score')}")  #print 10 selected features

Top features ranked by their scores: 
    Wavelengths       Score
105  698.154175  720.919667
106  702.233948  719.056195
107  706.308533  627.234419
104  694.069275  606.849653
80   594.515015  547.337937
79    590.30426  543.387562
81   598.720825  543.283568
82   602.921631  537.050359
78   586.088501  535.784726
84   611.308228  535.343478


In [12]:
# reduce X to the selected features
X_new = fit.transform(X)
X_new.shape

(399, 10)

In [13]:
# reduce X to the selected features WITH custom distance
X_dist = band_select_distance(X, featureScores, distance, top_n_features)
X_dist.shape
# save the top n features WITH custom distance into the list
top_features_dist.append(X_dist.columns)

In [14]:
# compare the classification accuracy using only top 10 selected wavelengths
models = get_models()
# evaluate the models and store results
results = []
names = []
methods.append ('Chi2')
for name, model in models:
    scores = evaluate_model(model, X_new, y)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

>SVML 52.462 (7.159)


In [15]:
# compare the classification accuracy using the distanced top 10 selected wavelengths
models = get_models()
# evaluate the models and store results
results_dist = []
names = []
#methods.append ('Chi2')
for name, model in models:
    scores = evaluate_model(model, X_dist, y)
    results_dist.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

>SVML 59.558 (6.231)


# 2. Feature selection using SelectFromModel

# 2.1 Random forest - feature importance and selection

In [16]:
# random forest for feature importance on a classification problem

# define the model
model = RandomForestClassifier()
feature_names = np.array(wl_all)

# select the features with the SelectFromModel from sklearn.feature_selection
tic = time()
sfm = SelectFromModel(model.fit(X,y), threshold=-np.inf, max_features=top_n_features).fit(X, y)
toc = time()

print("Features selected by SelectFromModel: "
      f"{feature_names[sfm.get_support()]}")
print(f"Done in {toc - tic:.3f}s")

Features selected by SelectFromModel: ['274.382599' '496.418976' '500.737885' '619.674927' '644.654663'
 '648.800354' '652.940979' '689.979309' '826.149292' '884.301941']
Done in 0.447s


In [17]:
# build a dataframe including scores for each feature
dfscores = pd.DataFrame(sfm.estimator_.feature_importances_)
dfcolumns = pd.DataFrame(X.columns)

# concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Wavelengths','Score']  #naming the dataframe columns

# sort the featureScores dataframe based on feature weight and ignore the original index
feature_sorted = featureScores.sort_values(by = 'Score', ascending=False, ignore_index=True)
# only save the top n features into the list
df_top_n_features = feature_sorted['Wavelengths'][0:top_n_features]
top_features.append(df_top_n_features.values)

print("Top features ranked by their scores: \n"
      f"{featureScores.nlargest(10,'Score')}")  #print 10 selected features
print(f"Done in {toc - tic:.3f}s")

Top features ranked by their scores: 
    Wavelengths     Score
152  884.301941  0.021429
58   500.737885  0.020402
94   652.940979  0.019930
103  689.979309  0.017079
92   644.654663  0.015917
137  826.149292  0.015051
86   619.674927  0.014884
93   648.800354  0.014486
57   496.418976  0.014011
7    274.382599  0.012908
Done in 0.447s


In [18]:
# reduce X to the selected features
X_new = sfm.transform(X)
X_new.shape

(399, 10)

In [19]:
# reduce X to the selected features WITH custom distance
X_dist = band_select_distance(X, featureScores, distance, top_n_features)
X_dist.shape
# save the top n features WITH custom distance into the list
top_features_dist.append(X_dist.columns)

In [20]:
# compare the classification accuracy using only selected wavelengths
models = get_models()
methods.append ('SFM_RF')
for name, model in models:
    scores = evaluate_model(model, X_new, y)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

>SVML 63.073 (5.597)


In [21]:
# compare the classification accuracy using only selected wavelengths
models = get_models()
#methods.append ('SFM_RF')
for name, model in models:
    scores = evaluate_model(model, X_dist, y)
    results_dist.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

>SVML 65.327 (5.103)


# 2.2 Support vector machine linear -  weights of features (coefficients)

In [22]:
# feature importance via coefficients
# only apply to SVM with a linear kernel

# define the model
model = SVC(kernel = 'linear')
feature_names = np.array(wl_all)

# select the features with the SelectFromModel from sklearn.feature_selection
tic = time()
sfm = SelectFromModel(model.fit(X,y), threshold=-np.inf, max_features=top_n_features).fit(X, y)
toc = time()

print("Features selected by SelectFromModel: "
      f"{feature_names[sfm.get_support()]}")
print(f"Done in {toc - tic:.3f}s")

Features selected by SelectFromModel: ['694.069275' '742.75' '758.812317' '762.814941' '766.8125' '802.555725'
 '806.501038' '837.875061' '849.553345' '884.301941']
Done in 0.436s


In [23]:
# build a dataframe including scores for each feature
coefficient = pd.DataFrame(np.abs(sfm.estimator_.coef_)) # use the absolute value of each cofficient
dfscores = pd.DataFrame(coefficient.mean()) # use the average of absolute coefficient as the score for each feature
dfcolumns = pd.DataFrame(X.columns)

# concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Wavelengths','Score']  #naming the dataframe columns

# sort the featureScores dataframe based on feature weight and ignore the original index
feature_sorted = featureScores.sort_values(by = 'Score', ascending=False, ignore_index=True)
# only save the top n features into the list
df_top_n_features = feature_sorted['Wavelengths'][0:top_n_features]
top_features.append(df_top_n_features.values)

print("Top features ranked by their scores: \n"
      f"{featureScores.nlargest(10,'Score')}")  #print 10 selected features
print(f"Done in {toc - tic:.3f}s")

Top features ranked by their scores: 
    Wavelengths     Score
152  884.301941  0.445491
120  758.812317  0.320164
132  806.501038  0.319294
122    766.8125  0.308200
116      742.75  0.290659
140  837.875061  0.283890
121  762.814941  0.281686
143  849.553345  0.274764
104  694.069275  0.274119
131  802.555725  0.273443
Done in 0.436s


In [24]:
# reduce X to the selected features
X_new = sfm.transform(X)
X_new.shape

(399, 10)

In [25]:
# reduce X to the selected features WITH custom distance
X_dist = band_select_distance(X, featureScores, distance, top_n_features)
X_dist.shape
# save the top n features WITH custom distance into the list
top_features_dist.append(X_dist.columns)

In [26]:
# compare the classification accuracy using only selected wavelengths
models = get_models()
methods.append ('SFM_SVML')
for name, model in models:
    scores = evaluate_model(model, X_new, y)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

>SVML 56.545 (7.726)


In [27]:
# compare the classification accuracy using only selected wavelengths
models = get_models()
#methods.append ('SFM_SVML')
for name, model in models:
    scores = evaluate_model(model, X_dist, y)
    results_dist.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

>SVML 65.237 (6.301)


# 3. Recursive feature elimination (RFE)

# 3.1 Estimator = random forest

In [28]:
# define RFE
estimator = RandomForestClassifier()
rfe = RFE(estimator, n_features_to_select=top_n_features)

# fit RFE
tic = time()
rfe_fit = rfe.fit(X, y)
toc = time()

# build a dataframe including ranking for each feature
dfranking = pd.DataFrame(rfe_fit.ranking_)
dfcolumns = pd.DataFrame(X.columns)

# concat two dataframes for better visualization 
feature_ranking = pd.concat([dfcolumns,dfranking],axis=1)
feature_ranking.columns = ['Wavelengths','Ranking']  #naming the dataframe columns

print("Features selected by Recrusive Feature Elimination: \n"
      f"{feature_ranking.nsmallest(10,'Ranking')}")  #print 10 selected features (ranking == 1)
print(f"Done in {toc - tic:.3f}s")

Features selected by Recrusive Feature Elimination: 
    Wavelengths  Ranking
1     246.95195        1
7    274.382599        1
55   487.766541        1
58   500.737885        1
59   505.052002        1
86   619.674927        1
102  685.884216        1
103  689.979309        1
112  726.605103        1
152  884.301941        1
Done in 29.602s


In [29]:
# For RFE model, all features should be included to get all their scores
# Or only number of n_features_to_select will have scores
rfe2 = RFE(estimator, n_features_to_select=1)
# fit RFE
tic = time()
rfe_fit2 = rfe2.fit(X, y)
toc = time()

# For RFE model, set the score for each feature equal to (total number of features - ranking of each feature)
rfe_fit2_scores = rfe_fit2.ranking_
for i in range(len(rfe_fit2_scores)):
    rfe_fit2_scores[i] = len(rfe_fit2_scores) - rfe_fit2_scores[i]

# build a dataframe including scores for each feature
dfscores = pd.DataFrame(rfe_fit2_scores)
dfcolumns = pd.DataFrame(X.columns)

# concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Wavelengths','Score']  #naming the dataframe columns

# sort the featureScores dataframe based on feature weight and ignore the original index
feature_sorted = featureScores.sort_values(by = 'Score', ascending=False, ignore_index=True)
# only save the top n features into the list
df_top_n_features = feature_sorted['Wavelengths'][0:top_n_features]
top_features.append(df_top_n_features.values)

In [30]:
# reduce X to the selected features
X_new = rfe_fit.transform(X)
X_new.shape

(399, 10)

In [31]:
# reduce X to the selected features WITH custom distance
X_dist = band_select_distance(X, featureScores, distance, top_n_features)
X_dist.shape
# save the top n features WITH custom distance into the list
top_features_dist.append(X_dist.columns)

In [32]:
# compare the classification accuracy using only selected wavelengths
models = get_models()
methods.append ('RFE_RF')
for name, model in models:
    scores = evaluate_model(model, X_new, y)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

>SVML 65.085 (5.077)


In [33]:
# compare the classification accuracy using only selected wavelengths
models = get_models()
#methods.append ('RFE_RF')
for name, model in models:
    scores = evaluate_model(model, X_dist, y)
    results_dist.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

>SVML 64.818 (6.101)


# 3.2 Estimator = Support vector machine with linear kernel

In [34]:
# define RFE
estimator = SVC(kernel = 'linear')
rfe = RFE(estimator, n_features_to_select=top_n_features)

# fit RFE
tic = time()
rfe_fit = rfe.fit(X, y)
toc = time()

# build a dataframe including ranking for each feature
dfranking = pd.DataFrame(rfe_fit.ranking_)
dfcolumns = pd.DataFrame(X.columns)

# concat two dataframes for better visualization 
feature_ranking = pd.concat([dfcolumns,dfranking],axis=1)
feature_ranking.columns = ['Wavelengths','Ranking']  #naming the dataframe columns

print("Features selected by Recrusive Feature Elimination: \n"
      f"{feature_ranking.nsmallest(10,'Ranking')}")  #print 10 selected features (ranking == 1)
print(f"Done in {toc - tic:.3f}s")

Features selected by Recrusive Feature Elimination: 
    Wavelengths  Ranking
4    260.687927        1
13     301.6474        1
33   391.318542        1
34   395.752716        1
59   505.052002        1
61   513.665588        1
94   652.940979        1
121  762.814941        1
138   830.06311        1
152  884.301941        1
Done in 16.829s


In [35]:
# For RFE model, all features should be included to get all their scores
# Or only number of n_features_to_select will have scores
rfe2 = RFE(estimator, n_features_to_select=1)
# fit RFE
tic = time()
rfe_fit2 = rfe2.fit(X, y)
toc = time()

# For RFE model, set the score for each feature equal to (total number of features - ranking of each feature)
rfe_fit2_scores = rfe_fit2.ranking_
for i in range(len(rfe_fit2_scores)):
    rfe_fit2_scores[i] = len(rfe_fit2_scores) - rfe_fit2_scores[i]

# build a dataframe including scores for each feature
dfscores = pd.DataFrame(rfe_fit2_scores)
dfcolumns = pd.DataFrame(X.columns)

# concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Wavelengths','Score']  #naming the dataframe columns

# sort the featureScores dataframe based on feature weight and ignore the original index
feature_sorted = featureScores.sort_values(by = 'Score', ascending=False, ignore_index=True)
# only save the top n features into the list
df_top_n_features = feature_sorted['Wavelengths'][0:top_n_features]
top_features.append(df_top_n_features.values)

In [36]:
# reduce X to the selected features
X_new = rfe_fit.transform(X)
X_new.shape

(399, 10)

In [37]:
# reduce X to the selected features WITH custom distance
X_dist = band_select_distance(X, featureScores, distance, top_n_features)
X_dist.shape
# save the top n features WITH custom distance into the list
top_features_dist.append(X_dist.columns)

In [38]:
# compare the classification accuracy using only selected wavelengths
models = get_models()
methods.append ('RFE_SVML')
for name, model in models:
    scores = evaluate_model(model, X_new, y)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

>SVML 69.748 (5.341)


In [39]:
# compare the classification accuracy using only selected wavelengths
models = get_models()
#methods.append ('RFE_SVML')
for name, model in models:
    scores = evaluate_model(model, X_dist, y)
    results_dist.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

>SVML 68.750 (5.606)


# 5. Principal components analysis (PCA)

In [40]:
# create the PCA instance
n_components = top_n_features # As n_components increases, classification accuracy will increase
pca = PCA(n_components=n_components)
# fit on data
pca.fit(X)
# transform data
X_new = pca.transform(X)

In [41]:
# compare the classification accuracy using only selected wavelengths
models = get_models()
methods.append ('PCA')
for name, model in models:
    scores = evaluate_model(model, X_new, y)
    results.append(scores)
    results_dist.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

>SVML 63.494 (6.064)


In [42]:
# build a dataframe including cross validation accuracy for each machine learning methods
df_clf_accuracy = pd.DataFrame(results)
# transpose the dataframe
df_clf_accuracy = df_clf_accuracy.T
#naming the dataframe columns
df_clf_accuracy.columns = methods

# write the dataframe of cross validation accuracy to CVS file
df_clf_accuracy.to_csv(r'../results/R32_peanut_ssr_feature_selection_comparison_rf.csv', encoding='utf-8', header='true')
df_clf_accuracy

Unnamed: 0,Chi2,SFM_RF,SFM_SVML,RFE_RF,RFE_SVML,PCA
0,47.5,67.5,57.5,67.5,82.5,70.0
1,52.5,65.0,60.0,67.5,65.0,62.5
2,52.5,57.5,52.5,67.5,70.0,72.5
3,52.5,72.5,65.0,60.0,70.0,65.0
4,60.0,62.5,57.5,62.5,65.0,57.5
5,45.0,60.0,32.5,60.0,57.5,57.5
6,55.0,62.5,62.5,65.0,72.5,60.0
7,50.0,60.0,60.0,67.5,70.0,62.5
8,57.5,70.0,62.5,70.0,67.5,72.5
9,56.410256,61.538462,53.846154,66.666667,69.230769,64.102564


In [43]:
# build a dataframe including cross validation accuracy for each machine learning methods
df_clf_accuracy_dist = pd.DataFrame(results_dist)
# transpose the dataframe
df_clf_accuracy_dist = df_clf_accuracy_dist.T
#naming the dataframe columns
df_clf_accuracy_dist.columns = methods

# write the dataframe of cross validation accuracy to CVS file
df_clf_accuracy_dist.to_csv(r'../results/R32_peanut_ssr_feature_selection_distance_rf.csv', encoding='utf-8', header='true')
df_clf_accuracy_dist

Unnamed: 0,Chi2,SFM_RF,SFM_SVML,RFE_RF,RFE_SVML,PCA
0,70.0,72.5,72.5,72.5,80.0,70.0
1,57.5,57.5,65.0,60.0,70.0,62.5
2,65.0,72.5,70.0,65.0,67.5,72.5
3,65.0,67.5,75.0,75.0,72.5,65.0
4,60.0,65.0,67.5,60.0,65.0,57.5
5,45.0,60.0,50.0,52.5,55.0,57.5
6,62.5,62.5,65.0,60.0,67.5,60.0
7,65.0,62.5,57.5,65.0,70.0,62.5
8,62.5,75.0,62.5,72.5,70.0,72.5
9,56.410256,66.666667,61.538462,56.410256,66.666667,64.102564


In [44]:
# build a dataframe including top n features for each feature selection methods
df_top_features = pd.DataFrame(top_features)
# transpose the dataframe
df_top_features = df_top_features.T
#naming the dataframe columns
df_top_features.columns = methods[0:5]

# write the dataframe of cross validation accuracy to CVS file
df_top_features.to_csv(r'../results/R32_peanut_ssr_top_features_rf.csv', encoding='utf-8', header='true')
df_top_features

Unnamed: 0,Chi2,SFM_RF,SFM_SVML,RFE_RF,RFE_SVML
0,698.154175,884.301941,884.301941,500.737885,505.052002
1,702.233948,500.737885,758.812317,884.301941,395.752716
2,706.308533,652.940979,806.501038,619.674927,301.6474
3,694.069275,689.979309,766.8125,246.95195,391.318542
4,594.515015,644.654663,742.75,505.052002,260.687927
5,590.30426,826.149292,837.875061,689.979309,652.940979
6,598.720825,619.674927,762.814941,730.649109,513.665588
7,602.921631,648.800354,849.553345,274.382599,884.301941
8,586.088501,496.418976,694.069275,750.791504,762.814941
9,611.308228,274.382599,802.555725,496.418976,830.06311


In [45]:
# build a dataframe including distanced top n features for each feature selection methods
df_top_features_dist = pd.DataFrame(top_features_dist)
# transpose the dataframe
df_top_features_dist = df_top_features_dist.T
#naming the dataframe columns
df_top_features_dist.columns = methods[0:5]

# write the dataframe of cross validation accuracy to CVS file
df_top_features_dist.to_csv(r'../results/R32_peanut_ssr_top_features_distance_rf.csv', encoding='utf-8', header='true')
df_top_features_dist

Unnamed: 0,Chi2,SFM_RF,SFM_SVML,RFE_RF,RFE_SVML
0,698.154175,884.301941,884.301941,500.737885,505.052002
1,594.515015,500.737885,758.812317,884.301941,395.752716
2,632.187378,652.940979,806.501038,619.674927,301.6474
3,573.411438,689.979309,837.875061,246.95195,260.687927
4,526.549194,826.149292,694.069275,689.979309,652.940979
5,552.184326,619.674927,648.800354,730.649109,884.301941
6,657.07666,274.382599,242.36412,274.382599,762.814941
7,718.501892,457.330505,730.649109,750.791504,830.06311
8,505.052002,726.605103,673.568604,826.149292,431.054749
9,677.678833,246.95195,586.088501,644.654663,623.850769
