In [149]:
import pandas as pd
import numpy as np
import warnings
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from scipy import stats

warnings.filterwarnings('ignore') 

In [41]:
data = pd.read_csv("../Dataset/leaf.csv", delimiter=",")
X_, y_ = data.iloc[:, :-1], data.iloc[:, -1:],
x_train, x_test, y_train, y_test = train_test_split(X_, y_, test_size=0.2, random_state= 0)
num_features = len(X_.columns)

In [32]:
clf = KNeighborsClassifier(n_neighbors=5)
cv_strat = RepeatedStratifiedKFold(n_splits=5, n_repeats=4, random_state=42)

In [158]:
cv_results_full = cross_val_score(estimator = clf,  X = X_, y = y_, cv = cv_strat, scoring = 'accuracy')

In [159]:
cv_results_full.mean()

0.5477941176470588

In [85]:
fscore = SelectKBest(k = 'all')
fscore.fit_transform(X_, y_)
indices_fscore = np.argsort(fscore.scores_)[::-1][0:num_features]
print(X.columns[indices_fscore].values)
print(fscore.scores_[indices_fscore])

['Aspect Ratio' 'Isoperimetric Factor' 'Solidity' 'Elongation'
 'Stochastic Convexity' 'Eccentricity' 'Maximal Indentation Depth'
 'Average Contrast' 'Smoothness' 'Average Intensity' 'Entropy'
 'Third moment' 'Lobedness' 'Uniformity']
[177.36851327 150.62706584 143.51971656 120.46404269  85.31945942
  66.08024337  44.42536511  37.38593165  33.2840698   29.12326416
  27.17029832  26.67072643  23.63650093  11.45661752]


In [97]:
cv_results_fscore = cross_val_score(estimator = clf,  X = X_.iloc[:, indices_fscore[:10]], y = y_, cv = cv_strat, scoring = 'accuracy')
cv_results_fscore.mean()

0.5691176470588235

In [130]:
mutual_info = SelectKBest(mutual_info_classif, k = 'all')
mutual_info.fit_transform(X_, y_)
indices_mutual_info = np.argsort(mutual_info.scores_)[::-1][0:num_features]
print(X.columns[indices_mutual_info].values)
print(mutual_info.scores_[indices_mutual_info])

['Aspect Ratio' 'Solidity' 'Elongation' 'Eccentricity'
 'Isoperimetric Factor' 'Maximal Indentation Depth' 'Lobedness'
 'Stochastic Convexity' 'Third moment' 'Average Contrast' 'Smoothness'
 'Average Intensity' 'Entropy' 'Uniformity']
[1.42470931 1.36514485 1.33009141 1.31396946 1.27632173 1.11460545
 1.09612128 0.80802782 0.79420408 0.78653559 0.77582558 0.77028467
 0.75241624 0.699652  ]


In [153]:
cv_results_mutual_info = cross_val_score(estimator = clf,  X = X_.iloc[:, indices_mutual_info[:12]], y = y_, cv = cv_strat, scoring = 'accuracy')
cv_results_mutual_info.mean()

0.5647058823529412

In [154]:
rfi = RandomForestClassifier(n_estimators = 200)
rfi.fit(X_, y_)
indices_rfi = np.argsort(rfi.feature_importances_)[::-1][0:num_features]
print(X.columns[indices_rfi].values)
print(rfi.feature_importances_[indices_rfi])

['Solidity' 'Aspect Ratio' 'Elongation' 'Eccentricity'
 'Isoperimetric Factor' 'Lobedness' 'Entropy' 'Maximal Indentation Depth'
 'Average Intensity' 'Uniformity' 'Third moment' 'Smoothness'
 'Stochastic Convexity' 'Average Contrast']
[0.12119304 0.08899132 0.08568173 0.08547711 0.0829449  0.07424862
 0.0722997  0.0666012  0.0561054  0.05533614 0.05517506 0.05291166
 0.05252473 0.05050939]


In [155]:
cv_results_rfi = cross_val_score(estimator = clf,  X = X_.iloc[:, indices_rfi[:8]], y = y_, cv = cv_strat, scoring = 'accuracy')
cv_results_rfi.mean()

0.5463235294117648

In [157]:
print(stats.ttest_rel(cv_results_fscore, cv_results_rfi))
print(stats.ttest_rel(cv_results_mutual_info, cv_results_rfi))
print(stats.ttest_rel(cv_results_mutual_info, cv_results_fscore)) 

Ttest_relResult(statistic=1.8276667316931274, pvalue=0.08334819806466919)
Ttest_relResult(statistic=1.8276667316931274, pvalue=0.08334819806466919)
Ttest_relResult(statistic=nan, pvalue=nan)


In [160]:
print(stats.ttest_rel(cv_results_fscore, cv_results_full))

Ttest_relResult(statistic=1.7803546183312935, pvalue=0.09101559261121525)


In [163]:
data.iloc[:, indices_fscore[:10]].to_csv(path_or_buf="../Dataset/fe_leaf.csv", index= False)

In [184]:
print(list(indices_fscore[:10]))

[1, 3, 2, 0, 5, 6, 7, 4, 11, 9]
