In [1]:
import numpy as np
import pandas as pd
import polars as pl
import pytomlpp as toml
from data_preprocessing import getXY, data_normalizeation
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

In [2]:
# read config
cfg = toml.load('/home/gyli/GP_HP/cfg/feature_selection.toml')
data_path = cfg['data_path']
variance_filter_thres = cfg['variance_filter_thres']

In [3]:
raw_x, raw_y = getXY(data_path)
# raw_x = data_normalizeation(raw_x)
future_names = raw_x.columns
pd_x, pd_y = raw_x.to_pandas(), raw_y.to_pandas()

In [4]:
svm_clf = SVC()
X, Y = raw_x.to_pandas(), np.ravel(raw_y.to_pandas())

In [5]:
# filter by variance threshold
from sklearn.feature_selection import VarianceThreshold
var_thres = VarianceThreshold(threshold=variance_filter_thres)
var_thres.fit(X)
var_thres_col_idx = var_thres.get_support(indices=True)
X_Var = X.iloc[:, var_thres_col_idx]
print("\033[34m使用方差阈值筛选出来的特征结果索引:",var_thres.get_support(indices=True))
print("\033[34m方差阈值选择前的结果:",cross_val_score(estimator=svm_clf,X=X,y=Y,cv=5,scoring='accuracy').mean())
print("\033[34m方差阈值选择后的结果:",cross_val_score(estimator=svm_clf,X=X_Var,y=Y,cv=5,scoring='accuracy').mean())
# X_Varthresh = X[:, var_thres_col_idx]

[34m使用方差阈值筛选出来的特征结果索引: [ 0  1  2  5  6  7  8  9 10 11 13 14 15 16 17 18 19 21 22]
[34m方差阈值选择前的结果: 0.7198593923225771
[34m方差阈值选择后的结果: 0.7201008896057328


In [None]:
# 归一化
X_Var = data_normalizeation(pl.from_pandas(X_Var))

In [6]:
list(X.columns)

['年龄',
 '中性粒细胞数',
 '淋巴细胞数',
 '嗜酸性粒细胞数',
 '嗜碱性粒细胞数',
 '总蛋白',
 '白蛋白',
 '球蛋白',
 '总胆红素',
 '直接胆红素',
 '钾',
 '钠',
 '钙',
 '尿素氮',
 '谷草谷丙',
 '甘油三酯',
 '高密度脂蛋白',
 '低密度脂蛋白',
 '红细胞平均体积',
 '凝血酶原时间',
 '国际标准化比值',
 '活化部分凝血活酶时间',
 '纤维蛋白原']

In [7]:
# filter by Chi2
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
chi2_feature = SelectKBest(score_func=chi2, k='all')
chi2_selector = chi2_feature.fit(X_Var, Y)
chi2_selector.scores_
chi2_df = pd.DataFrame(np.arange(len(chi2_selector.scores_)), columns=['feature_idx'])
chi2_df['feature'] = list(X_Var.columns)
chi2_df['chi2_value'] = chi2_selector.scores_
sort_chi2 = chi2_df.sort_values(by='chi2_value', ascending=False).reset_index(drop=True)
print(sort_chi2)
X_Chi2_top10 = X_Var.iloc[:, sort_chi2[:10]['feature_idx'].to_list()]
X_Chi2_top14 = X_Var.iloc[:, sort_chi2[:14]['feature_idx'].to_list()]
print("\033[34m卡方选择前的结果:",cross_val_score(estimator=svm_clf,X=X_Var,y=Y,cv=5,scoring='accuracy').mean())
print("\033[34m卡方选择top14后的结果:",cross_val_score(estimator=svm_clf,X=X_Chi2_top10,y=Y,cv=5,scoring='accuracy').mean())
print("\033[34m卡方选择top10后的结果:",cross_val_score(estimator=svm_clf,X=X_Chi2_top14,y=Y,cv=5,scoring='accuracy').mean())

    feature_idx     feature   chi2_value
0             4         白蛋白  3607.793828
1             3         总蛋白  2232.444816
2             0          年龄  1620.557003
3             1      中性粒细胞数  1262.392571
4            17  活化部分凝血活酶时间   383.776717
5            11        谷草谷丙   246.632228
6            12        甘油三酯   196.700687
7             6        总胆红素   102.868027
8             7       直接胆红素    90.602259
9             2       淋巴细胞数    89.572173
10           14      低密度脂蛋白    48.415306
11           15     红细胞平均体积    43.963457
12           16      凝血酶原时间    43.712427
13            9           钠    41.472782
14           10         尿素氮    22.948253
15            8           钾    14.112563
16           18       纤维蛋白原     3.729141
17           13      高密度脂蛋白     0.603448
18            5         球蛋白     0.000358
[34m卡方选择前的结果: 0.7201008896057328
[34m卡方选择top14后的结果: 0.7238045203935901
[34m卡方选择top10后的结果: 0.7202619607103495


In [9]:
X_Chi2 = X_Var.iloc[:, sort_chi2[:14]['feature_idx'].to_list()]

In [32]:
# 相关系数筛选
corr = X_Chi2.corr().abs()
upper = corr.where(np.triu(np.ones(corr.shape),k=1).astype(np.bool_))
to_drop = [col for col in upper.columns if any(upper[col] > 0.8)]
print('deselect : ', to_drop)
upper

deselect :  ['总蛋白', '直接胆红素']


Unnamed: 0,白蛋白,总蛋白,年龄,中性粒细胞数,活化部分凝血活酶时间,谷草谷丙,甘油三酯,总胆红素,直接胆红素,淋巴细胞数,低密度脂蛋白,红细胞平均体积,凝血酶原时间,钠
白蛋白,,0.821454,0.06181,0.460003,0.188782,0.334556,0.006366,0.211114,0.020222,0.214614,0.046843,0.077524,0.120331,0.356134
总蛋白,,,0.129596,0.338535,0.166047,0.256663,0.048175,0.153805,0.030848,0.196337,0.095966,0.024375,0.097896,0.244804
年龄,,,,0.177365,0.131641,0.028208,0.138757,0.092833,0.104352,0.051793,0.075182,0.095411,0.169505,0.121082
中性粒细胞数,,,,,0.093932,0.147988,0.061344,0.111389,0.025697,0.038299,0.002683,0.054687,0.058327,0.351828
活化部分凝血活酶时间,,,,,,0.117652,0.150532,0.066922,0.032863,0.031075,0.061671,0.17434,0.740851,0.126529
谷草谷丙,,,,,,,0.086485,0.109115,0.038425,0.141833,0.036579,0.007635,0.077541,0.122886
甘油三酯,,,,,,,,0.085004,0.085252,0.064004,0.024384,0.064693,0.217243,0.092386
总胆红素,,,,,,,,,0.83941,0.006889,0.016982,0.049823,0.088564,0.004142
直接胆红素,,,,,,,,,,0.059185,0.083656,0.022396,0.070193,0.070869
淋巴细胞数,,,,,,,,,,,0.080059,0.025527,0.004276,0.131018


In [22]:
X_Corr = X_Chi2.drop(to_drop, axis=1)
print("Corr选择前的结果:",cross_val_score(estimator=svm_clf,X=X_Chi2,y=Y,cv=5,scoring='accuracy').mean())
print("Corr选择后的结果:",cross_val_score(estimator=svm_clf,X=X_Corr,y=Y,cv=5,scoring='accuracy').mean())

Corr选择前的结果: 0.7202619607103495
Corr选择后的结果: 0.7186114133067856


In [37]:
# 随机森林筛选
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier()
rf_clf.fit(X_Corr,Y)
feature_importance = rf_clf.feature_importances_
print(feature_importance)
feature_importance_df = pd.DataFrame(feature_importance, columns=['importance'])
feature_importance_df['feature'] = list(X_Corr.columns)
feature_importance_df['featrue_idx'] = np.arange(feature_importance_df.shape[0])
feature_importance_df = feature_importance_df.sort_values(ascending=False, by='importance').reset_index(drop=True)
print(feature_importance_df)
X_forest_top1 = X_Corr.iloc[:, feature_importance_df[:1]['featrue_idx'].to_list()]
X_forest_top6 = X_Corr.iloc[:, feature_importance_df[:6]['featrue_idx'].to_list()]
X_forest_top9 = X_Corr.iloc[:, feature_importance_df[:9]['featrue_idx'].to_list()]
print("\033[34m随机森林选择前的结果:",cross_val_score(estimator=svm_clf,X=X_Corr,y=Y,cv=5,scoring='accuracy').mean())
print("\033[34m随机森林选择top1后的结果:",cross_val_score(estimator=svm_clf,X=X_forest_top1,y=Y,cv=5,scoring='accuracy').mean())
print("\033[34m随机森林选择top6后的结果:",cross_val_score(estimator=svm_clf,X=X_forest_top6,y=Y,cv=5,scoring='accuracy').mean())
print("\033[34m随机森林选择top9后的结果:",cross_val_score(estimator=svm_clf,X=X_forest_top9,y=Y,cv=5,scoring='accuracy').mean())

[0.16763199 0.07554009 0.08616053 0.09075083 0.04454144 0.07358761
 0.06446714 0.0882615  0.08970428 0.06743523 0.07320635 0.078713  ]
    importance     feature  featrue_idx
0     0.167632         白蛋白            0
1     0.090751  活化部分凝血活酶时间            3
2     0.089704      低密度脂蛋白            8
3     0.088261       淋巴细胞数            7
4     0.086161      中性粒细胞数            2
5     0.078713           钠           11
6     0.075540          年龄            1
7     0.073588        甘油三酯            5
8     0.073206      凝血酶原时间           10
9     0.067435     红细胞平均体积            9
10    0.064467        总胆红素            6
11    0.044541        谷草谷丙            4
[34m随机森林选择前的结果: 0.7186114133067856
[34m随机森林选择top1后的结果: 0.7138206801654963
[34m随机森林选择top6后的结果: 0.723844705126697
[34m随机森林选择top9后的结果: 0.7241669850658623


In [35]:
X_forest_top1 = X_Corr[:, feature_importance_df[:1]['featrue_idx'].to_list()]
# X_forest_top6 = X_Corr[:, feature_importance_df[:6]['featrue_idx'].to_list()]
# X_forest_top9 = X_Corr[:, feature_importance_df[:9]['featrue_idx'].to_list()]

[0]