In [1]:
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
data = np.load('../../data/oversampled_data.npy', allow_pickle=True)
gene = np.load('../../data/geneAfterDiscard_0.npy', allow_pickle=True)

feature = data[:, :-1]
label = data[:, -1]
print(gene.shape)
print(feature.shape)
print(label.shape)

(11959,)
(296, 11959)
(296,)


# 1 Calculate the importance of genes.
+ The feature importance of each gene was calculated using the MIC algorithm.
+ 100 experiments were performed to mitigate the effect of randomness.

In [3]:
from minepy import MINE

def mic(x, y):
    m = MINE()
    m.compute_score(x, y)
    return m.mic()


def gene_select(x, y):
    res = []
    for fe in x.T:
        res.append(mic(fe, y))
    res = np.array(res)
    return res

In [4]:
# repeated experiments
runNum = 100

imp = []
tic = time.time()

for i in range(runNum):
    X_train, X_test, y_train, y_test = train_test_split(feature, label, test_size=0.3, shuffle=True)
    imp.append(gene_select(X_train, y_train))
    print(i, imp[-1].sum(), imp[-1])

toc = time.time()
duration = toc-tic
print(f'duration:{duration}')

0 2873.2002587293364 [0.23821079 0.22200117 0.22250041 ... 0.27053736 0.2638531  0.24496671]
1 2860.442502112872 [0.23473665 0.24862866 0.24927856 ... 0.27330818 0.24240559 0.23539506]
2 2825.5596404965845 [0.29431976 0.27417728 0.21565954 ... 0.27824073 0.20121249 0.21519873]
3 2822.45491585958 [0.24647459 0.2507441  0.23314728 ... 0.24252579 0.18227697 0.24978633]
4 2878.1757996372344 [0.24139447 0.24910165 0.23218104 ... 0.27240196 0.26759099 0.26949847]
5 2830.544030009589 [0.25972194 0.28255852 0.22995845 ... 0.27029746 0.24845426 0.25739103]
6 2851.8731814125877 [0.26203174 0.26926052 0.22007653 ... 0.24923682 0.27190737 0.24072275]
7 2856.0835604223084 [0.24258991 0.21704726 0.24063357 ... 0.26443203 0.21337823 0.19568158]
8 2802.0651438196855 [0.20938685 0.22727107 0.21150657 ... 0.24333953 0.20120928 0.23267809]
9 2822.2423352277387 [0.25368031 0.24099192 0.21539854 ... 0.29801739 0.2309962  0.2459542 ]
10 2847.389699241293 [0.26813793 0.21956984 0.18795524 ... 0.25835751 0.23

In [5]:
# calculating feature importance

imp = np.array(imp)

"""
Because the characteristics of each experiment are not equal in importance, 
in order to ensure the equality of each experiment,
Here, the sum of the characteristic importance of each experiment is 1
"""

for i in range(imp.shape[0]):
    rowSum = imp[i].sum()
    imp[i] /= rowSum

np.save("NormalizedImp100.npy", imp)