In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from minepy import MINE
from sklearn.linear_model import LogisticRegression
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
X = np.loadtxt('feature_selection_X.txt',delimiter='\t')
y = np.loadtxt('feature_selection_Y.txt',delimiter='\t')

data = train_test_split(X, y, test_size=100, random_state=14)

In [3]:
def MIC(X, y):
    m = MINE()
    m.compute_score(X, y)
    return (m.mic(), 0.5)

In [4]:
def Fisher(X, y):
    benign_idx = list()
    malign_idx = list()
    fisher_scores = list()
    
    for i in range(len(y)):
        if y[i]==0:
            benign_idx.append(i)
        elif y[i]==1:
            malign_idx.append(i)
        else:
            raise ValueError('Unallowed Label.')
    
    X_benign = X[benign_idx,:]
    X_malign = X[malign_idx,:]
    
    feature_num = X.shape[0]
    for i in range(feature_num):
        mu1 = np.mean(X_benign[:,i])
        mu2 = np.mean(X_malign[:,i])
        s1_sq = 0
        s2_sq = 0
        for x in X_benign[:,i]:
            s1_sq += (x - mu1)**2
        for x in X_malign[:,i]:
            s2_sq += (x - mu2)**2
        score = (mu1-mu2)**2/(s1_sq + s2_sq)
        fisher_scores.append(score)   

    return np.asarray(fisher_scores)

In [5]:
def MICFeatSel(feature_num, data):
    X_train, X_test, y_train, y_test = data
    sel = SelectKBest(lambda X, y: tuple(map(tuple,np.array(list(map(lambda x:MIC(x,y), X.T))).T)), k=feature_num)
    X_train_new = sel.fit_transform(X_train,y_train)
    X_test_new = sel.transform(X_test)
    
    scores = sel.scores_
    sel_idxs = scores.argsort()[-feature_num:][::-1]
    sel_idxs.sort()
    
    return (X_train_new, X_test_new, y_train, y_test), sel_idxs

In [6]:
def FisFeatSel(feature_num, data):
    X_train, X_test, y_train, y_test = data
    scores = Fisher(X_train,y_train)
    sel_idxs = scores.argsort()[-feature_num:][::-1]
    sel_idxs.sort()
    
    X_train_new = X_train[:,sel_idxs]
    X_test_new = X_test[:,sel_idxs]
    
    return (X_train_new, X_test_new, y_train, y_test), sel_idxs

In [7]:
def LogReg(data):
    X_train, X_test, y_train, y_test = data
    
    model = LogisticRegression(penalty='none', random_state=14).fit(X_train,y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    conf = confusion_matrix(y_test, y_pred)

    print('accuracy:',acc)
    print('confusion matrix:', conf.ravel())

In [8]:
def Filter_Methods(feature_num, data):
    print('\nk=',feature_num)
    
    data_MIC, score_MIC = MICFeatSel(feature_num=feature_num, data=data)
    print('Feature Selection Using MIC:')
    LogReg(data_MIC)
    
    data_Fis, score_Fis = FisFeatSel(feature_num=feature_num, data=data)
    print('Feature Selection Using Fisher:')
    LogReg(data_Fis)
    
    common_feats = set(score_MIC) & set (score_Fis)
    print('Common Feature Index:',common_feats)
    print('Common Feature Counts:',len(common_feats))
    
    return score_MIC, score_Fis

In [9]:
print('Without Feature Selection:')
LogReg(data)

score_MIC_1, score_Fis_1 = Filter_Methods(1,data)
score_MIC_5, score_Fis_5 = Filter_Methods(5,data)
score_MIC_10, score_Fis_10 = Filter_Methods(10,data)
score_MIC_20, score_Fis_20 = Filter_Methods(20,data)
score_MIC_50, score_Fis_50 = Filter_Methods(50,data)
score_MIC_100, score_Fis_100 = Filter_Methods(100,data)

Without Feature Selection:
accuracy: 0.86
confusion matrix: [37  9  5 49]

k= 1
Feature Selection Using MIC:
accuracy: 0.92
confusion matrix: [39  7  1 53]
Feature Selection Using Fisher:
accuracy: 0.92
confusion matrix: [39  7  1 53]
Common Feature Index: {47}
Common Feature Counts: 1

k= 5
Feature Selection Using MIC:
accuracy: 0.95
confusion matrix: [45  1  4 50]
Feature Selection Using Fisher:
accuracy: 0.95
confusion matrix: [44  2  3 51]
Common Feature Index: {219, 4, 47}
Common Feature Counts: 3

k= 10
Feature Selection Using MIC:
accuracy: 0.95
confusion matrix: [45  1  4 50]
Feature Selection Using Fisher:
accuracy: 0.92
confusion matrix: [44  2  6 48]
Common Feature Index: {219, 4, 47}
Common Feature Counts: 3

k= 20
Feature Selection Using MIC:
accuracy: 0.95
confusion matrix: [45  1  4 50]
Feature Selection Using Fisher:
accuracy: 0.91
confusion matrix: [44  2  7 47]
Common Feature Index: {4, 38, 47, 15, 219}
Common Feature Counts: 5

k= 50
Feature Selection Using MIC:
accu

In [10]:
def SFSFeatSel(feature_num, data):
    X_train, X_test, y_train, y_test = data
    est = LogisticRegression(penalty='none', max_iter=5000, random_state=14)
    model = SFS(estimator=est, k_features=feature_num)
    model.fit_transform(X_train, y_train)
    
    return model.k_feature_idx_

In [11]:
score_SFS_1 = SFSFeatSel(1, data)

In [12]:
def ComFeat(feature_num, score_SFS, score_MIC, score_Fis):
    common_feats_MIC = set(score_MIC) & set(score_SFS)
    common_feats_Fis = set (score_Fis) & set(score_SFS)
    common_feats_all = set(score_MIC) & set (score_Fis) & set(score_SFS)
    print('\nk=',feature_num)
    
    print('Common Feature Index for MIC:',common_feats_MIC)
    print('Common Feature Counts for MIC:',len(common_feats_MIC))
    
    print('Common Feature Index for Fisher:',common_feats_Fis)
    print('Common Feature Counts for Fisher:',len(common_feats_Fis))
    
    print('Common Feature Index for All:',common_feats_all)
    print('Common Feature Counts for All:',len(common_feats_all))

In [13]:
ComFeat(1, score_SFS_1, score_MIC_1, score_Fis_1)


k= 1
Common Feature Index for MIC: {47}
Common Feature Counts for MIC: 1
Common Feature Index for Fisher: {47}
Common Feature Counts for Fisher: 1
Common Feature Index for All: {47}
Common Feature Counts for All: 1


In [14]:
score_SFS_5 = SFSFeatSel(5, data)
ComFeat(5, score_SFS_5, score_MIC_5, score_Fis_5)


k= 5
Common Feature Index for MIC: {916, 219, 4, 47}
Common Feature Counts for MIC: 4
Common Feature Index for Fisher: {219, 4, 47}
Common Feature Counts for Fisher: 3
Common Feature Index for All: {219, 4, 47}
Common Feature Counts for All: 3


In [16]:
score_SFS_10 = SFSFeatSel(10, data)
ComFeat(10, score_SFS_10, score_MIC_10, score_Fis_10)


k= 10
Common Feature Index for MIC: {916, 219, 4, 47}
Common Feature Counts for MIC: 4
Common Feature Index for Fisher: {219, 4, 47}
Common Feature Counts for Fisher: 3
Common Feature Index for All: {219, 4, 47}
Common Feature Counts for All: 3


In [17]:
score_SFS_20 = SFSFeatSel(20, data)
ComFeat(20, score_SFS_20, score_MIC_20, score_Fis_20)


k= 20
Common Feature Index for MIC: {291, 4, 47, 916, 219}
Common Feature Counts for MIC: 5
Common Feature Index for Fisher: {219, 4, 46, 47}
Common Feature Counts for Fisher: 4
Common Feature Index for All: {219, 4, 47}
Common Feature Counts for All: 3


In [18]:
score_SFS_50 = SFSFeatSel(50, data)
ComFeat(50, score_SFS_50, score_MIC_50, score_Fis_50)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist


k= 50
Common Feature Index for MIC: {291, 4, 356, 39, 15, 47, 210, 916, 219}
Common Feature Counts for MIC: 9
Common Feature Index for Fisher: {291, 4, 37, 39, 46, 15, 47, 115, 53, 26, 219}
Common Feature Counts for Fisher: 11
Common Feature Index for All: {291, 4, 39, 47, 15, 219}
Common Feature Counts for All: 6



STOPPING EARLY DUE TO KEYBOARD INTERRUPT...

In [19]:
score_SFS_100 = SFSFeatSel(100, data)
ComFeat(100, score_SFS_100, score_MIC_100, score_Fis_100)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist


k= 100
Common Feature Index for MIC: {35, 4, 291, 99, 39, 219, 15, 47, 110, 210, 916, 118, 26, 411}
Common Feature Counts for MIC: 14
Common Feature Index for Fisher: {224, 291, 4, 37, 39, 46, 15, 47, 210, 115, 53, 26, 219}
Common Feature Counts for Fisher: 13
Common Feature Index for All: {291, 4, 39, 47, 15, 210, 26, 219}
Common Feature Counts for All: 8



STOPPING EARLY DUE TO KEYBOARD INTERRUPT...

In [20]:
def DecTree(data):
    X_train, X_test, y_train, y_test = data
    model = DecisionTreeClassifier(random_state=14)
    model.fit(X_train,y_train)
    
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    conf = confusion_matrix(y_test, y_pred)
    
    print('accuracy:',acc)
    print('confusion matrix:', conf.ravel())
    
    scores = model.feature_importances_
    return scores

In [21]:
def ComFeatAll(feature_num, score_Dec, score_SFS, score_MIC, score_Fis):
    sel_idxs = score_Dec.argsort()[-feature_num:][::-1]
    sel_idxs.sort()

    common_feats_MIC = set(score_MIC) & set(sel_idxs)
    common_feats_Fis = set(score_Fis) & set(sel_idxs)
    common_feats_SFS = set(score_SFS) & set(sel_idxs)
    
    common_feats_all = set(score_MIC) & set (score_Fis) & set(score_SFS) & set(sel_idxs)
    
    print('\nk=',feature_num)
    
    print('Common Feature Index for MIC:',common_feats_MIC)
    print('Common Feature Counts for MIC:',len(common_feats_MIC))
    
    print('Common Feature Index for Fisher:',common_feats_Fis)
    print('Common Feature Counts for Fisher:',len(common_feats_Fis))
    
    print('Common Feature Index for SFS:',common_feats_SFS)
    print('Common Feature Counts for SFS:',len(common_feats_SFS))
    
    print('Common Feature Index for All:',common_feats_all)
    print('Common Feature Counts for All:',len(common_feats_all))

In [22]:
score_Dec = DecTree(data)
ComFeatAll(1, score_Dec, score_SFS_1, score_MIC_1, score_Fis_1)
ComFeatAll(5, score_Dec, score_SFS_5, score_MIC_5, score_Fis_5)
ComFeatAll(10, score_Dec, score_SFS_10, score_MIC_10, score_Fis_10)
ComFeatAll(20, score_Dec, score_SFS_20, score_MIC_20, score_Fis_20)
ComFeatAll(50, score_Dec, score_SFS_50, score_MIC_50, score_Fis_50)
ComFeatAll(100, score_Dec, score_SFS_100, score_MIC_100, score_Fis_100)

accuracy: 0.93
confusion matrix: [44  2  5 49]

k= 1
Common Feature Index for MIC: {47}
Common Feature Counts for MIC: 1
Common Feature Index for Fisher: {47}
Common Feature Counts for Fisher: 1
Common Feature Index for SFS: {47}
Common Feature Counts for SFS: 1
Common Feature Index for All: {47}
Common Feature Counts for All: 1

k= 5
Common Feature Index for MIC: {916, 4, 47}
Common Feature Counts for MIC: 3
Common Feature Index for Fisher: {4, 47}
Common Feature Counts for Fisher: 2
Common Feature Index for SFS: {916, 4, 47}
Common Feature Counts for SFS: 3
Common Feature Index for All: {4, 47}
Common Feature Counts for All: 2

k= 10
Common Feature Index for MIC: {916, 4, 47}
Common Feature Counts for MIC: 3
Common Feature Index for Fisher: {4, 47}
Common Feature Counts for Fisher: 2
Common Feature Index for SFS: {916, 4, 47}
Common Feature Counts for SFS: 3
Common Feature Index for All: {4, 47}
Common Feature Counts for All: 2

k= 20
Common Feature Index for MIC: {916, 4, 47}
Common