In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, log_loss
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from MLFeatureSelection import sequence_selection, importance_selection, coherence_selection,tools

In [2]:
def lossfunction(y_pred, y_test):
    """define your own loss function with y_pred and y_test
    return score
    """
    return 100 * accuracy_score(y_test, y_pred)


def validate(X, y, features, clf, lossfunction):
    """define your own validation function with 5 parameters
    input as X, y, features, clf, lossfunction
    clf is set by SetClassifier()
    lossfunction is import earlier
    features will be generate automatically
    function return score and trained classfier
    """
    train_X, test_X, train_y, test_y = train_test_split(X[features],
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42,
                                                        shuffle=True)
    clf.fit(train_X, train_y)
    scores = cross_val_score(clf, X[features], y, cv=5)
    #y_pred = clf.predict(test_X)
    #score = lossfunction(test_y, y_pred)
    score = scores.mean()
    return score, clf


def add(x, y):
    return x + y


def substract(x, y):
    return x - y


def times(x, y):
    return x * y


def divide(x, y):
    return (x + 0.001) / (y + 0.001)


def sq(x, y):
    return x**2


CrossMethod = {
    #'+': add,
    #'-': substract,
    '*': times,
    #'/': divide,
    #'^': sq,
}


def seq(df, f, notusable, estimator):
    sf = sequence_selection.Select(
        Sequence=True, Random=False,
        Cross=True)  #initialized selector with wanted process
    sf.ImportDF(df, label='label')  #import dataframe and define the label name
    sf.ImportLossFunction(
        lossfunction,
        direction='ascend')  #import loosfunction and improve direction
    sf.ImportCrossMethod(CrossMethod)  #import dictionary of cross method
    sf.InitialNonTrainableFeatures(
        notusable)  #define features that are not trainable
    sf.InitialFeatures(f)  #define list initial features combination
    sf.GenerateCol()  #generate candidate features list
    sf.clf = estimator  #define selected estimator
    sf.SetLogFile('record_seq.log')  #set the log file name
    return sf.run(validate)  #start running


def imp(df, f, estimator):
    sf = importance_selection.Select()  #initialized selector
    sf.ImportDF(df, label='label')  #import dataset
    sf.ImportLossFunction(
        lossfunction,
        direction='ascend')  #import loosfunction and improve direction
    sf.InitialFeatures(f)  #define list initial features combination
    sf.SelectRemoveMode(
        batch=1)  #define remove features quantity each iteration
    sf.clf = estimator  #define selected estimator
    sf.SetLogFile('record_imp.log')  #set the log file name
    return sf.run(validate)  #start running


def coh(df, f, estimator):
    sf = coherence_selection.Select()  #initialized selector
    sf.ImportDF(df, label='label')  #import dataset
    sf.ImportLossFunction(
        lossfunction,
        direction='ascend')  #import loosfunction and improve direction
    sf.InitialFeatures(f)  #define list initial features combination
    sf.SelectRemoveMode(
        batch=1, lowerbound=0.5
    )  #define remove features quantity each iteration and selection threshold
    sf.clf = estimator  #define selected estimator
    sf.SetLogFile('record_coh.log')  #set the log file name
    return sf.run(validate)  #start running


def run(df, bf):
    notusable = ['label']  #not trainable features
    f = bf  #initial features combination
    clf = RandomForestClassifier(n_estimators=800,
                                 max_leaf_nodes=64,
                                 n_jobs=-1)
    uf = f[:]
    print('sequence selection')
    uf = seq(df, uf, notusable, clf)
    print('importance selection')
    uf = imp(df, uf, clf)
    print('coherence selection')
    uf = coh(df, uf, clf)
    return uf


df = pd.read_csv('newnewnew.csv')
df = df.drop(columns=['name', 'word count'])
for i in range(df.shape[1]):
    df.iloc[:, i] = preprocessing.scale(df.iloc[:, i])

In [4]:
bf = run(df, bf1)

sequence selection
Features Quantity Limit: inf
Time Limit: inf min(s)
100000000
test performance of initial features combination
Mean loss: 0.7894
--------------------start greedy--------------------
word ratio
GRF_8
WC_2
GRF_3
WC_7
base_10
base_9
WC_9
WC_22
base_13
******************** 11 round ********************
F_feature
0/42
Mean loss: 0.7846
WC_1
1/42
Mean loss: 0.7862000000000001
WC_3
2/42
Mean loss: 0.7858
WC_4
3/42
Mean loss: 0.7876
WC_5
4/42
Mean loss: 0.7852
WC_6
5/42
Mean loss: 0.7868
WC_8
6/42
Mean loss: 0.7872000000000001
WC_10
7/42
Mean loss: 0.7849999999999999
WC_11
8/42
Mean loss: 0.7858
WC_12
9/42
Mean loss: 0.7858
WC_13
10/42
Mean loss: 0.7868
WC_14
11/42
Mean loss: 0.788
WC_15
12/42
Mean loss: 0.7847999999999999
WC_16
13/42
Mean loss: 0.7862
WC_17
14/42
Mean loss: 0.7832
WC_18
15/42
Mean loss: 0.7876000000000001
WC_19
16/42
Mean loss: 0.7878000000000001
WC_20
17/42
Mean loss: 0.7882
WC_21
18/42
Mean loss: 0.7854000000000001
WC_23
19/42
Mean loss: 0.787800000000000

Mean loss: 0.7858
base_15
41/43
Mean loss: 0.7864000000000001
base_16
42/43
Mean loss: 0.7858
word ratio
reverse 0/9
Mean loss: 0.7424
GRF_8
reverse 1/9
Mean loss: 0.7856
WC_2
reverse 2/9
Mean loss: 0.7862
GRF_3
reverse 3/9
Mean loss: 0.7708
WC_7
reverse 4/9
Mean loss: 0.7852
base_10
reverse 5/9
Mean loss: 0.7876000000000001
base_9
reverse 6/9
Mean loss: 0.7848
WC_9
reverse 7/9
Mean loss: 0.7868
base_13
reverse 8/9
Mean loss: 0.7891999999999999
--------------------complete greedy--------------------
random select starts with:
 ['word ratio', 'GRF_8', 'WC_2', 'GRF_3', 'WC_7', 'base_10', 'base_9', 'WC_9', 'base_13', '(base_10*WC_9)']
 score: 0.7908
*-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-*
best score:0.7908
best features combination: ['word ratio', 'GRF_8', 'WC_2', 'GRF_3', 'WC_7', 'base_10', 'base_9', 'WC_9', 'base_13', '(base_10*WC_9)']
importance selection
Features Quantity Lim

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  df.ix[i,i] = 0
The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.
  tempdelete = t[t.abs().max() == t.abs().max().max()].abs().sum(axis = 1).argmax()


remove features: ['WC_9']
Mean loss: 0.7874000000000001
Delete base_10 with coherence 0.5501168885032616


The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.
  tempdelete = t[t.abs().max() == t.abs().max().max()].abs().sum(axis = 1).argmax()


remove features: ['base_10']
Mean loss: 0.7866
Delete GRF_8 with coherence 0.515786852168539


The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.
  tempdelete = t[t.abs().max() == t.abs().max().max()].abs().sum(axis = 1).argmax()


remove features: ['GRF_8']
Mean loss: 0.7854000000000001
*-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-**-*
best score:0.7894000000000001
best features combination: ['word ratio', 'GRF_8', 'WC_2', 'GRF_3', 'WC_7', 'base_10', 'base_9', 'WC_9', 'base_13', '(base_10*WC_9)']


1. ['bias', 'word ratio', 'base_15', 'GRF_8', 'WC_2', 'GRF_1']
2. ['word ratio', 'GRF_8', 'WC_2', 'GRF_3', 'WC_7', 'base_10', 'base_9', 'WC_9', 'WC_22', 'base_13', '(word ratio-base_10)', '(GRF_8^WC_2)', '(GRF_3\*base_9)', '(GRF_3-WC_2)', '(GRF_3\*WC_2)', '(WC_7-WC_22)']
3. ['word ratio', 'GRF_8', 'WC_2', 'GRF_3', 'WC_7', 'base_10', 'base_9', 'WC_9', 'base_13', '(base_10\*WC_9)']

In [3]:
bf1 = [
    'word ratio', 'GRF_8', 'WC_2', 'GRF_3', 'WC_7', 'base_10', 'base_9',
    'WC_9', 'WC_22', 'base_13'
]