In [1]:
from sklearn.datasets import load_iris, load_wine, load_breast_cancer, load_digits
from sklearn.linear_model import Perceptron
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from pymfe.mfe import MFE
import pandas as pd
import numpy as np

In [2]:
columns = ['name', 'can_cor.mean', 'can_cor.sd',
           'cor.mean', 'cor.sd', 'cov.mean',
           'cov.sd', 'eigenvalues.mean',
           'eigenvalues.sd', 'g_mean.mean', 
           'g_mean.sd', 'gravity', 'h_mean.mean', 
           'h_mean.sd', 'iq_range.mean', 'iq_range.sd', 
           'kurtosis.mean', 'kurtosis.sd', 'lh_trace', 
           'mad.mean', 'mad.sd', 'max.mean', 'max.sd', 
           'mean.mean', 'mean.sd', 'median.mean', 
           'median.sd', 'min.mean', 'min.sd', 
           'nr_cor_attr', 'nr_disc', 'nr_norm', 
           'nr_outliers', 'p_trace', 'range.mean', 
           'range.sd', 'roy_root', 'sd.mean', 'sd.sd', 
           'sd_ratio', 'skewness.mean', 'skewness.sd', 
           'sparsity.mean', 'sparsity.sd', 't_mean.mean', 
           't_mean.sd', 'var.mean', 'var.sd', 'w_lambda', "algorithm"]

iris_x, iris_y = load_iris(return_X_y=True)
wine_x, wine_y = load_wine(return_X_y=True)
cancer_x, cancer_y = load_breast_cancer(return_X_y=True)
digits_x, digits_y = load_digits(return_X_y=True)

In [3]:
def test(x, y, data_name):
    result = 0
    mfe = MFE(groups=["statistical"])
    mfe.fit(x, y)
    ft = mfe.extract()
    
    X_train, X_test, y_train, y_test = train_test_split(
    x, y, test_size=0.33, random_state=42)
        
    clf = Perceptron(tol=1e-3, random_state=0)
    clf.fit(X_train, y_train)
    score1 = clf.score(X_test, y_test)
    
    clf2 = RandomForestClassifier(max_depth=2, random_state=0)
    clf2.fit(X_train, y_train)
    score2 = clf2.score(X_test, y_test)
    
    if score1 > score2: result = 1
    
    return [data_name] + ft[1] + [result]

In [4]:
linhas = []
linhas.append(test(wine_x, wine_y, "wine"))
linhas.append(test(digits_x, digits_y, "digits"))

df = pd.DataFrame(linhas, columns = columns)
df = df.fillna(0)
df

  c /= stddev[:, None]
  c /= stddev[None, :]
  np.log(np.linalg.det(S_i)) for S_i in sample_cov_matrices
  * np.log(np.linalg.det(pooled_cov_mat))
  (num_inst - num_classes)


Unnamed: 0,name,can_cor.mean,can_cor.sd,cor.mean,cor.sd,cov.mean,cov.sd,eigenvalues.mean,eigenvalues.sd,g_mean.mean,...,skewness.mean,skewness.sd,sparsity.mean,sparsity.sd,t_mean.mean,t_mean.sd,var.mean,var.sd,w_lambda,algorithm
0,wine,0.923167,0.036686,0.304957,0.190846,40.570766,208.403872,7645.500384,27509.281977,64.214526,...,0.344289,0.465443,0.006197,0.005509,65.071108,191.571123,7645.500384,27498.76029,0.019341,0
1,digits,0.802985,0.119817,0.0,0.0,2.233279,3.627786,18.783558,37.195949,0.0,...,4.307087,9.140055,0.141928,0.21901,4.733941,4.730131,18.783558,14.915131,1.8e-05,1


In [5]:
x = df.loc[:, df.columns != 'algorithm']
x = x.loc[:, x.columns != 'name'].values

y = df.algorithm.values
print(x)
print(y)

[[ 9.23166935e-01  3.66856864e-02  3.04957481e-01  1.90846486e-01
   4.05707659e+01  2.08403872e+02  7.64550038e+03  2.75092820e+04
   6.42145256e+01  1.88590229e+02  1.10610641e+02  5.97797835e+01
   1.73514597e+02  3.99298077e+01  1.33669884e+02 -6.02757289e-02
   8.98575345e-01  1.32097812e+01  2.50798897e+01  8.27635927e+01
   1.48290000e+02  4.62257236e+02  6.91336629e+01  2.05400096e+02
   6.33150000e+01  1.85223937e+02  2.89707692e+01  7.71979003e+01
   1.79487179e-01  2.00000000e+00  1.00000000e+00  7.00000000e+00
   1.70582022e+00  1.19319231e+02  3.86177141e+02  9.08118367e+00
   2.61777852e+01  8.68345703e+01  1.36246975e+00  3.44288886e-01
   4.65443485e-01  6.19675720e-03  5.50929953e-03  6.50711083e+01
   1.91571123e+02  7.64550038e+03  2.74987603e+04  1.93414866e-02]
 [ 8.02984551e-01  1.19816690e-01  0.00000000e+00  0.00000000e+00
   2.23327927e+00  3.62778561e+00  1.87835580e+01  3.71959495e+01
   0.00000000e+00  0.00000000e+00  2.55114618e+01  0.00000000e+00
   0.0000

In [6]:
clf3 = RandomForestClassifier(max_depth=2, random_state=0)
clf3.fit(x, y)

RandomForestClassifier(max_depth=2, random_state=0)

In [7]:
iris_x, iris_y = load_iris(return_X_y=True)
mfe = MFE(groups=["statistical"])
mfe.fit(iris_x, iris_y)
ft = mfe.extract()
print(ft[1])

[0.7280089563896477, 0.363186923364524, 0.594116025760156, 0.3375443182856702, 0.5966542132736764, 0.5582672431248461, 1.1432392617449665, 2.058771301506975, 3.2230731578977903, 2.0229431040263726, 3.2082811597489393, 2.9783891110628673, 2.145948231748242, 1.7000000000000002, 1.2754084313139324, -0.8105361276250795, 0.7326910069728161, 32.477316568193444, 1.0934175, 0.5785781994035033, 5.425000000000001, 2.4431878083083722, 3.4645000000000006, 1.918485079431164, 3.6125000000000003, 1.919364043982624, 1.8499999999999999, 1.8083141320025125, 0.5, 2, 1.0, 1, 1.1918988224700764, 3.5750000000000006, 1.6500000000000001, 32.191925524309035, 0.9478670787835934, 0.5712994109375844, 1.2708666438750906, 0.06273198447775732, 0.29439896290757683, 0.0287147773948895, 0.011032357470087495, 3.4705555555555554, 1.9048021402275979, 1.1432392617449665, 1.3325463926454557, 0.0234386332222684]


In [8]:
clf3.predict([ft[1]])

array([1], dtype=int64)