In [95]:
from pymfe.mfe import MFE
import pandas as pd
import numpy as np
from pandas.api.types import is_numeric_dtype
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from scipy import stats
from sklearn.preprocessing import MinMaxScaler
import os
import openml
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn import preprocessing
import time
from sklearn.metrics.pairwise import cosine_similarity
from scipy import stats
from sklearn.datasets import load_iris

In [162]:
class dataset_OpenML(object):
    """
    Create an OpenML specific version of the dataset class.
    """

    def __init__(self, data_id):
        # read dataset
        self.X, self.y, self.name = self.__get_data(data_id)

    def __get_data(self, data_id):
        # Retrieve dataset from OpenMl
        dataset = openml.datasets.get_dataset(data_id)
        X, y, categorical_indicator, attribute_names = dataset.get_data(
            dataset_format="dataframe", target=dataset.default_target_attribute
        )
        X = self.impute_missing(X, categorical_indicator)

        return X, y, dataset.name

    def impute_missing(self, X, categorical_indicator):
        categorical_features = list(X.columns[categorical_indicator])
        numeric_features = list(X.columns[~np.array(categorical_indicator)])
        X = X.dropna(axis=1, how='all')
        
        categorical_features = [ft for ft in categorical_features if ft in X.columns]
        numeric_features = [ft for ft in numeric_features if ft in X.columns]
        
        object_cols = {col: 'object' for col in categorical_features}
        X = X.astype(object_cols)
        
        dtypes = dict(X.dtypes)

        numeric_transformer = SimpleImputer(strategy="mean")
        categorical_transformer = SimpleImputer(strategy="constant", fill_value="missing")
        type_specific_preprocessor = ColumnTransformer(
            transformers=[
                ("num", numeric_transformer, numeric_features),
                ("cat", categorical_transformer, categorical_features),
            ]
        )
        prepr_np = type_specific_preprocessor.fit_transform(X)
        prepr_df = pd.DataFrame(prepr_np, columns=numeric_features + categorical_features)
        prepr_df = prepr_df.astype(dtypes)
        return prepr_df

    def get_arrays(self):
        return np.asarray(self.X), np.asarray(self.y)

In [27]:
dataset = openml.datasets.get_dataset(40996)
X, y, categorical_indicator, attribute_names = dataset.get_data(dataset_format="dataframe", target=dataset.default_target_attribute)

False

In [96]:
openml_df = openml.datasets.list_datasets(output_format="dataframe")
openml_df[openml_df['name'] == 'cjs']


Unnamed: 0,did,name,version,uploader,status,format,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures
473,473,cjs,1,2,active,ARFF,680.0,57.0,274.0,6.0,35.0,2796.0,2795.0,68100.0,32.0,3.0
1024,1024,cjs,2,2,active,ARFF,2116.0,57.0,680.0,2.0,35.0,2796.0,2795.0,68100.0,32.0,3.0
23380,23380,cjs,3,2,active,ARFF,680.0,57.0,274.0,6.0,35.0,2796.0,2795.0,68100.0,32.0,3.0


In [148]:
sum(~X['TBG'].isna())

0

In [132]:
dataset = openml.datasets.get_dataset(38)
X, y, categorical_indicator, attribute_names = dataset.get_data(
    dataset_format="dataframe", target=dataset.default_target_attribute)

In [151]:
X.shape
X.dropna(axis=1, how='all').shape

(3772, 28)

In [165]:
dat = dataset_OpenML(38)
X, y = dat.get_arrays()
X.shape

(3772, 28)

In [13]:
import os
from MFE.extract_features import extract_features_OpenML
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import argparse
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [126]:
openml_dataset = 24
rootdir = os.path.curdir
log_dir = os.path.join(rootdir, "D2V/checkpoints", f"split-{0}")
# configuration = json.load(open(os.path.join(log_dir, "configuration.txt"), "r"))
#
# metafeatures = pd.DataFrame(data=None)
# datasetmf = []

# batch = Batch(configuration['batch_size'])
# dataset = Dataset_OpenML(openml_dataset)
# testsampler = TestSampling(dataset=dataset)
#
# model = Dataset2VecModel(configuration)
#
# model.load_weights(os.path.join(log_dir, "weights"), by_name=False, skip_mismatch=False)
#
# for q in range(10):  # any number of samples
#     batch = testsampler.sample_from_one_dataset(batch)
#     batch.collect()
#     datasetmf.append(model(batch.input).numpy())
#
# metafeatures = np.vstack(datasetmf).mean(axis=0)[None]
# mf_df = pd.DataFrame(metafeatures, index=[openml_dataset])
# mf_df.insert(0, "dataset_name", value=dataset.name)

# Process MFE features
name, metafeatures_mfe = extract_features_OpenML(openml_dataset)

mfe_extracted = True if isinstance(metafeatures_mfe, pd.DataFrame) else False
if mfe_extracted:
    metafeatures_mfe.insert(0, "dataset_name", value=name)
    metafeatures_mfe = metafeatures_mfe.loc[openml_dataset]

    
extracted_mf = pd.read_csv("extracted_MF/OpenML-CC18_mfe.csv", index_col=0)
if openml_dataset in extracted_mf.index:
    print("Input index already in features")

else:
    extracted_mf = extracted_mf.append(metafeatures_mfe)
    

to_be_scaled_df = extracted_mf.iloc[:, 1:]
to_be_scaled_df = to_be_scaled_df.loc[:,~np.isinf(to_be_scaled_df).any()]
min_max_scaler = MinMaxScaler()
mf_scaled = min_max_scaler.fit_transform(to_be_scaled_df)
mf_scaled = pd.DataFrame(mf_scaled, index=extracted_mf.index)
# print("Initial Shape: " + str(mf_scaled.shape))
na_count_cols = mf_scaled.isna().sum() / len(mf_scaled)
mf_scaled = mf_scaled.loc[:, na_count_cols <= 0.20]
inp_ar = np.array(mf_scaled.loc[openml_dataset, :]).reshape(1, -1)
nan_col_inp = np.argwhere(np.isnan(inp_ar))[:, 1]
inp_ar = np.delete(inp_ar, nan_col_inp, axis=1)
filtered = mf_scaled[mf_scaled.index != openml_dataset]
comp_ind = filtered.index
filtered = np.delete(np.asarray(filtered), nan_col_inp, axis=1)
comp_ar = filtered[~np.isnan(filtered).any(axis=1), :]
comp_ind = comp_ind[~np.isnan(filtered).any(axis=1)]



mushroom has shape: (8124, 22)


  can_cor_eig_vals = sqr_can_cors / (1 - sqr_can_cors)


In [115]:
to_be_scaled_df.loc[24,][np.isinf(to_be_scaled_df.loc[24,])]

lh_trace    inf
roy_root    inf
Name: 24, dtype: float64

Unnamed: 0,attr_conc.mean,attr_conc.sd,attr_ent.mean,attr_ent.sd,attr_to_inst,best_node.mean,best_node.sd,can_cor.mean,can_cor.sd,cat_to_num,...,tree_imbalance.sd,tree_shape.mean,tree_shape.sd,var.mean,var.sd,var_importance.mean,var_importance.sd,w_lambda,worst_node.mean,worst_node.sd
3,1.058391e-02,0.014220,0.590148,0.338522,0.011264,0.489635,0.127226,0.799823,,,...,0.092559,0.052246,0.093870,0.130367,0.085100,0.026316,0.063278,3.602836e-01,0.476572,0.123835
6,3.980478e-02,0.058341,2.974428,0.255379,0.000800,0.071800,0.001585,0.467835,0.277919,0.00,...,0.053129,0.003276,0.013882,5.344024,2.054562,0.062500,0.040933,9.799904e-04,0.050400,0.002092
11,-6.250002e-08,0.000000,2.321928,0.000000,0.006400,0.446313,0.109058,0.423927,0.563810,0.00,...,0.074884,0.040688,0.038232,2.003205,0.000000,0.250000,0.017456,3.231216e-01,0.629007,0.101050
12,3.409325e-02,0.041214,3.524253,0.124568,0.108000,0.191000,0.005164,0.927064,0.048168,0.00,...,0.084058,0.040307,0.073197,4581.543057,5584.125416,0.004630,0.015216,3.206439e-09,0.195500,0.004972
14,1.024494e-02,0.005677,3.584956,0.000001,0.038000,0.191000,0.008097,0.721437,0.230074,0.00,...,0.075176,0.023157,0.049418,0.005520,0.005879,0.013158,0.027479,5.040808e-05,0.138000,0.007888
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40979,2.839167e-02,0.039093,1.537901,0.316983,0.120000,0.197500,0.002635,0.847343,0.095695,0.00,...,0.102885,0.043073,0.034203,6.141881,1.734872,0.004167,0.013687,1.952524e-06,0.167500,0.015138
41027,8.456124e-03,0.018998,2.696971,0.436888,0.000134,0.615945,0.044473,0.416155,0.174517,0.00,...,0.042622,0.000927,0.004439,6.457388,1.582940,0.166667,0.054241,6.481243e-01,0.509382,0.119002
40670,7.438350e-04,0.001018,0.806063,0.063779,0.056497,0.623363,0.013370,0.847595,0.031322,,...,0.070289,0.031635,0.047764,0.186220,0.019957,0.005525,0.026701,7.760364e-02,0.519146,0.001230
40701,1.763699e-02,0.121693,3.323736,1.272905,0.004000,0.870000,0.010914,0.485715,,0.25,...,0.064081,0.018657,0.045526,69834.981597,380368.734343,0.033333,0.037577,7.640812e-01,0.859000,0.001414


In [98]:
comp_ind

Int64Index([    6,    11,    12,    15,    16,    18,    22,    23,    29,
               31,    32,    37,    44,    54,   151,   182,   188,    38,
              307,   458,  1063,  1590,  1510,  1489,  1494,  1497,  1501,
             1480,  1487,  1468,  1475,  1462,  1464,  6332,  1461,  4538,
            23381, 40499, 40966, 40982, 40994, 40983, 40984, 40979, 41027,
            40701],
           dtype='int64')

(46, 104)

In [None]:
MFE()

In [127]:
ft[1]

[0.0644665970503495,
 0.10681140835811544,
 1.8906490168129426,
 1.1292266412928182,
 0.06851851851851852,
 0.6388888888888888,
 0.14190819669842775,
 0.7834513256923659,
 nan,
 1.0555555555555556,
 0.05311932802798223,
 0.1138621702147878,
 0.9824740868386409,
 0.040506088515330614,
 0.06671764453615259,
 8.041616174393983,
 336.617403777315,
 489113.3109367248,
 6100467.726470403,
 0.437037037037037,
 0.04876152508180617,
 16.811619141368737,
 0.5,
 0.10999438818457402,
 236.98775706880298,
 2923.455265578917,
 985.2232856727205,
 232.81300286147047,
 2871.671854114693,
 14.594594594594595,
 22.046715944012288,
 237.50329471793214,
 2.8146829213837705,
 1.1201025958781865,
 135.14753809038834,
 167.02936594662572,
 80,
 9.425,
 2.919922835810667,
 0.0125,
 0.024691846377417833,
 20440.027572427574,
 34891.8882469992,
 0.5,
 0.053033008588991036,
 1.5893049981681275,
 0.6240740740740741,
 0.08510910898876127,
 16.316076910268155,
 176.80384252769804,
 587.796858974359,
 7065.420386307

In [130]:
ft[1]

[0.0644665970503495,
 0.10681140835811544,
 1.8906490168129426,
 1.1292266412928182,
 0.06851851851851852,
 0.6388888888888888,
 0.14190819669842775,
 0.7834513256923659,
 nan,
 1.0555555555555556,
 0.05311932802798223,
 0.1138621702147878,
 0.9824740868386409,
 0.040506088515330614,
 0.06671764453615259,
 8.041616174393983,
 336.617403777315,
 489113.3109367248,
 6100467.726470403,
 0.437037037037037,
 0.04876152508180617,
 16.811619141368737,
 0.5,
 0.10999438818457402,
 236.98775706880298,
 2923.455265578917,
 985.2232856727205,
 232.81300286147047,
 2871.671854114693,
 14.594594594594595,
 22.046715944012288,
 237.50329471793214,
 2.8146829213837705,
 1.1201025958781865,
 135.14753809038834,
 167.02936594662572,
 80,
 9.425,
 2.919922835810667,
 0.0125,
 0.024691846377417833,
 20440.027572427574,
 34891.8882469992,
 0.5,
 0.053033008588991036,
 1.5893049981681275,
 0.6240740740740741,
 0.08510910898876127,
 16.316076910268155,
 176.80384252769804,
 587.796858974359,
 7065.420386307

In [131]:
pd.DataFrame(data=[ft[1]], index=[dat.name], columns=ft[0])

Unnamed: 0,attr_conc.mean,attr_conc.sd,attr_ent.mean,attr_ent.sd,attr_to_inst,best_node.mean,best_node.sd,can_cor.mean,can_cor.sd,cat_to_num,...,tree_imbalance.sd,tree_shape.mean,tree_shape.sd,var.mean,var.sd,var_importance.mean,var_importance.sd,w_lambda,worst_node.mean,worst_node.sd
cylinder-bands,0.064467,0.106811,1.890649,1.129227,0.068519,0.638889,0.141908,0.783451,,1.055556,...,0.09947,0.041933,0.074902,489113.310937,6100466.0,0.00641,0.018849,0.386204,0.577778,0.007808


In [1]:
dat = dataset_OpenML(6332)
X, y = dat.get_arrays()
mfe = MFE()
mfe.fit(X, y)
ft = mfe.extract(cat_cols='auto', suppress_warnings=True)


NameError: name 'dataset_OpenML' is not defined

In [23]:
dataset = openml.datasets.get_dataset(40996)
X, y, categorical_indicator, attribute_names = dataset.get_data(dataset_format="dataframe", target=dataset.default_target_attribute)


<pymfe.mfe.MFE at 0x16f6e95ed00>