In [1]:
from collections import defaultdict, OrderedDict, deque
import copy
import sys
import pandas as pd

In [2]:
import numpy as np
import scipy.stats
from scipy.linalg import LinAlgError
import scipy.sparse
import sklearn
# TODO use balanced accuracy!
import sklearn.metrics
import sklearn.model_selection
from sklearn.utils import check_array
from sklearn.multiclass import OneVsRestClassifier

In [3]:
import numpy as np
import openml

In [4]:
from sklearn.preprocessing import Imputer
from autosklearn.pipeline.implementations.OneHotEncoder import OneHotEncoder
from sklearn.preprocessing import StandardScaler


In [5]:
# TODO Allow multiple dependencies for a metafeature
# TODO Add HelperFunction as an object
class HelperFunctions(object):
    def __init__(self):
        self.functions = OrderedDict()
        self.values = OrderedDict()

    def clear(self):
        self.values = OrderedDict()
        self.computation_time = OrderedDict()

    def __iter__(self):
        return self.functions.__iter__()

    def __getitem__(self, item):
        return self.functions.__getitem__(item)

    def __setitem__(self, key, value):
        return self.functions.__setitem__(key, value)

    def __delitem__(self, key):
        return self.functions.__delitem__(key)

    def __contains__(self, item):
        return self.functions.__contains__(item)

    def is_calculated(self, key):
        """Return if a helper function has already been executed.
        Necessary as get_value() can return None if the helper function hasn't
        been executed or if it returned None."""
        return key in self.values

    def get_value(self, key):
        return self.values.get(key).value

    def set_value(self, key, item):
        self.values[key] = item

    def define(self, name):
        """Decorator for adding helper functions to a "dictionary".
        This behaves like a function decorating a function,
        not a class decorating a function"""
        def wrapper(metafeature_class):
            instance = metafeature_class()
            self.__setitem__(name, instance)
            return instance
        return wrapper


In [6]:
class MetafeatureFunctions(object):
    def __init__(self):
        self.functions = OrderedDict()
        self.dependencies = OrderedDict()
        self.values = OrderedDict()

    def clear(self):
        self.values = OrderedDict()

    def __iter__(self):
        return self.functions.__iter__()

    def __getitem__(self, item):
        return self.functions.__getitem__(item)

    def __setitem__(self, key, value):
        return self.functions.__setitem__(key, value)

    def __delitem__(self, key):
        return self.functions.__delitem__(key)

    def __contains__(self, item):
        return self.functions.__contains__(item)

    def get_value(self, key):
        return self.values[key].value

    def set_value(self, key, item):
        self.values[key] = item

    def is_calculated(self, key):
        """Return if a helper function has already been executed.
        Necessary as get_value() can return None if the helper function hasn't
        been executed or if it returned None."""
        return key in self.values

    def get_dependency(self, name):
        """Return the dependency of metafeature "name".
        """
        return self.dependencies.get(name)

    def define(self, name, dependency=None):
        """Decorator for adding metafeature functions to a "dictionary" of
        metafeatures. This behaves like a function decorating a function,
        not a class decorating a function"""
        def wrapper(metafeature_class):
            instance = metafeature_class()
            self.__setitem__(name, instance)
            self.dependencies[name] = dependency
            return instance
        return wrapper
    
metafeatures = MetafeatureFunctions()
helper_functions = HelperFunctions()


In [7]:
def load_task(task_id):
    task = openml.tasks.get_task(task_id)
    X, y = task.get_X_and_y()
    train_indices, test_indices = task.get_train_test_split_indices()
    X_train = X[train_indices]
    y_train = y[train_indices]
    X_test = X[test_indices]
    y_test = y[test_indices]
    dataset = openml.datasets.get_dataset(task.dataset_id)
    _, _, cat = dataset.get_data(return_categorical_indicator=True,
target=task.target_name)
    del _
    del dataset
    cat = ['categorical' if c else 'numerical' for c in cat]

    unique = np.unique(y_train)
    mapping = {unique_value: i for i, unique_value in enumerate(unique)}
    y_train = np.array([mapping[value] for value in y_train])
    y_test = np.array([mapping[value] for value in y_test])

    return X_train, y_train, X_test, y_test, cat

In [8]:
X_train, y_train, X_test, y_test, cat = load_task(236)

In [15]:
X_train.shape

(13400, 16)

In [23]:
print(cat)

['numerical', 'numerical', 'numerical', 'numerical', 'numerical', 'numerical', 'numerical', 'numerical', 'numerical', 'numerical', 'numerical', 'numerical', 'numerical', 'numerical', 'numerical', 'numerical']


In [27]:
### Simple features
computed_features={}
def NumberOfInstances(X, y, categorical):
    return float(X.shape[0])
computed_features['NumberOfInstances']=NumberOfInstances(X_train,y_train,cat)
print('NumberOfInstances')

def LogNumberOfInstances(X, y, categorical):
    return np.log(computed_features["NumberOfInstances"])
computed_features['LogNumberOfInstances']=LogNumberOfInstances(X_train,y_train,cat)
print('LogNumberOfInstances')
    
#Calculate the number of classes.
#Calls np.unique on the targets. If the dataset is a multilabel dataset,
#does this for each label seperately and returns the mean.    
def NumberofClasses(X, y, categorical):
    if len(y.shape) == 2:
        return np.mean([len(np.unique(y[:,i])) for i in range(y.shape[1])])
    else:
        return float(len(np.unique(y)))
computed_features['NumberofClasses']=NumberofClasses(X_train,y_train,cat)
print('NumberofClasses')


def NumberOfFeatures(X, y, categorical):
    return float(X.shape[1])
computed_features['NumberOfFeatures'] = NumberOfFeatures(X_train,y_train,cat)
print('NumberOfFeatures')


def LogNumberOfFeatures(X,y,categorical):
    return np.log(computed_features['NumberOfFeatures'] )
computed_features['LogNumberOfFeatures'] = LogNumberOfFeatures(X_train,y_train,cat)
print('LogNumberOfFeatures')

def MissingValues(X, y, categorical):
    missing = ~np.isfinite(X)
    return missing
computed_features['MissingValues'] = MissingValues(X_train,y_train,cat)
print('MissingValues')

def NumberOfInstancesWithMissingValues(X, y, categorical):
    missing = computed_features["MissingValues"]
    num_missing = missing.sum(axis=1)
    return float(np.sum([1 if num > 0 else 0 for num in num_missing]))
computed_features['NumberOfInstancesWithMissingValues'] = NumberOfInstancesWithMissingValues(X_train,y_train,cat)
print('NumberOfInstancesWithMissingValues')


def PercentageOfInstancesWithMissingValues(X, y, categorical):
    return float(computed_features["NumberOfInstancesWithMissingValues"]) / float(computed_features["NumberOfInstances"])
computed_features['PercentageOfInstancesWithMissingValues'] = PercentageOfInstancesWithMissingValues(X_train,y_train,cat)
print('PercentageOfInstancesWithMissingValues')



def NumberOfFeaturesWithMissingValues(X, y, categorical):
        missing = computed_features["MissingValues"]
        num_missing = missing.sum(axis=0)
        return float(np.sum([1 if num > 0 else 0 for num in num_missing]))
computed_features['NumberOfFeaturesWithMissingValues'] = NumberOfFeaturesWithMissingValues(X_train,y_train,cat)
print('NumberOfFeaturesWithMissingValues')

def PercentageOfFeaturesWithMissingValues(X, y, categorical):
        return float(computed_features["NumberOfFeaturesWithMissingValues"]) / float(computed_features["NumberOfFeatures"])
computed_features['PercentageOfFeaturesWithMissingValues'] = PercentageOfFeaturesWithMissingValues(X_train,y_train,cat)
print('PercentageOfFeaturesWithMissingValues')


def NumberOfMissingValues(X, y, categorical):
        return float(computed_features["MissingValues"].sum())
computed_features['NumberOfMissingValues'] = NumberOfMissingValues(X_train,y_train,cat)
print('NumberOfMissingValues')


def PercentageOfMissingValues(X, y, categorical):
        return float(computed_features["NumberOfMissingValues"]) / float(X.shape[0]*X.shape[1])
computed_features['PercentageOfMissingValues'] = PercentageOfMissingValues(X_train,y_train,cat)
print('PercentageOfMissingValues')


def NumberOfNumericFeatures(X, y, categorical):
    numerical_features=0
    for i in categorical:
        if i=='numerical':
            numerical_features+=1
    return numerical_features
computed_features['NumberOfNumericFeatures'] = NumberOfNumericFeatures(X_train,y_train,cat)
print('NumberOfNumericFeatures')

def NumberOfCategoricalFeatures(X, y, categorical):
    categorical_features=0
    for i in categorical:
        if i=='categorical':
            categorical_features+=1
    return categorical_features
computed_features['NumberOfCategoricalFeatures'] = NumberOfCategoricalFeatures(X_train,y_train,cat)
print('NumberOfCategoricalFeatures')

def RatioNumericalToNominal(X, y, categorical):
        num_categorical = float(computed_features["NumberOfCategoricalFeatures"])
        num_numerical = float(computed_features["NumberOfNumericFeatures"])
        if num_categorical == 0.0:
            return 0.
        return num_numerical / num_categorical
computed_features['RatioNumericalToNominal'] = RatioNumericalToNominal(X_train,y_train,cat)
print('RatioNumericalToNominal')


# Number of attributes divided by number of samples

def DatasetRatio(X, y, categorical):
        return float(computed_features["NumberOfFeatures"]) / float(computed_features["NumberOfInstances"])
computed_features['DatasetRatio'] = DatasetRatio(X_train,y_train,cat)
print('DatasetRatio')
                     
def LogDatasetRatio(X, y, categorical):
        return np.log(computed_features["DatasetRatio"])
computed_features['LogDatasetRatio'] = LogDatasetRatio(X_train,y_train,cat)
print('LogDatasetRatio')                 
                     
def InverseDatasetRatio(X, y, categorical):
        return float(computed_features["NumberOfInstances"]) / float(computed_features["NumberOfFeatures"])
computed_features['InverseDatasetRatio'] = InverseDatasetRatio(X_train,y_train,cat)
print('InverseDatasetRatio')
                     
def LogInverseDatasetRatio(X, y, categorical):
        return np.log(computed_features["InverseDatasetRatio"])
computed_features['LogInverseDatasetRatio'] = LogInverseDatasetRatio(X_train,y_train,cat)
print('LogInverseDatasetRatio')
                     
def ClassOccurences(X, y, categorical):
        if len(y.shape) == 2:
            occurences = []
            for i in range(y.shape[1]):
                occurences.append(self._calculate(X, y[:, i], cat))
            return occurences
        else:
            occurence_dict = defaultdict(float)
            for value in y:
                occurence_dict[value] += 1
            return occurence_dict
computed_features['ClassOccurences'] = ClassOccurences(X_train,y_train,cat)

def ClassProbabilityMin(X, y, categorical):
        occurences = computed_features["ClassOccurences"]

        min_value = np.iinfo(np.int64).max
        if len(y.shape) == 2:
            for i in range(y.shape[1]):
                for num_occurences in occurences[i].values():
                    if num_occurences < min_value:
                        min_value = num_occurences
        else:
            for num_occurences in occurences.values():
                if num_occurences < min_value:
                    min_value = num_occurences
        return float(min_value) / float(y.shape[0])
                     
computed_features['ClassProbabilityMin'] = ClassProbabilityMin(X_train,y_train,cat)

# aka default accuracy

def ClassProbabilityMax(X, y, categorical):
        occurences = computed_features["ClassOccurences"]
        max_value = -1

        if len(y.shape) == 2:
            for i in range(y.shape[1]):
                for num_occurences in occurences[i].values():
                    if num_occurences > max_value:
                        max_value = num_occurences
        else:
            for num_occurences in occurences.values():
                if num_occurences > max_value:
                    max_value = num_occurences
        return float(max_value) / float(y.shape[0])
                     
computed_features['ClassProbabilityMax'] = ClassProbabilityMax(X_train,y_train,cat)

def ClassProbabilityMean(X, y, categorical):
        occurence_dict = computed_features["ClassOccurences"]
        if len(y.shape) == 2:
            occurences = []
            for i in range(y.shape[1]):
                occurences.extend(
                    [occurrence for occurrence in occurence_dict[
                        i].values()])
            occurences = np.array(occurences)
        else:
            occurences = np.array([occurrence for occurrence in occurence_dict.values()],
                                  dtype=np.float64)
        return (occurences / y.shape[0]).mean()
computed_features['ClassProbabilityMean'] = ClassProbabilityMean(X_train,y_train,cat)

def ClassProbabilitySTD(X, y, categorical):
        occurence_dict = computed_features["ClassOccurences"]

        if len(y.shape) == 2:
            stds = []
            for i in range(y.shape[1]):
                std = np.array(
                    [occurrence for occurrence in occurence_dict[
                                                      i].values()],
                    dtype=np.float64)
                std = (std / y.shape[0]).std()
                stds.append(std)
            return np.mean(stds)
        else:
            occurences = np.array([occurrence for occurrence in occurence_dict.values()],
                                 dtype=np.float64)
            return (occurences / y.shape[0]).std()
computed_features['ClassProbabilitySTD'] = ClassProbabilitySTD(X_train,y_train,cat)


NumberOfInstances
LogNumberOfInstances
NumberofClasses
NumberOfFeatures
LogNumberOfFeatures
MissingValues
NumberOfInstancesWithMissingValues
PercentageOfInstancesWithMissingValues
NumberOfFeaturesWithMissingValues
PercentageOfFeaturesWithMissingValues
NumberOfMissingValues
PercentageOfMissingValues
NumberOfNumericFeatures
NumberOfCategoricalFeatures
RatioNumericalToNominal
DatasetRatio
LogDatasetRatio
InverseDatasetRatio
LogInverseDatasetRatio


In [28]:
computed_features

{'NumberOfInstances': 13400.0,
 'LogNumberOfInstances': 9.503009985939002,
 'NumberofClasses': 26.0,
 'NumberOfFeatures': 16.0,
 'LogNumberOfFeatures': 2.772588722239781,
 'MissingValues': array([[False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        ...,
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False]]),
 'NumberOfInstancesWithMissingValues': 0.0,
 'PercentageOfInstancesWithMissingValues': 0.0,
 'NumberOfFeaturesWithMissingValues': 0.0,
 'PercentageOfFeaturesWithMissingValues': 0.0,
 'NumberOfMissingValues': 0.0,
 'PercentageOfMissingValues': 0.0,
 'NumberOfNumericFeatures': 16,
 'NumberOfCategoricalFeatures': 0,
 'RatioNumericalToNominal': 0.0,
 'DatasetRatio': 0.0011940298507462687,
 'LogDatasetRatio': -6.730421263699221,
 'InverseDatasetRatio': 837