In [31]:
from collections import defaultdict, OrderedDict, deque
import copy
import sys
import pandas as pd

In [32]:
import numpy as np
import scipy.stats
from scipy.linalg import LinAlgError
import scipy.sparse
import sklearn
# TODO use balanced accuracy!
import sklearn.metrics
import sklearn.model_selection
from sklearn.utils import check_array
from sklearn.multiclass import OneVsRestClassifier

In [33]:
import numpy as np
import openml

In [34]:
from sklearn.preprocessing import Imputer
from autosklearn.pipeline.implementations.OneHotEncoder import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [35]:
# TODO Allow multiple dependencies for a metafeature
# TODO Add HelperFunction as an object
class HelperFunctions(object):
    def __init__(self):
        self.functions = OrderedDict()
        self.values = OrderedDict()

    def clear(self):
        self.values = OrderedDict()
        self.computation_time = OrderedDict()

    def __iter__(self):
        return self.functions.__iter__()

    def __getitem__(self, item):
        return self.functions.__getitem__(item)

    def __setitem__(self, key, value):
        return self.functions.__setitem__(key, value)

    def __delitem__(self, key):
        return self.functions.__delitem__(key)

    def __contains__(self, item):
        return self.functions.__contains__(item)

    def is_calculated(self, key):
        """Return if a helper function has already been executed.
        Necessary as get_value() can return None if the helper function hasn't
        been executed or if it returned None."""
        return key in self.values

    def get_value(self, key):
        return self.values.get(key).value

    def set_value(self, key, item):
        self.values[key] = item

    def define(self, name):
        """Decorator for adding helper functions to a "dictionary".
        This behaves like a function decorating a function,
        not a class decorating a function"""
        def wrapper(metafeature_class):
            instance = metafeature_class()
            self.__setitem__(name, instance)
            return instance
        return wrapper


In [36]:
class MetafeatureFunctions(object):
    def __init__(self):
        self.functions = OrderedDict()
        self.dependencies = OrderedDict()
        self.values = OrderedDict()

    def clear(self):
        self.values = OrderedDict()

    def __iter__(self):
        return self.functions.__iter__()

    def __getitem__(self, item):
        return self.functions.__getitem__(item)

    def __setitem__(self, key, value):
        return self.functions.__setitem__(key, value)

    def __delitem__(self, key):
        return self.functions.__delitem__(key)

    def __contains__(self, item):
        return self.functions.__contains__(item)

    def get_value(self, key):
        return self.values[key].value

    def set_value(self, key, item):
        self.values[key] = item

    def is_calculated(self, key):
        """Return if a helper function has already been executed.
        Necessary as get_value() can return None if the helper function hasn't
        been executed or if it returned None."""
        return key in self.values

    def get_dependency(self, name):
        """Return the dependency of metafeature "name".
        """
        return self.dependencies.get(name)

    def define(self, name, dependency=None):
        """Decorator for adding metafeature functions to a "dictionary" of
        metafeatures. This behaves like a function decorating a function,
        not a class decorating a function"""
        def wrapper(metafeature_class):
            instance = metafeature_class()
            self.__setitem__(name, instance)
            self.dependencies[name] = dependency
            return instance
        return wrapper
    
metafeatures = MetafeatureFunctions()
helper_functions = HelperFunctions()


In [37]:
def load_task(task_id):
    task = openml.tasks.get_task(task_id)
    X, y = task.get_X_and_y()
    train_indices, test_indices = task.get_train_test_split_indices()
    X_train = X[train_indices]
    y_train = y[train_indices]
    X_test = X[test_indices]
    y_test = y[test_indices]
    dataset = openml.datasets.get_dataset(task.dataset_id)
    _, _, cat = dataset.get_data(return_categorical_indicator=True,
target=task.target_name)
    del _
    del dataset
    cat = ['categorical' if c else 'numerical' for c in cat]

    unique = np.unique(y_train)
    mapping = {unique_value: i for i, unique_value in enumerate(unique)}
    y_train = np.array([mapping[value] for value in y_train])
    y_test = np.array([mapping[value] for value in y_test])

    return X_train, y_train, X_test, y_test, cat

In [38]:
X_train, y_train, X_test, y_test, cat = load_task(75182)

In [39]:
%%time
### Simple features
computed_features={}
def NumberOfInstances(X, y, categorical):
    return float(X.shape[0])
computed_features['NumberOfInstances']=NumberOfInstances(X_train,y_train,cat)
print('NumberOfInstances')

def LogNumberOfInstances(X, y, categorical):
    return np.log(computed_features["NumberOfInstances"])
computed_features['LogNumberOfInstances']=LogNumberOfInstances(X_train,y_train,cat)
print('LogNumberOfInstances')
    
#Calculate the number of classes.
#Calls np.unique on the targets. If the dataset is a multilabel dataset,
#does this for each label seperately and returns the mean.    
def NumberofClasses(X, y, categorical):
    if len(y.shape) == 2:
        return np.mean([len(np.unique(y[:,i])) for i in range(y.shape[1])])
    else:
        return float(len(np.unique(y)))
computed_features['NumberofClasses']=NumberofClasses(X_train,y_train,cat)
print('NumberofClasses')


def NumberOfFeatures(X, y, categorical):
    return float(X.shape[1])
computed_features['NumberOfFeatures'] = NumberOfFeatures(X_train,y_train,cat)
print('NumberOfFeatures')


def LogNumberOfFeatures(X,y,categorical):
    return np.log(computed_features['NumberOfFeatures'] )
computed_features['LogNumberOfFeatures'] = LogNumberOfFeatures(X_train,y_train,cat)
print('LogNumberOfFeatures')

def MissingValues(X, y, categorical):
    missing = ~np.isfinite(X)
    return missing
computed_features['MissingValues'] = MissingValues(X_train,y_train,cat)
print('MissingValues')

def NumberOfInstancesWithMissingValues(X, y, categorical):
    missing = computed_features["MissingValues"]
    num_missing = missing.sum(axis=1)
    return float(np.sum([1 if num > 0 else 0 for num in num_missing]))
computed_features['NumberOfInstancesWithMissingValues'] = NumberOfInstancesWithMissingValues(X_train,y_train,cat)
print('NumberOfInstancesWithMissingValues')


def PercentageOfInstancesWithMissingValues(X, y, categorical):
    return float(computed_features["NumberOfInstancesWithMissingValues"]) / float(computed_features["NumberOfInstances"])
computed_features['PercentageOfInstancesWithMissingValues'] = PercentageOfInstancesWithMissingValues(X_train,y_train,cat)
print('PercentageOfInstancesWithMissingValues')



def NumberOfFeaturesWithMissingValues(X, y, categorical):
        missing = computed_features["MissingValues"]
        num_missing = missing.sum(axis=0)
        return float(np.sum([1 if num > 0 else 0 for num in num_missing]))
computed_features['NumberOfFeaturesWithMissingValues'] = NumberOfFeaturesWithMissingValues(X_train,y_train,cat)
print('NumberOfFeaturesWithMissingValues')

def PercentageOfFeaturesWithMissingValues(X, y, categorical):
        return float(computed_features["NumberOfFeaturesWithMissingValues"]) / float(computed_features["NumberOfFeatures"])
computed_features['PercentageOfFeaturesWithMissingValues'] = PercentageOfFeaturesWithMissingValues(X_train,y_train,cat)
print('PercentageOfFeaturesWithMissingValues')


def NumberOfMissingValues(X, y, categorical):
        return float(computed_features["MissingValues"].sum())
computed_features['NumberOfMissingValues'] = NumberOfMissingValues(X_train,y_train,cat)
print('NumberOfMissingValues')


def PercentageOfMissingValues(X, y, categorical):
        return float(computed_features["NumberOfMissingValues"]) / float(X.shape[0]*X.shape[1])
computed_features['PercentageOfMissingValues'] = PercentageOfMissingValues(X_train,y_train,cat)
print('PercentageOfMissingValues')


def NumberOfNumericFeatures(X, y, categorical):
    numerical_features=0
    for i in categorical:
        if i=='numerical':
            numerical_features+=1
    return numerical_features
computed_features['NumberOfNumericFeatures'] = NumberOfNumericFeatures(X_train,y_train,cat)
print('NumberOfNumericFeatures')

def NumberOfCategoricalFeatures(X, y, categorical):
    categorical_features=0
    for i in categorical:
        if i=='categorical':
            categorical_features+=1
    return categorical_features
computed_features['NumberOfCategoricalFeatures'] = NumberOfCategoricalFeatures(X_train,y_train,cat)
print('NumberOfCategoricalFeatures')

def RatioNumericalToNominal(X, y, categorical):
        num_categorical = float(computed_features["NumberOfCategoricalFeatures"])
        num_numerical = float(computed_features["NumberOfNumericFeatures"])
        if num_categorical == 0.0:
            return 0.
        return num_numerical / num_categorical
computed_features['RatioNumericalToNominal'] = RatioNumericalToNominal(X_train,y_train,cat)
print('RatioNumericalToNominal')


# Number of attributes divided by number of samples

def DatasetRatio(X, y, categorical):
        return float(computed_features["NumberOfFeatures"]) / float(computed_features["NumberOfInstances"])
computed_features['DatasetRatio'] = DatasetRatio(X_train,y_train,cat)
print('DatasetRatio')
                     
def LogDatasetRatio(X, y, categorical):
        return np.log(computed_features["DatasetRatio"])
computed_features['LogDatasetRatio'] = LogDatasetRatio(X_train,y_train,cat)
print('LogDatasetRatio')                 
                     
def InverseDatasetRatio(X, y, categorical):
        return float(computed_features["NumberOfInstances"]) / float(computed_features["NumberOfFeatures"])
computed_features['InverseDatasetRatio'] = InverseDatasetRatio(X_train,y_train,cat)
print('InverseDatasetRatio')
                     
def LogInverseDatasetRatio(X, y, categorical):
        return np.log(computed_features["InverseDatasetRatio"])
computed_features['LogInverseDatasetRatio'] = LogInverseDatasetRatio(X_train,y_train,cat)
print('LogInverseDatasetRatio')
                     
def ClassOccurences(X, y, categorical):
        if len(y.shape) == 2:
            occurences = []
            for i in range(y.shape[1]):
                occurences.append(self._calculate(X, y[:, i], cat))
            return occurences
        else:
            occurence_dict = defaultdict(float)
            for value in y:
                occurence_dict[value] += 1
            return occurence_dict
computed_features['ClassOccurences'] = ClassOccurences(X_train,y_train,cat)

def ClassProbabilityMin(X, y, categorical):
        occurences = computed_features["ClassOccurences"]

        min_value = np.iinfo(np.int64).max
        if len(y.shape) == 2:
            for i in range(y.shape[1]):
                for num_occurences in occurences[i].values():
                    if num_occurences < min_value:
                        min_value = num_occurences
        else:
            for num_occurences in occurences.values():
                if num_occurences < min_value:
                    min_value = num_occurences
        return float(min_value) / float(y.shape[0])
                     
computed_features['ClassProbabilityMin'] = ClassProbabilityMin(X_train,y_train,cat)

# aka default accuracy

def ClassProbabilityMax(X, y, categorical):
        occurences = computed_features["ClassOccurences"]
        max_value = -1

        if len(y.shape) == 2:
            for i in range(y.shape[1]):
                for num_occurences in occurences[i].values():
                    if num_occurences > max_value:
                        max_value = num_occurences
        else:
            for num_occurences in occurences.values():
                if num_occurences > max_value:
                    max_value = num_occurences
        return float(max_value) / float(y.shape[0])
                     
computed_features['ClassProbabilityMax'] = ClassProbabilityMax(X_train,y_train,cat)

def ClassProbabilityMean(X, y, categorical):
        occurence_dict = computed_features["ClassOccurences"]
        if len(y.shape) == 2:
            occurences = []
            for i in range(y.shape[1]):
                occurences.extend(
                    [occurrence for occurrence in occurence_dict[
                        i].values()])
            occurences = np.array(occurences)
        else:
            occurences = np.array([occurrence for occurrence in occurence_dict.values()],
                                  dtype=np.float64)
        return (occurences / y.shape[0]).mean()
computed_features['ClassProbabilityMean'] = ClassProbabilityMean(X_train,y_train,cat)

def ClassProbabilitySTD(X, y, categorical):
        occurence_dict = computed_features["ClassOccurences"]

        if len(y.shape) == 2:
            stds = []
            for i in range(y.shape[1]):
                std = np.array(
                    [occurrence for occurrence in occurence_dict[
                                                      i].values()],
                    dtype=np.float64)
                std = (std / y.shape[0]).std()
                stds.append(std)
            return np.mean(stds)
        else:
            occurences = np.array([occurrence for occurrence in occurence_dict.values()],
                                 dtype=np.float64)
            return (occurences / y.shape[0]).std()
computed_features['ClassProbabilitySTD'] = ClassProbabilitySTD(X_train,y_train,cat)



def NumSymbols(X, y, categorical):
        symbols_per_column = []
        for i, column in enumerate(X.T):
            if categorical[i]:
                unique_values = np.unique(column)
                num_unique = np.sum(np.isfinite(unique_values))
                symbols_per_column.append(num_unique)
        return symbols_per_column
    
computed_features['NumSymbols'] = NumSymbols(X_train,y_train,cat)  


def SymbolsMin(X, y, categorical):
        # The minimum can only be zero if there are no nominal features,
        # otherwise it is at least one
        # TODO: shouldn't this rather be two?
        minimum = None
        for unique in computed_features['NumSymbols']:
            if unique > 0 and (minimum is None or unique < minimum):
                minimum = unique
        return minimum if minimum is not None else 0
    
computed_features['SymbolsMin'] = SymbolsMin(X_train,y_train,cat)  

def SymbolsMax(X, y, categorical):
        values = computed_features['NumSymbols']
        if len(values) == 0:
            return 0
        return max(max(values), 0)
    
computed_features['SymbolsMax'] = SymbolsMax(X_train,y_train,cat)     

def SymbolsMean(X, y, categorical):
        # TODO: categorical attributes without a symbol don't count towards this
        # measure
        values = [val for val in computed_features['NumSymbols'] if val > 0]
        mean = np.nanmean(values)
        return mean if np.isfinite(mean) else 0
    
computed_features['SymbolsMean'] = SymbolsMean(X_train,y_train,cat)

def SymbolsSTD(X, y, categorical):
        values = [val for val in computed_features['NumSymbols'] if val > 0]
        std = np.nanstd(values)
        return std if np.isfinite(std) else 0
    
computed_features['SymbolsSTD'] = SymbolsSTD(X_train,y_train,cat)   


def SymbolsSum(X, y, categorical):
        sum = np.nansum(computed_features['NumSymbols'])
        return sum if np.isfinite(sum) else 0
    
computed_features['SymbolsSum'] = SymbolsSum(X_train,y_train,cat)   


def ClassEntropy(X, y, categorical):
        labels = 1 if len(y.shape) == 1 else y.shape[1]
        if labels == 1:
            y = y.reshape((-1, 1))

        entropies = []
        for i in range(labels):
            occurence_dict = defaultdict(float)
            for value in y[:, i]:
                occurence_dict[value] += 1
            entropies.append(scipy.stats.entropy([occurence_dict[key] for key in
                                                 occurence_dict], base=2))

        return np.mean(entropies)

computed_features['ClassEntropy'] = ClassEntropy(X_train,y_train,cat)   

def mutual_information(X, y, categorical):
    try:
        if categorical == -1:
            categorical = doane_cat(x)
        if categorical == np.inf:
            categorical = sturges_cat(x)
    except ValueError:
        categorical = 10.0
    # print "bins", bins
    try:
        c_Xy = np.histogram2d(x, y, categorical)[0]
        mi = metrics.mutual_info_score(None, None, contingency=c_xy)
        # print "success"
    except Exception: 
        #print "error with mi calc", str(e)
        mi = 0
    return mi

computed_features['mutual_information'] = mutual_information(X_train,y_train,cat)  


NumberOfInstances
LogNumberOfInstances
NumberofClasses
NumberOfFeatures
LogNumberOfFeatures
MissingValues
NumberOfInstancesWithMissingValues
PercentageOfInstancesWithMissingValues
NumberOfFeaturesWithMissingValues
PercentageOfFeaturesWithMissingValues
NumberOfMissingValues
PercentageOfMissingValues
NumberOfNumericFeatures
NumberOfCategoricalFeatures
RatioNumericalToNominal
DatasetRatio
LogDatasetRatio
InverseDatasetRatio
LogInverseDatasetRatio
CPU times: user 32.1 ms, sys: 99 µs, total: 32.2 ms
Wall time: 28.1 ms


In [40]:
def Kurtosisses(X, y, categorical):
        kurts = []
        for i in range(X.shape[1]):
            if not categorical[i]:
                kurts.append(scipy.stats.kurtosis(X[:, i]))
        return kurts
computed_features['Kurtosisses'] = Kurtosisses(X_train,y_train,cat)
    
def KurtosisMin(X, y, categorical):
        kurts = computed_features["Kurtosisses"]
        minimum = np.nanmin(kurts) if len(kurts) > 0 else 0
        return minimum if np.isfinite(minimum) else 0
computed_features['KurtosisMin'] = KurtosisMin(X_train,y_train,cat)


def KurtosisMax(X, y, categorical):
        kurts = computed_features["Kurtosisses"]
        maximum = np.nanmax(kurts) if len(kurts) > 0 else 0
        return maximum if np.isfinite(maximum) else 0
computed_features['KurtosisMax'] = KurtosisMax(X_train,y_train,cat)


def KurtosisMean(X, y, categorical):
        kurts = computed_features["Kurtosisses"]
        mean = np.nanmean(kurts) if len(kurts) > 0 else 0
        return mean if np.isfinite(mean) else 0
computed_features['KurtosisMean'] = KurtosisMean(X_train,y_train,cat) 
    
def KurtosisSTD(X, y, categorical):
        kurts = computed_features["Kurtosisses"]
        std = np.nanstd(kurts) if len(kurts) > 0 else 0
        return std if np.isfinite(std) else 0
computed_features['KurtosisSTD'] = KurtosisSTD(X_train,y_train,cat) 

def Skewnesses(X, y, categorical):
        skews = []
        for i in range(X.shape[1]):
            if not categorical[i]:
                skews.append(scipy.stats.skew(X[:, i]))
        return skews
computed_features['Skewnesses'] = Skewnesses(X_train,y_train,cat) 
    

def SkewnessMin(X, y, categorical):
        skews = computed_features["Skewnesses"]
        minimum = np.nanmin(skews) if len(skews) > 0 else 0
        return minimum if np.isfinite(minimum) else 0
computed_features['SkewnessMin'] = SkewnessMin(X_train,y_train,cat) 

def SkewnessMax(X, y, categorical):
        skews = computed_features["Skewnesses"]
        maximum = np.nanmax(skews) if len(skews) > 0 else 0
        return maximum if np.isfinite(maximum) else 0
computed_features['SkewnessMax'] = SkewnessMax(X_train,y_train,cat)    

def SkewnessMean(X, y, categorical):
        skews = computed_features["Skewnesses"]
        mean = np.nanmean(skews) if len(skews) > 0 else 0
        return mean if np.isfinite(mean) else 0
computed_features['SkewnessMean'] = SkewnessMean(X_train,y_train,cat)    

def SkewnessSTD(X, y, categorical):
        skews = computed_features["Skewnesses"]
        std = np.nanstd(skews) if len(skews) > 0 else 0
        return std if np.isfinite(std) else 0
computed_features['SkewnessSTD'] = SkewnessSTD(X_train,y_train,cat)  


def LandmarkLDA(X, y, categorical):
        import sklearn.discriminant_analysis
        if len(y.shape) == 1 or y.shape[1] == 1:
            kf = sklearn.model_selection.StratifiedKFold(n_splits=10)
        else:
            kf = sklearn.model_selection.KFold(n_splits=10)

        accuracy = 0.
        try:
            for train, test in kf.split(X, y):
                lda = sklearn.discriminant_analysis.LinearDiscriminantAnalysis()

                if len(y.shape) == 1 or y.shape[1] == 1:
                    lda.fit(X[train], y[train])
                else:
                    lda = OneVsRestClassifier(lda)
                    lda.fit(X[train], y[train])

                predictions = lda.predict(X[test])
                accuracy += sklearn.metrics.accuracy_score(predictions, y[test])
            return accuracy / 10
        except scipy.linalg.LinAlgError as e:
            self.logger.warning("LDA failed: %s Returned 0 instead!" % e)
            return np.NaN
        except ValueError as e:
            self.logger.warning("LDA failed: %s Returned 0 instead!" % e)
            return np.NaN
computed_features['LandmarkLDA'] = LandmarkLDA(X_train,y_train,cat)  

# Naive Bayes
def LandmarkNaiveBayes(X, y, categorical):
        import sklearn.naive_bayes

        if len(y.shape) == 1 or y.shape[1] == 1:
            kf = sklearn.model_selection.StratifiedKFold(n_splits=10)
        else:
            kf = sklearn.model_selection.KFold(n_splits=10)

        accuracy = 0.
        for train, test in kf.split(X, y):
            nb = sklearn.naive_bayes.GaussianNB()

            if len(y.shape) == 1 or y.shape[1] == 1:
                nb.fit(X[train], y[train])
            else:
                nb = OneVsRestClassifier(nb)
                nb.fit(X[train], y[train])

            predictions = nb.predict(X[test])
            accuracy += sklearn.metrics.accuracy_score(predictions, y[test])
        return accuracy / 10
computed_features['LandmarkNaiveBayes'] = LandmarkNaiveBayes(X_train,y_train,cat) 
   

def LandmarkDecisionTree(X, y, categorical):
        import sklearn.tree

        if len(y.shape) == 1 or y.shape[1] == 1:
            kf = sklearn.model_selection.StratifiedKFold(n_splits=10)
        else:
            kf = sklearn.model_selection.KFold(n_splits=10)

        accuracy = 0.
        for train, test in kf.split(X, y):
            random_state = sklearn.utils.check_random_state(42)
            tree = sklearn.tree.DecisionTreeClassifier(random_state=random_state)

            if len(y.shape) == 1 or y.shape[1] == 1:
                tree.fit(X[train], y[train])
            else:
                tree = OneVsRestClassifier(tree)
                tree.fit(X[train], y[train])

            predictions = tree.predict(X[test])
            accuracy += sklearn.metrics.accuracy_score(predictions, y[test])
        return accuracy / 10
computed_features['LandmarkDecisionTree'] = LandmarkDecisionTree(X_train,y_train,cat)

"""If there is a dataset which has OneHotEncoded features it can happend that
a node learner splits at one of the attribute encodings. This should be fine
as the dataset is later on used encoded."""

# TODO: use the same tree, this has then to be computed only once and hence
#  saves a lot of time...

def LandmarkDecisionNodeLearner(X, y, categorical):
        import sklearn.tree

        if len(y.shape) == 1 or y.shape[1] == 1:
            kf = sklearn.model_selection.StratifiedKFold(n_splits=10)
        else:
            kf = sklearn.model_selection.KFold(n_splits=10)

        accuracy = 0.
        for train, test in kf.split(X, y):
            random_state = sklearn.utils.check_random_state(42)
            node = sklearn.tree.DecisionTreeClassifier(
                criterion="entropy", max_depth=1, random_state=random_state,
                min_samples_split=2, min_samples_leaf=1,  max_features=None)
            if len(y.shape) == 1 or y.shape[1] == 1:
                node.fit(X[train], y[train])
            else:
                node = OneVsRestClassifier(node)
                node.fit(X[train], y[train])
            predictions = node.predict(X[test])
            accuracy += sklearn.metrics.accuracy_score(predictions, y[test])
        return accuracy / 10
computed_features['LandmarkDecisionNodeLearner'] = LandmarkDecisionNodeLearner(X_train,y_train,cat)

def LandmarkRandomNodeLearner(X, y, categorical):
        import sklearn.tree

        if len(y.shape) == 1 or y.shape[1] == 1:
            kf = sklearn.model_selection.StratifiedKFold(n_splits=10)
        else:
            kf = sklearn.model_selection.KFold(n_splits=10)
        accuracy = 0.

        for train, test in kf.split(X, y):
            random_state = sklearn.utils.check_random_state(42)
            node = sklearn.tree.DecisionTreeClassifier(
                criterion="entropy", max_depth=1, random_state=random_state,
                min_samples_split=2, min_samples_leaf=1, max_features=1)
            node.fit(X[train], y[train])
            predictions = node.predict(X[test])
            accuracy += sklearn.metrics.accuracy_score(predictions, y[test])
        return accuracy / 10

computed_features['LandmarkRandomNodeLearner'] = LandmarkRandomNodeLearner(X_train,y_train,cat)


# Replace the Elite 1NN with a normal 1NN, this slightly changes the
# intuition behind this landmark, but Elite 1NN is used nowhere else...
def Landmark1NN(X, y, categorical):
        import sklearn.neighbors

        if len(y.shape) == 1 or y.shape[1] == 1:
            kf = sklearn.model_selection.StratifiedKFold(n_splits=10)
        else:
            kf = sklearn.model_selection.KFold(n_splits=10)

        accuracy = 0.
        for train, test in kf.split(X, y):
            kNN = sklearn.neighbors.KNeighborsClassifier(n_neighbors=1)
            if len(y.shape) == 1 or y.shape[1] == 1:
                kNN.fit(X[train], y[train])
            else:
                kNN = OneVsRestClassifier(kNN)
                kNN.fit(X[train], y[train])
            predictions = kNN.predict(X[test])
            accuracy += sklearn.metrics.accuracy_score(predictions, y[test])
        return accuracy / 10

computed_features['Landmark1NN'] = Landmark1NN(X_train,y_train,cat)

# Bardenet 2013 - Collaborative Hyperparameter Tuning
# K number of classes ("number_of_classes")
# log(d), log(number of attributes)
# log(n/d), log(number of training instances/number of attributes)
# p, how many principal components to keep in order to retain 95% of the
#     dataset variance
# skewness of a dataset projected onto one principal component...
# kurtosis of a dataset projected onto one principal component

def PCA(X, y, categorical):
        import sklearn.decomposition
        pca = sklearn.decomposition.PCA(copy=True)
        rs = np.random.RandomState(42)
        indices = np.arange(X.shape[0])
        for i in range(10):
            try:
                rs.shuffle(indices)
                pca.fit(X[indices])
                return pca
            except LinAlgError as e:
                pass
        self.logger.warning("Failed to compute a Principle Component Analysis")
        return None
computed_features['PCA'] = PCA(X_train,y_train,cat)

# Maybe define some more...

def PCAFractionOfComponentsFor95PercentVariance(X, y, categorical):
        pca_ = computed_features["PCA"]
        if pca_ is None:
            return np.NaN
        sum_ = 0.
        idx = 0
        while sum_ < 0.95 and idx < len(pca_.explained_variance_ratio_):
            sum_ += pca_.explained_variance_ratio_[idx]
            idx += 1
        return float(idx)/float(X.shape[1])
computed_features['PCAFractionOfComponentsFor95PercentVariance'] = PCAFractionOfComponentsFor95PercentVariance(X_train,y_train,cat)

# Kurtosis of first PC
def PCAKurtosisFirstPC(X, y, categorical):
        pca_ = computed_features["PCA"]
        if pca_ is None:
            return np.NaN
        components = pca_.components_
        pca_.components_ = components[:1]
        transformed = pca_.transform(X)
        pca_.components_ = components

        kurtosis = scipy.stats.kurtosis(transformed)
        return kurtosis[0]
computed_features['PCAKurtosisFirstPC'] = PCAKurtosisFirstPC(X_train,y_train,cat)

# Skewness of first PC
def PCASkewnessFirstPC(X, y, categorical):
        pca_ = computed_features["PCA"]
        if pca_ is None:
            return np.NaN
        components = pca_.components_
        pca_.components_ = components[:1]
        transformed = pca_.transform(X)
        pca_.components_ = components

        skewness = scipy.stats.skew(transformed)
        return skewness[0]
computed_features['PCASkewnessFirstPC'] = PCASkewnessFirstPC(X_train,y_train,cat)




def calculate_all_metafeatures(X, y, categorical, dataset_name,
        calculate=None, dont_calculate=None, densify_threshold=1000):
    logger = get_logger(__name__)

    """Calculate all metafeatures."""
    #helper_functions.clear()
    computed_features.clear()
    metafeatures.clear()
    mf_ = dict()

    visited = set()
    to_visit = deque()
    to_visit.extend(metafeatures)

    X_transformed = None
    y_transformed = None

    # TODO calculate the numpy metafeatures after all others to consume less
    # memory
    while len(to_visit) > 0:
        name = to_visit.pop()
        if calculate is not None and name not in calculate:
            continue
        if dont_calculate is not None and name in dont_calculate:
            continue

        if name in npy_metafeatures:
            if X_transformed is None:
                # TODO make sure this is done as efficient as possible (no copy for
                # sparse matrices because of wrong sparse format)
                sparse = scipy.sparse.issparse(X)
                if any(categorical):
                    ohe = OneHotEncoder(categorical_features=categorical, sparse=True)
                    X_transformed = ohe.fit_transform(X)
                else:
                    X_transformed = X
                imputer = Imputer(strategy='mean', copy=False)
                X_transformed = imputer.fit_transform(X_transformed)
                center = not scipy.sparse.isspmatrix(X_transformed)
                standard_scaler = StandardScaler(copy=False, with_mean=center)
                X_transformed = standard_scaler.fit_transform(X_transformed)
                categorical_transformed = [False] * X_transformed.shape[1]

                # Densify the transformed matrix
                if not sparse and scipy.sparse.issparse(X_transformed):
                    bytes_per_float = X_transformed.dtype.itemsize
                    num_elements = X_transformed.shape[0] * X_transformed.shape[1]
                    megabytes_required = num_elements * bytes_per_float / 1000 / 1000
                    if megabytes_required < densify_threshold:
                        X_transformed = X_transformed.todense()

                # This is not only important for datasets which are somehow
                # sorted in a strange way, but also prevents lda from failing in
                # some cases.
                # Because this is advanced indexing, a copy of the data is returned!!!
                X_transformed = check_array(X_transformed,
                                            force_all_finite=True,
                                            accept_sparse='csr')
                rs = np.random.RandomState(42)
                indices = np.arange(X_transformed.shape[0])
                rs.shuffle(indices)
                # TODO Shuffle inplace
                X_transformed = X_transformed[indices]
                y_transformed = y[indices]

            X_ = X_transformed
            y_ = y_transformed
            categorical_ = categorical_transformed
        else:
            X_ = X
            y_ = y
            categorical_ = categorical

        dependency = metafeatures.get_dependency(name)
        if dependency is not None:
            is_metafeature = dependency in metafeatures
            #is_helper_function = dependency in helper_functions
            is_computed_features = dependency in computed_features

            if is_metafeature and is_computed_features:
                raise NotImplementedError()
            elif not is_metafeature and not is_computed_features:
                raise ValueError(dependency)
            elif is_metafeature and not metafeatures.is_calculated(dependency):
                to_visit.appendleft(name)
                continue
            elif is_computed_features and not computed_features.is_calculated(
                    dependency):
                logger.info("%s: Going to calculate: %s", dataset_name,
                            dependency)
                value = computed_features[dependency](X_, y_, categorical_)
                computed_features.set_value(dependency, value)
                mf_[dependency] = value

        logger.info("%s: Going to calculate: %s", dataset_name,
                    name)

        value = metafeatures[name](X_, y_, categorical_)
        metafeatures.set_value(name, value)
        mf_[name] = value
        visited.add(name)

    mf_ = DatasetMetafeatures(dataset_name, mf_)
    return mf_


npy_metafeatures = set(["LandmarkLDA",
                        "LandmarkNaiveBayes",
                        "LandmarkDecisionTree",
                        "LandmarkDecisionNodeLearner",
                        "LandmarkRandomNodeLearner",
                        "LandmarkWorstNodeLearner",
                        "Landmark1NN",
                        "PCAFractionOfComponentsFor95PercentVariance",
                        "PCAKurtosisFirstPC",
                        "PCASkewnessFirstPC",
                        "Skewnesses",
                        "SkewnessMin",
                        "SkewnessMax",
                        "SkewnessMean",
                        "SkewnessSTD",
                        "Kurtosisses",
                        "KurtosisMin",
                        "KurtosisMax",
                        "KurtosisMean",
                        "KurtosisSTD"])

subsets = dict()
# All implemented metafeatures
subsets["all"] = set(computed_features.keys())

# Metafeatures used by Pfahringer et al. (2000) in the first experiment
subsets["pfahringer_2000_experiment1"] = set(["number_of_features",
                                             "number_of_numeric_features",
                                             "number_of_categorical_features",
                                             "number_of_classes",
                                             "class_probability_max",
                                             "landmark_lda",
                                             "landmark_naive_bayes",
                                             "landmark_decision_tree"])

# Metafeatures used by Pfahringer et al. (2000) in the second experiment
# worst node learner not implemented yet
"""
pfahringer_2000_experiment2 = set(["landmark_decision_node_learner",
                                   "landmark_random_node_learner",
                                   "landmark_worst_node_learner",
                                   "landmark_1NN"])
"""

# Metafeatures used by Yogatama and Mann (2014)
subsets["yogotama_2014"] = set(["log_number_of_features",
                               "log_number_of_instances",
                               "number_of_classes"])

# Metafeatures used by Bardenet et al. (2013) for the AdaBoost.MH experiment
subsets["bardenet_2013_boost"] = set(["number_of_classes",
                                     "log_number_of_features",
                                     "log_inverse_dataset_ratio",
                                     "pca_95percent"])

# Metafeatures used by Bardenet et al. (2013) for the Neural Net experiment
subsets["bardenet_2013_nn"] = set(["number_of_classes",
                                  "log_number_of_features",
                                  "log_inverse_dataset_ratio",
                                  "pca_kurtosis_first_pc",
                                  "pca_skewness_first_pc"])



In [41]:
computed_features

{'NumberOfInstances': 15266.0,
 'LogNumberOfInstances': 9.633383412358416,
 'NumberofClasses': 2.0,
 'NumberOfFeatures': 8.0,
 'LogNumberOfFeatures': 2.0794415416798357,
 'MissingValues': array([[False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        ...,
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False]]),
 'NumberOfInstancesWithMissingValues': 0.0,
 'PercentageOfInstancesWithMissingValues': 0.0,
 'NumberOfFeaturesWithMissingValues': 0.0,
 'PercentageOfFeaturesWithMissingValues': 0.0,
 'NumberOfMissingValues': 0.0,
 'PercentageOfMissingValues': 0.0,
 'NumberOfNumericFeatures': 8,
 'NumberOfCategoricalFeatures': 0,
 'RatioNumericalToNominal': 0.0,
 'DatasetRatio': 0.0005240403511070353,
 'LogDatasetRatio': -7.55394187067858,
 'InverseDatasetRatio': 1908.2

In [42]:
#classification_tasks = [233, 236, 242, 244, 246, 75090, 248, 251, 75124, 253,
                        #75092, 258,
                        #260, 261, 262, 75095, 266, 75097,
                        #75099, 75159,
                        #75100, 275, 288, 75103, 75108,
                        #75109, 75110,
                        #75112, 75113, 75114, 75115, 75116, 75117, 75119, 75120,
                        #75121, 75123,
                        #252, 75125, 75126, 75127, 75129,2119,
                        #2120, 2122,
                        #75132, 75133, 75139, 75141, 75142, 75143, 75146,
                        #75148, 75150,
                        #75153, 75154, 75230, 75156, 75157, 273,
                        #75163, 75166,
                        #75169, 75171,75173, 75174, 75175, 75176,
                        #75179,
                        #75184, 75185,75192, 75193, 75195,
                        #75234, 75161,
                        #75210,
                        #75213, 75215, 75217, 75219, 75221,75223, 75134,
                        #75225, 75226,
                        #75227, 75231, 75232, 75233, 75101, 75235, 75236, 75237,
                        #75178, 75239,
75181, 75187, 75250,75243, 75182,75096]

In [133]:
#classification_tasks =[75106, 75107, 75128, 2117,2123,2350,75197, 75172, 75177,75240,75189,
#75196,75198,75168, 75201,75202, 75203, 75188, 75205, 75207,75212,
#75222,75249,75248,75244, 254,3043, 75093,5108 ,75191,75098]

In [134]:
t_id = classification_tasks

In [None]:
for i in t_id:
    computed_features1={}
    computed_features1['task_id'] = (i)
    print(i)
    X_train, y_train, X_test, y_test, cat = load_task(i)
    computed_features1['NumberOfInstances']=NumberOfInstances(X_train,y_train,cat)
    computed_features1['LogNumberOfInstances']=LogNumberOfInstances(X_train,y_train,cat)
    computed_features1['NumberofClasses']=NumberofClasses(X_train,y_train,cat)
    computed_features1['NumberOfFeatures'] = NumberOfFeatures(X_train,y_train,cat)
    computed_features1['LogNumberOfFeatures'] = LogNumberOfFeatures(X_train,y_train,cat)
    computed_features['MissingValues'] = MissingValues(X_train,y_train,cat)
    computed_features1['NumberOfInstancesWithMissingValues'] = NumberOfInstancesWithMissingValues(X_train,y_train,cat)
    computed_features1['PercentageOfInstancesWithMissingValues'] = PercentageOfInstancesWithMissingValues(X_train,y_train,cat)
    computed_features1['NumberOfFeaturesWithMissingValues'] = NumberOfFeaturesWithMissingValues(X_train,y_train,cat)
    computed_features1['PercentageOfFeaturesWithMissingValues'] = PercentageOfFeaturesWithMissingValues(X_train,y_train,cat)
    computed_features1['NumberOfMissingValues'] = NumberOfMissingValues(X_train,y_train,cat)
    computed_features1['PercentageOfMissingValues'] = PercentageOfMissingValues(X_train,y_train,cat)
    computed_features1['NumberOfNumericFeatures'] = NumberOfNumericFeatures(X_train,y_train,cat)
    computed_features1['NumberOfCategoricalFeatures'] = NumberOfCategoricalFeatures(X_train,y_train,cat)
    computed_features1['RatioNumericalToNominal'] = RatioNumericalToNominal(X_train,y_train,cat)
    computed_features1['DatasetRatio'] = DatasetRatio(X_train,y_train,cat)
    computed_features1['LogDatasetRatio'] = LogDatasetRatio(X_train,y_train,cat)
    computed_features1['InverseDatasetRatio'] = InverseDatasetRatio(X_train,y_train,cat)
    computed_features1['LogInverseDatasetRatio'] = LogInverseDatasetRatio(X_train,y_train,cat)
    computed_features['ClassOccurences'] = ClassOccurences(X_train,y_train,cat)
    computed_features1['ClassProbabilityMin'] = ClassProbabilityMin(X_train,y_train,cat)
    computed_features1['ClassProbabilityMax'] = ClassProbabilityMax(X_train,y_train,cat)
    computed_features1['ClassProbabilityMean'] = ClassProbabilityMean(X_train,y_train,cat)
    computed_features1['ClassProbabilitySTD'] = ClassProbabilitySTD(X_train,y_train,cat)
    computed_features1['Kurtosisses'] = Kurtosisses(X_train,y_train,cat)
    computed_features1['KurtosisMin'] = KurtosisMin(X_train,y_train,cat)
    computed_features1['KurtosisMax'] = KurtosisMax(X_train,y_train,cat)
    computed_features1['KurtosisMean'] = KurtosisMean(X_train,y_train,cat) 
    computed_features1['KurtosisSTD'] = KurtosisSTD(X_train,y_train,cat) 
    computed_features1['Skewnesses'] = Skewnesses(X_train,y_train,cat) 
    computed_features1['SkewnessMin'] = SkewnessMin(X_train,y_train,cat) 
    computed_features1['SkewnessMax'] = SkewnessMax(X_train,y_train,cat)    
    computed_features1['SkewnessMean'] = SkewnessMean(X_train,y_train,cat) 
    computed_features1['SkewnessSTD'] = SkewnessSTD(X_train,y_train,cat)  
    computed_features1['LandmarkLDA'] = LandmarkLDA(X_train,y_train,cat)  
    computed_features1['LandmarkNaiveBayes'] = LandmarkNaiveBayes(X_train,y_train,cat) 
    computed_features1['LandmarkDecisionTree'] = LandmarkDecisionTree(X_train,y_train,cat)
    computed_features1['LandmarkDecisionNodeLearner'] = LandmarkDecisionNodeLearner(X_train,y_train,cat)
    computed_features1['LandmarkRandomNodeLearner'] = LandmarkRandomNodeLearner(X_train,y_train,cat)
    computed_features1['Landmark1NN'] = Landmark1NN(X_train,y_train,cat)
    computed_features['PCA'] = PCA(X_train,y_train,cat)
    computed_features1['PCAFractionOfComponentsFor95PercentVariance'] = PCAFractionOfComponentsFor95PercentVariance(X_train,y_train,cat)
    computed_features1['PCAKurtosisFirstPC'] = PCAKurtosisFirstPC(X_train,y_train,cat)
    computed_features1['PCASkewnessFirstPC'] = PCASkewnessFirstPC(X_train,y_train,cat)  
    computed_features1['mutual_information'] = mutual_information(X_train,y_train,cat)
    computed_features1['ClassEntropy'] = ClassEntropy(X_train,y_train,cat)  
    computed_features1['SymbolsMin'] = SymbolsMin(X_train,y_train,cat)  
    computed_features1['SymbolsMax'] = SymbolsMax(X_train,y_train,cat) 
    computed_features1['SymbolsMean'] = SymbolsMean(X_train,y_train,cat)
    computed_features1['SymbolsSTD'] = SymbolsSTD(X_train,y_train,cat)   
    computed_features1['SymbolsSum'] = SymbolsSum(X_train,y_train,cat)  
   
    metaData=[computed_features1]
    df = pd.DataFrame(metaData)
    #print(df) 
    # count = count + 1;
    # print(count)
    # df.append(df1)
    #df.to_csv("/home/humaira/my_data/meta_data_temp6.csv")
    df.to_csv("/home/humaira/my_data/meta_data_temp6.csv",mode='a', header = False)
    

75098


