In [4]:
import numpy as np

def _entropy(dist):
    """
    Entropy of class-distribution matrix
    
    It may be needed to change the clip to something smaler.
    """
    p = dist / np.sum(dist, axis=0)
    q = np.clip(p, 1e-15, 1)
    return np.sum(np.sum(- p * np.log2(q), axis=0) * np.sum(dist, axis=0) / np.sum(dist))


    """
    Information gain ratio is the ratio between information gain and
    the entropy of the feature's
    value distribution. The score was introduced in [Quinlan1986]_
    to alleviate overestimation for multi-valued features. See `Wikipedia entry on gain ratio
    <http://en.wikipedia.org/wiki/Information_gain_ratio>`_.
    .. [Quinlan1986] J R Quinlan: Induction of Decision Trees, Machine Learning, 1986.
    """
def GainRatio(examples, nan_adjustment):
    """
    Returns the Information Gain-Ration, that is \frac{Information Gain}{Intrinsic Value}

    examples are all training examples, given as 2-d numpy array.
    Assuming H is the Entropy and EX are the examples, than H(EX)shall be the same as h_class.
    nan_adjustment shall take care of all nan Values.
    The Information Gain is h_class-h_residual.
    h_attribute is the Intrinsic Value.
    If h_attribute = 0, than the Information Gain-Ration is the same as the Information Gain.
    """
    h_class = _entropy(np.sum(examples, axis=1))
    h_residual = _entropy(np.compress(np.sum(examples, axis=0), examples, axis=1))
    h_attribute = _entropy(np.sum(examples, axis=0))
    if h_attribute == 0:
        h_attribute = 1
    return nan_adjustment * (h_class - h_residual) / h_attribute

In [5]:
def _gini(dist):
    """Gini index of class-distribution matrix"""
    p = np.asarray(dist / np.sum(dist, axis=0))
    return np.sum((1 - np.sum(p ** 2, axis=0)) *
                  np.sum(dist, axis=0) / np.sum(dist))
"""
Gini impurity is the probability that two randomly chosen instances will have different
classes. See `Wikipedia entry on Gini impurity
<https://en.wikipedia.org/wiki/Decision_tree_learning#Gini_impurity>`_.
"""
def Gini(examples, nan_adjustment):
    """
    Returns the Gini imprunity of the examples.
    
    examples are all training examples, given as 2-d numpy array.
    The orientation of the numpy array is assumed ro be the same as for GainRatio.
    nan_adjustment shall take care of all nan Values. 
    """
    return (_gini(np.sum(examples, axis=1)) - _gini(examples)) * nan_adjustment