In [4]:
from sklearn.datasets import make_classification
from sklearn.cross_validation import train_test_split
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from __future__ import division

### Decision Tree functions

In [31]:
def intermediate_node_num_mistakes(labels_in_node):
    
    # Corner case: If labels_in_node is empty, return 0
    if len(labels_in_node) == 0:
        return 0
    
    C,unique_counts = np.unique(labels_in_node,return_counts=True) #the id of classes and number of each
    
    return (len(labels_in_node) - unique_counts[np.argmax(unique_counts)])


def reached_minimum_node_size(y, min_node_size):
    # Return True if the number of data points is less than or equal to the minimum node size.
    if y.shape[0] <= min_node_size:
        #print y.shape[0]
        return True

    
# X matrix of features (p datapoints x N features)
# y vector of labels (p x 1)

def best_splitting_feature(X, y, Nbins):
        
    best_feature = None # Keep track of the best feature 
    best_threshold = None
    best_I = -1     # Keep track of the best info gain so far 

    #the number of data points in the parent node
    num_data_points = y.shape[0]
    
    # Loop through each feature to consider splitting on that feature
    for feature in range(X.shape[1]):
        
        fvals = X[:,feature]
        fvals = np.sort(fvals)  #sorting the values
        if num_data_points > Nbins:            
            fvals = fvals[range(0,num_data_points,Nbins)]
        
        #loop through all values of current feature to find the best split
        for threshold in fvals:

            # The left split will have all data points where the feature value is smaller than threshold
            ind_left = X[:,feature] < threshold
            left_split = X[ind_left,feature]
             # The right split will have all data points where the feature value is larger or equal
            ind_right = X[:,feature] >= threshold
            right_split = X[ind_right,feature]
            
            #compute info-gain for current feature and threshold split
            I = infogain(y,y[ind_left],y[ind_right])
            
            # If this is the best error we have found so far, store the feature as best_feature
            # the threshold as the best threshold and the error as best_error
            if I > best_I:
                best_feature = feature
                best_threshold = threshold
                best_I = I
        
    return best_feature, best_threshold # Return the best feature and threshold


def infogain(yparent,yleft,yright):
    
    Nparent = len(yparent)
    Nleft = len(yleft)
    Nright = len(yright)
    
    #when one of the splits is empty returns I = 0
    if Nleft ==0 or Nright == 0:
        I = 0
    else:
        #compute information gain
        I = entropy(yparent) -( (Nleft/Nparent)*entropy(yleft) + (Nright/Nparent)*entropy(yright) )   

    return I


#entropy for multiple classes
def entropy(y):
    C,unique_counts = np.unique(y,return_counts=True) #the id of classes and number of each
    Pc = unique_counts/len(y)
    H = -(Pc*np.log(Pc)).sum()
    return H    


def create_leaf(target_values,C):

    # Create a leaf node
    leaf = {'splitting_feature' : None,
            'left' : None,
            'right' : None,
            'is_leaf': True,
            'prediction': None,
            'labels_distribution':None                       }   
    
    # Count the number of data points of each class in the leaf.
    C_in_node,unique_counts = np.unique(target_values,return_counts=True) #the id of classes and number of each
    leaf['prediction'] = C_in_node[np.argmax(unique_counts)]
    
    Classes = np.zeros(C)
    Classes[C_in_node] = unique_counts/len(target_values)
    leaf['labels_distribution'] = Classes
    
    # Return the leaf node        
    return leaf 


def decision_tree_create(X, y, N_features_to_sample, C, min_node_size, Nbins, Verbose, current_depth = 0, max_depth = 10):
    
    #randomly sample a subset of features
    Nfeatures = X.shape[1]
    features = np.random.choice(Nfeatures, N_features_to_sample, replace=False)    
    
    #select only the features sampled for this run
    Xcurrent = X[:,features]
    target_values = y

    if Verbose == True:
        print "--------------------------------------------------------------------"
        print "Subtree, depth = %s (%s data points)." % (current_depth, len(target_values))
        print "Features selected = %s" % features


    # Stopping condition 1
    # (Check if there are mistakes at current node, i.e. if the node is pure.)
    if intermediate_node_num_mistakes(target_values) == 0:  
        if Verbose == True:
            print "No Mistakes at current node - Stopping."     
        # If not mistakes at current node, make current node a leaf node
        return create_leaf(target_values,C)
    
    #Stopping condition 2: min node size reached
    if reached_minimum_node_size(y, min_node_size):
        if Verbose == True:
            print "Minimum node size reached - Stopping"
        return create_leaf(y,C)
    
    # Stopping condition 3: (limit tree depth)
    if current_depth >= max_depth:  
        if Verbose == True:
            print "Reached maximum depth. Stopping."
        # If the max tree depth has been reached, make current node a leaf node
        return create_leaf(target_values,C)

    # Find the best splitting feature and its threshold
    splitting_feature,splitting_thres = best_splitting_feature(Xcurrent,y,Nbins)
    splitting_feature = features[splitting_feature]
    
    # Split on the best feature that we found. 
    ind_left = X[:,splitting_feature] < splitting_thres
    left_split = X[ind_left,:]
    y_left = y[ind_left]

    ind_right = X[:,splitting_feature] >= splitting_thres
    right_split = X[ind_right,:]
    y_right = y[ind_right]

    if Verbose == True:
        print "Split on feature %s. (%s, %s), Threshold = %s" % (\
        splitting_feature, y_left.shape, y_right.shape, splitting_thres)
    
    # Create a leaf node if the split is "perfect"
    if len(y_left) == len(y) or len(y_right) == len(y):
        if Verbose == True: 
            print 'One split empty: Creating Leaf'          
        return create_leaf(y,C)  
        
    # Repeat (recurse) on left and right subtrees
    left_tree = decision_tree_create(left_split, y_left, N_features_to_sample, C, min_node_size, Nbins, Verbose, current_depth + 1, max_depth)        
    right_tree = decision_tree_create(right_split, y_right, N_features_to_sample, C, min_node_size, Nbins, Verbose, current_depth + 1, max_depth)

    return {'is_leaf'          : False, 
            'prediction'       : None,
            'splitting_feature': splitting_feature,
            'threshold'        : splitting_thres,
            'left'             : left_tree, 
            'right'            : right_tree,
            'labels_distribution': None 
            
            }

def count_nodes(tree):
    if tree['is_leaf']:
        return 1
    return 1 + count_nodes(tree['left']) + count_nodes(tree['right'])

def count_leaves(tree):
    if tree['is_leaf']:
        return 1 
    return count_leaves(tree['left']) + count_leaves(tree['right'])

def classify(tree, x):   
    # if the node is a leaf node.
    if tree['is_leaf']:
        return tree['labels_distribution'] 
    else:
        # split on feature.
        val_split_feature = x[tree['splitting_feature']]
        if val_split_feature < tree['threshold']:
            return classify(tree['left'], x)
        else:
            return classify(tree['right'],x)
        
def evaluate_classification_error_tree(tree, X, y):
    # Apply the classify(tree, x) to each row in your data
    P = map(lambda x: classify(tree,x), X)
    prediction = np.argmax(P,axis=1)
    # Once you've made the predictions, calculate the classification error and return it
    mistakes = sum(prediction != y)
    error = mistakes/len(y)
    return error


### Random Forest

In [96]:
def forest_create(X,y,ntrees,nvarsample=None, min_node_size = 5, Nbins = 10,max_depth=20):
    
    if nvarsample == None:
        nvarsample = (np.round(np.sqrt(X.shape[1]))).astype(int)
        print 'Nfeatures = %s'%nvarsample
    
    #the number of classes is inferred from the data
    C = len(np.unique(y))
    
    nptrain = X.shape[0] #how many datapoints each tree is trained (same size of X)
    RF = []
    #for loop creating and training each tree 
    #bootstrap X to train each tree
    for t in range(ntrees):
        print 'current trained tree = %s'%t
        #create bootstrap training dataset for tree t
        indbootstrap = np.random.choice(X.shape[0],nptrain)
        Xtree = X[indbootstrap,:]
        ytree = y[indbootstrap] 
        
        #train the tree
        tree1 = decision_tree_create(Xtree,ytree,nvarsample,C,min_node_size,Nbins,Verbose=False,max_depth = max_depth)
        RF.append(tree1)
    
    print 'Forest Trained!'
    return RF
    

#outputs the posterior prob of each tree and the corresponding class
def forest_posterior(RF,x):

    T = len(RF)  #the number of trees 

    #infer the number of classes
    P0 = classify(RF[0],x)
    C = len(P0)
    
    Pt = np.zeros((T,C)) #matrix of posteriors from each tree (T x Nclasses)
    Pt[0,:] = P0
    for t in range(len(RF))[1:]:
        Pt[t,:] = classify(RF[t],x) 
    return Pt
 
    
#classify input based on majority voting of each tree prediction
def forest_classify_majority(RF,x):
        Pt = forest_posterior(RF,x)
        Yt = np.argmax(Pt,axis=1)         
        C,unique_counts = np.unique(Yt,return_counts=True) #the id of classes and number of each
        return C[np.argmax(unique_counts)]   
    
#classify input by averaging posteriors 
def forest_classify_ensemble(RF,x):
    Pt = forest_posterior(RF,x)
    Pforest = Pt.mean(axis=0)
    ypred = np.argmax(Pt.mean(axis=0))
    return ypred

def forest_classify_geo_mean(RF,x):
        Pt = forest_posterior(RF,x)
        a = -np.log(Pt)
        Pforest = a.mean(axis=0)
        ypred = np.argmax(Pt.mean(axis=0))
        return ypred 
    
def evaluate_classification_error(RF, X, y, method = None):  
    # Apply the forest_classify(RF, x) to each row in your data
    if method == None:
        ypred = map(lambda x: forest_classify_ensemble(RF,x), X)
        #ypred = map(lambda x: forest_classify_majority(RF,x), X)
        #ypred = map(lambda x: forest_classify_geo_mean(RF,x), X)
        # Once you've made the predictions, calculate the classification error and return it
        mistakes = sum(ypred != y)
        error = mistakes/len(y)
        
    return error

### Generate a fake dataset 

In [6]:
Xtrain = np.random.rand(1000,10)
y = np.ones(1000).astype(int)
y[(Xtrain[:,0]+Xtrain[:,1]+Xtrain[:,2] < 1)] = 0
y[(Xtrain[:,0]+3*Xtrain[:,2] >= 1.5)] = 2
y[(1.5*Xtrain[:,0]-2*Xtrain[:,1]+Xtrain[:,2] < 1)] = 3

### another fake dataset

In [9]:
Xtrain = np.random.rand(1000,2)
y = np.ones(1000).astype(int)
y[(Xtrain[:,0]+Xtrain[:,1]< 1)] = 0
y[(Xtrain[:,0]-1*Xtrain[:,1] >= 0.5)] = 2

In [29]:
tree1 = decision_tree_create(Xtrain, y, N_features_to_sample = 2, C = 4, min_node_size = 5, Nbins = 10, Verbose = False, current_depth = 0, max_depth = 2)
evaluate_classification_error_tree(tree1, Xtrain, y)

0.17299999999999999

In [35]:
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("../MNIST_data/", one_hot=False)
Xtest = mnist.train.images
Xtrain = mnist.test.images
ytest = mnist.train.labels
ytrain = mnist.test.labels
Xsource, Xtarget, ysource, ytarget = train_test_split(Xtrain, ytrain, test_size=0.3)

Extracting ../MNIST_data/train-images-idx3-ubyte.gz
Extracting ../MNIST_data/train-labels-idx1-ubyte.gz
Extracting ../MNIST_data/t10k-images-idx3-ubyte.gz
Extracting ../MNIST_data/t10k-labels-idx1-ubyte.gz


In [61]:
tree1 = decision_tree_create(Xsource, ysource, N_features_to_sample = 28, C = 10, min_node_size = 10, Nbins = 10, Verbose = False, current_depth = 0, max_depth = 9)
evaluate_classification_error_tree(tree1, Xtarget, ytarget)

0.22700000000000001

In [62]:
count_leaves(tree1)

283

In [63]:
count_nodes(tree1)

565

In [88]:
RF = forest_create(Xsource,ysource,ntrees = 8,nvarsample=100, min_node_size = 5, Nbins = 10,max_depth=12)

current trained tree = 0
current trained tree = 1
current trained tree = 2
current trained tree = 3
current trained tree = 4
current trained tree = 5
current trained tree = 6
current trained tree = 7
Forest Trained!


In [104]:
evaluate_classification_error(RF[3:8], Xtarget, ytarget, method = None)

0.13633333333333333

In [76]:
evaluate_classification_error_tree(RF[1], Xtarget, ytarget)

0.26000000000000001