from sklearn.tree import DecisionTreeClassifier

https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

from sklearn.tree import DecisionTreeRegressor

https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html#sklearn.tree.DecisionTreeRegressor

from sklearn.ensemble import BaggingClassifier

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

from sklearn.ensemble import RandomForestClassifier

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

## Definition¶
A decision tree is a DAG (directed acyclic graph) type of classifier where each branch node represents a choice between a number of alternatives and each leaf node represents a classification. An unknown (or test) instance is routed down the tree according to the values of the attributes in the successive nodes. When the instance reaches a leaf, it is classified according to the label assigned to the corresponded leaf. The idea of feature importance is of high importance as selecting the correct feature to make a split that define complexity and effectiveness of the classification process. 

## Entropy and Information Gain raw code

    from math import log
#### Entropy
    def entropy(pi):
        '''
        return the Entropy of a probability distribution:
        entropy(p) = - SUM (Pi * log(Pi) )
        '''

        total = 0
        for p in pi:
            p = p / sum(pi)
            if p != 0:
                total +=  p * log(p, 2)
            else:
                total += 0
        total *= -1
        return total

    print(entropy([1,1])) # Maximum Entropy e.g. a coin toss
    print (entropy([0,6])) # No entropy, ignore the -ve with zero , its there due to log function
    print (entropy([2,10])) # A random mix of classes

    # Information Gain
    def IG(D, a):
        '''
        return the information gain:
        gain(D, A) = entropy(D)− SUM( |Di| / |D| * entropy(Di) )
        '''

        total = 0
        for Di in a:
            total += abs(sum(Di) / sum(D)) * entropy(Di)

        gain = entropy(D) - total
        return gain

    test_dist = [6, 6] # Yes, No
    test_attr = [ [4,0], [2,4], [0,2] ] # class1, class2, class3 of attr1 according to YES/NO classes in 
    test_dist
    print(IG(test_dist, test_attr))
    # the process of entropy is repeated until no more splits can be made, which is called the 'pure' split.

## Decision Tree Classifier
#### Import libraries
#### Obtain Data
#### Scrub Data
#### Explore Data (normalize and scale)
#### Create target and features
    features = dataset.drop('target', axis=1)  
    target = dataset['target']  
    # create train/test split
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.30, random_state= 0) 
    # instantiate classifier
    classifier = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')  
    # fit classifier
    classifier.fit(X_train, y_train)
    # predict data
    y_train_pred = classifier.predict(X_train)
    y_pred = classifier.predict(X_test)
    # run classification report, confusion matrix and visualize auc


## Decision Tree Regressor
#### Import libraries
#### Obtain Data
#### Scrub Data
#### Explore Data (normalize and scale)
#### Create target and features
    features = dataset.drop('target', axis=1)  

    target = dataset['target']  
    # create train/test split
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.30, random_state= 0)
    # instantiate regressor
    from sklearn.tree import DecisionTreeRegressor  
    regressor = DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
               max_leaf_nodes=None, min_impurity_decrease=0.0,
               min_impurity_split=None, min_samples_leaf=1,
               min_samples_split=2, min_weight_fraction_leaf=0.0,
               presort=False, random_state=None, splitter='best')  
    # fit regressor
    regressor.fit(X_train, y_train)  
    # predict data
    y_train_pred = regressor.predict(X_train)
    # run regressor evaluations


## Bagged Tree
#### Bootstrap resampling and aggregation. Bootstrapping refers to the subsets of your dataset by sampling with replacement. Aggregation refers to the practice of combining all the different estimates to arrive at a single estimate. Used on Classifiers that have no bootstrap option.
    bagged_tree =  BaggingClassifier(Classifier(), bootstrap=True, bootstrap_features=False, max_features=1.0, max_samples=1.0, n_estimators=20, n_jobs=None, oob_score=False, random_state=None, verbose=0, warm_start=False)
    # fit data
    bagged_tree.fit(data_train, target_train)
    # run evaluation metrics (accuracy, f1, recall, confusion matrix, feature importance)

## Random Forests
    forest = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
    # fit data
    forest.fit(data_train, target_train)
    # run evaluation metrics (accuracy, f1, recall, confusion matrix, feature importance)