# Outline
- Implement Random Forest using sklearn.tree.DecisionTreeClassifier
- Out-of-Bag Error
- Learn bagging and boosting in sklearn
- visualize bias-variance for bagging and boosting
- stacking
- play on California Housing Dataset

In [None]:
import numpy as np
from matplotlib import pyplot as plt

## Generate Toy Dataset

In [None]:
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=2000, 
                           n_features=16, n_informative=6,
                           n_clusters_per_class=2, flip_y=0.01)

## Random Forest
Build random forest based on sklearn.tree.DecisionTreeClassifier

In [None]:
from sklearn.tree import DecisionTreeClassifier as DTC

class RandomForest():
    def __init__(self, n_trees=10, max_features='sqrt', oob_score=False):
        self.n_trees = n_trees
        self.oob_score = oob_score
        self.trees = [DTC(max_features=max_features) for _ in range(n_trees)]

    # compatible with Sklearn fit api, train model with X and y
    # X: training data of shape [n_sample, d_feature]
    # y: class label of shape [n_sample]
    def fit(self, X, y):
        # TODO
    
    # compatible with Sklearn predict api, get class prediction with X
    # X: data of shape [n_sample, d_feature]
    # return: prediction of shape [n_sample]
    def predict(self, X):
        # TODO
    
    # compatible with Sklearn predict_proba api, get class predict probability with X
    # X: data of shape [n_sample, d_feature]
    # return: predict probability of shape [n_sample, n_class]
    def predict_proba(self, X):
        # TODO
    
    # compatible with Sklearn score api, compute classification accuracy using trained model
    # X: data of shape [n_sample, d_feature]
    # y: class label of shape [n_sample]
    # return: accuracy
    def score(self, X, y):
        # TODO

In [None]:
from sklearn.ensemble import BaggingClassifier

bc = BaggingClassifier(DTC(max_features='sqrt'), n_estimators=10, oob_score=True)
bc.fit(X, y)
print(bc.oob_score_)

In [None]:
# Visualization for Bagging (reduce variance)
n_trees = []
oob_score = []
train_score = []
for n_tree in range(1, 50):
    rf = RandomForest(n_trees = n_tree, oob_score=True)
    rf.fit(X, y)
    n_trees.append(n_tree)
    oob_score.append(rf.oob_score_)
    train_score.append(rf.score(X, y))
    
plt.plot(n_trees, oob_score, label='oob_score')
plt.plot(n_trees, train_score, label='train_score')
plt.ylabel('score')
plt.xlabel('n_trees')
plt.legend(loc="upper right")
plt.show()

## Sklearn GradientBoostingClassifier

In [None]:
# Visualization for Boosting (reduce bias)
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier as GBC

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

n_trees = []
train_score = []
test_score = []

for n_estimators in range(1, 100, 1):
    gbc = GBC(n_estimators=n_estimators, max_depth=3)
    gbc.fit(X_train, y_train)
    n_trees.append(n_estimators)
    train_score.append(gbc.score(X_train, y_train))
    test_score.append(gbc.score(X_test, y_test))
    
plt.plot(n_trees, train_score, label='train_score')
plt.plot(n_trees, test_score, label='test_score')
plt.ylabel('score')
plt.xlabel('n_trees')
plt.legend(loc="upper right")
plt.show()

## Build Ensemble Classifier
- **AverageClassifier**: average the predict probability of base classifiers
- **StackingClassifier**: use the predict probability of base classifiers (option: combined with original features) as new features then train a meta classifier 

In [None]:
from sklearn.model_selection import KFold

class EnsembleClassifier():
    # classifiers: list of base classifiers (with method fit, score, predict_proba)
    def __init__(self, classifiers):
        self.classifiers = classifiers
        
    def fit(self, X, y):
        raise NotImplementedError
        
    def predict(self, X):
        raise NotImplementedError
        
    def score(self, X, y):
        raise NotImplementedError
    
    # get score of each base classifier
    # X: data of shape [n_sample, d_feature]
    # y: class label of shape [n_sample]
    # return: list of scores
    def score_classifiers(self, X, y):
        # TODO
        
class AverageClassifier(EnsembleClassifier):
    def fit(self, X, y):
        # TODO
    
    def predict(self, X):
        # TODO
        
    def score(self, X, y):
        # TODO
    
class StackingClassifier(EnsembleClassifier):
    # meta_classifier: classifier for second-level prediction
    # concat_feature: whether to use original feature
    # kfold: split training data into kfold, train on k-1 folds, get prediction on the rest one
    def __init__(self, classifiers, meta_classifier, concat_feature=False, kfold=5):
        # TODO
        
    def fit(self, X, y):
        # TODO
        
    def predict(self, X):
        # TODO
        
    def score(self, X, y):
        # TODO

In [None]:
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.neighbors import KNeighborsClassifier as KNC

rf = RFC(n_estimators=10)
knc = KNC()
lr = LR(C=2, solver='liblinear', multi_class="ovr")

ac = AverageClassifier([rf, knc])
ac.fit(X_train, y_train)
print("BaseClassifiers: ", ac.score_classifiers(X_test, y_test))
print("AverageClassifiers: %.6f" % ac.score(X_test, y_test))

sc = StackingClassifier([rf, knc], lr, concat_feature=False)
sc.fit(X_train, y_train)
print("StackingClassifiers: %.6f" % sc.score(X_test, y_test))

sc_concat = StackingClassifier([rf, knc], lr, concat_feature=True)
sc_concat.fit(X_train, y_train)
print("StackingClassifiers with original features: %.6f" % sc_concat.score(X_test, y_test))

## Prepare California Housing Dataset
This dataset consists of 20,640 samples and 8 features.
The original target is real number, transform it classes first for demo purpose

In [None]:
from sklearn import datasets

cal_housing = datasets.fetch_california_housing()
X, y_real = cal_housing.data, cal_housing.target
y = np.zeros(y_real.shape[0])

plt.hist(y_real, bins=50)
plt.show()

# skewed number of samples with value > 4.9 according to the histogram
y[np.where(y_real > 4.9)] = 4
# divide rest of samples into 4 bins
# please check np.digitize, np.histogram on numpy document
# Notice: if you are using sklearn 0.20.1, there is a handy helper sklearn.preprocessing.KBinsDiscretizer
y_normal_idx = np.where(y_real <= 4.9)
y_normal = y_real[y_normal_idx]
y[y_normal_idx] = np.digitize(y_normal, bins=np.histogram(y_normal, bins=4)[1][1:], right=True)
y.astype(np.int32, copy=False)