# Ensemble Learning using RF 
We will be using tree-based ensemble methods on the [Covertype dataset](https://www.openml.org/d/180).

In [3]:
%matplotlib inline
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import openml
import time
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns

In [4]:
covertype = openml.datasets.get_dataset(180)
X, y, _, _ = covertype.get_data(target=covertype.default_target_attribute, dataset_format='array'); 
features = [f.name for i,f in covertype.features.items()][:-1]
X = pd.DataFrame(X, columns=features)
classes = covertype.retrieve_class_labels()

In [5]:
X.head()

Unnamed: 0,elevation,aspect,slope,horizontal_distance_to_hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,soil_type_31,soil_type_32,soil_type_33,soil_type_34,soil_type_35,soil_type_36,soil_type_37,soil_type_38,soil_type_39,soil_type_40
0,2754.0,146.0,5.0,150.0,2.0,1790.0,227.0,239.0,146.0,700.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3219.0,21.0,8.0,67.0,-1.0,2869.0,215.0,223.0,145.0,1825.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2965.0,337.0,16.0,42.0,7.0,4288.0,184.0,217.0,171.0,324.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2368.0,14.0,15.0,150.0,65.0,1006.0,205.0,208.0,137.0,812.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2366.0,165.0,3.0,390.0,156.0,1165.0,222.0,240.0,154.0,582.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
classes

['Aspen',
 'Cottonwood_Willow',
 'Douglas_fir',
 'Krummholz',
 'Lodgepole_Pine',
 'Ponderosa_Pine',
 'Spruce_Fir']

## Exercise 1: Random Forests

Implement a function `evaluate_rf` that measures the performance of a Random Forest Classifier, using trees of (max) depth 2,8,32, for any number of trees in the ensemble (`n_estimators`). For each model, store the cross validation score based on k=3.


In [None]:
def evaluate_RF(X, y, n_estimators, max_depth=[2,8,32], scoring='accuracy'):
    pass

## Exercise 2: Feature importance
Retrieve the feature importances according to the (tuned) random forest model. Which feature are most important?

Plot the results.

## Exercise 3: Feature selection
Re-build your tuned random forest, but this time only using the first 10 features.
Return both the balanced accuracy and training time. Interpret the results.

In [None]:
# Model Solution
start = time.time()
score = evaluate_RF(X,y,25,max_depth=[32], scoring='balanced_accuracy')
print("Normal RF: {:.2f} balanced ACC, {:.2f} seconds".format(score['rf_32'], (time.time()-start)))
start = time.time()
score = evaluate_RF(X[top_features],y,25,max_depth=[32], scoring='balanced_accuracy')
print("Feature Selection RF: {:.2f} balanced ACC, {:.2f} seconds".format(score['rf_32'], (time.time()-start)))

## Exercise 4: Confusion matrix
Do a standard stratified holdout and generate the confusion matrix of the tuned random forest. Which classes are still often confused?

In [None]:
# Model Solution
X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y, random_state=1)
tuned_forest = RandomForestClassifier(random_state=0, n_estimators=25, max_depth=32).fit(X_train, y_train)

In [None]:
# Model Solution
confusion_matrix(y_test, tuned_forest.predict(X_test))

In [None]:
print(classification_report(y_test, tuned_forest.predict(X_test)))