## Random Forrest

As in the Decision Tree example, we will again compare the performance of our own implementation to the one from the sklearn library. To avoid redundancy, in this notebook we will only focus on the examples where the single Decision Tree struggled and skip over the simple datasets where it performed well. The datasets we will focus on will be the digits dataset for classification, aswell as the diabetes dataset for regression.
Finally we will also use the large diamond dataset to compare the relative performance of the two implementations.

In [1]:
# Load modules
from models.random_forest import RandomForestClassifier as OwnRandomForestClassifier, RandomForestRegressor as OwnRandomForestRegressor
from sklearn.ensemble import RandomForestClassifier as SklearnRandomForestClassifier, RandomForestRegressor as SklearnRandomForestRegressor

from utils.reports import evaluate_classification, evaluate_regression
from utils.grid_search_cv import GridSearchCV
from sklearn.model_selection import train_test_split, ParameterGrid

params = {
    'max_depth': [3, 5, 7, 11],
    'min_samples_split': [2, 5, 7],
    'min_samples_leaf': [2, 5, 7]
}
param_grid = ParameterGrid(params)

from sklearn.datasets import load_digits
from sklearn.datasets import load_diabetes
from datasets.diamonds import load_diamonds

ds_c_hard = load_digits()
X, Y = ds_c_hard.data, ds_c_hard.target
X_c_hard_train, X_c_hard_test, Y_c_hard_train, Y_c_hard_test = train_test_split(X , Y, test_size=0.2, random_state=42)

ds_r_medium = load_diabetes()
X, Y = ds_r_medium.data, ds_r_medium.target
X_r_medium_train, X_r_medium_test, Y_r_medium_train, Y_r_medium_test = train_test_split(X, Y, test_size=0.2, random_state=42)

ds_r_hard = load_diamonds()
X, Y = ds_r_hard.data, ds_r_hard.target
X_r_hard_train, X_r_hard_test, Y_r_hard_train, Y_r_hard_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [2]:
dt_classifier = OwnRandomForestClassifier()

dt_classifier.fit(X_c_hard_train, Y_c_hard_train)
Y_c_hard_pred = dt_classifier.predict(X_c_hard_test)

evaluate_classification(Y_c_hard_test, Y_c_hard_pred)

Precision: 0.96, Recall: 0.96, F1-Score: 0.96


In [3]:
dt_classifier = SklearnRandomForestClassifier()

dt_classifier.fit(X_c_hard_train, Y_c_hard_train)
Y_c_hard_pred = dt_classifier.predict(X_c_hard_test)

evaluate_classification(Y_c_hard_test, Y_c_hard_pred)

Precision: 0.98, Recall: 0.98, F1-Score: 0.98


diabetes

In [4]:
dt_regressor = OwnRandomForestRegressor()

dt_regressor.fit(X_r_medium_train, Y_r_medium_train)
Y_r_medium_pred = dt_regressor.predict(X_r_medium_test)

evaluate_regression(Y_r_medium_test, Y_r_medium_pred)

MAE: 44.63, MSE: 2928.43, R²: 0.45


In [5]:
dt_regressor = SklearnRandomForestRegressor()

dt_regressor.fit(X_r_medium_train, Y_r_medium_train)
Y_r_medium_pred = dt_regressor.predict(X_r_medium_test)

evaluate_regression(Y_r_medium_test, Y_r_medium_pred)

MAE: 44.09, MSE: 2941.96, R²: 0.44


diamonds

In [6]:
dt_regressor = OwnRandomForestRegressor()

dt_regressor.fit(X_r_hard_train, Y_r_hard_train)
Y_r_hard_pred = dt_regressor.predict(X_r_hard_test)

evaluate_regression(Y_r_hard_test, Y_r_hard_pred)

MAE: 294.72, MSE: 332031.56, R²: 0.98


In [7]:
dt_regressor = SklearnRandomForestRegressor()

dt_regressor.fit(X_r_hard_train, Y_r_hard_train)
Y_r_hard_pred = dt_regressor.predict(X_r_hard_test)

evaluate_regression(Y_r_hard_test, Y_r_hard_pred)

MAE: 265.88, MSE: 292494.18, R²: 0.98
