In [75]:
import warnings

import pandas as pd

from sklearn.datasets import (make_friedman1, make_classification)

from sklearn.model_selection import (
    train_test_split,
    ParameterGrid,
    GridSearchCV
)

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

from sklearn.metrics import (
    mean_squared_error,
    f1_score
)

The objective of this project is to demonstrate the use of the OOB (out-of-bag) score in the random forest.


# **Info**
---
**@By:** **Kaiziferr**

**@Git:** https://github.com/Kaiziferr

# **Config**
---

In [43]:
random_seed = 12354
pd.set_option('display.float_format', '{:,.5f}'.format)
warnings.filterwarnings('ignore')

A set of non-linear synthetic data was generated for the regression problem, along with data for the classification problem. Since the goal is to demonstrate a proof of concept, this approach was adopted to minimize processing time.

# **Regression**
---

## **Data**
---

The make_friedman1 function from scikit-learn allows you to generate a non-linear dataset.

In [44]:
X, y = make_friedman1(
    n_samples=1000,
    n_features=8,
    noise=1.8,
    random_state=random_seed
)

In [45]:
pd.DataFrame(X).head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.55196,0.10971,0.02975,0.42802,0.56019,0.79467,0.46556,0.34588
1,0.33711,0.20498,0.45069,0.14943,0.78478,0.29625,0.86917,0.4528
2,0.21115,0.90488,0.33384,0.78674,0.49532,0.44739,0.8207,0.3717
3,0.47144,0.02144,0.23761,0.70976,0.57599,0.41125,0.71222,0.16422
4,0.55229,0.84667,0.78529,0.98003,0.8633,0.05351,0.08885,0.50807


# **Split**
---

In [46]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    train_size=0.8,
    random_state=random_seed
)

## **Model**
---

**Apply Cross Validation**

To validate the performance of the random forest, cross-validation can be used; however, depending on the case, it may be computationally expensive due to the multiple iterations involved. I usually place greater importance on the following parameters:

- n_estimators: number of estimators (number of trees)
- max_features: number of features to consider for each estimator (the features will be random)
- criterion: division  criterion for each estimator

While these are the ones I typically use, they depend on the context of the problem and what I want to find.

In [47]:
dict_params = ParameterGrid(
    {
        "n_estimators": [50, 100, 150, 200],
        'max_features': [0.75, None, 'sqrt', 'log2'],
        'criterion': ['squared_error', 'friedman_mse', 'absolute_error']
    }
)

In [48]:
dict_params.param_grid[0]

{'n_estimators': [50, 100, 150, 200],
 'max_features': [0.75, None, 'sqrt', 'log2'],
 'criterion': ['squared_error', 'friedman_mse', 'absolute_error']}

The test is carried out with five folds using GridSearchCV.

In [49]:
grid = GridSearchCV(
    estimator = RandomForestRegressor(
    n_jobs = -1,
    random_state = random_seed
    ),
    cv = 5,
    param_grid = dict_params.param_grid[0],
    scoring = "neg_root_mean_squared_error",
    refit = True,
    verbose = 0,
    return_train_score = True
)

grid.fit(X, y)

In [50]:
results = pd.DataFrame(grid.cv_results_)
results = results.filter(regex = '(param.*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False) \
    .head(4)

results

Unnamed: 0,param_criterion,param_max_features,param_n_estimators,mean_test_score,std_test_score,mean_train_score,std_train_score
15,squared_error,log2,200,-2.51016,0.09462,-0.93769,0.00557
31,friedman_mse,log2,200,-2.51028,0.09608,-0.93818,0.0058
14,squared_error,log2,150,-2.51042,0.09521,-0.94169,0.00675
30,friedman_mse,log2,150,-2.51208,0.09878,-0.94158,0.00698


The best hyperparameters are:
- param_criterion: squared_error
- param_max_features: log2
- param_n_estimators: 200

The lowest average error was:

In [51]:
-1*grid.best_score_

np.float64(2.5101564279184703)

**Apply Oob score**

• When applying oob_score, the parameter oob_score must be set to True.

• The default metric for regression is the coefficient of determination (R²).

In [52]:
results = {
    'params': [],
    'oob_r2': []
}

for params in dict_params:
  model_oobscore = RandomForestRegressor(
    oob_score=True,
    n_jobs=-1,
    random_state=random_seed,
    **params
  )
  model_oobscore.fit(X, y)
  results['params'].append(params)
  results['oob_r2'].append(model_oobscore.oob_score_)

In [53]:
results_score = pd.DataFrame(results)
results_score = pd.concat(
    [results_score, results_score['params'].apply(pd.Series)], axis=1
)

results_score = results_score.drop(columns = 'params')
results_score = results_score.sort_values('oob_r2', ascending=False)
results_score.head(4)

Unnamed: 0,oob_r2,criterion,max_features,n_estimators
19,0.77945,friedman_mse,0.75,200
3,0.77911,squared_error,0.75,200
35,0.77816,absolute_error,0.75,200
7,0.77759,squared_error,,200


The best hyperparameters are:
- criterion: friedman_mse
- max_features: 0.75
- n_estimators: 200

The highest score is:

In [54]:
results_score.head(1)['oob_r2'].values[0]

np.float64(0.779451121062925)

**Apply Oob score other function**


Although the default metric is R², an error function can be used through a callback. For this callback, an error measure such as MAE can be configured to be used as the oob_score.

In [55]:
def metrica_oob_score(y, y_predict, **kwards):
  score = mean_squared_error(y, y_predict,**kwards)
  return score

In [56]:
resultados = {
    'params': [],
    'mae': []
}

In [57]:
for params in dict_params:
  model_oobscore = RandomForestRegressor(
      oob_score       = metrica_oob_score,
      n_jobs          =-1,
      random_state    = random_seed,
      **params
  )

  model_oobscore.fit(X, y)
  resultados['params'].append(params)
  resultados['mae'].append(model_oobscore.oob_score_)

In [58]:
resultados_scores = pd.DataFrame(resultados)
resultados_scores = pd.concat(
    [resultados_scores, resultados_scores['params'].apply(pd.Series)], axis=1)

resultados_scores = resultados_scores.drop(columns = 'params')
resultados_scores = resultados_scores.sort_values('mae', ascending=True)
resultados_scores.head(4)

Unnamed: 0,mae,criterion,max_features,n_estimators
19,6.16085,friedman_mse,0.75,200
3,6.17029,squared_error,0.75,200
35,6.19696,absolute_error,0.75,200
7,6.2128,squared_error,,200


The best hyperparameters are:
- criterion: friedman_mse
- max_features: 0.75
- n_estimators: 200

The lowest average error was:

In [59]:
resultados_scores.head(1)['mae'].values[0]

np.float64(6.160847335550337)

# **Classification**

## **Data**
---

A class-imbalanced dataset with three categories is generated. This is done to justify not using the default metric that would be applied in the oob_score. However, it can be used for any classification problem.

°°°°

Se genera un conjunto de datos desequilibrado por clase con tres categorías. Esto se hace para justificar la no utilización de la métrica predeterminada que se aplicaría en oob_score. Sin embargo, puede utilizarse para cualquier problema de clasificación.

In [60]:
X, y = make_classification(
    n_samples = 1000,
    n_features = 10,
    n_informative = 7,
    n_redundant = 2,
    n_repeated = 1,
    n_classes = 3,
    weights = [0.5, 0.35, 0.15],
    class_sep = 0.8,
    random_state=random_seed
)

In [61]:
pd.Series(y).value_counts() / 1000

Unnamed: 0,count
0,0.494
1,0.352
2,0.154


In [62]:
pd.DataFrame(X).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-0.55972,3.05397,3.05397,3.18851,-1.03085,-0.23431,4.48372,0.15694,-0.81657,4.7132
1,1.58635,0.95818,0.95818,3.65735,4.07767,-1.47404,-2.08,0.98687,-1.78331,-0.60233
2,2.15958,1.33346,1.33346,-0.94225,2.20403,3.13712,0.20985,2.53849,-1.28125,0.48116
3,-0.26474,2.05311,2.05311,1.74693,-1.15899,-0.88624,2.35354,-2.32645,-3.21542,5.13135
4,0.24866,-1.63643,-1.63643,-3.2794,-0.86734,1.16182,-2.69982,1.89413,0.01785,-0.58824


# **Split**
---

In [63]:
x_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    train_size = 0.8,
    stratify=y,
    random_state = random_seed
)

## **Model**

**Apply Cross Validation**

To validate the performance of the random forest, cross-validation can be used; however, depending on the case, it can be computationally expensive due to multiple iterations. I usually place more importance on the following parameters:

- n_estimators: number of estimators (number of trees)
- max_features: number of features to consider for each estimator (the features will be random)
- criterion: division  criterion for each estimator

While these are the ones I typically use, they depend on the context of the problem and what I want to find.

In [64]:
dict_params = ParameterGrid(
    {
        "n_estimators": [50, 100, 150, 200],
        'max_features': [0.75, None, 'sqrt', 'log2'],
        'criterion': ['gini', 'entropy', 'log_loss']
    }
)

In [68]:
grid = GridSearchCV(
    estimator=RandomForestClassifier(
        n_jobs = -1,
        random_state = random_seed,

    ),
    cv = 5,
    param_grid  = dict_params.param_grid[0],
    scoring     = 'f1_micro',
    refit      = True,
    verbose    = 0,
    return_train_score = True)
grid.fit(X, y)

In [69]:
results = pd.DataFrame(grid.cv_results_)
results = results.filter(regex = '(param.*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False) \
    .head(4)
results

Unnamed: 0,param_criterion,param_max_features,param_n_estimators,mean_test_score,std_test_score,mean_train_score,std_train_score
45,log_loss,log2,100,0.837,0.01806,1.0,0.0
41,log_loss,sqrt,100,0.837,0.01806,1.0,0.0
25,entropy,sqrt,100,0.837,0.01806,1.0,0.0
29,entropy,log2,100,0.837,0.01806,1.0,0.0


The best hyperparameters are:
- criterion: log_loss
- max_features: log2
- n_estimators: 100

The highest score in the classification of the three categories is:

In [70]:
results.head(1)['mean_test_score'].values

array([0.837])

**Apply Oob score**

For classification problems, the default metric for oob_score is accuracy. This metric may not be useful for imbalanced problems or when you want to emphasize one or more specific classes. However, we will continue using the standard metric for this section of code.

In [71]:
results = {
    'params': [],
    'oob_score': []
}

for params in dict_params:
  model_oobscore = RandomForestClassifier(
      oob_score = True,
      n_jobs = -1,
      random_state = random_seed,
      **params
  )
  model_oobscore.fit(X_train, y_train)
  results['params'].append(params)
  results['oob_score'].append(model_oobscore.oob_score_)

In [72]:
results_score = pd.DataFrame(results)
results_score = pd.concat(
    [results_score, results_score['params'].apply(pd.Series)], axis=1
)

results_score = results_score.drop(columns = 'params')
results_score = results_score.sort_values('oob_score', ascending=False)
results_score.head(4)

Unnamed: 0,oob_score,criterion,max_features,n_estimators
40,0.46,log_loss,sqrt,50
24,0.46,entropy,sqrt,50
8,0.45875,gini,sqrt,50
20,0.45875,entropy,,50


The best hyperparameters are:
- criterion: log_loss
- max_features: sqrt
- n_estimators: 50



The highest score in the classification of the three categories is:

In [73]:
results_score.head(1)['oob_score'].values[0]

np.float64(0.46)

The metric allows us to determine that, despite having a high score, the performance is poor because the rating is below a typical baseline of 0.5, which would suggest that the model is deficient; however, this is not a definitive conclusion.

**Apply Oob score other function**

Through a callback, the f1 score  metric is defined.

In [76]:
def metrica_oob_score(y, y_predict, **kwards):
  score = f1_score(y, y_predict, average='micro')
  return score

In [77]:
resultados = {
    'params': [],
    'recall-score': []
}

for params in dict_params:
  model_oobscore = RandomForestClassifier(
      oob_score       = metrica_oob_score,
      n_jobs          =-1,
      random_state    = random_seed,
      **params
  )

  model_oobscore.fit(X_train, y_train)
  resultados['params'].append(params)
  resultados['recall-score'].append(model_oobscore.oob_score_)

In [78]:
results_score = pd.DataFrame(resultados)
results_score = pd.concat(
    [results_score, results_score['params'].apply(pd.Series)], axis=1
)

results_score = results_score.drop(columns = 'params')
results_score = results_score.sort_values('recall-score', ascending=False)
results_score.head(4)

Unnamed: 0,recall-score,criterion,max_features,n_estimators
40,0.46,log_loss,sqrt,50
24,0.46,entropy,sqrt,50
8,0.45875,gini,sqrt,50
20,0.45875,entropy,,50


The best hyperparameters are:
- criterion: log_loss
- max_features: sqrt
- n_estimators: 50

#**Info**
---
**@By:** **Kaiziferr**

**@Git:** https://github.com/Kaiziferr