<a href="https://colab.research.google.com/github/Kaiziferr/machine_learning/blob/main/randomforest/01_random_forest_oob_score.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [70]:
import warnings

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
    ParameterGrid,
    RepeatedKFold)
from sklearn.datasets import make_regression, make_friedman1, make_classification
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error
)

#**Info**
---
@By: **Steven Bernal**

@Nickname: **Kaiziferr**

@Git: https://github.com/Kaiziferr

# **Config**
---

In [71]:
sns.set(style="darkgrid")
pd.set_option('display.float_format', '{:,.5f}'.format)
random_seed = 12354
warnings.filterwarnings('ignore')

# **Regression**
---

## **Data**
---

In [72]:
X, y = make_friedman1(
    n_samples=1000,
    n_features=8,
    noise=1.8,
    random_state=random_seed)

In [73]:
pd.DataFrame(X).head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.55196,0.10971,0.02975,0.42802,0.56019,0.79467,0.46556,0.34588
1,0.33711,0.20498,0.45069,0.14943,0.78478,0.29625,0.86917,0.4528
2,0.21115,0.90488,0.33384,0.78674,0.49532,0.44739,0.8207,0.3717
3,0.47144,0.02144,0.23761,0.70976,0.57599,0.41125,0.71222,0.16422
4,0.55229,0.84667,0.78529,0.98003,0.8633,0.05351,0.08885,0.50807


In [74]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    train_size=0.8,
    random_state=random_seed)

## **Model**

**Apply Cross Validation**

In [75]:
dict_params = ParameterGrid(
    {
        "n_estimators": [50, 100, 150, 200],
        'max_features': [0.75, None, 'sqrt', 'log2'],
        'criterion': ['squared_error', 'friedman_mse', 'absolute_error']
    }
)



In [76]:
dict_params.param_grid[0]

{'n_estimators': [50, 100, 150, 200],
 'max_features': [0.75, None, 'sqrt', 'log2'],
 'criterion': ['squared_error', 'friedman_mse', 'absolute_error']}

In [77]:
grid = GridSearchCV(
    estimator=RandomForestRegressor(
        n_jobs = -1,
        random_state = random_seed,

    ),
    cv = 5,
    param_grid  = dict_params.param_grid[0],
    scoring = "neg_root_mean_squared_error",
    refit      = True,
    verbose    = 0,
    return_train_score = True
  )
grid.fit(X_train, y_train)

In [78]:
results = pd.DataFrame(grid.cv_results_)
results.filter(regex = '(param.*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False) \
    .head(4)

Unnamed: 0,param_criterion,param_max_features,param_n_estimators,mean_test_score,std_test_score,mean_train_score,std_train_score
30,friedman_mse,log2,150,-2.55307,0.15691,-0.95681,0.0096
31,friedman_mse,log2,200,-2.55344,0.15825,-0.95166,0.00759
14,squared_error,log2,150,-2.55589,0.15466,-0.95696,0.0096
15,squared_error,log2,200,-2.55632,0.15678,-0.95208,0.00761


**Apply Oob score**

In [79]:
results = {
    'params': [],
    'oob_r2': []
}

for params in dict_params:
  model_oobscore = RandomForestRegressor(
      oob_score = True,
      n_jobs = -1,
      random_state = random_seed,
      **params
  )
  model_oobscore.fit(X_train, y_train)
  results['params'].append(params)
  results['oob_r2'].append(model_oobscore.oob_score_)

In [80]:
results_score = pd.DataFrame(results)
results_score = pd.concat(
    [results_score, results_score['params'].apply(pd.Series)], axis=1
)

results_score = results_score.drop(columns = 'params')
results_score = results_score.sort_values('oob_r2', ascending=False)
results_score.head(4)

Unnamed: 0,oob_r2,criterion,max_features,n_estimators
15,0.77529,squared_error,log2,200
31,0.77473,friedman_mse,log2,200
14,0.77261,squared_error,log2,150
30,0.77221,friedman_mse,log2,150


**Apply Oob score other function**

In [81]:
def metrica_oob_score(y, y_predict, **kwards):
  score = mean_absolute_error(y, y_predict,**kwards)
  return score

In [82]:
resultados = {
    'params': [],
    'mae': []
}

In [83]:
for params in dict_params:
  model_oobscore = RandomForestRegressor(
      oob_score       = metrica_oob_score,
      n_jobs          =-1,
      random_state    = random_seed,
      **params
  )

  model_oobscore.fit(X_train, y_train)
  resultados['params'].append(params)
  resultados['mae'].append(model_oobscore.oob_score_)

In [84]:
resultados_scores = pd.DataFrame(resultados)
resultados_scores = pd.concat(
    [resultados_scores, resultados_scores['params'].apply(pd.Series)], axis=1)

resultados_scores = resultados_scores.drop(columns = 'params')
resultados_scores = resultados_scores.sort_values('mae', ascending=True)
resultados_scores.head(4)

Unnamed: 0,mae,criterion,max_features,n_estimators
15,1.97455,squared_error,log2,200
31,1.97553,friedman_mse,log2,200
30,1.98761,friedman_mse,log2,150
14,1.98819,squared_error,log2,150


# **Classification**

## **Data**
---

In [85]:
asdasdasd

NameError: name 'asdasdasd' is not defined

#**Info**
---
@By: **Steven Bernal**

@Nickname: **Kaiziferr**

@Git: https://github.com/Kaiziferr