# Random Forrest Classification

In [5]:
import time

import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import ParameterSampler, ParameterGrid, train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

from Transformer import ReplaceZeroWithMean

DATAPATH = "../Data"
MODELPATH = "../Data/Models/RFC"

data = pd.read_csv(f"{DATAPATH}/diabetes.csv")
X = data.drop("Outcome", axis=1)
y = data["Outcome"]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.80, random_state=1)
imputer = ReplaceZeroWithMean(["Glucose", "BloodPressure", "SkinThickness", "BMI"])
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

## WITHOUT Hyperparameter Tuning

In [None]:
rfc_base = RandomForestClassifier(n_jobs=-1, random_state=1)
rfc_base.fit(X_train, y_train)

joblib.dump(rfc_base, f"{MODELPATH}/RFC_no_hyper.pkl")

## WITH Hyperparameter Tuning

In [None]:
# np.logspace(-2, 2, 50),
# np.linspace(50, 1000, 10),
from pprint import pprint
import sys
from joblib import Parallel, delayed


param_dist = {  # TODO: 0.857
    "n_estimators": list(range(50, 150, 10)),
    "criterion": ["gini", "entropy", "log_loss"],
    "max_depth": np.linspace(2, 50, 2, dtype=np.int8),
    "max_features": ["sqrt", "log2"],
    "max_leaf_nodes": [None, 5, 8, 9, 10, 11, 12, 13, 14, 15, 20, 50, 100],
    "class_weight": [None, "balanced", "balanced_subsample"],
}

# param_dist = {  # TODO: 0.837
#     "n_estimators": list(range(50, 500, 50)),  # Number of trees in the forest
#     "criterion": ["gini", "entropy", "log_loss"],  # Splitting criteria
#     "max_depth": [None] + list(range(10, 110, 10)),  # Maximum depth of the trees
#     "min_samples_split": list(range(2, 21, 2)),  # Minimum samples required to split a node
#     "min_samples_leaf": list(range(1, 21, 2)),  # Minimum samples required to form a leaf
#     "min_weight_fraction_leaf": np.linspace(0.0, 0.5, 6),  # Minimum weighted fraction of the sum of weights at a leaf
#     "max_features": ["sqrt", "log2", None, 0.5, 0.75],  # Number of features to consider for the best split
#     "max_leaf_nodes": [None] + list(range(10, 200, 20)),  # Maximum number of leaf nodes
#     "bootstrap": [True],  # Whether to use bootstrap samples
#     "class_weight": [None, "balanced", "balanced_subsample"],  # Weights associated with classes
#     "ccp_alpha": np.linspace(0.0, 0.1, 5),  # Complexity parameter for pruning
#     "max_samples": [None] + list(np.linspace(0.5, 1.0, 6)),  # Fraction of samples to draw when bootstrap is True
# }

param_sampler = list(ParameterSampler(param_dist, n_iter=10000, random_state=1))


def train_and_evaluate(params):
    model = RandomForestClassifier(n_jobs=-1, random_state=1, **params)
    model.fit(X_train, y_train)
    test_score = model.score(X_test, y_test)
    return test_score, model


results = Parallel(n_jobs=-1)(delayed(train_and_evaluate)(params) for params in param_sampler)

rfc_hpt_score, rfc_hpt = max(results, key=lambda x: x[0])
print(rfc_hpt_score)

joblib.dump(rfc_hpt, f"{MODELPATH}/RFC_hyper.pkl")

## WITH Hyperparamerter Tuning AND Cross Validation

In [6]:
param_dist = {  # 0.831
    "n_estimators": list(range(50, 150, 15)),
    "criterion": ["gini", "log_loss"],
    "max_depth": np.linspace(2, 50, 3, dtype=np.int8),
    "max_features": ["sqrt", "log2"],
    "max_leaf_nodes": [None, 5, 8, 9, 10, 11, 12, 20, 50, 100],
    "class_weight": [None, "balanced", "balanced_subsample"],
}


rfc = RandomForestClassifier(n_jobs=-1, random_state=1)
models = RandomizedSearchCV(rfc, param_distributions=param_dist, n_iter=5000, cv=6, n_jobs=-1)
models.fit(X_train, y_train)

rfc_hpt_cv = models.best_estimator_
print(rfc_hpt_cv.score(X_test, y_test))

joblib.dump(rfc_hpt_cv, f"{MODELPATH}/RFC_hyper_cv.pkl")



0.8311688311688312


['../Data/Models/RFC/RFC_hyper_cv.pkl']

## Further Parameter Adjustments

In [None]:
rfc = RandomForestClassifier(max_features="log2", class_weight="balanced", n_jobs=-1, random_state=1)
param_grid = {
    "n_estimators": 125,
    "max_leaf_nodes": 20,
    "max_depth": 26,
}
for param, value in param_grid.items():
    percent = 5
    param_grid.update(
        {
            param: [int(value * i / 100) for i in range(105, 130, percent)]
            + [value]
            + [int(value * i / 100) for i in range(95, 70, -percent)]
        }
    )
pprint(param_grid)  # TODO: 0.837

models = GridSearchCV(rfc, param_grid, cv=10, n_jobs=-1)
models.fit(X_train, y_train)

rfc_best = models.best_estimator_
print(rfc_best.score(X_test, y_test))

joblib.dump(rfc_best, f"{MODELPATH}/RFC_hyper_cv_tuned.pkl")

In [None]:
print(models.best_score_, rfc_best.get_params())

In [None]:
print(f"Base      : {rfc_base.score(X_test, y_test)}")
print(f"HPT       : {rfc_hpt_score}")
print(f"HPT+CV    : {rfc_hpt_cv.score(X_test, y_test)}")
print(f"HPT+CV+Opt: {rfc_best.score(X_test, y_test)}")

In [None]:
# TODO: Make visualizations!!