# Random Forrest Classification

In [1]:
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import (
    ParameterSampler,
    train_test_split,
    RandomizedSearchCV,
    GridSearchCV,
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

from Transformer import ReplaceZeroWithMean
from utils import print_model_data

DATAPATH = "../Data"
MODELPATH = "../Data/Models/RFC"

data = pd.read_csv(f"{DATAPATH}/diabetes.csv")
X = data.drop("Outcome", axis=1)
y = data["Outcome"]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.80, random_state=1)
imputer = ReplaceZeroWithMean(["Glucose", "BloodPressure", "SkinThickness", "BMI"])
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

# .
# .
# .
# .
# .
# .

## WITHOUT Hyperparameter Tuning

In [2]:
rfc_base = RandomForestClassifier(n_jobs=-1, random_state=1)
rfc_base.fit(X_train, y_train)
rfc_base.score(X_test, y_test)

joblib.dump(rfc_base, f"{MODELPATH}/RFC_no_hyper.pkl")
print_model_data(
    rfc_base.get_params(), rfc_base.score(X_test, y_test), accuracy_score(y_test, rfc_base.predict(X_test))
)

# .
# .
# .
# .
# .
# .

## WITH Hyperparameter Tuning

In [None]:
from joblib import Parallel, delayed

# param_dist = {  # NOTE: 0.837
#     "n_estimators": list(range(50, 500, 50)),  # Number of trees in the forest
#     "criterion": ["gini", "entropy", "log_loss"],  # Splitting criteria
#     "max_depth": [None] + list(range(10, 110, 10)),  # Maximum depth of the trees
#     "min_samples_split": list(range(2, 21, 2)),  # Minimum samples required to split a node
#     "min_samples_leaf": list(range(1, 21, 2)),  # Minimum samples required to form a leaf
#     "min_weight_fraction_leaf": np.linspace(0.0, 0.5, 6),  # Minimum weighted fraction of the sum of weights at a leaf
#     "max_features": ["sqrt", "log2", None, 0.5, 0.75],  # Number of features to consider for the best split
#     "max_leaf_nodes": [None] + list(range(10, 200, 20)),  # Maximum number of leaf nodes
#     "bootstrap": [True],  # Whether to use bootstrap samples
#     "class_weight": [None, "balanced", "balanced_subsample"],  # Weights associated with classes
#     "ccp_alpha": np.linspace(0.0, 0.1, 5),  # Complexity parameter for pruning
#     "max_samples": [None] + list(np.linspace(0.5, 1.0, 6)),  # Fraction of samples to draw when bootstrap is True
# }

param_dist = {  # NOTE: 0.857 with an estimated gess
    "n_estimators": list(range(50, 150, 10)),
    "criterion": ["gini", "entropy", "log_loss"],
    "max_depth": np.linspace(2, 50, 2, dtype=np.int8),
    "max_features": ["sqrt", "log2"],
    "max_leaf_nodes": [None, 5, 8, 9, 10, 11, 12, 13, 14, 15, 20, 50, 100],
    "class_weight": [None, "balanced", "balanced_subsample"],
}
param_sampler = list(ParameterSampler(param_dist, n_iter=4500, random_state=1))


def train_and_evaluate(params):
    model = RandomForestClassifier(n_jobs=-1, random_state=1, **params)
    model.fit(X_train, y_train)
    test_score = model.score(X_test, y_test)
    return test_score, model


results = Parallel(n_jobs=-1)(delayed(train_and_evaluate)(params) for params in param_sampler)
rfc_hyper_score, rfc_hyper = max(results, key=lambda x: x[0])
print(rfc_hyper_score)

joblib.dump(rfc_hyper, f"{MODELPATH}/RFC_hyper.pkl")
print_model_data(rfc_hyper.get_params(), rfc_hyper_score, accuracy_score(y_test, rfc_hyper.predict(X_test)))

0.8571428571428571


# .
# .
# .
# .
# .
# .

## WITH Hyperparamerter Tuning AND Cross Validation

In [12]:
param_dist = {  # NOTE: 0.831
    "n_estimators": list(range(50, 150, 15)),
    "criterion": ["gini", "log_loss"],
    "max_depth": np.linspace(2, 50, 3, dtype=np.int8),
    "max_features": ["sqrt", "log2"],
    "max_leaf_nodes": [None, 5, 8, 9, 10, 11, 12, 20, 50, 100],
    "class_weight": [None, "balanced", "balanced_subsample"],
}


rfc = RandomForestClassifier(n_jobs=-1, random_state=1)
models = RandomizedSearchCV(rfc, param_distributions=param_dist, n_iter=2000, cv=6, n_jobs=-1)
models.fit(X_train, y_train)

rfc_hyper_cv = models.best_estimator_
print(rfc_hyper_cv.score(X_test, y_test))

joblib.dump(rfc_hyper_cv, f"{MODELPATH}/RFC_hyper_cv.pkl")
print_model_data(
    rfc_hyper_cv.get_params(), rfc_hyper_cv.score(X_test, y_test), accuracy_score(y_test, rfc_hyper_cv.predict(X_test))
)

0.7857142857142857


# .
# .
# .
# .
# .
# .

## WITH Hyperparamerter Tuning AND Cross Validation (Stratisfied)

In [None]:
from sklearn.model_selection import StratifiedKFold


param_dist = {  # 0.831
    "n_estimators": list(range(50, 150, 15)),
    "criterion": ["gini", "log_loss"],
    "max_depth": np.linspace(2, 50, 3, dtype=np.int8),
    "max_features": ["sqrt", "log2"],
    "max_leaf_nodes": [None, 5, 8, 9, 10, 11, 12, 20, 50, 100],
    "class_weight": [None, "balanced", "balanced_subsample"],
}


rfc = RandomForestClassifier(n_jobs=-1, random_state=1)
cv_split = StratifiedKFold(n_splits=5, shuffle=True)  # NOTE <-- used different cv strategy
models = RandomizedSearchCV(rfc, param_distributions=param_dist, n_iter=2500, cv=cv_split, n_jobs=-1)
models.fit(X_train, y_train)

rfc_hyper_cv = models.best_estimator_
print(rfc_hyper_cv.score(X_test, y_test))

joblib.dump(rfc_hyper_cv, f"{MODELPATH}/RFC_hyper_cv.pkl")
print_model_data(
    rfc_hyper_cv.get_params(), rfc_hyper_cv.score(X_test, y_test), accuracy_score(y_test, rfc_hyper_cv.predict(X_test))
)

0.7987012987012987


# .
# .
# .
# .
# .
# .

## Further Parameter Adjustments

In [17]:
rfc = RandomForestClassifier(max_features="log2", class_weight="balanced", n_jobs=-1, random_state=1)  # NOTE: 0.8376
rfc = RandomForestClassifier(
    class_weight="balanced",
    max_features="log2",
    n_jobs=-1,
    random_state=1,
    max_depth=50,
    max_leaf_nodes=10,
    n_estimators=140,
)
param_grid = {
    "max_depth": 50,
    "max_leaf_nodes": 10,
    "n_estimators": 140,
}
for param, value in param_grid.items():
    percent = 2
    param_grid.update(
        {
            param: [int(value * i / 100) for i in range(100, 100 + percent * 5, percent)]
            + [value]
            + [int(value * i / 100) for i in range(100, 100 - percent * 5, -percent)]
        }
    )

models = GridSearchCV(rfc, param_grid, cv=10, n_jobs=-1)
models.fit(X_train, y_train)

rfc_best = models.best_estimator_
print(rfc_best.score(X_test, y_test))

joblib.dump(rfc_best, f"{MODELPATH}/RFC_hyper_cv_tuned.pkl")
print_model_data(
    rfc_best.get_params(), rfc_best.score(X_test, y_test), accuracy_score(y_test, rfc_best.predict(X_test))
)

0.8441558441558441


# .
# .
# .
# .
# .
# .

In [45]:
from sklearn.metrics import f1_score, average_precision_score

RFC_URL = "../Data/Models/RFC"
SVM_URL = "../Data/Models/SVM"

models = {
    "RFC": f"{RFC_URL}/RFC_no_hyper.pkl",
    "RFC_hyper": f"{RFC_URL}/RFC_hyper.pkl",
    "RFC_hyper_cv": f"{RFC_URL}/RFC_hyper_cv.pkl",
    "RFC_hyper_cv_tuned": f"{RFC_URL}/RFC_hyper_cv_tuned.pkl",
}

model_scores = {}
for name, model in models.items():
    model = joblib.load(model)
    model.fit(X_train, y_train)
    model_scores.update(
        {
            name: {
                "Score": model.score(X_test, y_test),
                "F1-Score": f1_score(y_test, model.predict(X_test)),
                "Precision": average_precision_score(y_test, model.predict(X_test)),
            }
        }
    )

for model_name, scores in model_scores.items():
    print(f"{model_name}:")
    for score_name, score in scores.items():
        print(f"{score_name:10}: {round(score,4)}")
    print()

RFC:
Score     : 0.8052
F1-Score  : 0.7222
Precision : 0.6257

RFC_hyper:
Score     : 0.8571
F1-Score  : 0.8167
Precision : 0.7106

RFC_hyper_cv:
Score     : 0.7987
F1-Score  : 0.748
Precision : 0.6242

RFC_hyper_cv_tuned:
Score     : 0.8442
F1-Score  : 0.8033
Precision : 0.6905

